diff --git a/.github/actions/test_unit/action.yml b/.github/actions/test_unit/action.yml index adb8fc5719b4..4e32a581f650 100644 --- a/.github/actions/test_unit/action.yml +++ b/.github/actions/test_unit/action.yml @@ -16,7 +16,7 @@ runs: RUST_TEST_THREADS: "8" RUST_LOG: ERROR RUST_MIN_STACK: 104857600 - # RUST_BACKTRACE: full + # RUST_BACKTRACE: 1 - name: Upload failure if: failure() diff --git a/Cargo.lock b/Cargo.lock index 1e56348fa1a8..0eb783a04eee 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3332,6 +3332,7 @@ dependencies = [ "dyn-clone", "goldenfile", "log", + "maplit", "parking_lot 0.12.3", "parquet", "rand 0.8.5", @@ -3571,7 +3572,6 @@ dependencies = [ "libm", "match-template", "md-5", - "multiversion", "naive-cityhash", "num-traits", "once_cell", @@ -4271,14 +4271,15 @@ dependencies = [ "databend-common-catalog", "databend-common-exception", "databend-common-expression", + "databend-common-functions", "databend-common-meta-app", "databend-common-pipeline-core", "databend-common-storage", "databend-common-storages-parquet", + "databend-storages-common-pruner", "databend-storages-common-table-meta", "deltalake", "fastrace", - "maplit", "match-template", "object_store_opendal", "parquet", @@ -4386,7 +4387,6 @@ dependencies = [ "async-recursion", "async-trait", "chrono", - "databend-common-arrow", "databend-common-base", "databend-common-catalog", "databend-common-config", @@ -4400,8 +4400,8 @@ dependencies = [ "databend-common-pipeline-sources", "databend-common-sql", "databend-common-storage", - "databend-storages-common-cache", - "databend-storages-common-index", + "databend-common-storages-parquet", + "databend-storages-common-pruner", "databend-storages-common-table-meta", "fastrace", "faststr", @@ -4409,6 +4409,7 @@ dependencies = [ "hive_metastore", "log", "opendal 0.49.2", + "parquet", "recursive", "serde", "typetag", @@ -4551,6 +4552,7 @@ dependencies = [ "databend-common-settings", "databend-common-sql", "databend-common-storage", + "databend-storages-common-cache", "databend-storages-common-pruner", "databend-storages-common-stage", "databend-storages-common-table-meta", @@ -4650,7 +4652,6 @@ dependencies = [ "serde", "serde_json", "typetag", - "uuid", ] [[package]] @@ -5391,6 +5392,7 @@ dependencies = [ "hex", "log", "parking_lot 0.12.3", + "parquet", "rayon", "rustix 0.38.37", "siphasher", @@ -10176,28 +10178,6 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" -[[package]] -name = "multiversion" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4851161a11d3ad0bf9402d90ffc3967bf231768bfd7aeb61755ad06dbf1a142" -dependencies = [ - "multiversion-macros", - "target-features", -] - -[[package]] -name = "multiversion-macros" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a74ddee9e0c27d2578323c13905793e91622148f138ba29738f9dddb835e90" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", - "target-features", -] - [[package]] name = "mur3" version = "0.1.0" @@ -14922,12 +14902,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" -[[package]] -name = "target-features" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1bbb9f3c5c463a01705937a24fdabc5047929ac764b2d5b9cf681c1f5041ed5" - [[package]] name = "target-lexicon" version = "0.12.16" diff --git a/Cargo.toml b/Cargo.toml index d3d9aad56f3f..c8273d4b01bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -351,18 +351,26 @@ useless_format = "allow" mutable_key_type = "allow" result_large_err = "allow" +## DONT'T DELETE THIS: If we want best performance, we should use this profile but it will take longer time to compile. +## Test SQL: +## select sum(number) from numbers_mt(10000000000); ~ 3x performance +## select max(number) from numbers_mt(10000000000); ~ 3x performance +# [profile.release] +# debug = 1 +# lto = "thin" +# overflow-checks = false +# incremental = false +# codegen-units = 1 + [profile.release] debug = 1 lto = "thin" overflow-checks = false +opt-level = "s" ## defaults to be 3 incremental = false -opt-level = "s" - -# codegen-units = 1 # Reduce number of codegen units to increase optimizations. # [profile.release.package] -# arrow2 = { codegen-units = 4 } -# common-functions = { codegen-units = 16 } +# databend-common-arrow = { codegen-units = 16 } # databend-query = { codegen-units = 4 } # databend-binaries = { codegen-units = 4 } diff --git a/src/common/base/src/base/dma.rs b/src/common/base/src/base/dma.rs index 6f92a2043a62..7aeda6307fdd 100644 --- a/src/common/base/src/base/dma.rs +++ b/src/common/base/src/base/dma.rs @@ -182,7 +182,8 @@ struct DmaFile { } impl DmaFile { - async fn open_raw(path: impl AsRef, dio: bool) -> io::Result { + async fn open_raw(path: impl AsRef, #[allow(unused)] dio: bool) -> io::Result { + #[allow(unused_mut)] let mut flags = 0; #[cfg(target_os = "linux")] if dio { @@ -196,7 +197,8 @@ impl DmaFile { .await } - async fn create_raw(path: impl AsRef, dio: bool) -> io::Result { + async fn create_raw(path: impl AsRef, #[allow(unused)] dio: bool) -> io::Result { + #[allow(unused_mut)] let mut flags = OFlags::EXCL; #[cfg(target_os = "linux")] if dio { diff --git a/src/meta/types/src/cluster.rs b/src/meta/types/src/cluster.rs index 3f2b1c5f3e5d..f33d5642304e 100644 --- a/src/meta/types/src/cluster.rs +++ b/src/meta/types/src/cluster.rs @@ -78,6 +78,7 @@ pub struct NodeInfo { pub secret: String, pub cpu_nums: u64, pub version: u32, + pub http_address: String, pub flight_address: String, pub discovery_address: String, pub binary_version: String, @@ -88,6 +89,7 @@ impl NodeInfo { id: String, secret: String, cpu_nums: u64, + http_address: String, flight_address: String, discovery_address: String, binary_version: String, @@ -97,6 +99,7 @@ impl NodeInfo { secret, cpu_nums, version: 0, + http_address, flight_address, discovery_address, binary_version, diff --git a/src/meta/types/tests/it/cluster.rs b/src/meta/types/tests/it/cluster.rs index b9f13bf4d268..c168c897dd67 100644 --- a/src/meta/types/tests/it/cluster.rs +++ b/src/meta/types/tests/it/cluster.rs @@ -21,6 +21,7 @@ fn test_node_info_ip_port() -> anyhow::Result<()> { secret: "".to_string(), cpu_nums: 1, version: 1, + http_address: "7.8.9.10:987".to_string(), flight_address: "1.2.3.4:123".to_string(), discovery_address: "4.5.6.7:456".to_string(), binary_version: "v0.8-binary-version".to_string(), diff --git a/src/query/ast/src/ast/format/syntax/dml.rs b/src/query/ast/src/ast/format/syntax/dml.rs index 702a558f4b7a..fc270b5e43b0 100644 --- a/src/query/ast/src/ast/format/syntax/dml.rs +++ b/src/query/ast/src/ast/format/syntax/dml.rs @@ -236,7 +236,7 @@ pub(crate) fn pretty_copy_into_location(copy_stmt: CopyIntoLocationStmt) -> RcDo .append( RcDoc::line() .append(RcDoc::text("SINGLE = ")) - .append(RcDoc::text(copy_stmt.single.to_string())), + .append(RcDoc::text(copy_stmt.options.single.to_string())), ) } diff --git a/src/query/ast/src/ast/query.rs b/src/query/ast/src/ast/query.rs index 1d0ca62a35f8..e7448216dfec 100644 --- a/src/query/ast/src/ast/query.rs +++ b/src/query/ast/src/ast/query.rs @@ -29,6 +29,8 @@ use crate::ast::Identifier; use crate::ast::Lambda; use crate::ast::SelectStageOptions; use crate::ast::WindowDefinition; +use crate::ParseError; +use crate::Result; use crate::Span; /// Root node of a query tree @@ -623,56 +625,82 @@ impl Display for TemporalClause { } } -#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, Drive, DriveMut)] -pub enum SampleLevel { - ROW, - BLOCK, -} - #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut)] -pub enum SampleConfig { - Probability(f64), +pub enum SampleRowLevel { RowsNum(f64), + Probability(f64), } -impl Eq for SampleConfig {} - -#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq, Drive, DriveMut)] -pub struct Sample { - pub sample_level: SampleLevel, - pub sample_conf: SampleConfig, -} - -impl Sample { - pub fn sample_probability(&self, stats_rows: Option) -> Option { - let rand = match &self.sample_conf { - SampleConfig::Probability(probability) => probability / 100.0, - SampleConfig::RowsNum(rows) => { +impl SampleRowLevel { + pub fn sample_probability(&self, stats_rows: Option) -> Result> { + let rand = match &self { + SampleRowLevel::Probability(probability) => probability / 100.0, + SampleRowLevel::RowsNum(rows) => { if let Some(row_num) = stats_rows { if row_num > 0 { rows / row_num as f64 } else { - return None; + return Ok(None); } } else { - return None; + return Ok(None); } } }; - Some(rand) + if rand > 1.0 { + return Err(ParseError( + None, + format!( + "Sample value should be less than or equal to 100, but got {}", + rand * 100.0 + ), + )); + } + Ok(Some(rand)) + } +} + +impl Eq for SampleRowLevel {} + +#[derive( + serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Default, +)] +pub struct SampleConfig { + pub row_level: Option, + pub block_level: Option, +} + +impl SampleConfig { + pub fn set_row_level_sample(&mut self, value: f64, rows: bool) { + if rows { + self.row_level = Some(SampleRowLevel::RowsNum(value)); + } else { + self.row_level = Some(SampleRowLevel::Probability(value)); + } + } + + pub fn set_block_level_sample(&mut self, probability: f64) { + self.block_level = Some(probability); } } -impl Display for Sample { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { +impl Eq for SampleConfig {} + +impl Display for SampleConfig { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { write!(f, "SAMPLE ")?; - match self.sample_level { - SampleLevel::ROW => write!(f, "ROW ")?, - SampleLevel::BLOCK => write!(f, "BLOCK ")?, + if let Some(block_level) = self.block_level { + write!(f, "BLOCK ({}) ", block_level)?; } - match &self.sample_conf { - SampleConfig::Probability(prob) => write!(f, "({})", prob)?, - SampleConfig::RowsNum(rows) => write!(f, "({} ROWS)", rows)?, + if let Some(row_level) = &self.row_level { + match row_level { + SampleRowLevel::RowsNum(rows) => { + write!(f, "ROW ({} ROWS)", rows)?; + } + SampleRowLevel::Probability(probability) => { + write!(f, "ROW ({})", probability)?; + } + } } Ok(()) } @@ -692,7 +720,7 @@ pub enum TableReference { with_options: Option, pivot: Option>, unpivot: Option>, - sample: Option, + sample: Option, }, // `TABLE(expr)[ AS alias ]` TableFunction { @@ -703,7 +731,7 @@ pub enum TableReference { params: Vec, named_params: Vec<(Identifier, Expr)>, alias: Option, - sample: Option, + sample: Option, }, // Derived table, which can be a subquery or joined tables or combination of them Subquery { diff --git a/src/query/ast/src/ast/statements/copy.rs b/src/query/ast/src/ast/statements/copy.rs index 4c1629457043..88d483037ca0 100644 --- a/src/query/ast/src/ast/statements/copy.rs +++ b/src/query/ast/src/ast/statements/copy.rs @@ -143,6 +143,29 @@ impl Display for CopyIntoTableStmt { } } +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Drive, DriveMut, Eq)] +pub struct CopyIntoLocationOptions { + pub single: bool, + pub max_file_size: usize, + pub detailed_output: bool, + pub use_raw_path: bool, + pub include_query_id: bool, + pub overwrite: bool, +} + +impl Default for CopyIntoLocationOptions { + fn default() -> Self { + Self { + single: Default::default(), + max_file_size: Default::default(), + detailed_output: false, + use_raw_path: false, + include_query_id: true, + overwrite: false, + } + } +} + /// CopyIntoLocationStmt is the parsed statement of `COPY into from ...` #[derive(Debug, Clone, PartialEq, Drive, DriveMut)] pub struct CopyIntoLocationStmt { @@ -151,9 +174,7 @@ pub struct CopyIntoLocationStmt { pub src: CopyIntoLocationSource, pub dst: FileLocation, pub file_format: FileFormatOptions, - pub single: bool, - pub max_file_size: usize, - pub detailed_output: bool, + pub options: CopyIntoLocationOptions, } impl Display for CopyIntoLocationStmt { @@ -171,9 +192,12 @@ impl Display for CopyIntoLocationStmt { if !self.file_format.is_empty() { write!(f, " FILE_FORMAT = ({})", self.file_format)?; } - write!(f, " SINGLE = {}", self.single)?; - write!(f, " MAX_FILE_SIZE = {}", self.max_file_size)?; - write!(f, " DETAILED_OUTPUT = {}", self.detailed_output)?; + write!(f, " SINGLE = {}", self.options.single)?; + write!(f, " MAX_FILE_SIZE = {}", self.options.max_file_size)?; + write!(f, " DETAILED_OUTPUT = {}", self.options.detailed_output)?; + write!(f, " INCLUDE_QUERY_ID = {}", self.options.include_query_id)?; + write!(f, " USE_RAW_PATH = {}", self.options.use_raw_path)?; + write!(f, " OVERWRITE = {}", self.options.overwrite)?; Ok(()) } @@ -183,9 +207,12 @@ impl CopyIntoLocationStmt { pub fn apply_option(&mut self, opt: CopyIntoLocationOption) { match opt { CopyIntoLocationOption::FileFormat(v) => self.file_format = v, - CopyIntoLocationOption::Single(v) => self.single = v, - CopyIntoLocationOption::MaxFileSize(v) => self.max_file_size = v, - CopyIntoLocationOption::DetailedOutput(v) => self.detailed_output = v, + CopyIntoLocationOption::Single(v) => self.options.single = v, + CopyIntoLocationOption::MaxFileSize(v) => self.options.max_file_size = v, + CopyIntoLocationOption::DetailedOutput(v) => self.options.detailed_output = v, + CopyIntoLocationOption::IncludeQueryID(v) => self.options.include_query_id = v, + CopyIntoLocationOption::UseRawPath(v) => self.options.use_raw_path = v, + CopyIntoLocationOption::OverWrite(v) => self.options.overwrite = v, } } } @@ -482,7 +509,10 @@ pub enum CopyIntoLocationOption { FileFormat(FileFormatOptions), MaxFileSize(usize), Single(bool), + IncludeQueryID(bool), + UseRawPath(bool), DetailedOutput(bool), + OverWrite(bool), } #[derive(Clone, Debug, PartialEq, Eq, Default, Drive, DriveMut)] diff --git a/src/query/ast/src/ast/statements/dictionary.rs b/src/query/ast/src/ast/statements/dictionary.rs index 88508fb75c19..2ddf086c7e11 100644 --- a/src/query/ast/src/ast/statements/dictionary.rs +++ b/src/query/ast/src/ast/statements/dictionary.rs @@ -19,6 +19,7 @@ use std::fmt::Formatter; use derive_visitor::Drive; use derive_visitor::DriveMut; +use super::ShowLimit; use crate::ast::write_comma_separated_list; use crate::ast::write_dot_separated_list; use crate::ast::write_space_separated_string_map; @@ -123,3 +124,22 @@ impl Display for ShowCreateDictionaryStmt { ) } } + +#[derive(Debug, Clone, PartialEq, Drive, DriveMut)] +pub struct ShowDictionariesStmt { + pub database: Option, + pub limit: Option, +} + +impl Display for ShowDictionariesStmt { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "SHOW DICTIONARIES")?; + if let Some(database) = &self.database { + write!(f, " FROM {database}")?; + } + if let Some(limit) = &self.limit { + write!(f, " {limit}")?; + } + Ok(()) + } +} diff --git a/src/query/ast/src/ast/statements/mod.rs b/src/query/ast/src/ast/statements/mod.rs index 9eae6298eca8..b6fc9c0cf3ef 100644 --- a/src/query/ast/src/ast/statements/mod.rs +++ b/src/query/ast/src/ast/statements/mod.rs @@ -42,6 +42,7 @@ mod replace; mod script; mod sequence; mod set; +mod settings; mod show; mod stage; mod statement; @@ -85,6 +86,7 @@ pub use replace::*; pub use script::*; pub use sequence::*; pub use set::*; +pub use settings::*; pub use show::*; pub use stage::*; pub use statement::*; diff --git a/src/query/ast/src/ast/statements/set.rs b/src/query/ast/src/ast/statements/set.rs index 489ab0014199..6869dfdbd629 100644 --- a/src/query/ast/src/ast/statements/set.rs +++ b/src/query/ast/src/ast/statements/set.rs @@ -32,4 +32,6 @@ pub enum SetType { pub enum SetValues { Expr(Vec>), Query(Box), + // None means Unset Stmt + None, } diff --git a/src/query/ast/src/ast/statements/settings.rs b/src/query/ast/src/ast/statements/settings.rs new file mode 100644 index 000000000000..95de13eb4bb8 --- /dev/null +++ b/src/query/ast/src/ast/statements/settings.rs @@ -0,0 +1,77 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Display; +use std::fmt::Formatter; + +use derive_visitor::Drive; +use derive_visitor::DriveMut; + +use crate::ast::Identifier; +use crate::ast::SetType; +use crate::ast::SetValues; + +#[derive(Debug, Clone, PartialEq, Drive, DriveMut)] +pub struct Settings { + pub set_type: SetType, + pub identifiers: Vec, + pub values: SetValues, +} + +impl Display for Settings { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self.set_type { + SetType::SettingsGlobal => write!(f, "GLOBAL ")?, + SetType::SettingsSession => write!(f, "SESSION ")?, + SetType::Variable => write!(f, "VARIABLE ")?, + } + + if self.identifiers.len() > 1 { + write!(f, "(")?; + } + for (idx, variable) in self.identifiers.iter().enumerate() { + if idx > 0 { + write!(f, ", ")?; + } + write!(f, "{variable}")?; + } + if self.identifiers.len() > 1 { + write!(f, ")")?; + } + + match &self.values { + SetValues::Expr(exprs) => { + write!(f, " = ")?; + if exprs.len() > 1 { + write!(f, "(")?; + } + + for (idx, value) in exprs.iter().enumerate() { + if idx > 0 { + write!(f, ", ")?; + } + write!(f, "{value}")?; + } + if exprs.len() > 1 { + write!(f, ")")?; + } + } + SetValues::Query(query) => { + write!(f, " = {query}")?; + } + SetValues::None => {} + } + Ok(()) + } +} diff --git a/src/query/ast/src/ast/statements/statement.rs b/src/query/ast/src/ast/statements/statement.rs index 0607f1682d76..86c5b965dc96 100644 --- a/src/query/ast/src/ast/statements/statement.rs +++ b/src/query/ast/src/ast/statements/statement.rs @@ -27,8 +27,8 @@ use super::*; use crate::ast::quote::QuotedString; use crate::ast::statements::connection::CreateConnectionStmt; use crate::ast::statements::pipe::CreatePipeStmt; +use crate::ast::statements::settings::Settings; use crate::ast::statements::task::CreateTaskStmt; -use crate::ast::write_comma_separated_list; use crate::ast::CreateOption; use crate::ast::Identifier; use crate::ast::Query; @@ -86,14 +86,11 @@ pub enum Statement { }, SetStmt { - set_type: SetType, - identifiers: Vec, - values: SetValues, + settings: Settings, }, UnSetStmt { - unset_type: SetType, - identifiers: Vec, + settings: Settings, }, ShowVariables { @@ -158,9 +155,7 @@ pub enum Statement { CreateDictionary(CreateDictionaryStmt), DropDictionary(DropDictionaryStmt), ShowCreateDictionary(ShowCreateDictionaryStmt), - ShowDictionaries { - show_options: Option, - }, + ShowDictionaries(ShowDictionariesStmt), // Columns ShowColumns(ShowColumnsStmt), @@ -498,71 +493,8 @@ impl Display for Statement { } write!(f, " '{object_id}'")?; } - Statement::SetStmt { - set_type, - identifiers, - values, - } => { - write!(f, "SET ")?; - match *set_type { - SetType::SettingsGlobal => write!(f, "GLOBAL ")?, - SetType::SettingsSession => {} - SetType::Variable => write!(f, "VARIABLE ")?, - } - - if identifiers.len() > 1 { - write!(f, "(")?; - } - for (idx, variable) in identifiers.iter().enumerate() { - if idx > 0 { - write!(f, ", ")?; - } - write!(f, "{variable}")?; - } - if identifiers.len() > 1 { - write!(f, ")")?; - } - - match values { - SetValues::Expr(exprs) => { - write!(f, " = ")?; - if exprs.len() > 1 { - write!(f, "(")?; - } - - for (idx, value) in exprs.iter().enumerate() { - if idx > 0 { - write!(f, ", ")?; - } - write!(f, "{value}")?; - } - if exprs.len() > 1 { - write!(f, ")")?; - } - } - SetValues::Query(query) => { - write!(f, " = {query}")?; - } - } - } - Statement::UnSetStmt { - unset_type, - identifiers, - } => { - write!(f, "UNSET ")?; - match *unset_type { - SetType::SettingsSession => write!(f, "SESSION ")?, - SetType::SettingsGlobal => write!(f, "GLOBAL ")?, - SetType::Variable => write!(f, "VARIABLE ")?, - } - if identifiers.len() == 1 { - write!(f, "{}", identifiers[0])?; - } else { - write!(f, "(")?; - write_comma_separated_list(f, identifiers)?; - write!(f, ")")?; - } - } + Statement::SetStmt { settings } => write!(f, "SET {}", settings)?, + Statement::UnSetStmt { settings } => write!(f, "UNSET {}", settings)?, Statement::SetRole { is_default, role_name, @@ -613,12 +545,7 @@ impl Display for Statement { Statement::CreateDictionary(stmt) => write!(f, "{stmt}")?, Statement::DropDictionary(stmt) => write!(f, "{stmt}")?, Statement::ShowCreateDictionary(stmt) => write!(f, "{stmt}")?, - Statement::ShowDictionaries { show_options } => { - write!(f, "SHOW DICTIONARIES")?; - if let Some(show_options) = show_options { - write!(f, " {show_options}")?; - } - } + Statement::ShowDictionaries(stmt) => write!(f, "{stmt}")?, Statement::CreateView(stmt) => write!(f, "{stmt}")?, Statement::AlterView(stmt) => write!(f, "{stmt}")?, Statement::DropView(stmt) => write!(f, "{stmt}")?, diff --git a/src/query/ast/src/parser/common.rs b/src/query/ast/src/parser/common.rs index 6ffd8fc4039b..aa1b3ce0ac33 100644 --- a/src/query/ast/src/parser/common.rs +++ b/src/query/ast/src/parser/common.rs @@ -244,26 +244,9 @@ pub fn set_type(i: Input) -> IResult { }, |res| match res { Some(token) => match token.kind { - TokenKind::GLOBAL => SetType::SettingsGlobal, - TokenKind::SESSION => SetType::SettingsSession, - TokenKind::VARIABLE => SetType::Variable, - _ => unreachable!(), - }, - None => SetType::SettingsSession, - }, - )(i) -} - -pub fn unset_type(i: Input) -> IResult { - map( - rule! { - (GLOBAL | SESSION | VARIABLE)? - }, - |res| match res { - Some(token) => match token.kind { - TokenKind::GLOBAL => SetType::SettingsGlobal, - TokenKind::SESSION => SetType::SettingsSession, - TokenKind::VARIABLE => SetType::Variable, + GLOBAL => SetType::SettingsGlobal, + SESSION => SetType::SettingsSession, + VARIABLE => SetType::Variable, _ => unreachable!(), }, None => SetType::SettingsSession, diff --git a/src/query/ast/src/parser/copy.rs b/src/query/ast/src/parser/copy.rs index 83770db5be91..e7a601a4eac3 100644 --- a/src/query/ast/src/parser/copy.rs +++ b/src/query/ast/src/parser/copy.rs @@ -110,9 +110,7 @@ fn copy_into_location(i: Input) -> IResult { src, dst, file_format: Default::default(), - single: Default::default(), - max_file_size: Default::default(), - detailed_output: false, + options: Default::default(), }; for opt in opts { copy_stmt.apply_option(opt); @@ -210,6 +208,18 @@ fn copy_into_location_option(i: Input) -> IResult { rule! { DETAILED_OUTPUT ~ "=" ~ #literal_bool }, |(_, _, detailed_output)| CopyIntoLocationOption::DetailedOutput(detailed_output), ), + map( + rule! { USE_RAW_PATH ~ "=" ~ #literal_bool }, + |(_, _, use_raw_path)| CopyIntoLocationOption::UseRawPath(use_raw_path), + ), + map( + rule! { INCLUDE_QUERY_ID ~ "=" ~ #literal_bool }, + |(_, _, include_query_id)| CopyIntoLocationOption::IncludeQueryID(include_query_id), + ), + map( + rule! { OVERWRITE ~ "=" ~ #literal_bool }, + |(_, _, include_query_id)| CopyIntoLocationOption::OverWrite(include_query_id), + ), map(rule! { #file_format_clause }, |options| { CopyIntoLocationOption::FileFormat(options) }), diff --git a/src/query/ast/src/parser/query.rs b/src/query/ast/src/parser/query.rs index d59bdd2ecd0d..7f2b170edf80 100644 --- a/src/query/ast/src/parser/query.rs +++ b/src/query/ast/src/parser/query.rs @@ -702,7 +702,7 @@ pub enum TableReferenceElement { with_options: Option, pivot: Option>, unpivot: Option>, - sample: Option, + sample: Option, }, // `TABLE(expr)[ AS alias ]` TableFunction { @@ -711,7 +711,7 @@ pub enum TableReferenceElement { name: Identifier, params: Vec, alias: Option, - sample: Option, + sample: Option, }, // Derived table, which can be a subquery or joined tables or combination of them Subquery { @@ -760,7 +760,7 @@ pub fn table_reference_element(i: Input) -> IResult IResult IResult IResult, - level: Option<&Token>, - sample_conf: Option<(&Token, Expr, Option<&Token>, &Token)>, -) -> Option { - let mut table_sample = None; + block_level_sample: Option<(&Token, &Token, Expr, &Token)>, + row_level_sample: Option<(&Token, &Token, Expr, Option<&Token>, &Token)>, +) -> Option { + let mut default_sample_conf = SampleConfig::default(); if sample.is_some() { - let sample_level = match level { - // If the sample level is not specified, it defaults to ROW - Some(level) => match level.kind { - ROW => SampleLevel::ROW, - BLOCK => SampleLevel::BLOCK, - _ => unreachable!(), - }, - None => SampleLevel::ROW, - }; - let mut default_sample_conf = SampleConfig::Probability(100.0); - if let Some((_, Expr::Literal { value, .. }, rows, _)) = sample_conf { - default_sample_conf = if rows.is_some() { - SampleConfig::RowsNum(value.as_double().unwrap_or_default()) - } else { - SampleConfig::Probability(value.as_double().unwrap_or_default()) - }; + if let Some((_, _, Expr::Literal { value, .. }, _)) = block_level_sample { + default_sample_conf.set_block_level_sample(value.as_double().unwrap_or_default()); + } + if let Some((_, _, Expr::Literal { value, .. }, rows, _)) = row_level_sample { + default_sample_conf + .set_row_level_sample(value.as_double().unwrap_or_default(), rows.is_some()); } - table_sample = Some(Sample { - sample_level, - sample_conf: default_sample_conf, - }) - }; - table_sample + return Some(default_sample_conf); + } + None } struct TableReferenceParser; diff --git a/src/query/ast/src/parser/statement.rs b/src/query/ast/src/parser/statement.rs index f2e50528b5e9..683d9f46df7e 100644 --- a/src/query/ast/src/parser/statement.rs +++ b/src/query/ast/src/parser/statement.rs @@ -351,11 +351,14 @@ pub fn statement_body(i: Input) -> IResult { let unset_stmt = map( rule! { - UNSET ~ #unset_type ~ #unset_source + UNSET ~ #set_type ~ #unset_source }, |(_, unset_type, identifiers)| Statement::UnSetStmt { - unset_type, - identifiers, + settings: Settings { + set_type: unset_type, + identifiers, + values: SetValues::None, + }, }, ); @@ -389,9 +392,11 @@ pub fn statement_body(i: Input) -> IResult { SET ~ #set_type ~ #ident ~ "=" ~ #subexpr(0) }, |(_, set_type, var, _, value)| Statement::SetStmt { - set_type, - identifiers: vec![var], - values: SetValues::Expr(vec![Box::new(value)]), + settings: Settings { + set_type, + identifiers: vec![var], + values: SetValues::Expr(vec![Box::new(value)]), + }, }, ), map_res( @@ -402,9 +407,11 @@ pub fn statement_body(i: Input) -> IResult { |(_, set_type, _, ids, _, _, _, values, _)| { if ids.len() == values.len() { Ok(Statement::SetStmt { - set_type, - identifiers: ids, - values: SetValues::Expr(values.into_iter().map(|x| x.into()).collect()), + settings: Settings { + set_type, + identifiers: ids, + values: SetValues::Expr(values.into_iter().map(|x| x.into()).collect()), + }, }) } else { Err(nom::Err::Failure(ErrorKind::Other( @@ -418,9 +425,11 @@ pub fn statement_body(i: Input) -> IResult { SET ~ #set_type ~ #ident ~ "=" ~ #query }, |(_, set_type, var, _, query)| Statement::SetStmt { - set_type, - identifiers: vec![var], - values: SetValues::Query(Box::new(query)), + settings: Settings { + set_type, + identifiers: vec![var], + values: SetValues::Query(Box::new(query)), + }, }, ), map( @@ -428,9 +437,11 @@ pub fn statement_body(i: Input) -> IResult { SET ~ #set_type ~ "(" ~ #comma_separated_list0(ident) ~ ")" ~ "=" ~ #query }, |(_, set_type, _, vars, _, _, query)| Statement::SetStmt { - set_type, - identifiers: vars, - values: SetValues::Query(Box::new(query)), + settings: Settings { + set_type, + identifiers: vars, + values: SetValues::Query(Box::new(query)), + }, }, ), )); @@ -957,9 +968,15 @@ pub fn statement_body(i: Input) -> IResult { ); let show_dictionaries = map( rule! { - SHOW ~ DICTIONARIES ~ #show_options? + SHOW ~ DICTIONARIES ~ ((FROM|IN) ~ #ident)? ~ #show_limit? + }, + |(_, _, db, limit)| { + let database = match db { + Some((_, d)) => Some(d), + _ => None, + }; + Statement::ShowDictionaries(ShowDictionariesStmt { database, limit }) }, - |(_, _, show_options)| Statement::ShowDictionaries { show_options }, ); let show_create_dictionary = map( rule! { diff --git a/src/query/ast/src/parser/token.rs b/src/query/ast/src/parser/token.rs index 33b085c82178..70641dc974db 100644 --- a/src/query/ast/src/parser/token.rs +++ b/src/query/ast/src/parser/token.rs @@ -683,6 +683,8 @@ pub enum TokenKind { IF, #[token("IN", ignore(ascii_case))] IN, + #[token("INCLUDE_QUERY_ID", ignore(ascii_case))] + INCLUDE_QUERY_ID, #[token("INCREMENTAL", ignore(ascii_case))] INCREMENTAL, #[token("INDEX", ignore(ascii_case))] @@ -1064,6 +1066,8 @@ pub enum TokenKind { SYNTAX, #[token("USAGE", ignore(ascii_case))] USAGE, + #[token("USE_RAW_PATH", ignore(ascii_case))] + USE_RAW_PATH, #[token("UPDATE", ignore(ascii_case))] UPDATE, #[token("UPLOAD", ignore(ascii_case))] diff --git a/src/query/ast/tests/it/parser.rs b/src/query/ast/tests/it/parser.rs index 3b5f00a403e0..961c322b92e5 100644 --- a/src/query/ast/tests/it/parser.rs +++ b/src/query/ast/tests/it/parser.rs @@ -224,10 +224,11 @@ fn test_statement() { r#"select * from t sample row (99);"#, r#"select * from t sample block (99);"#, r#"select * from t sample row (10 rows);"#, - r#"select * from t sample block (10 rows);"#, r#"select * from numbers(1000) sample row (99);"#, r#"select * from numbers(1000) sample block (99);"#, r#"select * from numbers(1000) sample row (10 rows);"#, + r#"select * from numbers(1000) sample block (99) row (10 rows);"#, + r#"select * from numbers(1000) sample block (99) row (10);"#, r#"insert into t (c1, c2) values (1, 2), (3, 4);"#, r#"insert into t (c1, c2) values (1, 2);"#, r#"insert into table t select * from t2;"#, diff --git a/src/query/ast/tests/it/testdata/stmt.txt b/src/query/ast/tests/it/testdata/stmt.txt index 49138cf6e33e..9cfaea5263be 100644 --- a/src/query/ast/tests/it/testdata/stmt.txt +++ b/src/query/ast/tests/it/testdata/stmt.txt @@ -9869,11 +9869,13 @@ Query( pivot: None, unpivot: None, sample: Some( - Sample { - sample_level: ROW, - sample_conf: Probability( - 99.0, + SampleConfig { + row_level: Some( + Probability( + 99.0, + ), ), + block_level: None, }, ), }, @@ -9896,7 +9898,7 @@ Query( ---------- Input ---------- select * from t sample block (99); ---------- Output --------- -SELECT * FROM t SAMPLE BLOCK (99) +SELECT * FROM t SAMPLE BLOCK (99) ---------- AST ------------ Query( Query { @@ -9945,9 +9947,9 @@ Query( pivot: None, unpivot: None, sample: Some( - Sample { - sample_level: BLOCK, - sample_conf: Probability( + SampleConfig { + row_level: None, + block_level: Some( 99.0, ), }, @@ -10021,11 +10023,13 @@ Query( pivot: None, unpivot: None, sample: Some( - Sample { - sample_level: ROW, - sample_conf: RowsNum( - 10.0, + SampleConfig { + row_level: Some( + RowsNum( + 10.0, + ), ), + block_level: None, }, ), }, @@ -10046,20 +10050,20 @@ Query( ---------- Input ---------- -select * from t sample block (10 rows); +select * from numbers(1000) sample row (99); ---------- Output --------- -SELECT * FROM t SAMPLE BLOCK (10 ROWS) +SELECT * FROM numbers(1000) SAMPLE ROW (99) ---------- AST ------------ Query( Query { span: Some( - 0..38, + 0..43, ), with: None, body: Select( SelectStmt { span: Some( - 0..38, + 0..43, ), hints: None, distinct: false, @@ -10077,31 +10081,39 @@ Query( }, ], from: [ - Table { + TableFunction { span: Some( - 14..38, + 14..43, ), - catalog: None, - database: None, - table: Identifier { + lateral: false, + name: Identifier { span: Some( - 14..15, + 14..21, ), - name: "t", + name: "numbers", quote: None, ident_type: None, }, + params: [ + Literal { + span: Some( + 22..26, + ), + value: UInt64( + 1000, + ), + }, + ], + named_params: [], alias: None, - temporal: None, - with_options: None, - pivot: None, - unpivot: None, sample: Some( - Sample { - sample_level: BLOCK, - sample_conf: RowsNum( - 10.0, + SampleConfig { + row_level: Some( + Probability( + 99.0, + ), ), + block_level: None, }, ), }, @@ -10122,20 +10134,20 @@ Query( ---------- Input ---------- -select * from numbers(1000) sample row (99); +select * from numbers(1000) sample block (99); ---------- Output --------- -SELECT * FROM numbers(1000) SAMPLE ROW (99) +SELECT * FROM numbers(1000) SAMPLE BLOCK (99) ---------- AST ------------ Query( Query { span: Some( - 0..43, + 0..45, ), with: None, body: Select( SelectStmt { span: Some( - 0..43, + 0..45, ), hints: None, distinct: false, @@ -10155,7 +10167,7 @@ Query( from: [ TableFunction { span: Some( - 14..43, + 14..45, ), lateral: false, name: Identifier { @@ -10179,9 +10191,9 @@ Query( named_params: [], alias: None, sample: Some( - Sample { - sample_level: ROW, - sample_conf: Probability( + SampleConfig { + row_level: None, + block_level: Some( 99.0, ), }, @@ -10204,20 +10216,20 @@ Query( ---------- Input ---------- -select * from numbers(1000) sample block (99); +select * from numbers(1000) sample row (10 rows); ---------- Output --------- -SELECT * FROM numbers(1000) SAMPLE BLOCK (99) +SELECT * FROM numbers(1000) SAMPLE ROW (10 ROWS) ---------- AST ------------ Query( Query { span: Some( - 0..45, + 0..48, ), with: None, body: Select( SelectStmt { span: Some( - 0..45, + 0..48, ), hints: None, distinct: false, @@ -10237,7 +10249,91 @@ Query( from: [ TableFunction { span: Some( - 14..45, + 14..48, + ), + lateral: false, + name: Identifier { + span: Some( + 14..21, + ), + name: "numbers", + quote: None, + ident_type: None, + }, + params: [ + Literal { + span: Some( + 22..26, + ), + value: UInt64( + 1000, + ), + }, + ], + named_params: [], + alias: None, + sample: Some( + SampleConfig { + row_level: Some( + RowsNum( + 10.0, + ), + ), + block_level: None, + }, + ), + }, + ], + selection: None, + group_by: None, + having: None, + window_list: None, + qualify: None, + }, + ), + order_by: [], + limit: [], + offset: None, + ignore_result: false, + }, +) + + +---------- Input ---------- +select * from numbers(1000) sample block (99) row (10 rows); +---------- Output --------- +SELECT * FROM numbers(1000) SAMPLE BLOCK (99) ROW (10 ROWS) +---------- AST ------------ +Query( + Query { + span: Some( + 0..59, + ), + with: None, + body: Select( + SelectStmt { + span: Some( + 0..59, + ), + hints: None, + distinct: false, + top_n: None, + select_list: [ + StarColumns { + qualified: [ + Star( + Some( + 7..8, + ), + ), + ], + column_filter: None, + }, + ], + from: [ + TableFunction { + span: Some( + 14..59, ), lateral: false, name: Identifier { @@ -10261,9 +10357,13 @@ Query( named_params: [], alias: None, sample: Some( - Sample { - sample_level: BLOCK, - sample_conf: Probability( + SampleConfig { + row_level: Some( + RowsNum( + 10.0, + ), + ), + block_level: Some( 99.0, ), }, @@ -10286,20 +10386,20 @@ Query( ---------- Input ---------- -select * from numbers(1000) sample row (10 rows); +select * from numbers(1000) sample block (99) row (10); ---------- Output --------- -SELECT * FROM numbers(1000) SAMPLE ROW (10 ROWS) +SELECT * FROM numbers(1000) SAMPLE BLOCK (99) ROW (10) ---------- AST ------------ Query( Query { span: Some( - 0..48, + 0..54, ), with: None, body: Select( SelectStmt { span: Some( - 0..48, + 0..54, ), hints: None, distinct: false, @@ -10319,7 +10419,7 @@ Query( from: [ TableFunction { span: Some( - 14..48, + 14..54, ), lateral: false, name: Identifier { @@ -10343,10 +10443,14 @@ Query( named_params: [], alias: None, sample: Some( - Sample { - sample_level: ROW, - sample_conf: RowsNum( - 10.0, + SampleConfig { + row_level: Some( + Probability( + 10.0, + ), + ), + block_level: Some( + 99.0, ), }, ), @@ -14148,7 +14252,7 @@ COPY INTO 's3://mybucket/data.csv' skip_header = 1 ) ---------- Output --------- -COPY INTO 's3://mybucket/data.csv' FROM mytable FILE_FORMAT = (field_delimiter = ',', record_delimiter = '\n', skip_header = 1, type = CSV) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false +COPY INTO 's3://mybucket/data.csv' FROM mytable FILE_FORMAT = (field_delimiter = ',', record_delimiter = '\n', skip_header = 1, type = CSV) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false INCLUDE_QUERY_ID = true USE_RAW_PATH = false OVERWRITE = false ---------- AST ------------ CopyIntoLocation( CopyIntoLocationStmt { @@ -14197,9 +14301,14 @@ CopyIntoLocation( ), }, }, - single: false, - max_file_size: 0, - detailed_output: false, + options: CopyIntoLocationOptions { + single: false, + max_file_size: 0, + detailed_output: false, + use_raw_path: false, + include_query_id: true, + overwrite: false, + }, }, ) @@ -14208,7 +14317,7 @@ CopyIntoLocation( COPY INTO '@my_stage/my data' FROM mytable; ---------- Output --------- -COPY INTO '@my_stage/my data' FROM mytable SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false +COPY INTO '@my_stage/my data' FROM mytable SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false INCLUDE_QUERY_ID = true USE_RAW_PATH = false OVERWRITE = false ---------- AST ------------ CopyIntoLocation( CopyIntoLocationStmt { @@ -14235,9 +14344,14 @@ CopyIntoLocation( file_format: FileFormatOptions { options: {}, }, - single: false, - max_file_size: 0, - detailed_output: false, + options: CopyIntoLocationOptions { + single: false, + max_file_size: 0, + detailed_output: false, + use_raw_path: false, + include_query_id: true, + overwrite: false, + }, }, ) @@ -14252,7 +14366,7 @@ COPY INTO @my_stage skip_header = 1 ); ---------- Output --------- -COPY INTO '@my_stage' FROM mytable FILE_FORMAT = (field_delimiter = ',', record_delimiter = '\n', skip_header = 1, type = CSV) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false +COPY INTO '@my_stage' FROM mytable FILE_FORMAT = (field_delimiter = ',', record_delimiter = '\n', skip_header = 1, type = CSV) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false INCLUDE_QUERY_ID = true USE_RAW_PATH = false OVERWRITE = false ---------- AST ------------ CopyIntoLocation( CopyIntoLocationStmt { @@ -14292,9 +14406,14 @@ CopyIntoLocation( ), }, }, - single: false, - max_file_size: 0, - detailed_output: false, + options: CopyIntoLocationOptions { + single: false, + max_file_size: 0, + detailed_output: false, + use_raw_path: false, + include_query_id: true, + overwrite: false, + }, }, ) @@ -15922,78 +16041,82 @@ Query( ---------- Input ---------- SET max_threads = 10; ---------- Output --------- -SET max_threads = 10 +SET SESSION max_threads = 10 ---------- AST ------------ SetStmt { - set_type: SettingsSession, - identifiers: [ - Identifier { - span: Some( - 4..15, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - ], - values: Expr( - [ - Literal { + settings: Settings { + set_type: SettingsSession, + identifiers: [ + Identifier { span: Some( - 18..20, - ), - value: UInt64( - 10, + 4..15, ), + name: "max_threads", + quote: None, + ident_type: None, }, ], - ), + values: Expr( + [ + Literal { + span: Some( + 18..20, + ), + value: UInt64( + 10, + ), + }, + ], + ), + }, } ---------- Input ---------- SET max_threads = 10*2; ---------- Output --------- -SET max_threads = 10 * 2 +SET SESSION max_threads = 10 * 2 ---------- AST ------------ SetStmt { - set_type: SettingsSession, - identifiers: [ - Identifier { - span: Some( - 4..15, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - ], - values: Expr( - [ - BinaryOp { + settings: Settings { + set_type: SettingsSession, + identifiers: [ + Identifier { span: Some( - 20..21, + 4..15, ), - op: Multiply, - left: Literal { - span: Some( - 18..20, - ), - value: UInt64( - 10, - ), - }, - right: Literal { + name: "max_threads", + quote: None, + ident_type: None, + }, + ], + values: Expr( + [ + BinaryOp { span: Some( - 21..22, - ), - value: UInt64( - 2, + 20..21, ), + op: Multiply, + left: Literal { + span: Some( + 18..20, + ), + value: UInt64( + 10, + ), + }, + right: Literal { + span: Some( + 21..22, + ), + value: UInt64( + 2, + ), + }, }, - }, - ], - ), + ], + ), + }, } @@ -16003,73 +16126,75 @@ SET global (max_threads, max_memory_usage) = (10*2, 10*4); SET GLOBAL (max_threads, max_memory_usage) = (10 * 2, 10 * 4) ---------- AST ------------ SetStmt { - set_type: SettingsGlobal, - identifiers: [ - Identifier { - span: Some( - 12..23, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - Identifier { - span: Some( - 25..41, - ), - name: "max_memory_usage", - quote: None, - ident_type: None, - }, - ], - values: Expr( - [ - BinaryOp { + settings: Settings { + set_type: SettingsGlobal, + identifiers: [ + Identifier { span: Some( - 48..49, + 12..23, ), - op: Multiply, - left: Literal { - span: Some( - 46..48, - ), - value: UInt64( - 10, - ), - }, - right: Literal { - span: Some( - 49..50, - ), - value: UInt64( - 2, - ), - }, + name: "max_threads", + quote: None, + ident_type: None, }, - BinaryOp { + Identifier { span: Some( - 54..55, + 25..41, ), - op: Multiply, - left: Literal { + name: "max_memory_usage", + quote: None, + ident_type: None, + }, + ], + values: Expr( + [ + BinaryOp { span: Some( - 52..54, - ), - value: UInt64( - 10, + 48..49, ), - }, - right: Literal { - span: Some( - 55..56, - ), - value: UInt64( - 4, + op: Multiply, + left: Literal { + span: Some( + 46..48, + ), + value: UInt64( + 10, + ), + }, + right: Literal { + span: Some( + 49..50, + ), + value: UInt64( + 2, + ), + }, + }, + BinaryOp { + span: Some( + 54..55, ), + op: Multiply, + left: Literal { + span: Some( + 52..54, + ), + value: UInt64( + 10, + ), + }, + right: Literal { + span: Some( + 55..56, + ), + value: UInt64( + 4, + ), + }, }, - }, - ], - ), + ], + ), + }, } @@ -16079,17 +16204,20 @@ UNSET max_threads; UNSET SESSION max_threads ---------- AST ------------ UnSetStmt { - unset_type: SettingsSession, - identifiers: [ - Identifier { - span: Some( - 6..17, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - ], + settings: Settings { + set_type: SettingsSession, + identifiers: [ + Identifier { + span: Some( + 6..17, + ), + name: "max_threads", + quote: None, + ident_type: None, + }, + ], + values: None, + }, } @@ -16099,17 +16227,20 @@ UNSET session max_threads; UNSET SESSION max_threads ---------- AST ------------ UnSetStmt { - unset_type: SettingsSession, - identifiers: [ - Identifier { - span: Some( - 14..25, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - ], + settings: Settings { + set_type: SettingsSession, + identifiers: [ + Identifier { + span: Some( + 14..25, + ), + name: "max_threads", + quote: None, + ident_type: None, + }, + ], + values: None, + }, } @@ -16119,25 +16250,28 @@ UNSET (max_threads, sql_dialect); UNSET SESSION (max_threads, sql_dialect) ---------- AST ------------ UnSetStmt { - unset_type: SettingsSession, - identifiers: [ - Identifier { - span: Some( - 7..18, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - Identifier { - span: Some( - 20..31, - ), - name: "sql_dialect", - quote: None, - ident_type: None, - }, - ], + settings: Settings { + set_type: SettingsSession, + identifiers: [ + Identifier { + span: Some( + 7..18, + ), + name: "max_threads", + quote: None, + ident_type: None, + }, + Identifier { + span: Some( + 20..31, + ), + name: "sql_dialect", + quote: None, + ident_type: None, + }, + ], + values: None, + }, } @@ -16147,25 +16281,28 @@ UNSET session (max_threads, sql_dialect); UNSET SESSION (max_threads, sql_dialect) ---------- AST ------------ UnSetStmt { - unset_type: SettingsSession, - identifiers: [ - Identifier { - span: Some( - 15..26, - ), - name: "max_threads", - quote: None, - ident_type: None, - }, - Identifier { - span: Some( - 28..39, - ), - name: "sql_dialect", - quote: None, - ident_type: None, - }, - ], + settings: Settings { + set_type: SettingsSession, + identifiers: [ + Identifier { + span: Some( + 15..26, + ), + name: "max_threads", + quote: None, + ident_type: None, + }, + Identifier { + span: Some( + 28..39, + ), + name: "sql_dialect", + quote: None, + ident_type: None, + }, + ], + values: None, + }, } @@ -16175,29 +16312,31 @@ SET variable a = 3 SET VARIABLE a = 3 ---------- AST ------------ SetStmt { - set_type: Variable, - identifiers: [ - Identifier { - span: Some( - 13..14, - ), - name: "a", - quote: None, - ident_type: None, - }, - ], - values: Expr( - [ - Literal { + settings: Settings { + set_type: Variable, + identifiers: [ + Identifier { span: Some( - 17..18, - ), - value: UInt64( - 3, + 13..14, ), + name: "a", + quote: None, + ident_type: None, }, ], - ), + values: Expr( + [ + Literal { + span: Some( + 17..18, + ), + value: UInt64( + 3, + ), + }, + ], + ), + }, } @@ -16241,58 +16380,60 @@ SET variable a = select 3 SET VARIABLE a = SELECT 3 ---------- AST ------------ SetStmt { - set_type: Variable, - identifiers: [ - Identifier { - span: Some( - 13..14, - ), - name: "a", - quote: None, - ident_type: None, - }, - ], - values: Query( - Query { - span: Some( - 17..25, - ), - with: None, - body: Select( - SelectStmt { - span: Some( - 17..25, - ), - hints: None, - distinct: false, - top_n: None, - select_list: [ - AliasedExpr { - expr: Literal { - span: Some( - 24..25, - ), - value: UInt64( - 3, - ), + settings: Settings { + set_type: Variable, + identifiers: [ + Identifier { + span: Some( + 13..14, + ), + name: "a", + quote: None, + ident_type: None, + }, + ], + values: Query( + Query { + span: Some( + 17..25, + ), + with: None, + body: Select( + SelectStmt { + span: Some( + 17..25, + ), + hints: None, + distinct: false, + top_n: None, + select_list: [ + AliasedExpr { + expr: Literal { + span: Some( + 24..25, + ), + value: UInt64( + 3, + ), + }, + alias: None, }, - alias: None, - }, - ], - from: [], - selection: None, - group_by: None, - having: None, - window_list: None, - qualify: None, - }, - ), - order_by: [], - limit: [], - offset: None, - ignore_result: false, - }, - ), + ], + from: [], + selection: None, + group_by: None, + having: None, + window_list: None, + qualify: None, + }, + ), + order_by: [], + limit: [], + offset: None, + ignore_result: false, + }, + ), + }, } @@ -16302,126 +16443,128 @@ SET variable a = (select max(number) from numbers(10)) SET VARIABLE a = (SELECT max(number) FROM numbers(10)) ---------- AST ------------ SetStmt { - set_type: Variable, - identifiers: [ - Identifier { - span: Some( - 13..14, - ), - name: "a", - quote: None, - ident_type: None, - }, - ], - values: Expr( - [ - Subquery { + settings: Settings { + set_type: Variable, + identifiers: [ + Identifier { span: Some( - 17..54, + 13..14, ), - modifier: None, - subquery: Query { + name: "a", + quote: None, + ident_type: None, + }, + ], + values: Expr( + [ + Subquery { span: Some( - 18..53, + 17..54, ), - with: None, - body: Select( - SelectStmt { - span: Some( - 18..53, - ), - hints: None, - distinct: false, - top_n: None, - select_list: [ - AliasedExpr { - expr: FunctionCall { - span: Some( - 25..36, - ), - func: FunctionCall { - distinct: false, - name: Identifier { - span: Some( - 25..28, - ), - name: "max", - quote: None, - ident_type: None, - }, - args: [ - ColumnRef { + modifier: None, + subquery: Query { + span: Some( + 18..53, + ), + with: None, + body: Select( + SelectStmt { + span: Some( + 18..53, + ), + hints: None, + distinct: false, + top_n: None, + select_list: [ + AliasedExpr { + expr: FunctionCall { + span: Some( + 25..36, + ), + func: FunctionCall { + distinct: false, + name: Identifier { span: Some( - 29..35, + 25..28, ), - column: ColumnRef { - database: None, - table: None, - column: Name( - Identifier { - span: Some( - 29..35, - ), - name: "number", - quote: None, - ident_type: None, - }, + name: "max", + quote: None, + ident_type: None, + }, + args: [ + ColumnRef { + span: Some( + 29..35, ), + column: ColumnRef { + database: None, + table: None, + column: Name( + Identifier { + span: Some( + 29..35, + ), + name: "number", + quote: None, + ident_type: None, + }, + ), + }, }, - }, - ], - params: [], - window: None, - lambda: None, + ], + params: [], + window: None, + lambda: None, + }, }, + alias: None, }, - alias: None, - }, - ], - from: [ - TableFunction { - span: Some( - 42..53, - ), - lateral: false, - name: Identifier { + ], + from: [ + TableFunction { span: Some( - 42..49, + 42..53, ), - name: "numbers", - quote: None, - ident_type: None, - }, - params: [ - Literal { + lateral: false, + name: Identifier { span: Some( - 50..52, - ), - value: UInt64( - 10, + 42..49, ), + name: "numbers", + quote: None, + ident_type: None, }, - ], - named_params: [], - alias: None, - sample: None, - }, - ], - selection: None, - group_by: None, - having: None, - window_list: None, - qualify: None, - }, - ), - order_by: [], - limit: [], - offset: None, - ignore_result: false, + params: [ + Literal { + span: Some( + 50..52, + ), + value: UInt64( + 10, + ), + }, + ], + named_params: [], + alias: None, + sample: None, + }, + ], + selection: None, + group_by: None, + having: None, + window_list: None, + qualify: None, + }, + ), + order_by: [], + limit: [], + offset: None, + ignore_result: false, + }, }, - }, - ], - ), + ], + ), + }, } @@ -19699,7 +19842,7 @@ CreateTask( ---------- Input ---------- CREATE TASK IF NOT EXISTS MyTask1 SCHEDULE = USING CRON '0 13 * * *' AS COPY INTO @my_internal_stage FROM canadian_city_population FILE_FORMAT = (TYPE = PARQUET) ---------- Output --------- -CREATE TASK IF NOT EXISTS MyTask1 SCHEDULE = USING CRON '0 13 * * *' AS COPY INTO '@my_internal_stage' FROM canadian_city_population FILE_FORMAT = (type = PARQUET) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false +CREATE TASK IF NOT EXISTS MyTask1 SCHEDULE = USING CRON '0 13 * * *' AS COPY INTO '@my_internal_stage' FROM canadian_city_population FILE_FORMAT = (type = PARQUET) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false INCLUDE_QUERY_ID = true USE_RAW_PATH = false OVERWRITE = false ---------- AST ------------ CreateTask( CreateTaskStmt { @@ -19721,7 +19864,7 @@ CreateTask( after: [], when_condition: None, sql: SingleStatement( - "COPY INTO '@my_internal_stage' FROM canadian_city_population FILE_FORMAT = (type = PARQUET) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false", + "COPY INTO '@my_internal_stage' FROM canadian_city_population FILE_FORMAT = (type = PARQUET) SINGLE = false MAX_FILE_SIZE = 0 DETAILED_OUTPUT = false INCLUDE_QUERY_ID = true USE_RAW_PATH = false OVERWRITE = false", ), }, ) diff --git a/src/query/catalog/Cargo.toml b/src/query/catalog/Cargo.toml index 922568ad019f..4f1e03024541 100644 --- a/src/query/catalog/Cargo.toml +++ b/src/query/catalog/Cargo.toml @@ -47,6 +47,7 @@ xorf = { version = "0.11.0", default-features = false, features = ["binary-fuse" [dev-dependencies] goldenfile = "1.4" +maplit = "1.0.2" [lints] workspace = true diff --git a/src/query/catalog/src/lib.rs b/src/query/catalog/src/lib.rs index 55c54b31dd96..722d088441b6 100644 --- a/src/query/catalog/src/lib.rs +++ b/src/query/catalog/src/lib.rs @@ -20,6 +20,7 @@ pub mod cluster_info; pub mod database; pub mod lock; pub mod merge_into_join; +pub mod partition_columns; pub mod plan; pub mod query_kind; pub mod runtime_filter_info; diff --git a/src/query/storages/delta/src/partition_columns/mod.rs b/src/query/catalog/src/partition_columns/mod.rs similarity index 94% rename from src/query/storages/delta/src/partition_columns/mod.rs rename to src/query/catalog/src/partition_columns/mod.rs index 1478737f43b1..cb7698f75136 100644 --- a/src/query/storages/delta/src/partition_columns/mod.rs +++ b/src/query/catalog/src/partition_columns/mod.rs @@ -16,4 +16,4 @@ mod pushdown_transform; mod values_serde; pub use pushdown_transform::get_pushdown_without_partition_columns; -pub use values_serde::get_partition_values; +pub use values_serde::str_to_scalar; diff --git a/src/query/storages/delta/src/partition_columns/pushdown_transform.rs b/src/query/catalog/src/partition_columns/pushdown_transform.rs similarity index 96% rename from src/query/storages/delta/src/partition_columns/pushdown_transform.rs rename to src/query/catalog/src/partition_columns/pushdown_transform.rs index 489a122f0539..f95db01e446a 100644 --- a/src/query/storages/delta/src/partition_columns/pushdown_transform.rs +++ b/src/query/catalog/src/partition_columns/pushdown_transform.rs @@ -14,12 +14,13 @@ use std::collections::BTreeMap; -use databend_common_catalog::plan::Projection; -use databend_common_catalog::plan::PushDownInfo; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::FieldIndex; +use crate::plan::Projection; +use crate::plan::PushDownInfo; + pub fn get_pushdown_without_partition_columns( mut pushdown: PushDownInfo, partition_columns: &[FieldIndex], @@ -87,10 +88,9 @@ fn shift_projection(prj: Projection, partition_columns: &[FieldIndex]) -> Result #[cfg(test)] mod tests { - use databend_common_catalog::plan::Projection; - use super::shift_projection; use super::shift_projection_index; + use crate::plan::Projection; #[test] fn test_shift_projection_index() { diff --git a/src/query/storages/delta/src/partition_columns/values_serde.rs b/src/query/catalog/src/partition_columns/values_serde.rs similarity index 83% rename from src/query/storages/delta/src/partition_columns/values_serde.rs rename to src/query/catalog/src/partition_columns/values_serde.rs index e283b0f4338e..005c48eed91d 100644 --- a/src/query/storages/delta/src/partition_columns/values_serde.rs +++ b/src/query/catalog/src/partition_columns/values_serde.rs @@ -21,8 +21,6 @@ use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberScalar; use databend_common_expression::Scalar; -use databend_common_expression::TableField; -use deltalake::kernel::Add; pub fn str_to_scalar(value: &str, data_type: &DataType) -> Result { if value.is_empty() { @@ -81,20 +79,3 @@ pub fn str_to_scalar(value: &str, data_type: &DataType) -> Result { ))), } } - -pub fn get_partition_values(add: &Add, fields: &[&TableField]) -> Result> { - let mut values = Vec::with_capacity(fields.len()); - for f in fields { - match add.partition_values.get(&f.name) { - Some(Some(v)) => values.push(str_to_scalar(v, &f.data_type().into())?), - Some(None) => values.push(Scalar::Null), - None => { - return Err(ErrorCode::BadArguments(format!( - "partition value for column {} not found", - &f.name - ))); - } - } - } - Ok(values) -} diff --git a/src/query/catalog/src/plan/datasource/datasource_info/stage.rs b/src/query/catalog/src/plan/datasource/datasource_info/stage.rs index c8816a921c55..06d7219925ab 100644 --- a/src/query/catalog/src/plan/datasource/datasource_info/stage.rs +++ b/src/query/catalog/src/plan/datasource/datasource_info/stage.rs @@ -17,6 +17,7 @@ use std::fmt::Display; use std::fmt::Formatter; use std::sync::Arc; +use databend_common_ast::ast::CopyIntoLocationOptions; use databend_common_exception::Result; use databend_common_expression::RemoteExpr; use databend_common_expression::TableSchema; @@ -40,6 +41,7 @@ pub struct StageTableInfo { // - may need to be purged as well (depends on the copy options) pub duplicated_files_detected: Vec, pub is_select: bool, + pub copy_into_location_options: CopyIntoLocationOptions, } impl StageTableInfo { diff --git a/src/query/catalog/src/plan/pushdown.rs b/src/query/catalog/src/plan/pushdown.rs index 2d934983968c..ce03538de283 100644 --- a/src/query/catalog/src/plan/pushdown.rs +++ b/src/query/catalog/src/plan/pushdown.rs @@ -15,7 +15,7 @@ use std::collections::BTreeMap; use std::fmt::Debug; -use databend_common_ast::ast::Sample; +use databend_common_ast::ast::SampleConfig; use databend_common_expression::types::DataType; use databend_common_expression::types::F32; use databend_common_expression::DataSchema; @@ -143,7 +143,7 @@ pub struct PushDownInfo { pub change_type: Option, pub inverted_index: Option, /// Used by table sample - pub sample: Option, + pub sample: Option, } #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] diff --git a/src/query/config/src/config.rs b/src/query/config/src/config.rs index ab1046e115c0..5cedd0c0d59c 100644 --- a/src/query/config/src/config.rs +++ b/src/query/config/src/config.rs @@ -2940,11 +2940,7 @@ pub struct DiskCacheConfig { #[serde(default, deny_unknown_fields)] pub struct SpillConfig { /// Path of spill to local disk. disable if it's empty. - #[clap( - long, - value_name = "VALUE", - default_value = "./.databend/temp/_query_spill" - )] + #[clap(long, value_name = "VALUE", default_value = "")] pub spill_local_disk_path: OsString, #[clap(long, value_name = "VALUE", default_value = "30")] diff --git a/src/query/config/src/inner.rs b/src/query/config/src/inner.rs index fb4a32c8afa2..7f0df3db978a 100644 --- a/src/query/config/src/inner.rs +++ b/src/query/config/src/inner.rs @@ -723,7 +723,7 @@ pub struct SpillConfig { impl Default for SpillConfig { fn default() -> Self { Self { - path: OsString::from("./.databend/temp/_query_spill"), + path: OsString::from(""), reserved_disk_ratio: OrderedFloat(0.3), global_bytes_limit: u64::MAX, } diff --git a/src/query/expression/src/aggregate/aggregate_hashtable.rs b/src/query/expression/src/aggregate/aggregate_hashtable.rs index 00c48e83293e..07d403518e0b 100644 --- a/src/query/expression/src/aggregate/aggregate_hashtable.rs +++ b/src/query/expression/src/aggregate/aggregate_hashtable.rs @@ -49,6 +49,7 @@ pub struct AggregateHashTable { // use for append rows directly during deserialize pub direct_append: bool, pub config: HashTableConfig, + current_radix_bits: u64, entries: Vec, count: usize, @@ -585,6 +586,7 @@ impl AggregateHashTable { .iter() .map(|arena| arena.allocated_bytes()) .sum::() + + self.entries.len() * std::mem::size_of::() } } diff --git a/src/query/expression/src/block.rs b/src/query/expression/src/block.rs index 820d5841d4a3..866f976644f3 100644 --- a/src/query/expression/src/block.rs +++ b/src/query/expression/src/block.rs @@ -240,6 +240,18 @@ impl DataBlock { self.columns().iter().map(|entry| entry.memory_size()).sum() } + pub fn consume_convert_to_full(self) -> Self { + if self + .columns() + .iter() + .all(|entry| entry.value.as_column().is_some()) + { + return self; + } + + self.convert_to_full() + } + pub fn convert_to_full(&self) -> Self { let columns = self .columns() diff --git a/src/query/expression/src/converts/arrow/to.rs b/src/query/expression/src/converts/arrow/to.rs index 98379b1cd1f0..391d2893f596 100644 --- a/src/query/expression/src/converts/arrow/to.rs +++ b/src/query/expression/src/converts/arrow/to.rs @@ -101,7 +101,7 @@ impl DataBlock { let arrow_schema = table_schema_to_arrow_schema(table_schema); let mut arrays = Vec::with_capacity(self.columns().len()); for (entry, arrow_field) in self - .convert_to_full() + .consume_convert_to_full() .columns() .iter() .zip(arrow_schema.fields()) diff --git a/src/query/expression/tests/it/sort.rs b/src/query/expression/tests/it/sort.rs index 9c72d7d6ab28..1d1dbf626844 100644 --- a/src/query/expression/tests/it/sort.rs +++ b/src/query/expression/tests/it/sort.rs @@ -24,6 +24,7 @@ use databend_common_expression::FromData; use databend_common_expression::SortColumnDescription; use crate::common::new_block; +use crate::rand_block_for_all_types; #[test] fn test_block_sort() -> Result<()> { @@ -201,3 +202,52 @@ fn test_block_sort() -> Result<()> { Ok(()) } + +#[test] +fn sort_concat() { + // Sort(Sort A || Sort B) = Sort (A || B) + use databend_common_expression::DataBlock; + use itertools::Itertools; + use rand::seq::SliceRandom; + use rand::Rng; + + let mut rng = rand::thread_rng(); + let num_blocks = 100; + + for _i in 0..num_blocks { + let block_a = rand_block_for_all_types(rng.gen_range(0..100)); + let block_b = rand_block_for_all_types(rng.gen_range(0..100)); + + let mut sort_index: Vec = (0..block_a.num_columns()).collect(); + sort_index.shuffle(&mut rng); + + let sort_desc = sort_index + .iter() + .map(|i| SortColumnDescription { + offset: *i, + asc: rng.gen_bool(0.5), + nulls_first: rng.gen_bool(0.5), + is_nullable: rng.gen_bool(0.5), + }) + .collect_vec(); + + let concat_ab_0 = DataBlock::concat(&[block_a.clone(), block_b.clone()]).unwrap(); + + let sort_a = DataBlock::sort(&block_a, &sort_desc, None).unwrap(); + let sort_b = DataBlock::sort(&block_b, &sort_desc, None).unwrap(); + let concat_ab_1 = DataBlock::concat(&[sort_a, sort_b]).unwrap(); + + let block_1 = DataBlock::sort(&concat_ab_0, &sort_desc, None).unwrap(); + let block_2 = DataBlock::sort(&concat_ab_1, &sort_desc, None).unwrap(); + + assert_eq!(block_1.num_columns(), block_2.num_columns()); + assert_eq!(block_1.num_rows(), block_2.num_rows()); + + let columns_1 = block_1.columns(); + let columns_2 = block_2.columns(); + for idx in 0..columns_1.len() { + assert_eq!(columns_1[idx].data_type, columns_2[idx].data_type); + assert_eq!(columns_1[idx].value, columns_2[idx].value); + } + } +} diff --git a/src/query/functions/Cargo.toml b/src/query/functions/Cargo.toml index 16e6f6c6d2e4..615ff7f9d0d8 100644 --- a/src/query/functions/Cargo.toml +++ b/src/query/functions/Cargo.toml @@ -47,7 +47,6 @@ lexical-core = "0.8.5" libm = "0.2.6" match-template = { workspace = true } md-5 = "0.10.5" -multiversion = "0.7.4" naive-cityhash = "0.2.0" num-traits = "0.2.15" once_cell = { workspace = true } diff --git a/src/query/functions/src/aggregates/aggregate_min_max_any.rs b/src/query/functions/src/aggregates/aggregate_min_max_any.rs index 9efdc985f13d..6ab35d792d80 100644 --- a/src/query/functions/src/aggregates/aggregate_min_max_any.rs +++ b/src/query/functions/src/aggregates/aggregate_min_max_any.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use borsh::BorshDeserialize; use borsh::BorshSerialize; +use databend_common_arrow::arrow::bitmap::Bitmap; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::decimal::*; @@ -92,6 +93,36 @@ where Ok(()) } + fn add_batch( + &mut self, + other: T::Column, + validity: Option<&Bitmap>, + function_data: Option<&dyn FunctionData>, + ) -> Result<()> { + let column_len = T::column_len(&other); + if column_len == 0 { + return Ok(()); + } + + let column_iter = T::iter_column(&other); + if let Some(validity) = validity { + if validity.unset_bits() == column_len { + return Ok(()); + } + for (data, valid) in column_iter.zip(validity.iter()) { + if valid { + let _ = self.add(data, function_data); + } + } + } else { + let v = column_iter.reduce(|l, r| if !C::change_if(&l, &r) { l } else { r }); + if let Some(v) = v { + let _ = self.add(v, function_data); + } + } + Ok(()) + } + fn merge(&mut self, rhs: &Self) -> Result<()> { if let Some(v) = &rhs.value { self.add(T::to_scalar_ref(v), None)?; diff --git a/src/query/functions/src/aggregates/aggregate_mode.rs b/src/query/functions/src/aggregates/aggregate_mode.rs new file mode 100644 index 000000000000..7a437ea27f53 --- /dev/null +++ b/src/query/functions/src/aggregates/aggregate_mode.rs @@ -0,0 +1,167 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::hash::Hash; +use std::ops::AddAssign; +use std::sync::Arc; + +use borsh::BorshDeserialize; +use borsh::BorshSerialize; +use databend_common_exception::Result; +use databend_common_expression::types::*; +use databend_common_expression::with_number_mapped_type; +use databend_common_expression::AggregateFunctionRef; +use databend_common_expression::Scalar; + +use super::FunctionData; +use super::UnaryState; +use crate::aggregates::aggregate_function_factory::AggregateFunctionDescription; +use crate::aggregates::assert_unary_arguments; +use crate::aggregates::AggregateUnaryFunction; + +#[derive(BorshSerialize, BorshDeserialize)] +pub struct ModeState +where + T: ValueType, + T::Scalar: Ord + Hash + BorshSerialize + BorshDeserialize, +{ + pub frequency_map: HashMap, +} + +impl Default for ModeState +where + T: ValueType, + T::Scalar: Ord + Hash + BorshSerialize + BorshDeserialize, +{ + fn default() -> Self { + ModeState:: { + frequency_map: HashMap::new(), + } + } +} + +impl UnaryState for ModeState +where + T: ValueType + Sync + Send, + T::Scalar: Ord + Hash + Sync + Send + BorshSerialize + BorshDeserialize, +{ + fn add( + &mut self, + other: T::ScalarRef<'_>, + _function_data: Option<&dyn FunctionData>, + ) -> Result<()> { + let other = T::to_owned_scalar(other); + match self.frequency_map.entry(other) { + Entry::Occupied(o) => *o.into_mut() += 1, + Entry::Vacant(v) => { + v.insert(1); + } + }; + + Ok(()) + } + + fn merge(&mut self, rhs: &Self) -> Result<()> { + for (key, value) in rhs.frequency_map.iter() { + match self.frequency_map.get_mut(key) { + Some(entry) => entry.add_assign(value), + None => { + self.frequency_map.insert(key.clone(), *value); + } + } + } + + Ok(()) + } + + fn merge_result( + &mut self, + builder: &mut T::ColumnBuilder, + _function_data: Option<&dyn FunctionData>, + ) -> Result<()> { + if self.frequency_map.is_empty() { + T::push_default(builder); + } else { + let (key, _) = self + .frequency_map + .iter() + .max_by_key(|&(_, value)| value) + .unwrap(); + T::push_item(builder, T::to_scalar_ref(key)); + } + + Ok(()) + } +} + +pub fn try_create_aggregate_mode_function( + display_name: &str, + params: Vec, + arguments: Vec, +) -> Result { + assert_unary_arguments(display_name, arguments.len())?; + + let data_type = arguments[0].clone(); + with_number_mapped_type!(|NUM| match &data_type { + DataType::Number(NumberDataType::NUM) => { + let func = AggregateUnaryFunction::< + ModeState>, + NumberType, + NumberType, + >::try_create( + display_name, data_type.clone(), params, data_type.clone() + ) + .with_need_drop(true); + Ok(Arc::new(func)) + } + DataType::Decimal(DecimalDataType::Decimal128(_)) => { + let func = AggregateUnaryFunction::< + ModeState, + Decimal128Type, + Decimal128Type, + >::try_create( + display_name, data_type.clone(), params, data_type.clone() + ) + .with_need_drop(true); + Ok(Arc::new(func)) + } + DataType::Decimal(DecimalDataType::Decimal256(_)) => { + let func = AggregateUnaryFunction::< + ModeState, + Decimal256Type, + Decimal256Type, + >::try_create( + display_name, data_type.clone(), params, data_type.clone() + ) + .with_need_drop(true); + Ok(Arc::new(func)) + } + _ => { + let func = AggregateUnaryFunction::, AnyType, AnyType>::try_create( + display_name, + data_type.clone(), + params, + data_type.clone(), + ) + .with_need_drop(true); + Ok(Arc::new(func)) + } + }) +} + +pub fn aggregate_mode_function_desc() -> AggregateFunctionDescription { + AggregateFunctionDescription::creator(Box::new(try_create_aggregate_mode_function)) +} diff --git a/src/query/functions/src/aggregates/aggregate_sum.rs b/src/query/functions/src/aggregates/aggregate_sum.rs index 116ba6e46c46..355d8dfa8a41 100644 --- a/src/query/functions/src/aggregates/aggregate_sum.rs +++ b/src/query/functions/src/aggregates/aggregate_sum.rs @@ -15,6 +15,7 @@ use borsh::BorshDeserialize; use borsh::BorshSerialize; use databend_common_arrow::arrow::bitmap::Bitmap; +use databend_common_arrow::arrow::buffer::Buffer; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::decimal::*; @@ -80,21 +81,33 @@ where } } -#[multiversion::multiversion(targets("x86_64+avx", "x86_64+sse"))] -fn sum_batch(other: T::Column) -> N::Scalar +// #[multiversion::multiversion(targets("x86_64+avx", "x86_64+sse"))] +#[inline] +pub fn sum_batch(inner: Buffer, validity: Option<&Bitmap>) -> TSum where - T: ValueType + Sync + Send, - N: ValueType, - T::Scalar: Number + AsPrimitive, - N::Scalar: Number + AsPrimitive + std::ops::AddAssign, - for<'a> T::ScalarRef<'a>: Number + AsPrimitive, + T: Number + AsPrimitive, + TSum: Number + std::ops::AddAssign, { - // use temp variable to hint the compiler to unroll the loop - let mut sum = N::Scalar::default(); - for value in T::iter_column(&other) { - sum += value.as_(); + match validity { + Some(v) if v.unset_bits() > 0 => { + let mut sum = TSum::default(); + inner.iter().zip(v.iter()).for_each(|(t, b)| { + if b { + sum += t.as_(); + } + }); + + sum + } + _ => { + let mut sum = TSum::default(); + inner.iter().for_each(|t| { + sum += t.as_(); + }); + + sum + } } - sum } impl UnaryState for NumberSumState @@ -117,9 +130,12 @@ where fn add_batch( &mut self, other: T::Column, + validity: Option<&Bitmap>, _function_data: Option<&dyn FunctionData>, ) -> Result<()> { - self.value += sum_batch::(other); + let col = T::upcast_column(other); + let buffer = NumberType::::try_downcast_column(&col).unwrap(); + self.value += sum_batch::(buffer, validity); Ok(()) } diff --git a/src/query/functions/src/aggregates/aggregate_unary.rs b/src/query/functions/src/aggregates/aggregate_unary.rs index fa85cffcce0b..5ac0bc5a4d22 100644 --- a/src/query/functions/src/aggregates/aggregate_unary.rs +++ b/src/query/functions/src/aggregates/aggregate_unary.rs @@ -47,10 +47,22 @@ where fn add_batch( &mut self, other: T::Column, + validity: Option<&Bitmap>, function_data: Option<&dyn FunctionData>, ) -> Result<()> { - for value in T::iter_column(&other) { - self.add(value, function_data)?; + match validity { + Some(validity) => { + for (data, valid) in T::iter_column(&other).zip(validity.iter()) { + if valid { + self.add(data, function_data)?; + } + } + } + None => { + for value in T::iter_column(&other) { + self.add(value, function_data)?; + } + } } Ok(()) } @@ -206,18 +218,8 @@ where ) -> Result<()> { let column = T::try_downcast_column(&columns[0]).unwrap(); let state: &mut S = place.get::(); - match validity { - Some(bitmap) if bitmap.unset_bits() > 0 => { - let column_iter = T::iter_column(&column); - for (value, is_valid) in column_iter.zip(bitmap.iter()) { - if is_valid { - state.add(value, self.function_data.as_deref())?; - } - } - Ok(()) - } - _ => state.add_batch(column, self.function_data.as_deref()), - } + + state.add_batch(column, validity, self.function_data.as_deref()) } fn accumulate_row(&self, place: StateAddr, columns: InputColumns, row: usize) -> Result<()> { diff --git a/src/query/functions/src/aggregates/aggregator.rs b/src/query/functions/src/aggregates/aggregator.rs index 857db00bc55e..e463ae7f8168 100644 --- a/src/query/functions/src/aggregates/aggregator.rs +++ b/src/query/functions/src/aggregates/aggregator.rs @@ -31,6 +31,7 @@ use super::aggregate_covariance::aggregate_covariance_sample_desc; use super::aggregate_min_max_any::aggregate_any_function_desc; use super::aggregate_min_max_any::aggregate_max_function_desc; use super::aggregate_min_max_any::aggregate_min_function_desc; +use super::aggregate_mode::aggregate_mode_function_desc; use super::aggregate_stddev::aggregate_stddev_pop_function_desc; use super::aggregate_stddev::aggregate_stddev_samp_function_desc; use super::aggregate_window_funnel::aggregate_window_funnel_function_desc; @@ -141,6 +142,8 @@ impl Aggregators { ); factory.register("histogram", aggregate_histogram_function_desc()); + + factory.register("mode", aggregate_mode_function_desc()); } pub fn register_combinator(factory: &mut AggregateFunctionFactory) { diff --git a/src/query/functions/src/aggregates/mod.rs b/src/query/functions/src/aggregates/mod.rs index 071869dba0b3..3092cd526068 100644 --- a/src/query/functions/src/aggregates/mod.rs +++ b/src/query/functions/src/aggregates/mod.rs @@ -33,6 +33,7 @@ mod aggregate_json_array_agg; mod aggregate_json_object_agg; mod aggregate_kurtosis; mod aggregate_min_max_any; +mod aggregate_mode; mod aggregate_null_result; mod aggregate_quantile_cont; mod aggregate_quantile_disc; @@ -64,6 +65,7 @@ pub use aggregate_json_array_agg::*; pub use aggregate_json_object_agg::*; pub use aggregate_kurtosis::*; pub use aggregate_min_max_any::*; +pub use aggregate_mode::*; pub use aggregate_null_result::AggregateNullResultFunction; pub use aggregate_quantile_cont::*; pub use aggregate_quantile_disc::*; diff --git a/src/query/functions/tests/it/aggregates/agg.rs b/src/query/functions/tests/it/aggregates/agg.rs index 7a2a01417925..969e4ecc9348 100644 --- a/src/query/functions/tests/it/aggregates/agg.rs +++ b/src/query/functions/tests/it/aggregates/agg.rs @@ -72,6 +72,7 @@ fn test_agg() { test_agg_histogram(file, eval_aggr); test_agg_json_array_agg(file, eval_aggr); test_agg_json_object_agg(file, eval_aggr); + test_agg_mode(file, eval_aggr); } #[test] @@ -111,6 +112,7 @@ fn test_agg_group_by() { test_agg_group_array_moving_sum(file, eval_aggr); test_agg_json_array_agg(file, eval_aggr); test_agg_json_object_agg(file, eval_aggr); + test_agg_mode(file, simulate_two_groups_group_by); } fn gen_bitmap_data() -> Column { @@ -139,6 +141,7 @@ fn get_example() -> Vec<(&'static str, Column)> { ("a", Int64Type::from_data(vec![4i64, 3, 2, 1])), ("b", UInt64Type::from_data(vec![1u64, 2, 3, 4])), ("c", UInt64Type::from_data(vec![1u64, 2, 1, 3])), + ("d", UInt64Type::from_data(vec![1u64, 1, 1, 1])), ( "x_null", UInt64Type::from_data_with_validity(vec![1u64, 2, 3, 4], vec![ @@ -882,3 +885,10 @@ fn test_agg_json_object_agg(file: &mut impl Write, simulator: impl AggregationSi simulator, ); } + +fn test_agg_mode(file: &mut impl Write, simulator: impl AggregationSimulator) { + run_agg_ast(file, "mode(1)", get_example().as_slice(), simulator); + run_agg_ast(file, "mode(NULL)", get_example().as_slice(), simulator); + run_agg_ast(file, "mode(d)", get_example().as_slice(), simulator); + run_agg_ast(file, "mode(all_null)", get_example().as_slice(), simulator); +} diff --git a/src/query/functions/tests/it/aggregates/testdata/agg.txt b/src/query/functions/tests/it/aggregates/testdata/agg.txt index 3d0b4358a0c4..85300eac60a7 100644 --- a/src/query/functions/tests/it/aggregates/testdata/agg.txt +++ b/src/query/functions/tests/it/aggregates/testdata/agg.txt @@ -1492,3 +1492,43 @@ evaluation (internal): +--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +ast: mode(1) +evaluation (internal): ++--------+---------------------------------------------------------------+ +| Column | Data | ++--------+---------------------------------------------------------------+ +| a | Int64([4, 3, 2, 1]) | +| Output | NullableColumn { column: UInt8([1]), validity: [0b_______1] } | ++--------+---------------------------------------------------------------+ + + +ast: mode(NULL) +evaluation (internal): ++--------+---------------------+ +| Column | Data | ++--------+---------------------+ +| a | Int64([4, 3, 2, 1]) | +| Output | Null { len: 1 } | ++--------+---------------------+ + + +ast: mode(d) +evaluation (internal): ++--------+----------------------------------------------------------------+ +| Column | Data | ++--------+----------------------------------------------------------------+ +| d | UInt64([1, 1, 1, 1]) | +| Output | NullableColumn { column: UInt64([1]), validity: [0b_______1] } | ++--------+----------------------------------------------------------------+ + + +ast: mode(all_null) +evaluation (internal): ++----------+-------------------------------------------------------------------------+ +| Column | Data | ++----------+-------------------------------------------------------------------------+ +| all_null | NullableColumn { column: UInt64([1, 2, 3, 4]), validity: [0b____0000] } | +| Output | NullableColumn { column: UInt64([0]), validity: [0b_______0] } | ++----------+-------------------------------------------------------------------------+ + + diff --git a/src/query/functions/tests/it/aggregates/testdata/agg_group_by.txt b/src/query/functions/tests/it/aggregates/testdata/agg_group_by.txt index fbb497ea5784..2a4854afe6c9 100644 --- a/src/query/functions/tests/it/aggregates/testdata/agg_group_by.txt +++ b/src/query/functions/tests/it/aggregates/testdata/agg_group_by.txt @@ -1430,3 +1430,43 @@ evaluation (internal): +--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +ast: mode(1) +evaluation (internal): ++--------+------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------+ +| a | Int64([4, 3, 2, 1]) | +| Output | NullableColumn { column: UInt8([1, 1]), validity: [0b______11] } | ++--------+------------------------------------------------------------------+ + + +ast: mode(NULL) +evaluation (internal): ++--------+---------------------+ +| Column | Data | ++--------+---------------------+ +| a | Int64([4, 3, 2, 1]) | +| Output | Null { len: 2 } | ++--------+---------------------+ + + +ast: mode(d) +evaluation (internal): ++--------+-------------------------------------------------------------------+ +| Column | Data | ++--------+-------------------------------------------------------------------+ +| d | UInt64([1, 1, 1, 1]) | +| Output | NullableColumn { column: UInt64([1, 1]), validity: [0b______11] } | ++--------+-------------------------------------------------------------------+ + + +ast: mode(all_null) +evaluation (internal): ++----------+-------------------------------------------------------------------------+ +| Column | Data | ++----------+-------------------------------------------------------------------------+ +| all_null | NullableColumn { column: UInt64([1, 2, 3, 4]), validity: [0b____0000] } | +| Output | NullableColumn { column: UInt64([0, 0]), validity: [0b______00] } | ++----------+-------------------------------------------------------------------------+ + + diff --git a/src/query/management/src/cluster/cluster_mgr.rs b/src/query/management/src/cluster/cluster_mgr.rs index d9c7952096cd..0b35309d18c9 100644 --- a/src/query/management/src/cluster/cluster_mgr.rs +++ b/src/query/management/src/cluster/cluster_mgr.rs @@ -30,7 +30,7 @@ use databend_common_meta_types::Operation; use crate::cluster::ClusterApi; -pub static CLUSTER_API_KEY_PREFIX: &str = "__fd_clusters_v3"; +pub static CLUSTER_API_KEY_PREFIX: &str = "__fd_clusters_v4"; pub struct ClusterMgr { metastore: MetaStore, diff --git a/src/query/management/tests/it/cluster.rs b/src/query/management/tests/it/cluster.rs index a7b8ac49712c..8166bc6696bb 100644 --- a/src/query/management/tests/it/cluster.rs +++ b/src/query/management/tests/it/cluster.rs @@ -33,7 +33,7 @@ async fn test_successfully_add_node() -> Result<()> { let node_info = create_test_node_info(); cluster_api.add_node(node_info.clone()).await?; let value = kv_api - .get_kv("__fd_clusters_v3/test%2dtenant%2did/test%2dcluster%2did/databend_query/test_node") + .get_kv("__fd_clusters_v4/test%2dtenant%2did/test%2dcluster%2did/databend_query/test_node") .await?; match value { @@ -122,7 +122,7 @@ async fn test_successfully_heartbeat_node() -> Result<()> { cluster_api.add_node(node_info.clone()).await?; let value = kv_api - .get_kv("__fd_clusters_v3/test%2dtenant%2did/test%2dcluster%2did/databend_query/test_node") + .get_kv("__fd_clusters_v4/test%2dtenant%2did/test%2dcluster%2did/databend_query/test_node") .await?; let meta = value.unwrap().meta.unwrap(); @@ -133,7 +133,7 @@ async fn test_successfully_heartbeat_node() -> Result<()> { cluster_api.heartbeat(&node_info, MatchSeq::GE(1)).await?; let value = kv_api - .get_kv("__fd_clusters_v3/test%2dtenant%2did/test%2dcluster%2did/databend_query/test_node") + .get_kv("__fd_clusters_v4/test%2dtenant%2did/test%2dcluster%2did/databend_query/test_node") .await?; assert!(value.unwrap().meta.unwrap().get_expire_at_ms().unwrap() - now_ms >= 59_000); @@ -146,6 +146,7 @@ fn create_test_node_info() -> NodeInfo { secret: "".to_string(), cpu_nums: 0, version: 0, + http_address: "ip3:port".to_string(), flight_address: String::from("ip:port"), discovery_address: "ip2:port".to_string(), binary_version: "binary_version".to_string(), diff --git a/src/query/service/src/clusters/cluster.rs b/src/query/service/src/clusters/cluster.rs index ba9349572d12..92cd2d3b4076 100644 --- a/src/query/service/src/clusters/cluster.rs +++ b/src/query/service/src/clusters/cluster.rs @@ -336,6 +336,10 @@ impl ClusterDiscovery { pub async fn register_to_metastore(self: &Arc, cfg: &InnerConfig) -> Result<()> { let cpus = cfg.query.num_cpus; let mut address = cfg.query.flight_api_address.clone(); + let mut http_address = format!( + "{}:{}", + cfg.query.http_handler_host, cfg.query.http_handler_port + ); let mut discovery_address = match cfg.query.discovery_address.is_empty() { true => format!( "{}:{}", @@ -347,6 +351,7 @@ impl ClusterDiscovery { for (lookup_ip, typ) in [ (&mut address, "flight-api-address"), (&mut discovery_address, "discovery-address"), + (&mut http_address, "http-address"), ] { if let Ok(socket_addr) = SocketAddr::from_str(lookup_ip) { let ip_addr = socket_addr.ip(); @@ -371,6 +376,7 @@ impl ClusterDiscovery { self.local_id.clone(), self.local_secret.clone(), cpus, + http_address, address, discovery_address, DATABEND_COMMIT_VERSION.to_string(), diff --git a/src/query/service/src/databases/system/system_database.rs b/src/query/service/src/databases/system/system_database.rs index 0ae6e34ee9dc..93702af95b02 100644 --- a/src/query/service/src/databases/system/system_database.rs +++ b/src/query/service/src/databases/system/system_database.rs @@ -35,6 +35,7 @@ use databend_common_storages_system::ConfigsTable; use databend_common_storages_system::ContributorsTable; use databend_common_storages_system::CreditsTable; use databend_common_storages_system::DatabasesTable; +use databend_common_storages_system::DictionariesTable; use databend_common_storages_system::EnginesTable; use databend_common_storages_system::FullStreamsTable; use databend_common_storages_system::FunctionsTable; @@ -144,6 +145,7 @@ impl SystemDatabase { ViewsTableWithoutHistory::create(sys_db_meta.next_table_id()), TemporaryTablesTable::create(sys_db_meta.next_table_id()), ProceduresTable::create(sys_db_meta.next_table_id()), + DictionariesTable::create(sys_db_meta.next_table_id()), ]; let disable_tables = Self::disable_system_tables(); diff --git a/src/query/service/src/interpreters/access/privilege_access.rs b/src/query/service/src/interpreters/access/privilege_access.rs index 009e2a8154e4..0eaa433bc235 100644 --- a/src/query/service/src/interpreters/access/privilege_access.rs +++ b/src/query/service/src/interpreters/access/privilege_access.rs @@ -61,10 +61,11 @@ enum ObjectId { // some statements like `SELECT 1`, `SHOW USERS`, `SHOW ROLES`, `SHOW TABLES` will be // rewritten to the queries on the system tables, we need to skip the privilege check on // these tables. -const SYSTEM_TABLES_ALLOW_LIST: [&str; 19] = [ +const SYSTEM_TABLES_ALLOW_LIST: [&str; 20] = [ "catalogs", "columns", "databases", + "dictionaries", "tables", "views", "tables_with_history", @@ -709,7 +710,8 @@ impl AccessChecker for PrivilegeAccess { Some(RewriteKind::ShowDatabases) | Some(RewriteKind::ShowEngines) | Some(RewriteKind::ShowFunctions) - | Some(RewriteKind::ShowUserFunctions) => { + | Some(RewriteKind::ShowUserFunctions) + | Some(RewriteKind::ShowDictionaries(_)) => { return Ok(()); } | Some(RewriteKind::ShowTableFunctions) => { diff --git a/src/query/service/src/interpreters/interpreter_copy_into_location.rs b/src/query/service/src/interpreters/interpreter_copy_into_location.rs index 766e13b833c6..c771f4d2eac0 100644 --- a/src/query/service/src/interpreters/interpreter_copy_into_location.rs +++ b/src/query/service/src/interpreters/interpreter_copy_into_location.rs @@ -14,6 +14,7 @@ use std::sync::Arc; +use databend_common_ast::ast::CopyIntoLocationOptions; use databend_common_base::runtime::GlobalIORuntime; use databend_common_catalog::plan::StageTableInfo; use databend_common_exception::Result; @@ -86,6 +87,7 @@ impl CopyIntoLocationInterpreter { stage: &StageInfo, path: &str, query: &Plan, + options: &CopyIntoLocationOptions, ) -> Result<(PipelineBuildResult, Vec)> { let (query_interpreter, update_stream_meta_req) = self.build_query(query).await?; let query_physical_plan = query_interpreter.build_physical_plan().await?; @@ -109,6 +111,7 @@ impl CopyIntoLocationInterpreter { duplicated_files_detected: vec![], is_select: false, default_values: None, + copy_into_location_options: options.clone(), }, })); @@ -145,6 +148,7 @@ impl Interpreter for CopyIntoLocationInterpreter { &self.plan.stage, &self.plan.path, &self.plan.from, + &self.plan.options, ) .await?; diff --git a/src/query/service/src/locks/lock_holder.rs b/src/query/service/src/locks/lock_holder.rs index 09053dbe10b4..4ad31aefb7c9 100644 --- a/src/query/service/src/locks/lock_holder.rs +++ b/src/query/service/src/locks/lock_holder.rs @@ -21,18 +21,28 @@ use std::time::Instant; use backoff::backoff::Backoff; use databend_common_base::base::tokio::sync::Notify; use databend_common_base::base::tokio::time::sleep; +use databend_common_base::base::tokio::time::timeout; use databend_common_base::runtime::GlobalIORuntime; use databend_common_base::runtime::TrySpawn; use databend_common_catalog::catalog::Catalog; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_meta_api::kv_pb_api::KVPbApi; use databend_common_meta_app::schema::CreateLockRevReq; use databend_common_meta_app::schema::DeleteLockRevReq; use databend_common_meta_app::schema::ExtendLockRevReq; +use databend_common_meta_app::schema::ListLockRevReq; +use databend_common_meta_app::schema::TableLockIdent; +use databend_common_meta_kvapi::kvapi::Key; +use databend_common_meta_types::protobuf::watch_request::FilterType; +use databend_common_meta_types::protobuf::WatchRequest; +use databend_common_metrics::lock::record_acquired_lock_nums; use databend_common_metrics::lock::record_created_lock_nums; use databend_common_storages_fuse::operations::set_backoff; +use databend_common_users::UserApiProvider; use futures::future::select; use futures::future::Either; +use futures_util::StreamExt; use rand::thread_rng; use rand::Rng; @@ -46,13 +56,120 @@ pub struct LockHolder { impl LockHolder { #[async_backtrace::framed] - pub async fn start( + pub(crate) async fn try_acquire_lock( + self: &Arc, + catalog: Arc, + req: CreateLockRevReq, + should_retry: bool, + acquire_timeout: Duration, + ) -> Result { + let start = Instant::now(); + + let ttl = req.ttl; + + let lock_key = req.lock_key.clone(); + let lock_type = lock_key.lock_type().to_string(); + let table_id = lock_key.get_table_id(); + let tenant = lock_key.get_tenant(); + + let revision = self.start(catalog.clone(), req).await?; + + let meta_api = UserApiProvider::instance().get_meta_store_client(); + let list_table_lock_req = ListLockRevReq::new(lock_key.clone()); + + loop { + // List all revisions and check if the current is the minimum. + let mut rev_list = catalog + .list_lock_revisions(list_table_lock_req.clone()) + .await? + .into_iter() + .map(|(x, _)| x) + .collect::>(); + // list_lock_revisions are returned in big-endian order, + // we need to sort them in ascending numeric order. + rev_list.sort(); + let position = rev_list.iter().position(|x| *x == revision).ok_or_else(|| + // If the current is not found in list, it means that the current has been expired. + ErrorCode::TableLockExpired(format!( + "The acquired table lock with revision '{}' maybe expired(elapsed: {:?})", + revision, + start.elapsed(), + )))?; + + if position == 0 { + // The lock is acquired by current session. + let extend_table_lock_req = + ExtendLockRevReq::new(lock_key.clone(), revision, ttl, true); + + catalog.extend_lock_revision(extend_table_lock_req).await?; + // metrics. + record_acquired_lock_nums(lock_type, table_id, 1); + break; + } + + let prev_revision = rev_list[position - 1]; + let elapsed = start.elapsed(); + // if no need retry, return error directly. + if !should_retry || elapsed >= acquire_timeout { + return Err(ErrorCode::TableAlreadyLocked(format!( + "Table is locked by other session(rev: {}, prev: {}, elapsed: {:?})", + revision, + prev_revision, + start.elapsed() + ))); + } + + let watch_delete_ident = TableLockIdent::new(tenant, table_id, prev_revision); + + // Get the previous revision, watch the delete event. + let req = WatchRequest { + key: watch_delete_ident.to_string_key(), + key_end: None, + filter_type: FilterType::Delete.into(), + }; + let mut watch_stream = meta_api.watch(req).await?; + + let lock_meta = meta_api.get_pb(&watch_delete_ident).await?; + if lock_meta.is_none() { + log::warn!( + "Lock revision '{}' already does not exist, skipping", + prev_revision + ); + continue; + } + + // Add a timeout period for watch. + if let Err(_cause) = timeout(acquire_timeout.abs_diff(elapsed), async move { + while let Some(Ok(resp)) = watch_stream.next().await { + if let Some(event) = resp.event { + if event.current.is_none() { + break; + } + } + } + }) + .await + { + return Err(ErrorCode::TableAlreadyLocked(format!( + "Table is locked by other session(rev: {}, prev: {}, elapsed: {:?})", + revision, + prev_revision, + start.elapsed() + ))); + } + } + + Ok(revision) + } + + #[async_backtrace::framed] + async fn start( self: &Arc, - query_id: String, catalog: Arc, req: CreateLockRevReq, ) -> Result { let lock_key = req.lock_key.clone(); + let query_id = req.query_id.clone(); let ttl = req.ttl; let sleep_range = (ttl / 3)..=(ttl * 2 / 3); @@ -61,6 +178,7 @@ impl LockHolder { let revision = res.revision; // metrics. record_created_lock_nums(lock_key.lock_type().to_string(), lock_key.get_table_id(), 1); + log::debug!("create table lock success, revision={}", revision); let delete_table_lock_req = DeleteLockRevReq::new(lock_key.clone(), revision); let extend_table_lock_req = ExtendLockRevReq::new(lock_key.clone(), revision, ttl, false); @@ -179,7 +297,10 @@ impl LockHolder { let mut backoff = set_backoff(Some(Duration::from_millis(2)), None, max_retry_elapsed); loop { match catalog.delete_lock_revision(req.clone()).await { - Ok(_) => break, + Ok(_) => { + log::debug!("delete table lock success, revision={}", req.revision); + break; + } Err(e) => match backoff.next_backoff() { Some(duration) => { log::debug!( diff --git a/src/query/service/src/locks/lock_manager.rs b/src/query/service/src/locks/lock_manager.rs index e1b86aa0f1c2..7bd00139c58d 100644 --- a/src/query/service/src/locks/lock_manager.rs +++ b/src/query/service/src/locks/lock_manager.rs @@ -15,35 +15,21 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; -use std::time::Instant; use databend_common_base::base::tokio::sync::mpsc; -use databend_common_base::base::tokio::time::timeout; use databend_common_base::base::GlobalInstance; use databend_common_base::runtime::GlobalIORuntime; use databend_common_base::runtime::TrySpawn; use databend_common_catalog::lock::Lock; use databend_common_catalog::table_context::TableContext; -use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_meta_api::kv_pb_api::KVPbApi; use databend_common_meta_app::schema::CreateLockRevReq; -use databend_common_meta_app::schema::DeleteLockRevReq; -use databend_common_meta_app::schema::ExtendLockRevReq; -use databend_common_meta_app::schema::ListLockRevReq; use databend_common_meta_app::schema::LockKey; use databend_common_meta_app::schema::TableInfo; -use databend_common_meta_app::schema::TableLockIdent; -use databend_common_meta_kvapi::kvapi::Key; -use databend_common_meta_types::protobuf::watch_request::FilterType; -use databend_common_meta_types::protobuf::WatchRequest; use databend_common_metrics::lock::metrics_inc_shutdown_lock_holder_nums; use databend_common_metrics::lock::metrics_inc_start_lock_holder_nums; -use databend_common_metrics::lock::record_acquired_lock_nums; use databend_common_pipeline_core::LockGuard; use databend_common_pipeline_core::UnlockApi; -use databend_common_users::UserApiProvider; -use futures_util::StreamExt; use parking_lot::RwLock; use crate::locks::lock_holder::LockHolder; @@ -97,129 +83,34 @@ impl LockManager { catalog_name: &str, should_retry: bool, ) -> Result>> { - let start = Instant::now(); + let acquire_timeout = Duration::from_secs(ctx.get_settings().get_acquire_lock_timeout()?); - let lock_type = lock_key.lock_type().to_string(); - let table_id = lock_key.get_table_id(); - let tenant = lock_key.get_tenant(); - let expire_secs = ctx.get_settings().get_table_lock_expire_secs()?; - let query_id = ctx.get_id(); + let ttl = Duration::from_secs(ctx.get_settings().get_table_lock_expire_secs()?); let req = CreateLockRevReq::new( - lock_key.clone(), + lock_key, ctx.get_current_user()?.name, // user ctx.get_cluster().local_id.clone(), // node - query_id.clone(), // query_id - Duration::from_secs(expire_secs), + ctx.get_id(), // query_id + ttl, ); let catalog = ctx.get_catalog(catalog_name).await?; let lock_holder = Arc::new(LockHolder::default()); - let revision = lock_holder.start(query_id, catalog.clone(), req).await?; - - self.insert_lock(revision, lock_holder); - let guard = LockGuard::new(self.clone(), revision); - - let acquire_lock_timeout = ctx.get_settings().get_acquire_lock_timeout()?; - let duration = Duration::from_secs(acquire_lock_timeout); - let meta_api = UserApiProvider::instance().get_meta_store_client(); - - let list_table_lock_req = ListLockRevReq::new(lock_key.clone()); - - let delete_table_lock_req = DeleteLockRevReq::new(lock_key.clone(), revision); - - loop { - // List all revisions and check if the current is the minimum. - let mut rev_list = catalog - .list_lock_revisions(list_table_lock_req.clone()) - .await? - .into_iter() - .map(|(x, _)| x) - .collect::>(); - // list_lock_revisions are returned in big-endian order, - // we need to sort them in ascending numeric order. - rev_list.sort(); - let position = rev_list.iter().position(|x| *x == revision).ok_or_else(|| - // If the current is not found in list, it means that the current has been expired. - ErrorCode::TableLockExpired(format!( - "the acquired table lock with revision '{}' is not in {:?}, maybe expired(elapsed: {:?})", - revision, - rev_list, - start.elapsed(), - )))?; - - if position == 0 { - // The lock is acquired by current session. - let extend_table_lock_req = ExtendLockRevReq::new( - lock_key.clone(), - revision, - Duration::from_secs(expire_secs), - true, - ); - - catalog.extend_lock_revision(extend_table_lock_req).await?; - // metrics. - record_acquired_lock_nums(lock_type, table_id, 1); - break; + match lock_holder + .try_acquire_lock(catalog, req, should_retry, acquire_timeout) + .await + { + Ok(revision) => { + self.insert_lock(revision, lock_holder); + let guard = LockGuard::new(self.clone(), revision); + Ok(Some(Arc::new(guard))) } - - let elapsed = start.elapsed(); - // if no need retry, return error directly. - if !should_retry || elapsed >= duration { - catalog - .delete_lock_revision(delete_table_lock_req.clone()) - .await?; - return Err(ErrorCode::TableAlreadyLocked(format!( - "table is locked by other session, please retry later(elapsed: {:?})", - elapsed - ))); + Err(err) => { + lock_holder.shutdown(); + Err(err) } - - let watch_delete_ident = TableLockIdent::new(tenant, table_id, rev_list[position - 1]); - - // Get the previous revision, watch the delete event. - let req = WatchRequest { - key: watch_delete_ident.to_string_key(), - key_end: None, - filter_type: FilterType::Delete.into(), - }; - let mut watch_stream = meta_api.watch(req).await?; - - let lock_meta = meta_api.get_pb(&watch_delete_ident).await?; - if lock_meta.is_none() { - log::warn!( - "Lock revision '{}' already does not exist, skipping", - rev_list[position - 1] - ); - continue; - } - - // Add a timeout period for watch. - match timeout(duration.abs_diff(elapsed), async move { - while let Some(Ok(resp)) = watch_stream.next().await { - if let Some(event) = resp.event { - if event.current.is_none() { - break; - } - } - } - }) - .await - { - Ok(_) => Ok(()), - Err(_) => { - catalog - .delete_lock_revision(delete_table_lock_req.clone()) - .await?; - Err(ErrorCode::TableAlreadyLocked(format!( - "table is locked by other session, please retry later(elapsed: {:?})", - start.elapsed() - ))) - } - }?; } - - Ok(Some(Arc::new(guard))) } fn insert_lock(&self, revision: u64, lock_holder: Arc) { diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs index 28d6dea1eedd..cbd229d9b7ff 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_aggregate_partial.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::sync::Arc; +use std::time::Instant; use std::vec; use bumpalo::Bump; @@ -111,6 +112,10 @@ pub struct TransformPartialAggregate { hash_table: HashTable, probe_state: ProbeState, params: Arc, + start: Instant, + first_block_start: Option, + processed_bytes: usize, + processed_rows: usize, } impl TransformPartialAggregate { @@ -164,6 +169,10 @@ impl TransformPartialAggregate { hash_table, probe_state: ProbeState::default(), settings: AggregateSettings::try_from(ctx)?, + start: Instant::now(), + first_block_start: None, + processed_bytes: 0, + processed_rows: 0, }, )) } @@ -239,10 +248,16 @@ impl TransformPartialAggregate { .map(|index| index.is_agg) .unwrap_or_default(); - let block = block.convert_to_full(); + let block = block.consume_convert_to_full(); let group_columns = InputColumns::new_block_proxy(&self.params.group_columns, &block); let rows_num = block.num_rows(); + self.processed_bytes += block.memory_size(); + self.processed_rows += rows_num; + if self.first_block_start.is_none() { + self.first_block_start = Some(Instant::now()); + } + { match &mut self.hash_table { HashTable::MovedOut => unreachable!(), @@ -449,6 +464,26 @@ impl AccumulatingTransform for TransformPartialAggrega HashTable::AggregateHashTable(hashtable) => { let partition_count = hashtable.payload.partition_count(); let mut blocks = Vec::with_capacity(partition_count); + + log::info!( + "Aggregated {} to {} rows in {} sec(real: {}). ({} rows/sec, {}/sec, {})", + self.processed_rows, + hashtable.payload.len(), + self.start.elapsed().as_secs_f64(), + if let Some(t) = &self.first_block_start { + t.elapsed().as_secs_f64() + } else { + self.start.elapsed().as_secs_f64() + }, + convert_number_size( + self.processed_rows as f64 / self.start.elapsed().as_secs_f64() + ), + convert_byte_size( + self.processed_bytes as f64 / self.start.elapsed().as_secs_f64() + ), + convert_byte_size(self.processed_bytes as f64), + ); + for (bucket, payload) in hashtable.payload.payloads.into_iter().enumerate() { if payload.len() != 0 { blocks.push(DataBlock::empty_with_meta( diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs index e4062b84a70d..7878df9e5ef8 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_group_by_partial.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::sync::Arc; +use std::time::Instant; use std::vec; use bumpalo::Bump; @@ -107,6 +108,11 @@ pub struct TransformPartialGroupBy { probe_state: ProbeState, settings: GroupBySettings, params: Arc, + + start: Instant, + first_block_start: Option, + processed_rows: usize, + processed_bytes: usize, } impl TransformPartialGroupBy { @@ -142,6 +148,10 @@ impl TransformPartialGroupBy { probe_state: ProbeState::default(), params, settings: GroupBySettings::try_from(ctx)?, + start: Instant::now(), + first_block_start: None, + processed_bytes: 0, + processed_rows: 0, }, )) } @@ -151,12 +161,19 @@ impl AccumulatingTransform for TransformPartialGroupBy const NAME: &'static str = "TransformPartialGroupBy"; fn transform(&mut self, block: DataBlock) -> Result> { - let block = block.convert_to_full(); + let block = block.consume_convert_to_full(); + + let rows_num = block.num_rows(); + + self.processed_bytes += block.memory_size(); + self.processed_rows += rows_num; + if self.first_block_start.is_none() { + self.first_block_start = Some(Instant::now()); + } + let group_columns = InputColumns::new_block_proxy(&self.params.group_columns, &block); { - let rows_num = block.num_rows(); - match &mut self.hash_table { HashTable::MovedOut => unreachable!(), HashTable::HashTable(cell) => { @@ -305,6 +322,26 @@ impl AccumulatingTransform for TransformPartialGroupBy HashTable::AggregateHashTable(hashtable) => { let partition_count = hashtable.payload.partition_count(); let mut blocks = Vec::with_capacity(partition_count); + + log::info!( + "Aggregated {} to {} rows in {} sec(real: {}). ({} rows/sec, {}/sec, {})", + self.processed_rows, + hashtable.payload.len(), + self.start.elapsed().as_secs_f64(), + if let Some(t) = &self.first_block_start { + t.elapsed().as_secs_f64() + } else { + self.start.elapsed().as_secs_f64() + }, + convert_number_size( + self.processed_rows as f64 / self.start.elapsed().as_secs_f64() + ), + convert_byte_size( + self.processed_bytes as f64 / self.start.elapsed().as_secs_f64() + ), + convert_byte_size(self.processed_bytes as f64), + ); + for (bucket, payload) in hashtable.payload.payloads.into_iter().enumerate() { if payload.len() != 0 { blocks.push(DataBlock::empty_with_meta( @@ -316,7 +353,6 @@ impl AccumulatingTransform for TransformPartialGroupBy )); } } - blocks } }) diff --git a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_single_key.rs b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_single_key.rs index de05ea7d2e31..1de36a979b1f 100644 --- a/src/query/service/src/pipelines/processors/transforms/aggregator/transform_single_key.rs +++ b/src/query/service/src/pipelines/processors/transforms/aggregator/transform_single_key.rs @@ -15,9 +15,12 @@ use std::alloc::Layout; use std::borrow::BorrowMut; use std::sync::Arc; +use std::time::Instant; use std::vec; use bumpalo::Bump; +use databend_common_base::base::convert_byte_size; +use databend_common_base::base::convert_number_size; use databend_common_catalog::plan::AggIndexMeta; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -47,6 +50,11 @@ pub struct PartialSingleStateAggregator { places: Vec, arg_indices: Vec>, funcs: Vec, + + start: Instant, + first_block_start: Option, + rows: usize, + bytes: usize, } impl PartialSingleStateAggregator { @@ -76,6 +84,10 @@ impl PartialSingleStateAggregator { places, funcs: params.aggregate_functions.clone(), arg_indices: params.aggregate_functions_arguments.clone(), + start: Instant::now(), + first_block_start: None, + rows: 0, + bytes: 0, }) } } @@ -84,13 +96,17 @@ impl AccumulatingTransform for PartialSingleStateAggregator { const NAME: &'static str = "AggregatorPartialTransform"; fn transform(&mut self, block: DataBlock) -> Result> { + if self.first_block_start.is_none() { + self.first_block_start = Some(Instant::now()); + } + let is_agg_index_block = block .get_meta() .and_then(AggIndexMeta::downcast_ref_from) .map(|index| index.is_agg) .unwrap_or_default(); - let block = block.convert_to_full(); + let block = block.consume_convert_to_full(); for (idx, func) in self.funcs.iter().enumerate() { let place = self.places[idx]; @@ -107,6 +123,9 @@ impl AccumulatingTransform for PartialSingleStateAggregator { } } + self.rows += block.num_rows(); + self.bytes += block.memory_size(); + Ok(vec![]) } @@ -137,6 +156,20 @@ impl AccumulatingTransform for PartialSingleStateAggregator { } } + log::info!( + "Aggregated {} to 1 rows in {} sec (real: {}). ({} rows/sec, {}/sec, {})", + self.rows, + self.start.elapsed().as_secs_f64(), + if let Some(t) = &self.first_block_start { + t.elapsed().as_secs_f64() + } else { + self.start.elapsed().as_secs_f64() + }, + convert_number_size(self.rows as f64 / self.start.elapsed().as_secs_f64()), + convert_byte_size(self.bytes as f64 / self.start.elapsed().as_secs_f64()), + convert_byte_size(self.bytes as _), + ); + Ok(generate_data_block) } } @@ -195,7 +228,7 @@ impl AccumulatingTransform for FinalSingleStateAggregator { fn transform(&mut self, block: DataBlock) -> Result> { if !block.is_empty() { - let block = block.convert_to_full(); + let block = block.consume_convert_to_full(); for (index, _) in self.funcs.iter().enumerate() { let binary_array = block.get_by_offset(index).value.as_column().unwrap(); diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs index fb5f9a649b35..c478a19f5ebf 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/transform_hash_join_probe.rs @@ -352,7 +352,7 @@ impl Processor for TransformHashJoinProbe { { self.probe_hash_table(data_block)?; } else if let Some(data_block) = self.input_data_blocks.pop_front() { - let data_block = data_block.convert_to_full(); + let data_block = data_block.consume_convert_to_full(); self.probe_hash_table(data_block)?; } } diff --git a/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs b/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs index 21371abcec8e..a168b43cacb1 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs @@ -1005,7 +1005,7 @@ where T: Number + ResultTypeOfUnary let num_rows = data.num_rows(); if num_rows != 0 { self.blocks.push_back(WindowBlock { - block: data.convert_to_full(), + block: data.consume_convert_to_full(), builder: ColumnBuilder::with_capacity(&self.func.return_type()?, num_rows), }); } diff --git a/src/query/service/src/servers/mysql/writers/query_result_writer.rs b/src/query/service/src/servers/mysql/writers/query_result_writer.rs index 6beefcb0525e..d0251eefa50a 100644 --- a/src/query/service/src/servers/mysql/writers/query_result_writer.rs +++ b/src/query/service/src/servers/mysql/writers/query_result_writer.rs @@ -242,7 +242,7 @@ impl<'a, W: AsyncWrite + Send + Unpin> DFQueryResultWriter<'a, W> { let mut buf = Vec::::new(); let columns = block - .convert_to_full() + .consume_convert_to_full() .columns() .iter() .map(|column| column.value.clone().into_column().unwrap()) diff --git a/src/query/service/src/sessions/query_ctx.rs b/src/query/service/src/sessions/query_ctx.rs index 5ef082ffdcca..c3987e5bea23 100644 --- a/src/query/service/src/sessions/query_ctx.rs +++ b/src/query/service/src/sessions/query_ctx.rs @@ -1309,6 +1309,7 @@ impl TableContext for QueryContext { duplicated_files_detected: vec![], is_select: true, default_values: None, + copy_into_location_options: Default::default(), }; OrcTable::try_create(info).await } @@ -1325,6 +1326,7 @@ impl TableContext for QueryContext { duplicated_files_detected: vec![], is_select: true, default_values: None, + copy_into_location_options: Default::default(), }; StageTable::try_create(info) } @@ -1359,6 +1361,7 @@ impl TableContext for QueryContext { duplicated_files_detected: vec![], is_select: true, default_values: None, + copy_into_location_options: Default::default(), }; StageTable::try_create(info) } diff --git a/src/query/service/src/test_kits/cluster.rs b/src/query/service/src/test_kits/cluster.rs index 450cbe8c2c30..6cfff0852e21 100644 --- a/src/query/service/src/test_kits/cluster.rs +++ b/src/query/service/src/test_kits/cluster.rs @@ -38,6 +38,7 @@ impl ClusterDescriptor { id.into(), "".to_string(), 0, + "".to_string(), addr.into(), "".to_string(), DATABEND_COMMIT_VERSION.to_string(), diff --git a/src/query/service/tests/it/parquet_rs/data.rs b/src/query/service/tests/it/parquet_rs/data.rs index 22e1fa57a535..28d3f8354aef 100644 --- a/src/query/service/tests/it/parquet_rs/data.rs +++ b/src/query/service/tests/it/parquet_rs/data.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::os::unix::fs::PermissionsExt; use std::sync::Arc; use arrow_array::Array; @@ -36,6 +37,7 @@ use chrono::Duration; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; +use tokio::fs::create_dir_all; // Test cases from apache/arrow-datafusion @@ -336,10 +338,13 @@ fn create_data_batch(scenario: Scenario) -> Vec { /// Create a test parquet file with various data types pub async fn make_test_file_rg(scenario: Scenario) -> (NamedTempFile, SchemaRef) { + let dir = std::env::temp_dir().join("parquets_rg"); + create_dir_all(&dir).await.unwrap(); let mut output_file = tempfile::Builder::new() .prefix("parquet_pruning") .suffix(".parquet") - .tempfile() + .permissions(std::fs::Permissions::from_mode(0o666)) + .tempfile_in(dir) .expect("tempfile creation"); let props = WriterProperties::builder() @@ -362,10 +367,13 @@ pub async fn make_test_file_rg(scenario: Scenario) -> (NamedTempFile, SchemaRef) } pub async fn make_test_file_page(scenario: Scenario) -> (NamedTempFile, SchemaRef) { + let dir = std::env::temp_dir().join("parquets_page"); + create_dir_all(&dir).await.unwrap(); let mut output_file = tempfile::Builder::new() .prefix("parquet_page_pruning") .suffix(".parquet") - .tempfile() + .permissions(std::fs::Permissions::from_mode(0o666)) + .tempfile_in(dir) .expect("tempfile creation"); // set row count to 5, should get same result as rowGroup diff --git a/src/query/service/tests/it/parquet_rs/prune_pages.rs b/src/query/service/tests/it/parquet_rs/prune_pages.rs index 464dcab5cf95..41f2f1cb6933 100644 --- a/src/query/service/tests/it/parquet_rs/prune_pages.rs +++ b/src/query/service/tests/it/parquet_rs/prune_pages.rs @@ -29,424 +29,334 @@ use crate::parquet_rs::data::Scenario; use crate::parquet_rs::utils::create_parquet_test_fixture; use crate::parquet_rs::utils::get_data_source_plan; -async fn test(scenario: Scenario, predicate: &str, expected_selection: RowSelection) { - let (file, arrow_schema) = make_test_file_page(scenario).await; - let file_path = file.path().to_string_lossy(); - let sql = format!("select * from 'fs://{file_path}' where {predicate}"); - +async fn test_batch(batches: &[(Scenario, &str, RowSelection)]) { let fixture = create_parquet_test_fixture().await; - let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) - .await + for (scenario, predicate, expected_selection) in batches { + let (file, arrow_schema) = make_test_file_page(*scenario).await; + let file_path = file.path().to_string_lossy(); + let sql = format!("select * from 'fs://{file_path}' where {predicate}"); + + let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) + .await + .unwrap(); + let metadata = ArrowReaderMetadata::load( + file.as_file(), + ArrowReaderOptions::new() + .with_page_index(true) + .with_skip_arrow_metadata(true), + ) .unwrap(); - let metadata = ArrowReaderMetadata::load( - file.as_file(), - ArrowReaderOptions::new() - .with_page_index(true) - .with_skip_arrow_metadata(true), - ) - .unwrap(); - let parquet_meta = metadata.metadata(); - let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); - let leaf_fields = Arc::new(schema.leaf_fields()); - - let pruner = ParquetRSPruner::try_create( - FunctionContext::default(), - Arc::new(schema), - leaf_fields, - &plan.push_downs, - ParquetReadOptions::default() - .with_prune_row_groups(false) - .with_prune_pages(true), - vec![], - ) - .unwrap(); - - let row_groups = (0..parquet_meta.num_row_groups()).collect::>(); - let selection = pruner - .prune_pages(parquet_meta, &row_groups, None) - .unwrap() + let parquet_meta = metadata.metadata(); + let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); + let leaf_fields = Arc::new(schema.leaf_fields()); + + let pruner = ParquetRSPruner::try_create( + FunctionContext::default(), + Arc::new(schema), + leaf_fields, + &plan.push_downs, + ParquetReadOptions::default() + .with_prune_row_groups(false) + .with_prune_pages(true), + vec![], + ) .unwrap(); - assert_eq!( - expected_selection, selection, - "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", - expected_selection, selection, scenario, predicate - ); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 1 2020-01-01T01:01:01.000000 2020-01-02T01:01:01.000000 -// page-1 1 2020-01-01T01:01:11.000000 2020-01-02T01:01:11.000000 -// page-2 1 2020-01-01T01:11:01.000000 2020-01-02T01:11:01.000000 -// page-3 1 2020-01-11T01:01:01.000000 2020-01-12T01:01:01.000000 -async fn test_timestamp() { - test( - Scenario::Timestamp, - "micros < to_timestamp('2020-01-02 01:01:11Z')", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 1 2020-01-01 2020-01-04 -// page-1 1 2020-01-11 2020-01-14 -// page-2 1 2020-10-27 2020-10-30 -// page-3 1 2029-11-09 2029-11-12 -async fn test_date() { - test( - Scenario::Date, - "date32 < to_date('2020-01-02')", - RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(15)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 0 -5 -1 -// page-1 0 -4 0 -// page-2 0 0 4 -// page-3 0 5 9 -async fn test_int32_lt() { - test( - Scenario::Int32, - "i < 1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; - // result of sql "SELECT * FROM t where i < 1" is same as - // "SELECT * FROM t where -i > -1" - test( - Scenario::Int32, - "-i > -1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_gt() { - test( - Scenario::Int32, - "i > 8", - RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), - ) - .await; - - test( - Scenario::Int32, - "-i < -8", - RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), - ) - .await; + let row_groups = (0..parquet_meta.num_row_groups()).collect::>(); + let selection = pruner + .prune_pages(parquet_meta, &row_groups, None) + .unwrap() + .unwrap(); + + let expected_selection = expected_selection.clone(); + assert_eq!( + expected_selection, selection, + "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", + expected_selection, selection, scenario, predicate + ); + } } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq() { - test( - Scenario::Int32, - "i = 1", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun_and_eq() { - test( - Scenario::Int32, - "abs(i) = 1 and i = 1", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun() { - test( - Scenario::Int32, - "abs(i) = 1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr() { - test( - Scenario::Int32, - "i+1 = 1", - RowSelection::from(vec![ - RowSelector::skip(5), - RowSelector::select(10), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr_subtract() { - test( - Scenario::Int32, - "1-i > 1", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 0 -5.0 -1.0 -// page-1 0 -4.0 0.0 -// page-2 0 0.0 4.0 -// page-3 0 5.0 9.0 -async fn test_f64_lt() { - test( - Scenario::Float64, - "f < 1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::Float64, - "-f > -1", - RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), - ) - .await; +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +async fn test_basic() { + let test_cases = vec![ + // Timestamp tests + // null count min max + // page-0 1 2020-01-01T01:01:01.000000 2020-01-02T01:01:01.000000 + // page-1 1 2020-01-01T01:01:11.000000 2020-01-02T01:01:11.000000 + // page-2 1 2020-01-01T01:11:01.000000 2020-01-02T01:11:01.000000 + // page-3 1 2020-01-11T01:01:01.000000 2020-01-12T01:01:01.000000 + ( + Scenario::Timestamp, + "micros < to_timestamp('2020-01-02 01:01:11Z')", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + // Date tests + // null count min max + // page-0 1 2020-01-01 2020-01-04 + // page-1 1 2020-01-11 2020-01-14 + // page-2 1 2020-10-27 2020-10-30 + // page-3 1 2029-11-09 2029-11-12 + ( + Scenario::Date, + "date32 < to_date('2020-01-02')", + RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(15)]), + ), + // Int32 tests + // null count min max + // page-0 0 -5 -1 + // page-1 0 -4 0 + // page-2 0 0 4 + // page-3 0 5 9 + ( + Scenario::Int32, + "i < 1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + // result of sql "SELECT * FROM t where i < 1" is same as + // "SELECT * FROM t where -i > -1" + ( + Scenario::Int32, + "-i > -1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Int32, + "i > 8", + RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), + ), + ( + Scenario::Int32, + "-i < -8", + RowSelection::from(vec![RowSelector::skip(15), RowSelector::select(5)]), + ), + ( + Scenario::Int32, + "i = 1", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "abs(i) = 1 and i = 1", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "abs(i) = 1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Int32, + "i+1 = 1", + RowSelection::from(vec![ + RowSelector::skip(5), + RowSelector::select(10), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "1-i > 1", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), + ), + // Float64 tests + // null count min max + // page-0 0 -5.0 -1.0 + // page-1 0 -4.0 0.0 + // page-2 0 0.0 4.0 + // page-3 0 5.0 9.0 + ( + Scenario::Float64, + "f < 1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Float64, + "-f > -1", + RowSelection::from(vec![RowSelector::select(15), RowSelector::skip(5)]), + ), + ( + Scenario::Float64, + "abs(f - 1) <= 0.000001 and f >= 0.1", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Float64, + "abs(f-1) <= 0.000001", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Float64, + "f+1 > 1.1", + RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(10)]), + ), + ( + Scenario::Float64, + "1-f > 1", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), + ), + // Int32 in list tests + // null count min max + // page-0 0 -5 -1 + // page-1 0 -4 0 + // page-2 0 0 4 + // page-3 0 5 9 + ( + Scenario::Int32, + "i in (1)", + RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::Int32, + "i in (100)", + RowSelection::from(vec![RowSelector::skip(20)]), + ), + ( + Scenario::Int32, + "i not in (1)", + RowSelection::from(vec![RowSelector::select(20)]), + ), + // Decimal tests + // The data type of decimal_col is decimal(9,2) + // There are three pages each 5 rows: + // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] + ( + Scenario::Decimal, + "decimal_col < 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // compare with the casted decimal value + ( + Scenario::Decimal, + "decimal_col < cast(4.55 as decimal(20,2))", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // The data type of decimal_col is decimal(38,2) + ( + Scenario::DecimalLargePrecision, + "decimal_col < 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // compare with the casted decimal value + ( + Scenario::DecimalLargePrecision, + "decimal_col < cast(4.55 as decimal(20,2))", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // The data type of decimal_col is decimal(9,2) + // There are three pages: + // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] + ( + Scenario::Decimal, + "decimal_col = 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::Decimal, + "decimal_col = 4.00", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + // The data type of decimal_col is decimal(38,2) + ( + Scenario::DecimalLargePrecision, + "decimal_col = 4", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col = 4.00", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col = 30.00", + RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(5)]), + ), + ]; + + test_batch(&test_cases).await; } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun_and_gt() { - test( - Scenario::Float64, - "abs(f - 1) <= 0.000001 and f >= 0.1", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun() { - test( - Scenario::Float64, - "abs(f-1) <= 0.000001", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr() { - test( - Scenario::Float64, - "f+1 > 1.1", - RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(10)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr_subtract() { - test( - Scenario::Float64, - "1-f > 1", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(10)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -// null count min max -// page-0 0 -5 -1 -// page-1 0 -4 0 -// page-2 0 0 4 -// page-3 0 5 9 -async fn test_int32_eq_in_list() { - test( - Scenario::Int32, - "i in (1)", - RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_2() { - test( - Scenario::Int32, - "i in (100)", - RowSelection::from(vec![RowSelector::skip(20)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_negated() { - test( - Scenario::Int32, - "i not in (1)", - RowSelection::from(vec![RowSelector::select(20)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_lt() { - // The data type of decimal_col is decimal(9,2) - // There are three pages each 5 rows: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col < 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - // compare with the casted decimal value - test( - Scenario::Decimal, - "decimal_col < cast(4.55 as decimal(20,2))", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col < 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - // compare with the casted decimal value - test( - Scenario::DecimalLargePrecision, - "decimal_col < cast(4.55 as decimal(20,2))", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_eq() { - // The data type of decimal_col is decimal(9,2) - // There are three pages: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col = 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::Decimal, - "decimal_col = 4.00", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col = 4", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col = 4.00", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col = 30.00", - RowSelection::from(vec![RowSelector::skip(10), RowSelector::select(5)]), - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_decimal_in_list() { // The data type of decimal_col is decimal(9,2) // There are three pages: // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col in (4,3,123456789123)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::Decimal, - "decimal_col in (4.00,3.00,11.2345)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4,3,123456789123)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4.00,3.00,11.2345,1)", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; + let cases = vec![ + ( + Scenario::Decimal, + "decimal_col in (4,3,123456789123)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::Decimal, + "decimal_col in (4.00,3.00,11.2345)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4,3,123456789123)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4.00,3.00,11.2345,1)", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ]; + + test_batch(&cases).await; } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_periods_in_column_names() { - // There are three row groups for "service.name", each with 5 rows = 15 rows total - // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], - // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], - // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], - test( - Scenario::PeriodsInColumnNames, - // use double quotes to use column named "service.name" - "\"service.name\" = 'frontend'", - RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "name <> 'HTTP GET / DISPATCH'", - RowSelection::from(vec![ - RowSelector::skip(5), - RowSelector::select(5), - RowSelector::skip(5), - ]), - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", - RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(10)]), - ) - .await; + let test_cases = vec![ + // Tests for periods in column names + // There are three row groups for "service.name", each with 5 rows = 15 rows total + // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], + // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], + // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], + ( + Scenario::PeriodsInColumnNames, + // use double quotes to use column named "service.name" + "\"service.name\" = 'frontend'", + RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(5)]), + ), + ( + Scenario::PeriodsInColumnNames, + "name <> 'HTTP GET / DISPATCH'", + RowSelection::from(vec![ + RowSelector::skip(5), + RowSelector::select(5), + RowSelector::skip(5), + ]), + ), + ( + Scenario::PeriodsInColumnNames, + "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", + RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(10)]), + ), + ]; + + test_batch(&test_cases).await; } diff --git a/src/query/service/tests/it/parquet_rs/prune_row_groups.rs b/src/query/service/tests/it/parquet_rs/prune_row_groups.rs index a0f18b4c8c61..28edf445e1af 100644 --- a/src/query/service/tests/it/parquet_rs/prune_row_groups.rs +++ b/src/query/service/tests/it/parquet_rs/prune_row_groups.rs @@ -26,262 +26,177 @@ use super::utils::get_data_source_plan; use crate::parquet_rs::utils::create_parquet_test_fixture; /// Enable row groups pruning and test. -async fn test(scenario: Scenario, predicate: &str, expected_rgs: Vec) { - test_impl(scenario, predicate, expected_rgs, true).await +async fn test_batch(args: &[(Scenario, &str, Vec)]) { + test_impl_batch(args, true).await } // Disable row groups pruning and test. -async fn test_without_prune(scenario: Scenario, predicate: &str, expected_rgs: Vec) { - test_impl(scenario, predicate, expected_rgs, false).await +async fn test_batch_without_prune(args: &[(Scenario, &str, Vec)]) { + test_impl_batch(args, false).await } -async fn test_impl(scenario: Scenario, predicate: &str, expected_rgs: Vec, prune: bool) { - let (file, arrow_schema) = make_test_file_rg(scenario).await; - let file_path = file.path().to_string_lossy(); - let sql = format!("select * from 'fs://{file_path}' where {predicate}"); - +async fn test_impl_batch(args: &[(Scenario, &str, Vec)], prune: bool) { let fixture = create_parquet_test_fixture().await; - let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) - .await - .unwrap(); - let parquet_meta = parquet::file::footer::parse_metadata(file.as_file()).unwrap(); - let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); - let leaf_fields = Arc::new(schema.leaf_fields()); - - let pruner = ParquetRSPruner::try_create( - FunctionContext::default(), - Arc::new(schema), - leaf_fields, - &plan.push_downs, - ParquetReadOptions::default() - .with_prune_row_groups(prune) - .with_prune_pages(false), - vec![], - ) - .unwrap(); - - let (rgs, _) = pruner.prune_row_groups(&parquet_meta, None, None).unwrap(); - assert_eq!( - expected_rgs, rgs, - "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", - expected_rgs, rgs, scenario, predicate - ); -} + for (scenario, predicate, expected_rgs) in args { + let (file, arrow_schema) = make_test_file_rg(*scenario).await; + let file_path = file.path().to_string_lossy(); + let sql = format!("select * from 'fs://{file_path}' where {predicate}"); + + let plan = get_data_source_plan(fixture.new_query_ctx().await.unwrap(), &sql) + .await + .unwrap(); + let parquet_meta = parquet::file::footer::parse_metadata(file.as_file()).unwrap(); + let schema = TableSchema::try_from(arrow_schema.as_ref()).unwrap(); + let leaf_fields = Arc::new(schema.leaf_fields()); + + let pruner = ParquetRSPruner::try_create( + FunctionContext::default(), + Arc::new(schema), + leaf_fields, + &plan.push_downs, + ParquetReadOptions::default() + .with_prune_row_groups(prune) + .with_prune_pages(false), + vec![], + ) + .unwrap(); -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_timestamp() { - test( - Scenario::Timestamp, - "micros < to_timestamp('2020-01-02 01:01:11Z')", - vec![0, 1, 2], - ) - .await; -} + let (rgs, _) = pruner.prune_row_groups(&parquet_meta, None, None).unwrap(); -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_date() { - test(Scenario::Date, "date32 < to_date('2020-01-02')", vec![0]).await; + assert_eq!( + expected_rgs.to_vec(), + rgs, + "Expected {:?}, got {:?}. Scenario: {:?}, predicate: {}", + expected_rgs, + rgs, + scenario, + predicate + ); + } } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_disabled() { - test( + test_batch(&[( Scenario::Timestamp, "micros < to_timestamp('2020-01-02 01:01:11Z')", vec![0, 1, 2], - ) + )]) .await; - - test_without_prune( + test_batch_without_prune(&[( Scenario::Timestamp, "micros < to_timestamp('2020-01-02 01:01:11Z')", vec![0, 1, 2, 3], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_lt() { - test(Scenario::Int32, "i < 1", vec![0, 1, 2]).await; - // result of sql "SELECT * FROM t where i < 1" is same as - // "SELECT * FROM t where -i > -1" - test(Scenario::Int32, " -i > -1", vec![0, 1, 2]).await -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq() { - test(Scenario::Int32, "i = 1", vec![2]).await; -} -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun_and_eq() { - test(Scenario::Int32, "abs(i) = 1 and i = 1", vec![2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_scalar_fun() { - test(Scenario::Int32, "abs(i) = 1", vec![0, 1, 2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr() { - test(Scenario::Int32, "i+1 = 1", vec![1, 2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_complex_expr_subtract() { - test(Scenario::Int32, "1-i > 1", vec![0, 1]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_lt() { - test(Scenario::Float64, "f < 1", vec![0, 1, 2]).await; - test(Scenario::Float64, "-f > -1", vec![0, 1, 2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun_and_gt() { - test( - Scenario::Float64, - "abs(f - 1) <= 0.000001 and f >= 0.1", - vec![2], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_scalar_fun() { - test(Scenario::Float64, "abs(f-1) <= 0.000001", vec![2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr() { - test(Scenario::Float64, "f+1 > 1.1", vec![2, 3]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_f64_complex_expr_subtract() { - test(Scenario::Float64, "1-f > 1", vec![0, 1]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list() { - test(Scenario::Int32, "i in (1)", vec![2]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_2() { - test(Scenario::Int32, "i in (1000)", vec![]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_int32_eq_in_list_negated() { - test(Scenario::Int32, "i not in (1)", vec![0, 1, 2, 3]).await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_lt() { - // The data type of decimal_col is decimal(9,2) - // There are three row groups: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test(Scenario::Decimal, "decimal_col < 4", vec![0, 1]).await; - // compare with the casted decimal value - test( - Scenario::Decimal, - "decimal_col < cast(4.55 as decimal(20,2))", - vec![0, 1], - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test(Scenario::DecimalLargePrecision, "decimal_col < 4", vec![ - 0, 1, - ]) - .await; - // compare with the casted decimal value - test( - Scenario::DecimalLargePrecision, - "decimal_col < cast(4.55 as decimal(20,2))", - vec![0, 1], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_eq() { - // The data type of decimal_col is decimal(9,2) - // There are three row groups: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test(Scenario::Decimal, "decimal_col = 4", vec![0, 1]).await; - test(Scenario::Decimal, "decimal_col = 4.00", vec![0, 1]).await; - - // The data type of decimal_col is decimal(38,2) - test(Scenario::DecimalLargePrecision, "decimal_col = 4", vec![ - 0, 1, - ]) - .await; - test(Scenario::DecimalLargePrecision, "decimal_col = 4.00", vec![ - 0, 1, - ]) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_decimal_in_list() { - // The data type of decimal_col is decimal(9,2) - // There are three row groups: - // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] - test( - Scenario::Decimal, - "decimal_col in (4,3,123456789123)", - vec![0, 1], - ) - .await; - test( - Scenario::Decimal, - "decimal_col in (4.00,3.00,11.2345)", - vec![0, 1], - ) - .await; - - // The data type of decimal_col is decimal(38,2) - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4,3,123456789123)", - vec![0, 1], - ) - .await; - test( - Scenario::DecimalLargePrecision, - "decimal_col in (4.00,3.00,11.2345)", - vec![0, 1], - ) - .await; -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn test_periods_in_column_names() { - // There are three row groups for "service.name", each with 5 rows = 15 rows total - // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], - // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], - // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], - test( - Scenario::PeriodsInColumnNames, + )]) + .await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +async fn test_various_rg_scenarios() { + let test_cases = vec![ + ( + Scenario::Timestamp, + "micros < to_timestamp('2020-01-02 01:01:11Z')", + vec![0, 1, 2], + ), + // Date scenario + (Scenario::Date, "date32 < to_date('2020-01-02')", vec![0]), + // Int32 scenarios + (Scenario::Int32, "i < 1", vec![0, 1, 2]), + // result of sql "SELECT * FROM t where i < 1" is same as + // "SELECT * FROM t where -i > -1" + (Scenario::Int32, " -i > -1", vec![0, 1, 2]), + (Scenario::Int32, "i = 1", vec![2]), + (Scenario::Int32, "abs(i) = 1 and i = 1", vec![2]), + (Scenario::Int32, "abs(i) = 1", vec![0, 1, 2]), + (Scenario::Int32, "i+1 = 1", vec![1, 2]), + (Scenario::Int32, "1-i > 1", vec![0, 1]), + (Scenario::Int32, "i in (1)", vec![2]), + (Scenario::Int32, "i in (1000)", vec![]), + (Scenario::Int32, "i not in (1)", vec![0, 1, 2, 3]), + // Float64 scenarios + (Scenario::Float64, "f < 1", vec![0, 1, 2]), + (Scenario::Float64, "-f > -1", vec![0, 1, 2]), + ( + Scenario::Float64, + "abs(f - 1) <= 0.000001 and f >= 0.1", + vec![2], + ), + (Scenario::Float64, "abs(f-1) <= 0.000001", vec![2]), + (Scenario::Float64, "f+1 > 1.1", vec![2, 3]), + (Scenario::Float64, "1-f > 1", vec![0, 1]), + // Decimal scenarios + // The data type of decimal_col is decimal(9,2) + // There are three row groups: + // [1.00, 6.00], [-5.00,6.00], [20.00,60.00] + (Scenario::Decimal, "decimal_col < 4", vec![0, 1]), + // compare with the casted decimal value + ( + Scenario::Decimal, + "decimal_col < cast(4.55 as decimal(20,2))", + vec![0, 1], + ), + (Scenario::Decimal, "decimal_col = 4", vec![0, 1]), + (Scenario::Decimal, "decimal_col = 4.00", vec![0, 1]), + ( + Scenario::Decimal, + "decimal_col in (4,3,123456789123)", + vec![0, 1], + ), + ( + Scenario::Decimal, + "decimal_col in (4.00,3.00,11.2345)", + vec![0, 1], + ), + // DecimalLargePrecision scenarios + // The data type of decimal_col is decimal(38,2) + (Scenario::DecimalLargePrecision, "decimal_col < 4", vec![ + 0, 1, + ]), + ( + Scenario::DecimalLargePrecision, + "decimal_col < cast(4.55 as decimal(20,2))", + vec![0, 1], + ), + (Scenario::DecimalLargePrecision, "decimal_col = 4", vec![ + 0, 1, + ]), + (Scenario::DecimalLargePrecision, "decimal_col = 4.00", vec![ + 0, 1, + ]), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4,3,123456789123)", + vec![0, 1], + ), + ( + Scenario::DecimalLargePrecision, + "decimal_col in (4.00,3.00,11.2345)", + vec![0, 1], + ), + // PeriodsInColumnNames scenarios + // There are three row groups for "service.name", each with 5 rows = 15 rows total + // name = "HTTP GET / DISPATCH", service.name = ['frontend', 'frontend'], + // name = "HTTP PUT / DISPATCH", service.name = ['backend', 'frontend'], + // name = "HTTP GET / DISPATCH", service.name = ['backend', 'backend' ], // use double quotes to use column named "service.name" - "\"service.name\" = 'frontend'", - vec![0, 1], - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "name <> 'HTTP GET / DISPATCH'", - vec![1], - ) - .await; - test( - Scenario::PeriodsInColumnNames, - "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", - vec![0], - ) - .await; + ( + Scenario::PeriodsInColumnNames, + "\"service.name\" = 'frontend'", + vec![0, 1], + ), + ( + Scenario::PeriodsInColumnNames, + "name <> 'HTTP GET / DISPATCH'", + vec![1], + ), + ( + Scenario::PeriodsInColumnNames, + "\"service.name\" = 'frontend' AND name = 'HTTP GET / DISPATCH'", + vec![0], + ), + ]; + + test_batch(&test_cases).await; } diff --git a/src/query/service/tests/it/storages/fuse/operations/internal_column.rs b/src/query/service/tests/it/storages/fuse/operations/internal_column.rs index 5527734d4cb9..a759e15d1eb0 100644 --- a/src/query/service/tests/it/storages/fuse/operations/internal_column.rs +++ b/src/query/service/tests/it/storages/fuse/operations/internal_column.rs @@ -71,8 +71,8 @@ fn expected_data_block( } fn check_data_block(expected: Vec, blocks: Vec) -> Result<()> { - let expected_data_block = DataBlock::concat(&expected)?.convert_to_full(); - let data_block = DataBlock::concat(&blocks)?.convert_to_full(); + let expected_data_block = DataBlock::concat(&expected)?.consume_convert_to_full(); + let data_block = DataBlock::concat(&blocks)?.consume_convert_to_full(); for (expected_column, column) in expected_data_block .columns() diff --git a/src/query/service/tests/it/storages/testdata/caches_table.txt b/src/query/service/tests/it/storages/testdata/caches_table.txt index 373568235de0..11ca7da3b0c9 100644 --- a/src/query/service/tests/it/storages/testdata/caches_table.txt +++ b/src/query/service/tests/it/storages/testdata/caches_table.txt @@ -9,7 +9,7 @@ DB.Table: 'system'.'caches', Table: caches-table_id:1, ver:0, Engine: SystemCach | 'test-node' | 'memory_cache_compact_segment_info' | 0 | 0 | 1073741824 | 'bytes' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_inverted_index_file' | 0 | 0 | 2147483648 | 'bytes' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_inverted_index_file_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | -| 'test-node' | 'memory_cache_parquet_file_meta' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | +| 'test-node' | 'memory_cache_parquet_meta_data' | 0 | 0 | 3000 | 'count' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_prune_partitions' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_table_snapshot' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | | 'test-node' | 'memory_cache_table_statistics' | 0 | 0 | 256 | 'count' | 0 | 0 | 0 | diff --git a/src/query/service/tests/it/storages/testdata/columns_table.txt b/src/query/service/tests/it/storages/testdata/columns_table.txt index 614f58070a07..df5879922802 100644 --- a/src/query/service/tests/it/storages/testdata/columns_table.txt +++ b/src/query/service/tests/it/storages/testdata/columns_table.txt @@ -15,6 +15,8 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'arguments' | 'system' | 'procedures' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'arguments' | 'system' | 'user_functions' | 'Variant' | 'VARIANT' | '' | '' | 'NO' | '' | | 'attempt_number' | 'system' | 'task_history' | 'Int32' | 'INT' | '' | '' | 'NO' | '' | +| 'attribute_names' | 'system' | 'dictionaries' | 'Array(String)' | 'ARRAY(STRING)' | '' | '' | 'NO' | '' | +| 'attribute_types' | 'system' | 'dictionaries' | 'Array(String)' | 'ARRAY(STRING)' | '' | '' | 'NO' | '' | | 'auth_type' | 'system' | 'users' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'auto_increment' | 'information_schema' | 'tables' | 'NULL' | 'NULL' | '' | '' | 'NO' | '' | | 'byte_size' | 'system' | 'clustering_history' | 'UInt64' | 'BIGINT UNSIGNED' | '' | '' | 'NO' | '' | @@ -59,6 +61,7 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'command' | 'system' | 'processes' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'comment' | 'information_schema' | 'statistics' | 'NULL' | 'NULL' | '' | '' | 'NO' | '' | | 'comment' | 'system' | 'columns' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | +| 'comment' | 'system' | 'dictionaries' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'comment' | 'system' | 'notifications' | 'Nullable(String)' | 'VARCHAR' | '' | '' | 'YES' | '' | | 'comment' | 'system' | 'password_policies' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'comment' | 'system' | 'procedures' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | @@ -82,6 +85,7 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'create_time' | 'information_schema' | 'tables' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'created_on' | 'system' | 'background_jobs' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'created_on' | 'system' | 'background_tasks' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | +| 'created_on' | 'system' | 'dictionaries' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'created_on' | 'system' | 'indexes' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'created_on' | 'system' | 'locks' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'created_on' | 'system' | 'notification_history' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | @@ -116,6 +120,7 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'data_write_bytes' | 'system' | 'processes' | 'UInt64' | 'BIGINT UNSIGNED' | '' | '' | 'NO' | '' | | 'database' | 'system' | 'clustering_history' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'database' | 'system' | 'columns' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | +| 'database' | 'system' | 'dictionaries' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'database' | 'system' | 'processes' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'database' | 'system' | 'streams' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'database' | 'system' | 'streams_terse' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | @@ -231,6 +236,8 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'job_type' | 'system' | 'background_jobs' | 'Nullable(String)' | 'VARCHAR' | '' | '' | 'YES' | '' | | 'join_spilled_bytes' | 'system' | 'query_log' | 'UInt64' | 'BIGINT UNSIGNED' | '' | '' | 'NO' | '' | | 'join_spilled_rows' | 'system' | 'query_log' | 'UInt64' | 'BIGINT UNSIGNED' | '' | '' | 'NO' | '' | +| 'key_names' | 'system' | 'dictionaries' | 'Array(String)' | 'ARRAY(STRING)' | '' | '' | 'NO' | '' | +| 'key_types' | 'system' | 'dictionaries' | 'Array(String)' | 'ARRAY(STRING)' | '' | '' | 'NO' | '' | | 'keywords' | 'information_schema' | 'keywords' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'kind' | 'system' | 'metrics' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'labels' | 'system' | 'metrics' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | @@ -266,6 +273,7 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'name' | 'system' | 'contributors' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'name' | 'system' | 'credits' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'name' | 'system' | 'databases' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | +| 'name' | 'system' | 'dictionaries' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'name' | 'system' | 'functions' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'name' | 'system' | 'indexes' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'name' | 'system' | 'malloc_stats_totals' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | @@ -386,6 +394,7 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'session_settings' | 'system' | 'query_log' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'size' | 'system' | 'caches' | 'UInt64' | 'BIGINT UNSIGNED' | '' | '' | 'NO' | '' | | 'snapshot_location' | 'system' | 'streams' | 'Nullable(String)' | 'VARCHAR' | '' | '' | 'YES' | '' | +| 'source' | 'system' | 'dictionaries' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'sql' | 'system' | 'query_cache' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | | 'sql_path' | 'information_schema' | 'schemata' | 'NULL' | 'NULL' | '' | '' | 'NO' | '' | | 'sql_user' | 'system' | 'query_log' | 'String' | 'VARCHAR' | '' | '' | 'NO' | '' | @@ -465,6 +474,7 @@ DB.Table: 'system'.'columns', Table: columns-table_id:1, ver:0, Engine: SystemCo | 'update_on' | 'system' | 'roles' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'update_on' | 'system' | 'users' | 'Nullable(Timestamp)' | 'TIMESTAMP' | '' | '' | 'YES' | '' | | 'updated_on' | 'system' | 'background_tasks' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | +| 'updated_on' | 'system' | 'dictionaries' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | | 'updated_on' | 'system' | 'indexes' | 'Nullable(Timestamp)' | 'TIMESTAMP' | '' | '' | 'YES' | '' | | 'updated_on' | 'system' | 'password_policies' | 'Nullable(Timestamp)' | 'TIMESTAMP' | '' | '' | 'YES' | '' | | 'updated_on' | 'system' | 'streams' | 'Timestamp' | 'TIMESTAMP' | '' | '' | 'NO' | '' | diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 260f72e66e38..88e72e9db513 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -117,7 +117,7 @@ impl DefaultSettings { Ok(Arc::clone(DEFAULT_SETTINGS.get_or_try_init(|| -> Result> { let num_cpus = Self::num_cpus(); let max_memory_usage = Self::max_memory_usage()?; - let recluster_block_size = Self::recluster_block_size()?; + let recluster_block_size = Self::recluster_block_size(max_memory_usage); let default_max_spill_io_requests = Self::spill_io_requests(num_cpus); let default_max_storage_io_requests = Self::storage_io_requests(num_cpus); let data_retention_time_in_days_max = Self::data_retention_time_in_days_max(); @@ -1022,12 +1022,10 @@ impl DefaultSettings { }) } - fn recluster_block_size() -> Result { - let max_memory_usage = Self::max_memory_usage()?; + fn recluster_block_size(max_memory_usage: u64) -> u64 { // The sort merge consumes more than twice as much memory, // so the block size is set relatively conservatively here. - let recluster_block_size = max_memory_usage * 32 / 100; - Ok(recluster_block_size) + std::cmp::min(max_memory_usage * 30 / 100, 80 * 1024 * 1024 * 1024) } /// Converts and validates a setting value based on its key. diff --git a/src/query/sql/src/executor/physical_plans/physical_table_scan.rs b/src/query/sql/src/executor/physical_plans/physical_table_scan.rs index 6e5d4e1fd19c..7045b592f0ed 100644 --- a/src/query/sql/src/executor/physical_plans/physical_table_scan.rs +++ b/src/query/sql/src/executor/physical_plans/physical_table_scan.rs @@ -17,7 +17,6 @@ use std::collections::BTreeMap; use std::collections::HashSet; use std::sync::Arc; -use databend_common_ast::ast::SampleLevel; use databend_common_catalog::catalog::CatalogManager; use databend_common_catalog::plan::DataSourcePlan; use databend_common_catalog::plan::Filters; @@ -241,23 +240,24 @@ impl PhysicalPlanBuilder { if let Some(sample) = scan.sample && !table.use_own_sample_block() { - match sample.sample_level { - SampleLevel::ROW => {} - SampleLevel::BLOCK => { - let probability = sample.sample_probability(None); - if let Some(probability) = probability { - let original_parts = source.parts.partitions.len(); - let mut sample_parts = Vec::with_capacity(original_parts); - let mut rng = thread_rng(); - let bernoulli = Bernoulli::new(probability).unwrap(); - for part in source.parts.partitions.iter() { - if bernoulli.sample(&mut rng) { - sample_parts.push(part.clone()); - } - } - source.parts.partitions = sample_parts; + if let Some(block_sample_value) = sample.block_level { + if block_sample_value > 100.0 { + return Err(ErrorCode::SyntaxException(format!( + "Sample value should be less than or equal to 100, but got {}", + block_sample_value + ))); + } + let probability = block_sample_value / 100.0; + let original_parts = source.parts.partitions.len(); + let mut sample_parts = Vec::with_capacity(original_parts); + let mut rng = thread_rng(); + let bernoulli = Bernoulli::new(probability).unwrap(); + for part in source.parts.partitions.iter() { + if bernoulli.sample(&mut rng) { + sample_parts.push(part.clone()); } } + source.parts.partitions = sample_parts; } } source.table_index = scan.table_index; diff --git a/src/query/sql/src/planner/binder/bind_table_reference/bind_table.rs b/src/query/sql/src/planner/binder/bind_table_reference/bind_table.rs index 58150460b789..99c6f3a3c750 100644 --- a/src/query/sql/src/planner/binder/bind_table_reference/bind_table.rs +++ b/src/query/sql/src/planner/binder/bind_table_reference/bind_table.rs @@ -13,7 +13,7 @@ // limitations under the License. use databend_common_ast::ast::Identifier; -use databend_common_ast::ast::Sample; +use databend_common_ast::ast::SampleConfig; use databend_common_ast::ast::Statement; use databend_common_ast::ast::TableAlias; use databend_common_ast::ast::TemporalClause; @@ -49,7 +49,7 @@ impl Binder { alias: &Option, temporal: &Option, with_options: &Option, - sample: &Option, + sample: &Option, ) -> Result<(SExpr, BindContext)> { let table_identifier = TableIdentifier::new(self, catalog, database, table, alias); let (catalog, database, table_name, table_name_alias) = ( diff --git a/src/query/sql/src/planner/binder/bind_table_reference/bind_table_function.rs b/src/query/sql/src/planner/binder/bind_table_reference/bind_table_function.rs index 98b17a22025e..e48a93b98bd6 100644 --- a/src/query/sql/src/planner/binder/bind_table_reference/bind_table_function.rs +++ b/src/query/sql/src/planner/binder/bind_table_reference/bind_table_function.rs @@ -19,7 +19,7 @@ use databend_common_ast::ast::Expr; use databend_common_ast::ast::FunctionCall as ASTFunctionCall; use databend_common_ast::ast::Identifier; use databend_common_ast::ast::Literal; -use databend_common_ast::ast::Sample; +use databend_common_ast::ast::SampleConfig; use databend_common_ast::ast::SelectStmt; use databend_common_ast::ast::SelectTarget; use databend_common_ast::ast::TableAlias; @@ -62,7 +62,7 @@ impl Binder { params: &[Expr], named_params: &[(Identifier, Expr)], alias: &Option, - sample: &Option, + sample: &Option, ) -> Result<(SExpr, BindContext)> { let func_name = normalize_identifier(name, &self.name_resolution_ctx); diff --git a/src/query/sql/src/planner/binder/binder.rs b/src/query/sql/src/planner/binder/binder.rs index 03b90fd00bb2..353207ad7e58 100644 --- a/src/query/sql/src/planner/binder/binder.rs +++ b/src/query/sql/src/planner/binder/binder.rs @@ -21,6 +21,7 @@ use std::time::Instant; use chrono_tz::Tz; use databend_common_ast::ast::Hint; use databend_common_ast::ast::Identifier; +use databend_common_ast::ast::Settings; use databend_common_ast::ast::Statement; use databend_common_ast::parser::parse_sql; use databend_common_ast::parser::tokenize_sql; @@ -288,7 +289,7 @@ impl<'a> Binder { Statement::CreateDictionary(stmt) => self.bind_create_dictionary(stmt).await?, Statement::DropDictionary(stmt) => self.bind_drop_dictionary(stmt).await?, Statement::ShowCreateDictionary(stmt) => self.bind_show_create_dictionary(stmt).await?, - Statement::ShowDictionaries { show_options: _ } => todo!(), + Statement::ShowDictionaries(stmt) => self.bind_show_dictionaries(bind_context, stmt).await?, // Views Statement::CreateView(stmt) => self.bind_create_view(stmt).await?, Statement::AlterView(stmt) => self.bind_alter_view(stmt).await?, @@ -470,13 +471,15 @@ impl<'a> Binder { Statement::Presign(stmt) => self.bind_presign(bind_context, stmt).await?, - Statement::SetStmt {set_type, identifiers, values } => { + Statement::SetStmt { settings } => { + let Settings { set_type, identifiers, values } = settings; self.bind_set(bind_context, *set_type, identifiers, values) .await? } - Statement::UnSetStmt{unset_type, identifiers } => { - self.bind_unset(bind_context, *unset_type, identifiers) + Statement::UnSetStmt{settings } => { + let Settings { set_type, identifiers, .. } = settings; + self.bind_unset(bind_context, *set_type, identifiers) .await? } diff --git a/src/query/sql/src/planner/binder/copy_into_location.rs b/src/query/sql/src/planner/binder/copy_into_location.rs index 88dee9edfcde..54c43cb150cd 100644 --- a/src/query/sql/src/planner/binder/copy_into_location.rs +++ b/src/query/sql/src/planner/binder/copy_into_location.rs @@ -19,7 +19,8 @@ use databend_common_ast::parser::parse_sql; use databend_common_ast::parser::tokenize_sql; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_meta_app::principal::StageInfo; +use databend_common_storage::init_stage_operator; +use opendal::ErrorKind; use crate::binder::copy_into_table::resolve_file_location; use crate::binder::Binder; @@ -34,6 +35,22 @@ impl<'a> Binder { bind_context: &mut BindContext, stmt: &CopyIntoLocationStmt, ) -> Result { + if stmt.options.use_raw_path && !stmt.options.single { + return Err(ErrorCode::InvalidArgument( + "use_raw_path=true can only be set when single=true", + )); + } + if stmt.options.overwrite && (!stmt.options.single || !stmt.options.use_raw_path) { + return Err(ErrorCode::InvalidArgument( + "overwrite=true can only be set when single=true and use_raw_path=true for now", + )); + } + if !stmt.options.include_query_id && !stmt.options.use_raw_path { + return Err(ErrorCode::InvalidArgument( + "include_query_id=false can only be set when use_raw_path=true", + )); + } + let query = match &stmt.src { CopyIntoLocationSource::Table(table) => { let (catalog_name, database_name, table_name) = self @@ -72,36 +89,35 @@ impl<'a> Binder { }?; let (mut stage_info, path) = resolve_file_location(self.ctx.as_ref(), &stmt.dst).await?; - self.apply_copy_into_location_options(stmt, &mut stage_info) - .await?; + + if stmt.options.use_raw_path { + if path.ends_with("/") { + return Err(ErrorCode::BadArguments( + "when use_raw_path is set to true, url path can not end with '/'", + )); + } + let op = init_stage_operator(&stage_info)?; + if !stmt.options.overwrite { + match op.stat(&path).await { + Ok(_) => return Err(ErrorCode::BadArguments("file already exists")), + Err(e) => { + if e.kind() != ErrorKind::NotFound { + return Err(e.into()); + } + } + } + } + } + + if !stmt.file_format.is_empty() { + stage_info.file_format_params = self.try_resolve_file_format(&stmt.file_format).await?; + } Ok(Plan::CopyIntoLocation(CopyIntoLocationPlan { stage: Box::new(stage_info), path, from: Box::new(query), + options: stmt.options.clone(), })) } - - #[async_backtrace::framed] - pub async fn apply_copy_into_location_options( - &mut self, - stmt: &CopyIntoLocationStmt, - stage: &mut StageInfo, - ) -> Result<()> { - if !stmt.file_format.is_empty() { - stage.file_format_params = self.try_resolve_file_format(&stmt.file_format).await?; - } - - // Copy options. - { - // max_file_size. - if stmt.max_file_size != 0 { - stage.copy_options.max_file_size = stmt.max_file_size; - } - stage.copy_options.single = stmt.single; - stage.copy_options.detailed_output = stmt.detailed_output; - } - - Ok(()) - } } diff --git a/src/query/sql/src/planner/binder/copy_into_table.rs b/src/query/sql/src/planner/binder/copy_into_table.rs index 00e488daca70..78d7d0837b84 100644 --- a/src/query/sql/src/planner/binder/copy_into_table.rs +++ b/src/query/sql/src/planner/binder/copy_into_table.rs @@ -204,6 +204,7 @@ impl<'a> Binder { duplicated_files_detected: vec![], is_select: false, default_values, + copy_into_location_options: Default::default(), }, values_consts: vec![], required_source_schema: required_values_schema.clone(), @@ -363,6 +364,7 @@ impl<'a> Binder { duplicated_files_detected, is_select: false, default_values: Some(default_values), + copy_into_location_options: Default::default(), }, write_mode, query: None, diff --git a/src/query/sql/src/planner/binder/ddl/dictionary.rs b/src/query/sql/src/planner/binder/ddl/dictionary.rs index 6b54019fd5ee..800b16ef5bff 100644 --- a/src/query/sql/src/planner/binder/ddl/dictionary.rs +++ b/src/query/sql/src/planner/binder/ddl/dictionary.rs @@ -19,6 +19,8 @@ use std::sync::LazyLock; use databend_common_ast::ast::CreateDictionaryStmt; use databend_common_ast::ast::DropDictionaryStmt; use databend_common_ast::ast::ShowCreateDictionaryStmt; +use databend_common_ast::ast::ShowDictionariesStmt; +use databend_common_ast::ast::ShowLimit; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::DataType; @@ -28,12 +30,16 @@ use databend_common_expression::TableDataType; use databend_common_expression::TableSchema; use databend_common_meta_app::schema::DictionaryMeta; use itertools::Itertools; +use log::debug; use crate::plans::CreateDictionaryPlan; use crate::plans::DropDictionaryPlan; use crate::plans::Plan; +use crate::plans::RewriteKind; use crate::plans::ShowCreateDictionaryPlan; +use crate::BindContext; use crate::Binder; +use crate::SelectBuilder; pub const DICT_OPT_KEY_SQL_HOST: &str = "host"; pub const DICT_OPT_KEY_SQL_PORT: &str = "port"; @@ -383,4 +389,50 @@ impl Binder { }, ))) } + + #[async_backtrace::framed] + pub(in crate::planner::binder) async fn bind_show_dictionaries( + &mut self, + bind_context: &mut BindContext, + stmt: &ShowDictionariesStmt, + ) -> Result { + let ShowDictionariesStmt { database, limit } = stmt; + + let mut select_builder = SelectBuilder::from("system.dictionaries"); + + select_builder + .with_column("database AS Database") + .with_column("name AS Dictionary") + .with_column("key_names AS Key_Names") + .with_column("key_types AS key_Types") + .with_column("attribute_names AS Attribute_Names") + .with_column("attribute_types AS Attribute_Types") + .with_column("source AS Source") + .with_column("comment AS Comment"); + + select_builder + .with_order_by("database") + .with_order_by("name"); + + let database = self.check_database_exist(&None, database).await?; + select_builder.with_filter(format!("database = '{}'", database.clone())); + + match limit { + None => (), + Some(ShowLimit::Like { pattern }) => { + select_builder.with_filter(format!("name LIKE '{pattern}'")); + } + Some(ShowLimit::Where { selection }) => { + select_builder.with_filter(format!("({selection})")); + } + }; + let query = select_builder.build(); + debug!("show dictionaries rewrite to: {:?}", query); + self.bind_rewrite_to_query( + bind_context, + query.as_str(), + RewriteKind::ShowDictionaries(database.clone()), + ) + .await + } } diff --git a/src/query/sql/src/planner/binder/set.rs b/src/query/sql/src/planner/binder/set.rs index 2896ea96081b..d65d93e1c572 100644 --- a/src/query/sql/src/planner/binder/set.rs +++ b/src/query/sql/src/planner/binder/set.rs @@ -71,6 +71,7 @@ impl Binder { let p = self.clone().bind(&Statement::Query(query.clone())).await?; SetScalarsOrQuery::Query(Box::new(p)) } + SetValues::None => return Err(ErrorCode::SemanticError("set value can not be None")), }; Ok(Plan::Set(Box::new(SetPlan { diff --git a/src/query/sql/src/planner/binder/table.rs b/src/query/sql/src/planner/binder/table.rs index d24a09d9884f..288bb379bc74 100644 --- a/src/query/sql/src/planner/binder/table.rs +++ b/src/query/sql/src/planner/binder/table.rs @@ -21,9 +21,7 @@ use chrono::Utc; use dashmap::DashMap; use databend_common_ast::ast::Identifier; use databend_common_ast::ast::Indirection; -use databend_common_ast::ast::Sample; use databend_common_ast::ast::SampleConfig; -use databend_common_ast::ast::SampleLevel; use databend_common_ast::ast::SelectTarget; use databend_common_ast::ast::SetExpr; use databend_common_ast::ast::SetOperator; @@ -435,7 +433,7 @@ impl Binder { database_name: &str, table_index: IndexType, change_type: Option, - sample: &Option, + sample: &Option, ) -> Result<(SExpr, BindContext)> { let mut bind_context = BindContext::with_parent(Box::new(bind_context.clone())); @@ -489,7 +487,7 @@ impl Binder { columns: columns.into_iter().map(|col| col.index()).collect(), statistics: Arc::new(Statistics::default()), change_type, - sample: table_sample(sample)?, + sample: sample.clone(), ..Default::default() } .into(), @@ -679,16 +677,3 @@ impl Binder { Ok(index_metas) } } - -fn table_sample(sample: &Option) -> Result> { - if let Some(sample) = sample { - if sample.sample_level == SampleLevel::BLOCK { - if let SampleConfig::RowsNum(_) = sample.sample_conf { - return Err(ErrorCode::SyntaxException( - "BLOCK sampling doesn't support fixed rows.".to_string(), - )); - } - } - } - Ok(sample.clone()) -} diff --git a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs index bbf5dd3f96d6..fdb181cda595 100644 --- a/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs +++ b/src/query/sql/src/planner/optimizer/dynamic_sample/filter_selectivity_sample.rs @@ -15,9 +15,8 @@ use std::collections::HashSet; use std::sync::Arc; -use databend_common_ast::ast::Sample; use databend_common_ast::ast::SampleConfig; -use databend_common_ast::ast::SampleLevel; +use databend_common_ast::ast::SampleRowLevel; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; @@ -63,9 +62,9 @@ pub async fn filter_selectivity_sample( let mut new_s_expr = s_expr.clone(); // If the table is too small, we don't need to sample. if sample_size >= 10.0 { - scan.sample = Some(Sample { - sample_level: SampleLevel::ROW, - sample_conf: SampleConfig::RowsNum(sample_size), + scan.sample = Some(SampleConfig { + row_level: Some(SampleRowLevel::RowsNum(sample_size)), + block_level: None, }); let new_child = SExpr::create_leaf(Arc::new(RelOperator::Scan(scan))); new_s_expr = s_expr.replace_children(vec![Arc::new(new_child)]); diff --git a/src/query/sql/src/planner/optimizer/optimizer.rs b/src/query/sql/src/planner/optimizer/optimizer.rs index 75a576847c64..0f21b277fb4c 100644 --- a/src/query/sql/src/planner/optimizer/optimizer.rs +++ b/src/query/sql/src/planner/optimizer/optimizer.rs @@ -238,13 +238,17 @@ pub async fn optimize(mut opt_ctx: OptimizerContext, plan: Plan) -> Result partial, plan: Box::new(Box::pin(optimize(opt_ctx, *plan)).await?), }), - Plan::CopyIntoLocation(CopyIntoLocationPlan { stage, path, from }) => { - Ok(Plan::CopyIntoLocation(CopyIntoLocationPlan { - stage, - path, - from: Box::new(Box::pin(optimize(opt_ctx, *from)).await?), - })) - } + Plan::CopyIntoLocation(CopyIntoLocationPlan { + stage, + path, + from, + options, + }) => Ok(Plan::CopyIntoLocation(CopyIntoLocationPlan { + stage, + path, + from: Box::new(Box::pin(optimize(opt_ctx, *from)).await?), + options, + })), Plan::CopyIntoTable(mut plan) if !plan.no_file_to_copy => { plan.enable_distributed = opt_ctx.enable_distributed_optimization && opt_ctx diff --git a/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs b/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs index 836cf19aa405..f7c0d28e0d9c 100644 --- a/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs +++ b/src/query/sql/src/planner/optimizer/statistics/collect_statistics.rs @@ -15,7 +15,6 @@ use std::collections::HashMap; use std::sync::Arc; -use databend_common_ast::ast::SampleLevel; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; use databend_common_expression::types::NumberScalar; @@ -106,43 +105,42 @@ impl CollectStatisticsOptimizer { }); let mut s_expr = s_expr.replace_plan(Arc::new(RelOperator::Scan(scan.clone()))); if let Some(sample) = &scan.sample { - match sample.sample_level { - SampleLevel::ROW => { - if let Some(stats) = &table_stats - && let Some(probability) = sample.sample_probability(stats.num_rows) - { - let rand_expr = ScalarExpr::FunctionCall(FunctionCall { - span: None, - func_name: "rand".to_string(), - params: vec![], - arguments: vec![], - }); - let filter = ScalarExpr::FunctionCall(FunctionCall { - span: None, - func_name: "lte".to_string(), - params: vec![], - arguments: vec![ - rand_expr, - ScalarExpr::ConstantExpr(ConstantExpr { - span: None, - value: Scalar::Number(NumberScalar::Float64( - F64::from(probability), - )), - }), - ], - }); - s_expr = SExpr::create_unary( - Arc::new( - Filter { - predicates: vec![filter], - } - .into(), - ), - Arc::new(s_expr), - ); - } + // Only process row-level sampling in optimizer phase. + if let Some(row_level) = &sample.row_level { + if let Some(stats) = &table_stats + && let Some(probability) = + row_level.sample_probability(stats.num_rows)? + { + let rand_expr = ScalarExpr::FunctionCall(FunctionCall { + span: None, + func_name: "rand".to_string(), + params: vec![], + arguments: vec![], + }); + let filter = ScalarExpr::FunctionCall(FunctionCall { + span: None, + func_name: "lte".to_string(), + params: vec![], + arguments: vec![ + rand_expr, + ScalarExpr::ConstantExpr(ConstantExpr { + span: None, + value: Scalar::Number(NumberScalar::Float64(F64::from( + probability, + ))), + }), + ], + }); + s_expr = SExpr::create_unary( + Arc::new( + Filter { + predicates: vec![filter], + } + .into(), + ), + Arc::new(s_expr), + ); } - SampleLevel::BLOCK => {} } } Ok(s_expr) diff --git a/src/query/sql/src/planner/plans/copy_into_location.rs b/src/query/sql/src/planner/plans/copy_into_location.rs index c7655cb256fa..7eac3630c3d1 100644 --- a/src/query/sql/src/planner/plans/copy_into_location.rs +++ b/src/query/sql/src/planner/plans/copy_into_location.rs @@ -15,6 +15,7 @@ use std::fmt::Debug; use std::fmt::Formatter; +use databend_common_ast::ast::CopyIntoLocationOptions; use databend_common_expression::types::DataType; use databend_common_expression::types::NumberDataType; use databend_common_expression::DataField; @@ -29,11 +30,12 @@ pub struct CopyIntoLocationPlan { pub stage: Box, pub path: String, pub from: Box, + pub options: CopyIntoLocationOptions, } impl CopyIntoLocationPlan { pub fn schema(&self) -> DataSchemaRef { - if self.stage.copy_options.detailed_output { + if self.options.detailed_output { DataSchemaRefExt::create(vec![ DataField::new("file_name", DataType::String), DataField::new("file_size", DataType::Number(NumberDataType::UInt64)), diff --git a/src/query/sql/src/planner/plans/plan.rs b/src/query/sql/src/planner/plans/plan.rs index 245fa278e83e..d2df531f5be7 100644 --- a/src/query/sql/src/planner/plans/plan.rs +++ b/src/query/sql/src/planner/plans/plan.rs @@ -390,6 +390,7 @@ pub enum RewriteKind { ShowColumns(String, String, String), ShowTablesStatus, ShowVirtualColumns, + ShowDictionaries(String), ShowStreams(String), diff --git a/src/query/sql/src/planner/plans/scan.rs b/src/query/sql/src/planner/plans/scan.rs index 4c0edd385b22..2e63c6bb308e 100644 --- a/src/query/sql/src/planner/plans/scan.rs +++ b/src/query/sql/src/planner/plans/scan.rs @@ -16,7 +16,7 @@ use std::collections::HashMap; use std::collections::HashSet; use std::sync::Arc; -use databend_common_ast::ast::Sample; +use databend_common_ast::ast::SampleConfig; use databend_common_catalog::plan::InvertedIndexInfo; use databend_common_catalog::statistics::BasicColumnStatistics; use databend_common_catalog::table::TableStatistics; @@ -106,7 +106,7 @@ pub struct Scan { pub inverted_index: Option, // Lazy row fetch. pub is_lazy_table: bool, - pub sample: Option, + pub sample: Option, pub statistics: Arc, } diff --git a/src/query/storages/common/cache/Cargo.toml b/src/query/storages/common/cache/Cargo.toml index 7cb83f2577ee..773e6eb71585 100644 --- a/src/query/storages/common/cache/Cargo.toml +++ b/src/query/storages/common/cache/Cargo.toml @@ -30,6 +30,7 @@ crossbeam-channel = "0.5.6" hex = "0.4.3" log = { workspace = true } parking_lot = { workspace = true } +parquet = { workspace = true } rayon = "1.9.0" rustix = "0.38.37" siphasher = "0.3.10" diff --git a/src/query/storages/common/cache/src/caches.rs b/src/query/storages/common/cache/src/caches.rs index 8d36618e42d2..250574e5fc7d 100644 --- a/src/query/storages/common/cache/src/caches.rs +++ b/src/query/storages/common/cache/src/caches.rs @@ -14,7 +14,6 @@ use std::sync::Arc; -use databend_common_arrow::parquet::metadata::FileMetaData; use databend_common_cache::MemSized; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; @@ -27,6 +26,7 @@ use databend_storages_common_table_meta::meta::CompactSegmentInfo; use databend_storages_common_table_meta::meta::SegmentInfo; use databend_storages_common_table_meta::meta::TableSnapshot; use databend_storages_common_table_meta::meta::TableSnapshotStatistics; +use parquet::file::metadata::ParquetMetaData; use crate::manager::CacheManager; use crate::CacheAccessor; @@ -50,8 +50,8 @@ pub type BloomIndexMetaCache = InMemoryLruCache; pub type InvertedIndexMetaCache = InMemoryLruCache; pub type InvertedIndexFileCache = InMemoryLruCache; -/// In memory object cache of parquet FileMetaData of external parquet files -pub type FileMetaDataCache = InMemoryLruCache; +/// In memory object cache of parquet FileMetaData of external parquet rs files +pub type ParquetMetaDataCache = InMemoryLruCache; pub type PrunePartitionsCache = InMemoryLruCache<(PartStatistics, Partitions)>; @@ -122,10 +122,10 @@ impl CachedObject for Xor8Filter { } } -impl CachedObject for FileMetaData { - type Cache = FileMetaDataCache; +impl CachedObject for ParquetMetaData { + type Cache = ParquetMetaDataCache; fn cache() -> Option { - CacheManager::instance().get_file_meta_data_cache() + CacheManager::instance().get_parquet_meta_data_cache() } } @@ -234,8 +234,8 @@ impl From for CacheValue { } } -impl From for CacheValue { - fn from(value: FileMetaData) -> Self { +impl From for CacheValue { + fn from(value: ParquetMetaData) -> Self { CacheValue { inner: Arc::new(value), mem_bytes: 0, diff --git a/src/query/storages/common/cache/src/manager.rs b/src/query/storages/common/cache/src/manager.rs index 6553f64f30fd..3bf65686d19d 100644 --- a/src/query/storages/common/cache/src/manager.rs +++ b/src/query/storages/common/cache/src/manager.rs @@ -28,9 +28,9 @@ use crate::caches::BloomIndexMetaCache; use crate::caches::CacheValue; use crate::caches::ColumnArrayCache; use crate::caches::CompactSegmentInfoCache; -use crate::caches::FileMetaDataCache; use crate::caches::InvertedIndexFileCache; use crate::caches::InvertedIndexMetaCache; +use crate::caches::ParquetMetaDataCache; use crate::caches::PrunePartitionsCache; use crate::caches::TableSnapshotCache; use crate::caches::TableSnapshotStatisticCache; @@ -38,7 +38,7 @@ use crate::InMemoryLruCache; use crate::TableDataCache; use crate::TableDataCacheBuilder; -static DEFAULT_FILE_META_DATA_CACHE_ITEMS: usize = 3000; +static DEFAULT_PARQUET_META_DATA_CACHE_ITEMS: usize = 3000; /// Where all the caches reside pub struct CacheManager { @@ -50,7 +50,7 @@ pub struct CacheManager { inverted_index_meta_cache: Option, inverted_index_file_cache: Option, prune_partitions_cache: Option, - parquet_file_meta_data_cache: Option, + parquet_meta_data_cache: Option, table_data_cache: Option, in_memory_table_data_cache: Option, block_meta_cache: Option, @@ -122,7 +122,7 @@ impl CacheManager { inverted_index_meta_cache: None, inverted_index_file_cache: None, prune_partitions_cache: None, - parquet_file_meta_data_cache: None, + parquet_meta_data_cache: None, table_statistic_cache: None, table_data_cache, in_memory_table_data_cache, @@ -171,9 +171,9 @@ impl CacheManager { MEMORY_CACHE_PRUNE_PARTITIONS, ); - let parquet_file_meta_data_cache = Self::new_named_items_cache( - DEFAULT_FILE_META_DATA_CACHE_ITEMS, - MEMORY_CACHE_PARQUET_FILE_META, + let parquet_meta_data_cache = Self::new_named_items_cache( + DEFAULT_PARQUET_META_DATA_CACHE_ITEMS, + MEMORY_CACHE_PARQUET_META_DATA, ); let block_meta_cache = Self::new_named_items_cache( @@ -189,11 +189,11 @@ impl CacheManager { inverted_index_meta_cache, inverted_index_file_cache, prune_partitions_cache, - parquet_file_meta_data_cache, table_statistic_cache, table_data_cache, in_memory_table_data_cache, block_meta_cache, + parquet_meta_data_cache, })); } @@ -240,8 +240,8 @@ impl CacheManager { self.prune_partitions_cache.clone() } - pub fn get_file_meta_data_cache(&self) -> Option { - self.parquet_file_meta_data_cache.clone() + pub fn get_parquet_meta_data_cache(&self) -> Option { + self.parquet_meta_data_cache.clone() } pub fn get_table_data_cache(&self) -> Option { @@ -298,7 +298,7 @@ impl CacheManager { } const MEMORY_CACHE_TABLE_DATA: &str = "memory_cache_table_data"; -const MEMORY_CACHE_PARQUET_FILE_META: &str = "memory_cache_parquet_file_meta"; +const MEMORY_CACHE_PARQUET_META_DATA: &str = "memory_cache_parquet_meta_data"; const MEMORY_CACHE_PRUNE_PARTITIONS: &str = "memory_cache_prune_partitions"; const MEMORY_CACHE_INVERTED_INDEX_FILE: &str = "memory_cache_inverted_index_file"; const MEMORY_CACHE_INVERTED_INDEX_FILE_META_DATA: &str = diff --git a/src/query/storages/common/pruner/src/lib.rs b/src/query/storages/common/pruner/src/lib.rs index 04bc0341fc43..0e0f938ce3ec 100644 --- a/src/query/storages/common/pruner/src/lib.rs +++ b/src/query/storages/common/pruner/src/lib.rs @@ -18,6 +18,7 @@ mod block_meta; mod internal_column_pruner; mod limiter_pruner; mod page_pruner; +pub mod partition_prunner; mod range_pruner; mod topn_pruner; diff --git a/src/query/storages/common/pruner/src/partition_prunner.rs b/src/query/storages/common/pruner/src/partition_prunner.rs new file mode 100644 index 000000000000..6211e77d9dc0 --- /dev/null +++ b/src/query/storages/common/pruner/src/partition_prunner.rs @@ -0,0 +1,91 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use databend_common_exception::Result; +use databend_common_expression::Expr; +use databend_common_expression::FunctionContext; +use databend_common_expression::Scalar; +use databend_common_expression::TableField; +use databend_common_expression::TableSchema; +use databend_storages_common_index::RangeIndex; +use databend_storages_common_table_meta::meta::ColumnStatistics; +use databend_storages_common_table_meta::meta::StatisticsOfColumns; + +pub struct PartitionPruner { + pub filter: Expr, + pub partition_schema: Arc, + + leaf_fields: Vec, + + pub range_filter: RangeIndex, +} + +pub trait FetchPartitionScalars { + fn eval(_item: &T, _partition_fields: &[TableField]) -> Result>; +} + +impl PartitionPruner { + pub fn try_create( + ctx: FunctionContext, + filter: Expr, + partition_schema: Arc, + full_schema: Arc, + ) -> Result { + let range_filter = RangeIndex::try_create( + ctx, + &filter, + full_schema.clone(), + StatisticsOfColumns::default(), + )?; + Ok(PartitionPruner { + filter, + partition_schema, + leaf_fields: full_schema.leaf_fields(), + range_filter, + }) + } + + pub fn prune(&self, partitions: Vec) -> Result> + where F: FetchPartitionScalars { + let filtered_partitions = partitions + .into_iter() + .filter(|p| self.should_keep::(p).unwrap_or(true)) + .collect(); + Ok(filtered_partitions) + } + + pub fn should_keep(&self, partition: &T) -> Result + where F: FetchPartitionScalars { + let scalars = F::eval(partition, &self.partition_schema.fields)?; + let mut stats = HashMap::new(); + + for (index, scalar) in scalars.into_iter().enumerate() { + let null_count = u64::from(scalar.is_null()); + let column_stats = ColumnStatistics::new(scalar.clone(), scalar, null_count, 0, None); + + let mut f = self + .leaf_fields + .iter() + .filter(|f| f.name() == &self.partition_schema.field(index).name); + + if let Some(f) = f.next() { + stats.insert(f.column_id(), column_stats); + } + } + self.range_filter.apply(&stats, |_| false) + } +} diff --git a/src/query/storages/delta/Cargo.toml b/src/query/storages/delta/Cargo.toml index 428816eac2af..bd98a9a61050 100644 --- a/src/query/storages/delta/Cargo.toml +++ b/src/query/storages/delta/Cargo.toml @@ -14,10 +14,12 @@ databend-common-base = { workspace = true } databend-common-catalog = { workspace = true } databend-common-exception = { workspace = true } databend-common-expression = { workspace = true } +databend-common-functions = { workspace = true } databend-common-meta-app = { workspace = true } databend-common-pipeline-core = { workspace = true } databend-common-storage = { workspace = true } databend-common-storages-parquet = { workspace = true } +databend-storages-common-pruner = { workspace = true } databend-storages-common-table-meta = { workspace = true } deltalake = { workspace = true } fastrace = { workspace = true } @@ -30,9 +32,6 @@ tokio = { workspace = true } typetag = "0.2" url = "2.4.1" -[dev-dependencies] -maplit = "1.0.2" - [lints] workspace = true diff --git a/src/query/storages/delta/src/lib.rs b/src/query/storages/delta/src/lib.rs index 6df87495e1a1..dffe44433b0e 100644 --- a/src/query/storages/delta/src/lib.rs +++ b/src/query/storages/delta/src/lib.rs @@ -16,7 +16,6 @@ #![allow(clippy::diverging_sub_expression)] mod partition; -mod partition_columns; mod table; mod table_source; diff --git a/src/query/storages/delta/src/table.rs b/src/query/storages/delta/src/table.rs index 0361a8f72568..2bff4c9b76d2 100644 --- a/src/query/storages/delta/src/table.rs +++ b/src/query/storages/delta/src/table.rs @@ -18,9 +18,10 @@ use std::sync::Arc; use arrow_schema::Schema as ArrowSchema; use async_trait::async_trait; use databend_common_catalog::catalog::StorageDescription; +use databend_common_catalog::partition_columns::get_pushdown_without_partition_columns; +use databend_common_catalog::partition_columns::str_to_scalar; use databend_common_catalog::plan::DataSourcePlan; use databend_common_catalog::plan::ParquetReadOptions; -use databend_common_catalog::plan::PartInfo; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; use databend_common_catalog::plan::PartitionsShuffleKind; @@ -32,8 +33,10 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::DataSchema; use databend_common_expression::FieldIndex; +use databend_common_expression::Scalar; use databend_common_expression::TableField; use databend_common_expression::TableSchema; +use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::storage::StorageParams; use databend_common_pipeline_core::Pipeline; @@ -42,6 +45,8 @@ use databend_common_storages_parquet::ParquetFilesPart; use databend_common_storages_parquet::ParquetPart; use databend_common_storages_parquet::ParquetRSPruner; use databend_common_storages_parquet::ParquetRSReaderBuilder; +use databend_storages_common_pruner::partition_prunner::FetchPartitionScalars; +use databend_storages_common_pruner::partition_prunner::PartitionPruner; use databend_storages_common_table_meta::table::OPT_KEY_ENGINE_META; use deltalake::kernel::Add; use deltalake::DeltaTableBuilder; @@ -52,8 +57,6 @@ use tokio::sync::OnceCell; use url::Url; use crate::partition::DeltaPartInfo; -use crate::partition_columns::get_partition_values; -use crate::partition_columns::get_pushdown_without_partition_columns; use crate::table_source::DeltaTableSource; pub const DELTA_ENGINE: &str = "DELTA"; @@ -120,12 +123,11 @@ impl DeltaTable { }) } - #[allow(dead_code)] - fn get_partition_fields(&self) -> Result> { + fn get_partition_fields(&self) -> Result> { self.meta .partition_columns .iter() - .map(|name| self.info.meta.schema.field_with_name(name)) + .map(|name| self.info.meta.schema.field_with_name(name).cloned()) .collect() } @@ -261,7 +263,7 @@ impl DeltaTable { output, output_schema.clone(), parquet_reader.clone(), - self.get_partition_fields()?.into_iter().cloned().collect(), + self.get_partition_fields()?, ) }, max_threads.max(1), @@ -272,8 +274,8 @@ impl DeltaTable { #[async_backtrace::framed] async fn do_read_partitions( &self, - _ctx: Arc, - _push_downs: Option, + ctx: Arc, + push_downs: Option, ) -> Result<(PartStatistics, Partitions)> { let table = self.table().await?; @@ -281,14 +283,34 @@ impl DeltaTable { let mut read_bytes = 0; let partition_fields = self.get_partition_fields()?; - let adds = table + let mut adds = table .snapshot() .and_then(|f| f.file_actions()) .map_err(|e| { ErrorCode::ReadTableDataError(format!("Cannot read file_actions: {e:?}")) })?; + + let filter_expression = push_downs.as_ref().and_then(|p| { + p.filters + .as_ref() + .map(|filter| filter.filter.as_expr(&BUILTIN_FUNCTIONS)) + }); + let total_files = adds.len(); + if !partition_fields.is_empty() { + if let Some(expr) = filter_expression { + let partition_pruner = PartitionPruner::try_create( + ctx.get_function_context()?, + expr, + Arc::new(TableSchema::new(partition_fields.clone())), + self.schema(), + )?; + + adds = partition_pruner.prune::(adds)?; + } + } + #[derive(serde::Deserialize)] struct Stats { #[serde(rename = "numRecords")] @@ -311,9 +333,8 @@ impl DeltaTable { ).unwrap_or(1); read_rows += num_records as usize; read_bytes += add.size as usize; - let partition_values = get_partition_values(add, &partition_fields[..])?; - Ok(Arc::new( - Box::new(DeltaPartInfo { + let partition_values = get_partition_values(add, &partition_fields)?; + Ok(Arc::new(Box::new(DeltaPartInfo { partition_values, data: ParquetPart::ParquetFiles( ParquetFilesPart { @@ -321,8 +342,7 @@ impl DeltaTable { estimated_uncompressed_size: add.size as u64, // This field is not used here. }, ), - }) as Box - )) + }) as _)) }) .collect::>>()?; @@ -333,6 +353,14 @@ impl DeltaTable { } } +pub struct DeltaToScalar; + +impl FetchPartitionScalars for DeltaToScalar { + fn eval(add: &Add, partition_fields: &[TableField]) -> Result> { + get_partition_values(add, partition_fields) + } +} + #[async_trait] impl Table for DeltaTable { fn as_any(&self) -> &dyn Any { @@ -384,3 +412,20 @@ impl Table for DeltaTable { true } } + +pub fn get_partition_values(add: &Add, fields: &[TableField]) -> Result> { + let mut values = Vec::with_capacity(fields.len()); + for f in fields { + match add.partition_values.get(&f.name) { + Some(Some(v)) => values.push(str_to_scalar(v, &f.data_type().into())?), + Some(None) => values.push(Scalar::Null), + None => { + return Err(ErrorCode::BadArguments(format!( + "partition value for column {} not found", + &f.name + ))); + } + } + } + Ok(values) +} diff --git a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs index a5c7099bbce8..5de6cb538442 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutator/recluster_mutator.rs @@ -198,7 +198,7 @@ impl ReclusterMutator { // Compute memory threshold and maximum number of blocks allowed for reclustering let mem_info = sys_info::mem_info().map_err(ErrorCode::from_std_error)?; let recluster_block_size = self.ctx.get_settings().get_recluster_block_size()? as usize; - let memory_threshold = recluster_block_size.min(mem_info.avail as usize * 1024 * 35 / 100); + let memory_threshold = recluster_block_size.min(mem_info.avail as usize * 1024 * 30 / 100); // specify a rather small value, so that `recluster_block_size` might be tuned to lower value. let max_blocks_num = (memory_threshold / self.block_thresholds.max_bytes_per_block).max(2) * self.max_tasks; diff --git a/src/query/storages/fuse/src/pruning/fuse_pruner.rs b/src/query/storages/fuse/src/pruning/fuse_pruner.rs index 22588861f222..089262841036 100644 --- a/src/query/storages/fuse/src/pruning/fuse_pruner.rs +++ b/src/query/storages/fuse/src/pruning/fuse_pruner.rs @@ -19,6 +19,7 @@ use databend_common_base::runtime::Runtime; use databend_common_base::runtime::TrySpawn; use databend_common_catalog::plan::PushDownInfo; use databend_common_catalog::table_context::TableContext; +use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::RemoteExpr; use databend_common_expression::TableSchemaRef; @@ -360,7 +361,7 @@ impl FusePruner { ); } } else { - let sample_probability = table_sample(&push_down); + let sample_probability = table_sample(&push_down)?; for (location, info) in pruned_segments { let mut block_metas = Self::extract_block_metas(&location.location.0, &info, true)?; @@ -533,13 +534,21 @@ impl FusePruner { } } -fn table_sample(push_down_info: &Option) -> Option { +fn table_sample(push_down_info: &Option) -> Result> { + let mut sample_probability = None; if let Some(sample) = push_down_info .as_ref() .and_then(|info| info.sample.as_ref()) { - sample.sample_probability(None) - } else { - None + if let Some(block_sample_value) = sample.block_level { + if block_sample_value > 100.0 { + return Err(ErrorCode::SyntaxException(format!( + "Sample value should be less than or equal to 100, but got {}", + block_sample_value + ))); + } + sample_probability = Some(block_sample_value / 100.0) + } } + Ok(sample_probability) } diff --git a/src/query/storages/hive/hive/Cargo.toml b/src/query/storages/hive/hive/Cargo.toml index b71e36560838..90ccd787fb23 100644 --- a/src/query/storages/hive/hive/Cargo.toml +++ b/src/query/storages/hive/hive/Cargo.toml @@ -15,7 +15,6 @@ async-backtrace = { workspace = true } async-recursion = "1.1.1" async-trait = { workspace = true } chrono = { workspace = true } -databend-common-arrow = { workspace = true } databend-common-base = { workspace = true } databend-common-catalog = { workspace = true } databend-common-config = { workspace = true } @@ -29,8 +28,8 @@ databend-common-pipeline-core = { workspace = true } databend-common-pipeline-sources = { workspace = true } databend-common-sql = { workspace = true } databend-common-storage = { workspace = true } -databend-storages-common-cache = { workspace = true } -databend-storages-common-index = { workspace = true } +databend-common-storages-parquet = { workspace = true } +databend-storages-common-pruner = { workspace = true } databend-storages-common-table-meta = { workspace = true } fastrace = { workspace = true } faststr = "0.2" @@ -38,6 +37,7 @@ futures = { workspace = true } hive_metastore = "0.1.0" log = { workspace = true } opendal = { workspace = true } +parquet = { workspace = true } recursive = "0.1.1" serde = { workspace = true } typetag = { workspace = true } diff --git a/src/query/storages/hive/hive/src/hive_block_filter.rs b/src/query/storages/hive/hive/src/hive_block_filter.rs deleted file mode 100644 index 9414bf48b695..000000000000 --- a/src/query/storages/hive/hive/src/hive_block_filter.rs +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; - -use databend_common_arrow::parquet::metadata::RowGroupMetaData; -use databend_common_arrow::parquet::statistics::BinaryStatistics; -use databend_common_arrow::parquet::statistics::BooleanStatistics; -use databend_common_arrow::parquet::statistics::PrimitiveStatistics; -use databend_common_arrow::parquet::statistics::Statistics; -use databend_common_expression::types::number::F32; -use databend_common_expression::types::number::F64; -use databend_common_expression::types::BooleanType; -use databend_common_expression::types::NumberDataType; -use databend_common_expression::types::NumberType; -use databend_common_expression::types::StringType; -use databend_common_expression::types::ValueType; -use databend_common_expression::Scalar; -use databend_common_expression::TableDataType; -use databend_common_expression::TableField; -use databend_common_expression::TableSchema; -use databend_storages_common_index::RangeIndex; -use databend_storages_common_table_meta::meta::ColumnStatistics; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; - -use crate::hive_parquet_block_reader::HiveBlockReader; -use crate::hive_table::HIVE_DEFAULT_PARTITION; - -#[derive(Clone)] -pub struct HiveBlockFilter { - range_filter: Option, - projections: Vec, - data_schema: Arc, -} - -impl HiveBlockFilter { - pub fn create( - range_filter: Option, - projections: Vec, - data_schema: Arc, - ) -> Self { - Self { - range_filter, - projections, - data_schema, - } - } - - // true: rowgroup if filtered by predict - pub fn filter( - &self, - row_group: &RowGroupMetaData, - part_columns: HashMap, - ) -> bool { - if let Some(filter) = &self.range_filter { - let mut statistics = StatisticsOfColumns::new(); - for col in self.projections.iter() { - let column_meta = - HiveBlockReader::get_parquet_column_metadata(row_group, col.name()); - if let Ok(meta) = column_meta { - let in_memory_size = meta.uncompressed_size(); - if let Ok(stats) = meta.statistics().transpose() { - // if stats is none, we couldn't make a decision whether the block should be filtered - let stats = match stats { - None => return false, - Some(stats) => stats, - }; - if let Some((max, min, null_count)) = - Self::get_max_min_stats(col.data_type(), &*stats) - { - let col_stats = ColumnStatistics::new( - min, - max, - null_count as u64, - in_memory_size as u64, - None, - ); - if let Some((index, _)) = self.data_schema.column_with_name(col.name()) - { - statistics.insert(index as u32, col_stats); - } - } - } - } - } - - for (p_key, p_value) in part_columns { - if let Some((idx, _)) = self.data_schema.column_with_name(&p_key) { - let mut null_count = 0; - let v = if p_value == HIVE_DEFAULT_PARTITION { - null_count = row_group.num_rows(); - Scalar::Null - } else { - Scalar::String(p_value) - }; - - let col_stats = ColumnStatistics::new(v.clone(), v, null_count as u64, 0, None); - statistics.insert(idx as u32, col_stats); - } - } - - if let Ok(ret) = filter.apply(&statistics, |_| false) { - if !ret { - return true; - } - } - } - false - } - - fn get_max_min_stats( - column_type: &TableDataType, - stats: &dyn Statistics, - ) -> Option<(Scalar, Scalar, i64)> { - match column_type { - TableDataType::Number(NumberDataType::UInt8) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u8); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u8); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::UInt16) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u16); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u16); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::UInt32) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u32); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u32); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::UInt64) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as u64); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as u64); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int8) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as i8); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as i8); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int16) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap() as i16); - let min = NumberType::::upcast_scalar(s.min_value.unwrap() as i16); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int32) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap()); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Int64) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap()); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Float32) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap().into()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap().into()); - Some((max, min, null_count)) - } - } - TableDataType::Number(NumberDataType::Float64) => { - let s = stats - .as_any() - .downcast_ref::>() - .unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = NumberType::::upcast_scalar(s.max_value.unwrap().into()); - let min = NumberType::::upcast_scalar(s.min_value.unwrap().into()); - Some((max, min, null_count)) - } - } - TableDataType::Boolean => { - let s = stats.as_any().downcast_ref::().unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = BooleanType::upcast_scalar(s.max_value.unwrap()); - let min = BooleanType::upcast_scalar(s.min_value.unwrap()); - Some((max, min, null_count)) - } - } - TableDataType::String => { - let s = stats.as_any().downcast_ref::().unwrap(); - if s.null_count.is_none() || s.max_value.is_none() || s.min_value.is_none() { - None - } else { - let null_count = s.null_count.unwrap(); - let max = StringType::upcast_scalar( - String::from_utf8(s.max_value.clone().unwrap()).ok()?, - ); - let min = StringType::upcast_scalar( - String::from_utf8(s.min_value.clone().unwrap()).ok()?, - ); - Some((max, min, null_count)) - } - } - TableDataType::Nullable(inner_ty) => Self::get_max_min_stats(inner_ty.as_ref(), stats), - _ => None, - } - } -} diff --git a/src/query/storages/hive/hive/src/hive_blocks.rs b/src/query/storages/hive/hive/src/hive_blocks.rs deleted file mode 100644 index d21d426bdf86..000000000000 --- a/src/query/storages/hive/hive/src/hive_blocks.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_arrow::parquet::metadata::FileMetaData; -use databend_common_arrow::parquet::metadata::RowGroupMetaData; -use log::debug; - -use crate::HiveBlockFilter; -use crate::HivePartInfo; - -#[derive(Clone)] -pub struct HiveBlocks { - pub file_meta: Arc, - pub part: HivePartInfo, - pub valid_rowgroups: Vec, - pub current_index: usize, - pub hive_block_filter: Arc, -} - -impl HiveBlocks { - pub fn create( - file_meta: Arc, - part: HivePartInfo, - hive_block_filter: Arc, - ) -> Self { - Self { - file_meta, - part, - valid_rowgroups: vec![], - current_index: 0, - hive_block_filter, - } - } - - // there are some conditions to filter invalid row_groups: - // 1. the rowgroup doesn't belong to the partition - // 2. filtered by predict pushdown - pub fn prune(&mut self) -> bool { - let mut pruned_rg_cnt = 0; - for (idx, row_group) in self.file_meta.row_groups.iter().enumerate() { - let start = row_group.columns()[0].byte_range().0; - let mid = start + row_group.compressed_size() as u64 / 2; - if !self.part.range.contains(&mid) { - continue; - } - if self - .hive_block_filter - .filter(row_group, self.part.get_partition_map()) - { - pruned_rg_cnt += 1; - } else { - self.valid_rowgroups.push(idx); - } - } - debug!( - "hive parquet predict pushdown have pruned {} rowgroups", - pruned_rg_cnt - ); - self.has_blocks() - } - - pub fn get_part_info(&self) -> HivePartInfo { - self.part.clone() - } - - pub fn get_current_row_group_meta_data(&self) -> &RowGroupMetaData { - &self.file_meta.row_groups[self.get_current_rowgroup_index()] - } - - pub fn advance(&mut self) { - self.current_index += 1; - } - - pub fn has_blocks(&self) -> bool { - self.current_index < self.valid_rowgroups.len() - } - - fn get_current_rowgroup_index(&self) -> usize { - self.valid_rowgroups[self.current_index] - } -} diff --git a/src/query/storages/hive/hive/src/hive_file_splitter.rs b/src/query/storages/hive/hive/src/hive_file_splitter.rs deleted file mode 100644 index 466d660ec6a8..000000000000 --- a/src/query/storages/hive/hive/src/hive_file_splitter.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::ops::Range; -use std::sync::Arc; - -use databend_common_catalog::plan::PartInfo; - -use crate::HiveFileInfo; -use crate::HivePartInfo; - -#[derive(Clone, Debug)] -pub struct HiveFileSplitter { - min_split_size: u64, -} - -impl HiveFileSplitter { - pub fn create(min_split_size: u64) -> Self { - Self { min_split_size } - } - - pub fn split_length(&self, length: u64) -> Vec> { - let mut num = length / self.min_split_size; - let left = length % self.min_split_size; - if num == 0 || left > self.min_split_size / 3 { - num += 1; - } - - let mut res = vec![]; - for i in 0..num { - let start = i * self.min_split_size; - let end = match i == num - 1 { - true => length + 1, - false => (i + 1) * self.min_split_size, - }; - res.push(start..end); - } - res - } - - fn split_single_file(&self, hive_file_info: HiveFileInfo) -> Vec>> { - let splits = self.split_length(hive_file_info.length); - splits - .into_iter() - .map(|r| { - HivePartInfo::create( - hive_file_info.filename.clone(), - hive_file_info.partition.clone(), - r, - hive_file_info.length, - ) - }) - .collect() - } - - pub fn get_splits(&self, files: Vec) -> Vec>> { - files - .into_iter() - .flat_map(|hive_file| self.split_single_file(hive_file)) - .collect::>>>() - } -} diff --git a/src/query/storages/hive/hive/src/hive_meta_data_reader.rs b/src/query/storages/hive/hive/src/hive_meta_data_reader.rs deleted file mode 100644 index f847e8801378..000000000000 --- a/src/query/storages/hive/hive/src/hive_meta_data_reader.rs +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_arrow::parquet::metadata::FileMetaData; -use databend_common_arrow::parquet::read::read_metadata_async; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_storages_common_cache::CacheManager; -use databend_storages_common_cache::InMemoryItemCacheReader; -use databend_storages_common_cache::LoadParams; -use databend_storages_common_cache::Loader; -use opendal::Operator; - -pub struct LoaderWrapper(T); -pub type FileMetaDataReader = InMemoryItemCacheReader>; -pub struct MetaDataReader; - -impl MetaDataReader { - pub fn meta_data_reader(dal: Operator) -> FileMetaDataReader { - FileMetaDataReader::new( - CacheManager::instance().get_file_meta_data_cache(), - LoaderWrapper(dal), - ) - } -} - -#[async_trait::async_trait] -impl Loader for LoaderWrapper { - #[async_backtrace::framed] - async fn load(&self, params: &LoadParams) -> Result { - let size = match params.len_hint { - Some(v) => v, - None => self.0.stat(¶ms.location).await?.content_length(), - }; - let reader = self.0.reader(¶ms.location).await?; - - read_metadata_async(reader, size).await.map_err(|err| { - ErrorCode::Internal(format!( - "read file meta failed, {}, {:?}", - params.location, err - )) - }) - } -} diff --git a/src/query/storages/hive/hive/src/hive_parquet_block_reader.rs b/src/query/storages/hive/hive/src/hive_parquet_block_reader.rs deleted file mode 100644 index c059f2c6165f..000000000000 --- a/src/query/storages/hive/hive/src/hive_parquet_block_reader.rs +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use databend_common_arrow::arrow::datatypes::Field; -use databend_common_arrow::arrow::datatypes::Schema; -use databend_common_arrow::arrow::io::parquet::read::column_iter_to_arrays; -use databend_common_arrow::arrow::io::parquet::read::ArrayIter; -use databend_common_arrow::arrow::io::parquet::read::RowGroupDeserializer; -use databend_common_arrow::parquet::metadata::ColumnChunkMetaData; -use databend_common_arrow::parquet::metadata::FileMetaData; -use databend_common_arrow::parquet::metadata::RowGroupMetaData; -use databend_common_arrow::parquet::read::BasicDecompressor; -use databend_common_arrow::parquet::read::PageReader; -use databend_common_base::base::tokio::sync::Semaphore; -use databend_common_catalog::plan::Projection; -use databend_common_exception::ErrorCode; -use databend_common_exception::Result; -use databend_common_expression::DataBlock; -use databend_common_expression::DataSchema; -use databend_common_expression::DataSchemaRef; -use databend_common_expression::TableField; -use databend_common_expression::TableSchemaRef; -use databend_storages_common_cache::LoadParams; -use opendal::Operator; - -use crate::hive_partition::HivePartInfo; -use crate::HivePartitionFiller; -use crate::MetaDataReader; - -#[derive(Clone)] -pub struct HiveBlockReader { - operator: Operator, - projection: Vec, - arrow_schema: Arc, - projected_schema: DataSchemaRef, - // have partition columns - output_schema: DataSchemaRef, - hive_partition_filler: Option, - chunk_size: usize, -} - -pub struct DataBlockDeserializer { - deserializer: RowGroupDeserializer, - drained: bool, -} - -impl DataBlockDeserializer { - fn new(deserializer: RowGroupDeserializer) -> Self { - let num_rows = deserializer.num_rows(); - Self { - deserializer, - drained: num_rows == 0, - } - } - - fn next_block( - &mut self, - schema: &DataSchema, - filler: &Option, - part_info: &HivePartInfo, - ) -> Result> { - if self.drained { - return Ok(None); - }; - - let opt = self.deserializer.next().transpose()?; - if let Some(chunk) = opt { - // If the `Vec>` we have passed into the `RowGroupDeserializer` - // is empty, the deserializer will returns an empty chunk as well(since now rows are consumed). - // In this case, mark self as drained. - if chunk.is_empty() { - self.drained = true; - } - - let block: DataBlock = DataBlock::from_arrow_chunk(&chunk, schema)?; - - return if let Some(filler) = filler { - let num_rows = self.deserializer.num_rows(); - let filled = filler.fill_data(block, part_info, num_rows)?; - Ok(Some(filled)) - } else { - Ok(Some(block)) - }; - } - - self.drained = true; - Ok(None) - } -} - -impl HiveBlockReader { - pub fn create( - operator: Operator, - schema: TableSchemaRef, - projection: Projection, - partition_keys: &Option>, - chunk_size: usize, - ) -> Result> { - let original_projection = match projection { - Projection::Columns(projection) => projection, - Projection::InnerColumns(b) => { - return Err(ErrorCode::Unimplemented(format!( - "not support inter columns in hive block reader,{:?}", - b - ))); - } - }; - let output_schema = - DataSchemaRef::new(DataSchema::from(&schema.project(&original_projection))); - - let (projection, partition_fields) = filter_hive_partition_from_partition_keys( - schema.clone(), - original_projection, - partition_keys, - ); - - let hive_partition_filler = if !partition_fields.is_empty() { - Some(HivePartitionFiller::create( - schema.clone(), - partition_fields, - )) - } else { - None - }; - - let projected_schema = DataSchemaRef::new(DataSchema::from(&schema.project(&projection))); - let arrow_schema = schema.as_ref().into(); - Ok(Arc::new(HiveBlockReader { - operator, - projection, - projected_schema, - output_schema, - arrow_schema: Arc::new(arrow_schema), - hive_partition_filler, - chunk_size, - })) - } - - fn to_deserialize( - column_meta: &ColumnChunkMetaData, - chunk: Vec, - rows: usize, - field: Field, - chunk_size: usize, - ) -> Result> { - let primitive_type = column_meta.descriptor().descriptor.primitive_type.clone(); - let pages = PageReader::new( - std::io::Cursor::new(chunk), - column_meta, - Arc::new(|_, _| true), - vec![], - usize::MAX, - ); - - let decompressor = BasicDecompressor::new(pages, vec![]); - Ok(column_iter_to_arrays( - vec![decompressor], - vec![&primitive_type], - field, - Some(chunk_size), - rows, - )?) - } - - pub fn get_parquet_column_metadata<'a>( - row_group: &'a RowGroupMetaData, - field_name: &str, - ) -> Result<&'a ColumnChunkMetaData> { - let column_meta: Vec<&ColumnChunkMetaData> = row_group - .columns() - .iter() - .filter(|x| { - x.descriptor().path_in_schema[0].to_lowercase() == field_name.to_lowercase() - }) - .collect(); - if column_meta.is_empty() { - return Err(ErrorCode::ParquetFileInvalid(format!( - "couldn't find column:{} in parquet file", - field_name - ))); - } else if column_meta.len() > 1 { - return Err(ErrorCode::ParquetFileInvalid(format!( - "find multi column:{} in parquet file", - field_name - ))); - } - Ok(column_meta[0]) - } - - #[async_backtrace::framed] - async fn read_column( - op: Operator, - path: String, - offset: u64, - length: u64, - semaphore: Arc, - ) -> Result> { - let handler = databend_common_base::runtime::spawn(async move { - let chunk = op - .read_with(&path) - .range(offset..offset + length) - .await? - .to_vec(); - - let _semaphore_permit = semaphore.acquire().await.unwrap(); - Ok(chunk) - }); - - match handler.await { - Ok(Ok(data)) => Ok(data), - Ok(Err(cause)) => Err(cause), - Err(cause) => Err(ErrorCode::TokioError(format!( - "Cannot join future {:?}", - cause - ))), - } - } - - #[async_backtrace::framed] - pub async fn read_meta_data( - &self, - dal: Operator, - filename: &str, - filesize: u64, - ) -> Result> { - let reader = MetaDataReader::meta_data_reader(dal); - - let load_params = LoadParams { - location: filename.to_owned(), - len_hint: Some(filesize), - ver: 0, - put_cache: true, - }; - - reader.read(&load_params).await - } - - #[async_backtrace::framed] - pub async fn read_columns_data( - &self, - row_group: &RowGroupMetaData, - part: &HivePartInfo, - ) -> Result>> { - let mut join_handlers = Vec::with_capacity(self.projection.len()); - - let semaphore = Arc::new(Semaphore::new(10)); - for index in &self.projection { - let field = &self.arrow_schema.fields[*index]; - let column_meta = Self::get_parquet_column_metadata(row_group, &field.name)?; - let (start, len) = column_meta.byte_range(); - - join_handlers.push(Self::read_column( - self.operator.clone(), - part.filename.to_string(), - start, - len, - semaphore.clone(), - )); - } - - futures::future::try_join_all(join_handlers).await - } - - pub fn create_rowgroup_deserializer( - &self, - chunks: Vec>, - row_group: &RowGroupMetaData, - ) -> Result { - if self.projection.len() != chunks.len() { - return Err(ErrorCode::Internal( - "Columns chunk len must be equals projections len.", - )); - } - - let mut columns_array_iter = Vec::with_capacity(self.projection.len()); - - for (index, column_chunk) in chunks.into_iter().enumerate() { - let idx = self.projection[index]; - let field = self.arrow_schema.fields[idx].clone(); - let column_meta = Self::get_parquet_column_metadata(row_group, &field.name)?; - - columns_array_iter.push(Self::to_deserialize( - column_meta, - column_chunk, - row_group.num_rows(), - field, - self.chunk_size, - )?); - } - - let num_row = row_group.num_rows(); - let deserializer = RowGroupDeserializer::new(columns_array_iter, num_row, None); - Ok(DataBlockDeserializer::new(deserializer)) - } - - pub fn create_data_block( - &self, - row_group_iterator: &mut DataBlockDeserializer, - part: &HivePartInfo, - ) -> Result> { - row_group_iterator - .next_block(&self.projected_schema, &self.hive_partition_filler, part) - .map_err(|e| e.add_message(format!(" filename of hive part {}", part.filename))) - } - - pub fn get_all_datablocks( - &self, - mut rowgroup_deserializer: DataBlockDeserializer, - part: &HivePartInfo, - ) -> Result> { - let mut all_blocks = vec![]; - - while let Some(datablock) = self.create_data_block(&mut rowgroup_deserializer, part)? { - all_blocks.push(datablock); - } - - Ok(all_blocks) - } - - pub fn get_output_schema(&self) -> DataSchemaRef { - self.output_schema.clone() - } -} - -pub fn filter_hive_partition_from_partition_keys( - schema: TableSchemaRef, - projections: Vec, - partition_keys: &Option>, -) -> (Vec, Vec) { - match partition_keys { - Some(partition_keys) => { - let mut not_partitions = vec![]; - let mut partition_fields = vec![]; - for i in projections.into_iter() { - let field = schema.field(i); - if !partition_keys.contains(field.name()) { - not_partitions.push(i); - } else { - partition_fields.push(field.clone()); - } - } - (not_partitions, partition_fields) - } - None => (projections, vec![]), - } -} diff --git a/src/query/storages/hive/hive/src/hive_partition.rs b/src/query/storages/hive/hive/src/hive_partition.rs index e9cd1a597d84..b8a615444ca0 100644 --- a/src/query/storages/hive/hive/src/hive_partition.rs +++ b/src/query/storages/hive/hive/src/hive_partition.rs @@ -17,22 +17,20 @@ use std::collections::hash_map::DefaultHasher; use std::collections::HashMap; use std::hash::Hash; use std::hash::Hasher; -use std::ops::Range; use std::sync::Arc; use databend_common_catalog::plan::PartInfo; use databend_common_catalog::plan::PartInfoPtr; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::Scalar; #[derive(serde::Serialize, serde::Deserialize, PartialEq, Eq, Debug, Clone)] pub struct HivePartInfo { // file location, like /usr/hive/warehouse/ssb.db/customer.table/c_region=ASIA/c_nation=CHINA/f00.parquet pub filename: String, // partition values, like 'c_region=ASIA/c_nation=CHINA' - pub partitions: Option, - // only the data in ranges belong to this partition - pub range: Range, + pub partitions: Vec, // file size pub filesize: u64, } @@ -57,24 +55,16 @@ impl PartInfo for HivePartInfo { } impl HivePartInfo { - pub fn create( - filename: String, - partitions: Option, - range: Range, - filesize: u64, - ) -> Arc> { - Arc::new(Box::new(HivePartInfo { + pub fn create(filename: String, partitions: Vec, filesize: u64) -> Self { + HivePartInfo { filename, partitions, - range, filesize, - })) + } } - pub fn get_partition_map(&self) -> HashMap { - self.partitions - .as_ref() - .map_or_else(HashMap::new, |s| parse_hive_partitions(s)) + pub fn into_part_ptr(self) -> PartInfoPtr { + Arc::new(Box::new(self)) } pub fn from_part(info: &PartInfoPtr) -> Result<&HivePartInfo> { @@ -90,7 +80,9 @@ pub fn parse_hive_partitions(partitions: &str) -> HashMap { let parts = partitions.split('/').collect::>(); for part in parts { let kv = part.split('=').collect::>(); - partition_map.insert(kv[0].to_string(), kv[1].to_string()); + if kv.len() == 2 { + partition_map.insert(kv[0].to_string(), kv[1].to_string()); + } } partition_map } diff --git a/src/query/storages/hive/hive/src/hive_partition_filler.rs b/src/query/storages/hive/hive/src/hive_partition_filler.rs index 41e1c51bfacd..dca5045724a7 100644 --- a/src/query/storages/hive/hive/src/hive_partition_filler.rs +++ b/src/query/storages/hive/hive/src/hive_partition_filler.rs @@ -14,14 +14,10 @@ use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::AnyType; -use databend_common_expression::BlockEntry; -use databend_common_expression::DataBlock; +use databend_common_expression::Scalar; use databend_common_expression::TableField; -use databend_common_expression::TableSchemaRef; -use databend_common_expression::Value; -use crate::hive_partition::HivePartInfo; +use crate::hive_partition::parse_hive_partitions; use crate::utils::str_field_to_scalar; #[derive(Debug, Clone)] @@ -30,27 +26,20 @@ pub struct HivePartitionFiller { } impl HivePartitionFiller { - pub fn create(_schema: TableSchemaRef, partition_fields: Vec) -> Self { + pub fn create(partition_fields: Vec) -> Self { HivePartitionFiller { partition_fields } } - fn generate_value( - &self, - _num_rows: usize, - value: String, - field: &TableField, - ) -> Result> { - let value = str_field_to_scalar(&value, &field.data_type().into())?; - Ok(Value::Scalar(value)) - } - - fn extract_partition_values(&self, hive_part: &HivePartInfo) -> Result> { - let partition_map = hive_part.get_partition_map(); + pub fn extract_scalars(&self, locations: &str) -> Result> { + let partition_map = parse_hive_partitions(locations); let mut partition_values = vec![]; for field in self.partition_fields.iter() { match partition_map.get(field.name()) { - Some(v) => partition_values.push(v.to_string()), + Some(v) => { + let value = str_field_to_scalar(v.as_str(), &field.data_type().into())?; + partition_values.push(value); + } None => { return Err(ErrorCode::TableInfoError(format!( "couldn't find hive partition info :{}, hive partition maps:{:?}", @@ -62,29 +51,4 @@ impl HivePartitionFiller { } Ok(partition_values) } - - pub fn fill_data( - &self, - data_block: DataBlock, - part: &HivePartInfo, - origin_num_rows: usize, - ) -> Result { - let data_values = self.extract_partition_values(part)?; - - // create column, create datafield - let mut num_rows = data_block.num_rows(); - if num_rows == 0 { - num_rows = origin_num_rows; - } - - let mut columns = data_block.columns().to_vec(); - - for (i, field) in self.partition_fields.iter().enumerate() { - let value = &data_values[i]; - let column = self.generate_value(num_rows, value.clone(), field)?; - columns.push(BlockEntry::new(field.data_type().into(), column)); - } - - Ok(DataBlock::new(columns, num_rows)) - } } diff --git a/src/query/storages/hive/hive/src/hive_partition_pruner.rs b/src/query/storages/hive/hive/src/hive_partition_pruner.rs deleted file mode 100644 index 4b75dfbb868a..000000000000 --- a/src/query/storages/hive/hive/src/hive_partition_pruner.rs +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; -use std::vec; - -use databend_common_catalog::table_context::TableContext; -use databend_common_exception::Result; -use databend_common_expression::Expr; -use databend_common_expression::TableSchema; -use databend_storages_common_index::RangeIndex; -use databend_storages_common_table_meta::meta::ColumnStatistics; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; -use log::debug; - -use crate::utils::str_field_to_scalar; - -pub struct HivePartitionPruner { - pub ctx: Arc, - pub filter: Expr, - // pub partitions: Vec, - pub partition_schema: Arc, - pub full_schema: Arc, -} - -impl HivePartitionPruner { - pub fn create( - ctx: Arc, - filter: Expr, - partition_schema: Arc, - full_schema: Arc, - ) -> Self { - HivePartitionPruner { - ctx, - filter, - partition_schema, - full_schema, - } - } - - pub fn get_column_stats(&self, partitions: &Vec) -> Result> { - let mut data = Vec::with_capacity(partitions.len()); - for partition in partitions { - let mut stats = HashMap::new(); - for (index, singe_value) in partition.split('/').enumerate() { - let kv = singe_value.split('=').collect::>(); - let field = self.partition_schema.fields()[index].clone(); - let scalar = str_field_to_scalar(kv[1], &field.data_type().into())?; - let null_count = u64::from(scalar.is_null()); - let column_stats = - ColumnStatistics::new(scalar.clone(), scalar, null_count, 0, None); - stats.insert(index as u32, column_stats); - } - data.push(stats); - } - - Ok(data) - } - - pub fn prune(&self, partitions: Vec) -> Result> { - let range_filter = RangeIndex::try_create( - self.ctx.get_function_context()?, - &self.filter, - self.full_schema.clone(), - StatisticsOfColumns::default(), - )?; - let column_stats = self.get_column_stats(&partitions)?; - let mut filtered_partitions = vec![]; - for (idx, stats) in column_stats.into_iter().enumerate() { - let block_stats = stats - .iter() - .map(|(k, v)| { - let partition_col_name = self.partition_schema.field(*k as usize).name(); - let index = self.full_schema.index_of(partition_col_name).unwrap(); - - (index as u32, v.clone()) - }) - .collect(); - - if range_filter.apply(&block_stats, |_| false)? { - filtered_partitions.push(partitions[idx].clone()); - } - } - debug!("hive pruned partitions: {:?}", filtered_partitions); - Ok(filtered_partitions) - } -} diff --git a/src/query/storages/hive/hive/src/hive_table.rs b/src/query/storages/hive/hive/src/hive_table.rs index 07d30fb070d6..12fe4e01274b 100644 --- a/src/query/storages/hive/hive/src/hive_table.rs +++ b/src/query/storages/hive/hive/src/hive_table.rs @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; use std::sync::Arc; use std::time::Instant; use async_recursion::async_recursion; use databend_common_base::base::tokio::sync::Semaphore; use databend_common_catalog::catalog_kind::CATALOG_HIVE; +use databend_common_catalog::partition_columns::get_pushdown_without_partition_columns; use databend_common_catalog::plan::DataSourcePlan; +use databend_common_catalog::plan::ParquetReadOptions; use databend_common_catalog::plan::PartStatistics; use databend_common_catalog::plan::Partitions; use databend_common_catalog::plan::PartitionsShuffleKind; -use databend_common_catalog::plan::Projection; use databend_common_catalog::plan::PushDownInfo; use databend_common_catalog::table::NavigationPoint; use databend_common_catalog::table::Table; @@ -35,10 +35,10 @@ use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; -use databend_common_expression::DataSchemaRefExt; use databend_common_expression::Expr; +use databend_common_expression::FieldIndex; +use databend_common_expression::TableField; use databend_common_expression::TableSchema; -use databend_common_expression::TableSchemaRef; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_meta_app::schema::TableInfo; use databend_common_meta_app::schema::UpdateStreamMetaReq; @@ -46,14 +46,14 @@ use databend_common_meta_app::schema::UpsertTableCopiedFileReq; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::ProcessorPtr; use databend_common_pipeline_core::Pipeline; -use databend_common_pipeline_core::SourcePipeBuilder; use databend_common_pipeline_sources::SyncSource; use databend_common_pipeline_sources::SyncSourcer; use databend_common_storage::init_operator; use databend_common_storage::DataOperator; -use databend_storages_common_index::RangeIndex; +use databend_common_storages_parquet::ParquetRSPruner; +use databend_common_storages_parquet::ParquetRSReaderBuilder; +use databend_storages_common_pruner::partition_prunner::PartitionPruner; use databend_storages_common_table_meta::meta::SnapshotId; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; use databend_storages_common_table_meta::table::ChangeType; use futures::TryStreamExt; use log::info; @@ -63,13 +63,11 @@ use opendal::Metakey; use opendal::Operator; use super::hive_catalog::HiveCatalog; -use super::hive_partition_pruner::HivePartitionPruner; use super::hive_table_options::HiveTableOptions; -use crate::filter_hive_partition_from_partition_keys; -use crate::hive_parquet_block_reader::HiveBlockReader; use crate::hive_table_source::HiveTableSource; -use crate::HiveBlockFilter; -use crate::HiveFileSplitter; +use crate::utils::HiveFetchPartitionScalars; +use crate::HivePartInfo; +use crate::HivePartitionFiller; pub const HIVE_TABLE_ENGINE: &str = "hive"; pub const HIVE_DEFAULT_PARTITION: &str = "__HIVE_DEFAULT_PARTITION__"; @@ -96,283 +94,121 @@ impl HiveTable { }) } - fn get_block_filter( - &self, - ctx: Arc, - push_downs: &Option, - ) -> Result> { - let enable_hive_parquet_predict_pushdown = ctx - .get_settings() - .get_enable_hive_parquet_predict_pushdown()?; - - if enable_hive_parquet_predict_pushdown == 0 { - return Ok(Arc::new(HiveBlockFilter::create( - None, - vec![], - self.table_info.schema(), - ))); - } - - let filter_expression = push_downs.as_ref().and_then(|extra| { - extra - .filters - .as_ref() - .map(|filter| filter.filter.as_expr(&BUILTIN_FUNCTIONS)) - }); - - let range_filter = match filter_expression { - Some(expr) => Some(RangeIndex::try_create( - ctx.get_function_context()?, - &expr, - self.table_info.schema(), - StatisticsOfColumns::default(), - )?), - _ => None, - }; - - let projection = self.get_projections(push_downs)?; - let mut projection_fields = vec![]; - let schema = self.table_info.schema(); - for i in projection.into_iter() { - let field = schema.field(i); - projection_fields.push(field.clone()); - } - - Ok(Arc::new(HiveBlockFilter::create( - range_filter, - projection_fields, - self.table_info.schema(), - ))) - } - - fn is_prewhere_column_partition_keys( - &self, - schema: TableSchemaRef, - push_downs: &Option, - ) -> Result { - match push_downs { - None => Ok(false), - Some(p) => match &p.prewhere { - None => Ok(false), - Some(prewhere_info) => match &prewhere_info.prewhere_columns { - Projection::Columns(projections) => { - let partition_keys = &self.table_options.partition_keys; - let (not_partitions, _) = filter_hive_partition_from_partition_keys( - schema, - projections.clone(), - partition_keys, - ); - Ok(not_partitions.is_empty()) - } - Projection::InnerColumns(_) => { - Err(ErrorCode::Unimplemented("not support intercolumns")) - } - }, - }, - } + fn partition_fields(&self) -> Vec { + self.schema() + .fields() + .iter() + .filter(|field| { + self.table_options + .partition_keys + .as_ref() + .map(|ks| ks.contains(&field.name)) + .unwrap_or_default() + }) + .cloned() + .collect() } - #[inline] - pub fn do_read2( + fn no_partition_schema(&self) -> Arc { + let non_partition_fields = self + .schema() + .fields() + .iter() + .filter(|field| { + !self + .table_options + .partition_keys + .as_ref() + .map(|ks| ks.contains(&field.name)) + .unwrap_or_default() + }) + .cloned() + .collect(); + Arc::new(TableSchema::new(non_partition_fields)) + } + + pub fn do_read_data( &self, ctx: Arc, plan: &DataSourcePlan, pipeline: &mut Pipeline, ) -> Result<()> { - let push_downs = &plan.push_downs; - let chunk_size = ctx.get_settings().get_hive_parquet_chunk_size()? as usize; - let parts_len = plan.parts.len(); let max_threads = ctx.get_settings().get_max_threads()? as usize; let max_threads = std::cmp::min(parts_len, max_threads); + let table_schema = self.no_partition_schema(); - let mut source_builder = SourcePipeBuilder::create(); - let delay_timer = if self.is_simple_select_query(plan) { - // 0, 0, 200, 200, 400,400 - |x: usize| (x / 2).min(10) * 200 - } else { - |_| 0 - }; + let arrow_schema = table_schema.as_ref().into(); + let leaf_fields = Arc::new(table_schema.leaf_fields()); - let output_schema = Arc::new(DataSchema::from(plan.schema())); + let mut read_options = ParquetReadOptions::default(); - let prewhere_all_partitions = - self.is_prewhere_column_partition_keys(self.table_info.schema(), &plan.push_downs)?; - // create prewhere&remaindata block reader - let prewhere_reader = - self.build_prewhere_reader(plan, chunk_size, prewhere_all_partitions)?; - let remain_reader = self.build_remain_reader(plan, chunk_size, prewhere_all_partitions)?; - let prewhere_filter = - self.build_prewhere_filter_executor(plan, prewhere_reader.get_output_schema())?; - - let hive_block_filter = self.get_block_filter(ctx.clone(), push_downs)?; - - let mut src_fields = prewhere_reader.get_output_schema().fields().clone(); - if let Some(reader) = remain_reader.as_ref() { - let remain_field = reader.get_output_schema().fields().clone(); - src_fields.extend_from_slice(&remain_field); + if !ctx.get_settings().get_enable_parquet_page_index()? { + read_options = read_options.with_prune_pages(false); } - let src_schema = DataSchemaRefExt::create(src_fields); - for index in 0..std::cmp::max(1, max_threads) { - let output = OutputPort::create(); - source_builder.add_source( - output.clone(), - HiveTableSource::create( - ctx.clone(), - self.dal.clone(), - output, - prewhere_reader.clone(), - remain_reader.clone(), - prewhere_filter.clone(), - delay_timer(index), - hive_block_filter.clone(), - src_schema.clone(), - output_schema.clone(), - )?, - ); + if !ctx.get_settings().get_enable_parquet_rowgroup_pruning()? { + read_options = read_options.with_prune_row_groups(false); } - pipeline.add_pipe(source_builder.finalize()); - Ok(()) - } - - // simple select query is the sql likes `select * from xx limit 10` or - // `select * from xx where p_date = '20220201' limit 10` where p_date is a partition column; - // we just need to read a few data from table - fn is_simple_select_query(&self, plan: &DataSourcePlan) -> bool { - // couldn't get groupby order by info - if let Some(PushDownInfo { - filters, - limit: Some(lm), - .. - }) = &plan.push_downs - { - if *lm > 100000 { - return false; - } - - // filter out the partition column related expressions - let partition_keys = self.get_partition_key_sets(); - let columns = filters - .as_ref() - .map(|f| { - let expr = f.filter.as_expr(&BUILTIN_FUNCTIONS); - expr.column_refs().keys().cloned().collect::>() - }) - .unwrap_or_default(); - - if columns.difference(&partition_keys).count() == 0 { - return true; - } + if !ctx.get_settings().get_enable_parquet_prewhere()? { + read_options = read_options.with_do_prewhere(false); } - false - } - fn get_partition_key_sets(&self) -> HashSet { - self.table_options + let pruner = ParquetRSPruner::try_create( + ctx.get_function_context()?, + table_schema.clone(), + leaf_fields, + &plan.push_downs, + read_options, + self.table_options + .partition_keys + .clone() + .unwrap_or_default(), + )?; + + let op = self.dal.clone(); + + let partition_keys = self + .table_options .partition_keys .clone() - .unwrap_or_default() - .into_iter() - .collect() - } - - fn get_projections(&self, push_downs: &Option) -> Result> { - if let Some(PushDownInfo { - projection: Some(prj), - .. - }) = push_downs - { - match prj { - Projection::Columns(indices) => Ok(indices.clone()), - Projection::InnerColumns(_) => Err(ErrorCode::Unimplemented( - "does not support projection inner columns", - )), - } + .unwrap_or_default(); + + let partition_field_indexes: Result> = partition_keys + .iter() + .map(|name| self.schema().index_of(name)) + .collect(); + let partition_field_indexes = partition_field_indexes?; + let push_downs = if let Some(ref p) = plan.push_downs { + Some(get_pushdown_without_partition_columns( + p.clone(), + &partition_field_indexes[..], + )?) } else { - let col_ids = (0..self.table_info.schema().fields().len()).collect::>(); - Ok(col_ids) - } - } + None + }; + let mut builder = + ParquetRSReaderBuilder::create(ctx.clone(), op, table_schema, arrow_schema)? + .with_options(read_options) + .with_push_downs(push_downs.as_ref()) + .with_pruner(Some(pruner)) + .with_partition_columns(partition_keys); - // Build the prewhere reader. - fn build_prewhere_reader( - &self, - plan: &DataSourcePlan, - chunk_size: usize, - prewhere_all_partitions: bool, - ) -> Result> { - match ( - prewhere_all_partitions, - PushDownInfo::prewhere_of_push_downs(plan.push_downs.as_ref()), - ) { - (true, _) | (_, None) => { - let projection = PushDownInfo::projection_of_push_downs( - &plan.schema(), - plan.push_downs.as_ref(), - ); - HiveBlockReader::create( - self.dal.clone(), - self.table_info.schema(), - projection, - &self.table_options.partition_keys, - chunk_size, - ) - } - (false, Some(v)) => HiveBlockReader::create( - self.dal.clone(), - self.table_info.schema(), - v.prewhere_columns, - &self.table_options.partition_keys, - chunk_size, - ), - } - } + let parquet_reader = Arc::new(builder.build_full_reader()?); - // Build the prewhere filter executor. - fn build_prewhere_filter_executor( - &self, - plan: &DataSourcePlan, - schema: DataSchemaRef, - ) -> Result>> { - Ok(Arc::new( - PushDownInfo::prewhere_of_push_downs(plan.push_downs.as_ref()).map(|v| { - v.filter - .as_expr(&BUILTIN_FUNCTIONS) - .project_column_ref(|name| schema.index_of(name).unwrap()) - }), - )) - } - - // Build the remain reader. - fn build_remain_reader( - &self, - plan: &DataSourcePlan, - chunk_size: usize, - prewhere_all_partitions: bool, - ) -> Result>> { - Ok( - match ( - prewhere_all_partitions, - PushDownInfo::prewhere_of_push_downs(plan.push_downs.as_ref()), - ) { - (true, _) | (_, None) => Arc::new(None), - (false, Some(v)) => { - if v.remain_columns.is_empty() { - Arc::new(None) - } else { - let reader = HiveBlockReader::create( - self.dal.clone(), - self.table_info.schema(), - v.remain_columns, - &self.table_options.partition_keys, - chunk_size, - )?; - Arc::new(Some((*reader).clone())) - } - } + let output_schema = Arc::new(DataSchema::from(plan.schema())); + pipeline.add_source( + |output| { + HiveTableSource::create( + ctx.clone(), + output, + output_schema.clone(), + parquet_reader.clone(), + self.partition_fields(), + ) }, + max_threads.max(1), ) } @@ -415,9 +251,14 @@ impl HiveTable { if let Some(expr) = filter_expression { let partition_schemas = self.get_column_schemas(partition_keys.clone())?; - let partition_pruner = - HivePartitionPruner::create(ctx, expr, partition_schemas, self.table_info.schema()); - partition_names = partition_pruner.prune(partition_names)?; + let partition_pruner = PartitionPruner::try_create( + ctx.get_function_context()?, + expr, + partition_schemas, + self.table_info.schema(), + )?; + partition_names = + partition_pruner.prune::(partition_names)?; } trace!( @@ -479,7 +320,7 @@ impl HiveTable { async fn list_files_from_dirs( &self, dirs: Vec<(String, Option)>, - ) -> Result> { + ) -> Result> { let sem = Arc::new(Semaphore::new(60)); let mut tasks = Vec::with_capacity(dirs.len()); @@ -494,12 +335,9 @@ impl HiveTable { } let mut all_files = vec![]; - for (task, partition) in tasks { + for (task, _) in tasks { let files = task.await.unwrap()?; - for mut file in files { - file.add_partition(partition.clone()); - all_files.push(file); - } + all_files.extend_from_slice(&files); } Ok(all_files) @@ -516,11 +354,14 @@ impl HiveTable { let dirs = self.get_query_locations(ctx.clone(), &push_downs).await?; trace!("{} query locations: {:?}", dirs.len(), dirs); - let all_files = self.list_files_from_dirs(dirs).await?; - trace!("{} hive files: {:?}", all_files.len(), all_files); + let dir_len = dirs.len(); + let filler = HivePartitionFiller::create(self.partition_fields()); + let mut partitions = self.list_files_from_dirs(dirs).await?; + for partition in partitions.iter_mut() { + partition.partitions = filler.extract_scalars(&partition.filename)?; + } - let splitter = HiveFileSplitter::create(128 * 1024 * 1024_u64); - let partitions = splitter.get_splits(all_files); + trace!("{} hive files: {:?}", partitions.len(), partitions); info!( "read partition, partition num:{}, elapsed:{:?}", @@ -528,8 +369,26 @@ impl HiveTable { start.elapsed() ); + let estimated_read_rows: f64 = partitions + .iter() + .map(|s| s.filesize as f64 / (self.schema().num_fields() * 8) as f64) + .sum(); + + let read_bytes = partitions.iter().map(|s| s.filesize as usize).sum(); + let stats = PartStatistics::new_estimated( + None, + estimated_read_rows as _, + read_bytes, + partitions.len(), + dir_len, + ); + let partitions = partitions + .into_iter() + .map(HivePartInfo::into_part_ptr) + .collect(); + Ok(( - Default::default(), + stats, Partitions::create(PartitionsShuffleKind::Seq, partitions), )) } @@ -578,7 +437,7 @@ impl Table for HiveTable { pipeline: &mut Pipeline, _put_cache: bool, ) -> Result<()> { - self.do_read2(ctx, plan, pipeline) + self.do_read_data(ctx, plan, pipeline) } fn commit_insertion( @@ -665,27 +524,6 @@ impl SyncSource for HiveSource { } } -#[derive(Debug)] -pub struct HiveFileInfo { - pub filename: String, - pub length: u64, - pub partition: Option, -} - -impl HiveFileInfo { - pub fn create(filename: String, length: u64) -> Self { - HiveFileInfo { - filename, - length, - partition: None, - } - } - - pub fn add_partition(&mut self, partition: Option) { - self.partition = partition; - } -} - // convert hdfs path format to opendal path formatted // // there are two rules: @@ -731,7 +569,7 @@ async fn list_files_from_dir( operator: Operator, location: String, sem: Arc, -) -> Result> { +) -> Result> { let (files, dirs) = do_list_files_from_dir(operator.clone(), location, sem.clone()).await?; let mut all_files = files; let mut tasks = Vec::with_capacity(dirs.len()); @@ -759,7 +597,7 @@ async fn do_list_files_from_dir( operator: Operator, location: String, sem: Arc, -) -> Result<(Vec, Vec)> { +) -> Result<(Vec, Vec)> { let _a = sem.acquire().await.unwrap(); let mut m = operator .lister_with(&location) @@ -779,9 +617,9 @@ async fn do_list_files_from_dir( match meta.mode() { EntryMode::FILE => { - let filename = path.to_string(); + let location = path.to_string(); let length = meta.content_length(); - all_files.push(HiveFileInfo::create(filename, length)); + all_files.push(HivePartInfo::create(location, vec![], length)); } EntryMode::DIR => { all_dirs.push(path.to_string()); diff --git a/src/query/storages/hive/hive/src/hive_table_source.rs b/src/query/storages/hive/hive/src/hive_table_source.rs index db18d0180cc5..ae961f73e2d2 100644 --- a/src/query/storages/hive/hive/src/hive_table_source.rs +++ b/src/query/storages/hive/hive/src/hive_table_source.rs @@ -14,255 +14,93 @@ use std::any::Any; use std::sync::Arc; -use std::vec; -use databend_common_base::base::tokio::time::sleep; -use databend_common_base::base::tokio::time::Duration; use databend_common_base::base::Progress; use databend_common_base::base::ProgressValues; use databend_common_base::runtime::profile::Profile; use databend_common_base::runtime::profile::ProfileStatisticsName; -use databend_common_catalog::plan::PartInfoPtr; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::filter_helper::FilterHelpers; -use databend_common_expression::types::BooleanType; -use databend_common_expression::types::DataType; +use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; +use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; -use databend_common_expression::Evaluator; -use databend_common_expression::Expr; +use databend_common_expression::FieldIndex; +use databend_common_expression::TableField; use databend_common_expression::Value; -use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; -use log::debug; -use opendal::Operator; +use databend_common_storages_parquet::ParquetFileReader; +use databend_common_storages_parquet::ParquetRSFullReader; +use parquet::arrow::async_reader::ParquetRecordBatchStream; -use crate::hive_parquet_block_reader::DataBlockDeserializer; -use crate::hive_parquet_block_reader::HiveBlockReader; -use crate::HiveBlockFilter; -use crate::HiveBlocks; use crate::HivePartInfo; -struct PreWhereData { - data_blocks: Vec, - valids: Vec>, -} - -enum State { - /// Read parquet file meta data - /// IO bound - ReadMeta(Option), - - /// Read prewhere blocks from data groups (without deserialization) - /// IO bound - ReadPrewhereData(HiveBlocks), - - /// Read remain blocks from data groups (without deserialization) - /// IO bound - ReadRemainData(HiveBlocks, PreWhereData), - - /// do prewhere filter on prewhere data, if data are filtered, trans to Generated state with empty datablocks, - /// else trans to ReadRemainData - /// CPU bound - PrewhereFilter(HiveBlocks, DataBlockDeserializer), - - /// Deserialize remain block from the given data groups, concat prewhere and remain data blocks - /// CPU bound - Deserialize(HiveBlocks, DataBlockDeserializer, PreWhereData), - - /// indicates that data blocks are ready, and needs to be consumed - Generated(HiveBlocks, Vec), - Finish, -} +pub type PartitionColumnIndex = usize; pub struct HiveTableSource { - state: State, - ctx: Arc, - dal: Operator, - scan_progress: Arc, - prewhere_block_reader: Arc, - remain_reader: Arc>, - prewhere_filter: Arc>, output: Arc, - delay: usize, - hive_block_filter: Arc, + generated_data: Option, + is_finished: bool, - /// The schema before output. Some fields might be removed when outputting. - source_schema: DataSchemaRef, - /// The final output schema + scan_progress: Arc, + // Used for get partition + ctx: Arc, + + // Used to read parquet file. + parquet_reader: Arc, + + // Used to insert partition_block_entries to data block + // FieldIndex is the index in the output_schema + // PartitionColumnIndex is the index of in partition_fields and partition_block_entries + // order by FieldIndex so we can insert in order + output_partition_columns: Vec<(FieldIndex, PartitionColumnIndex)>, + partition_fields: Vec, + // Used to check schema output_schema: DataSchemaRef, + + // Per partition + stream: Option>, + partition_block_entries: Vec, } impl HiveTableSource { - #[allow(clippy::too_many_arguments)] pub fn create( ctx: Arc, - dal: Operator, output: Arc, - prewhere_block_reader: Arc, - remain_reader: Arc>, - prewhere_filter: Arc>, - delay: usize, - hive_block_filter: Arc, - source_schema: DataSchemaRef, output_schema: DataSchemaRef, + parquet_reader: Arc, + partition_fields: Vec, ) -> Result { + let output_partition_columns = output_schema + .fields() + .iter() + .enumerate() + .filter_map(|(fi, f)| { + partition_fields + .iter() + .position(|p| p.name() == f.name()) + .map(|pi| (fi, pi)) + }) + .collect(); let scan_progress = ctx.get_scan_progress(); Ok(ProcessorPtr::create(Box::new(HiveTableSource { - ctx, - dal, output, - prewhere_block_reader, - remain_reader, - prewhere_filter, - hive_block_filter, scan_progress, - state: State::ReadMeta(None), - delay, - source_schema, + ctx, + parquet_reader, output_schema, + partition_fields, + output_partition_columns, + stream: None, + generated_data: None, + is_finished: false, + partition_block_entries: vec![], }))) } - - fn try_get_partitions(&mut self) { - self.state = self - .ctx - .get_partition() - .map_or(State::Finish, |part_info| State::ReadMeta(Some(part_info))); - } - - fn exec_prewhere_filter( - &self, - filter: &Expr, - data_blocks: &Vec, - ) -> Result<(bool, Vec>)> { - assert_eq!(filter.data_type(), &DataType::Boolean); - - let mut valids = vec![]; - let mut exists = false; - let func_ctx = self.ctx.get_function_context()?; - for datablock in data_blocks { - let evaluator = Evaluator::new(datablock, &func_ctx, &BUILTIN_FUNCTIONS); - let predicates = evaluator - .run(filter) - .map_err(|e| e.add_message("eval prewhere filter failed:"))? - .try_downcast::() - .unwrap(); - - // shortcut, if predicates is const boolean (or can be cast to boolean) - if !FilterHelpers::is_all_unset(&predicates) { - exists = true; - } - - valids.push(predicates); - } - - assert_eq!(data_blocks.len(), valids.len()); - - Ok((exists, valids)) - } - - fn do_prewhere_filter( - &mut self, - hive_blocks: HiveBlocks, - rowgroup_deserializer: DataBlockDeserializer, - ) -> Result<()> { - // 1. deserialize chunks to datablocks - let prewhere_datablocks = self - .prewhere_block_reader - .get_all_datablocks(rowgroup_deserializer, &hive_blocks.part)?; - - let progress_values = ProgressValues { - rows: prewhere_datablocks.iter().map(|x| x.num_rows()).sum(), - bytes: prewhere_datablocks.iter().map(|x| x.memory_size()).sum(), - }; - Profile::record_usize_profile(ProfileStatisticsName::ScanBytes, progress_values.bytes); - self.scan_progress.incr(&progress_values); - - if let Some(filter) = self.prewhere_filter.as_ref() { - // 2. do filter - let (exists, valids) = self.exec_prewhere_filter(filter, &prewhere_datablocks)?; - // 3. if all data filter out, try next rowgroup, trans to prewhere data - if !exists { - // all rows in this block are filtered out - // turn to begin the next state cycle. - // Generate a empty block. - self.state = State::Generated(hive_blocks, vec![]); - return Ok(()); - } - // 4. if remain block is non, trans to generated state - if self.remain_reader.is_none() { - let prewhere_datablocks = prewhere_datablocks - .into_iter() - .zip(valids.iter()) - .map(|(datablock, valid)| { - let datablock = DataBlock::filter_boolean_value(datablock, valid).unwrap(); - datablock - .resort(&self.source_schema, &self.output_schema) - .unwrap() - }) - .filter(|x| !x.is_empty()) - .collect(); - - self.state = State::Generated(hive_blocks, prewhere_datablocks); - } else { - // 5. if not all data filter out, and remain block reader is not non, trans to read remain - self.state = State::ReadRemainData(hive_blocks, PreWhereData { - data_blocks: prewhere_datablocks, - valids, - }); - } - } else { - // if no prewhere filter, data should be all fetched in prewhere state - self.state = State::Generated(hive_blocks, prewhere_datablocks); - } - - Ok(()) - } - - fn do_deserialize( - &mut self, - hive_blocks: HiveBlocks, - rowgroup_deserializer: DataBlockDeserializer, - prewhere_data: PreWhereData, - ) -> Result<()> { - let datablocks = if let Some(remain_reader) = self.remain_reader.as_ref() { - // 1. deserialize all remain data block - let remain_datablocks = - remain_reader.get_all_datablocks(rowgroup_deserializer, &hive_blocks.part)?; - // 2. concat prewhere and remain datablock(may be none) - assert_eq!(remain_datablocks.len(), prewhere_data.data_blocks.len()); - - let allblocks = remain_datablocks - .iter() - .zip(prewhere_data.data_blocks.iter()) - .zip(prewhere_data.valids.iter()) - .map(|((r, p), v)| { - // do merge block - assert_eq!(r.num_rows(), p.num_rows()); - let mut a = p.clone(); - for column in r.columns().iter() { - a.add_column(column.clone()); - } - let a = DataBlock::filter_boolean_value(a, v).unwrap(); - a.resort(&self.source_schema, &self.output_schema).unwrap() - }) - .filter(|x| !x.is_empty()) - .collect::>(); - allblocks - } else { - return Err(ErrorCode::Internal("It's a bug. No remain reader")); - }; - - // 3 trans to generate state - self.state = State::Generated(hive_blocks, datablocks); - Ok(()) - } } #[async_trait::async_trait] @@ -276,8 +114,9 @@ impl Processor for HiveTableSource { } fn event(&mut self) -> Result { - if matches!(self.state, State::ReadMeta(None)) { - self.try_get_partitions(); + if self.is_finished { + self.output.finish(); + return Ok(Event::Finished); } if self.output.is_finished() { @@ -288,114 +127,108 @@ impl Processor for HiveTableSource { return Ok(Event::NeedConsume); } - if matches!(self.state, State::Generated(_, _)) { - if let State::Generated(mut hive_blocks, mut data_blocks) = - std::mem::replace(&mut self.state, State::Finish) - { - // 1. consume all generated blocks, - if let Some(data_block) = data_blocks.pop() { - self.output.push_data(Ok(data_block)); - // 2. if not all consumed, retain generated state - self.state = State::Generated(hive_blocks, data_blocks); - return Ok(Event::NeedConsume); - } - - // 3. if all consumed, try next rowgroup - hive_blocks.advance(); - match hive_blocks.has_blocks() { - true => { - self.state = State::ReadPrewhereData(hive_blocks); - } - false => { - self.try_get_partitions(); - } - } - } - } - - match self.state { - State::Finish => { - self.output.finish(); - Ok(Event::Finished) - } - State::ReadMeta(_) => Ok(Event::Async), - State::ReadPrewhereData(_) => Ok(Event::Async), - State::ReadRemainData(_, _) => Ok(Event::Async), - State::PrewhereFilter(_, _) => Ok(Event::Sync), - State::Deserialize(_, _, _) => Ok(Event::Sync), - State::Generated(_, _) => Err(ErrorCode::Internal("It's a bug.")), - } - } - - fn process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::PrewhereFilter(hive_blocks, rowgroup_deserializer) => { - self.do_prewhere_filter(hive_blocks, rowgroup_deserializer) - } - State::Deserialize(hive_blocks, rowgroup_deserializer, prewhere_data) => { - self.do_deserialize(hive_blocks, rowgroup_deserializer, prewhere_data) + match self.generated_data.take() { + None => Ok(Event::Async), + Some(data_block) => { + let progress_values = ProgressValues { + rows: data_block.num_rows(), + bytes: data_block.memory_size(), + }; + self.scan_progress.incr(&progress_values); + Profile::record_usize_profile( + ProfileStatisticsName::ScanBytes, + data_block.memory_size(), + ); + self.output.push_data(Ok(data_block)); + Ok(Event::NeedConsume) } - _ => Err(ErrorCode::Internal("It's a bug.")), } } #[async_backtrace::framed] async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Finish) { - State::ReadMeta(Some(part)) => { - if self.delay > 0 { - sleep(Duration::from_millis(self.delay as u64)).await; - debug!("sleep for {}ms", self.delay); - self.delay = 0; - } - let part = HivePartInfo::from_part(&part)?; - let file_meta = self - .prewhere_block_reader - .read_meta_data(self.dal.clone(), &part.filename, part.filesize) - .await?; - let mut hive_blocks = - HiveBlocks::create(file_meta, part.clone(), self.hive_block_filter.clone()); - - match hive_blocks.prune() { - true => { - self.state = State::ReadPrewhereData(hive_blocks); - } - false => { - self.try_get_partitions(); + if let Some(mut stream) = self.stream.take() { + if let Some(block) = self + .parquet_reader + .read_block_from_stream(&mut stream) + .await? + .map(|b| { + let mut columns = b.columns().to_vec(); + for (fi, pi) in self.output_partition_columns.iter() { + columns.insert(*fi, self.partition_block_entries[*pi].clone()); } - } - Ok(()) - } - State::ReadPrewhereData(hive_blocks) => { - let row_group = hive_blocks.get_current_row_group_meta_data(); - let part = hive_blocks.get_part_info(); - let chunks = self - .prewhere_block_reader - .read_columns_data(row_group, &part) - .await?; - let rowgroup_deserializer = self - .prewhere_block_reader - .create_rowgroup_deserializer(chunks, row_group)?; - self.state = State::PrewhereFilter(hive_blocks, rowgroup_deserializer); - Ok(()) + DataBlock::new(columns, b.num_rows()) + }) + .map(|b| check_block_schema(&self.output_schema, b)) + .transpose()? + { + self.generated_data = Some(block); + self.stream = Some(stream); } + // else: + // If `read_block` returns `None`, it means the stream is finished. + // And we should try to build another stream (in next event loop). + } else if let Some(part) = self.ctx.get_partition() { + let part = HivePartInfo::from_part(&part)?; + let partition_fields = self + .partition_fields + .iter() + .cloned() + .zip(part.partitions.iter().cloned()) + .collect::>(); + self.partition_block_entries = partition_fields + .iter() + .map(|(f, v)| BlockEntry::new(f.data_type().into(), Value::Scalar(v.clone()))) + .collect::>(); + let stream = self + .parquet_reader + .prepare_data_stream(&part.filename, part.filesize, Some(&partition_fields)) + .await?; + self.stream = Some(stream); + } else { + self.is_finished = true; + } - State::ReadRemainData(hive_blocks, prewhere_data) => { - let row_group = hive_blocks.get_current_row_group_meta_data(); - let part = hive_blocks.get_part_info(); + Ok(()) + } +} - if let Some(remain_reader) = self.remain_reader.as_ref() { - let chunks = remain_reader.read_columns_data(row_group, &part).await?; - let rowgroup_deserializer = - remain_reader.create_rowgroup_deserializer(chunks, row_group)?; - self.state = - State::Deserialize(hive_blocks, rowgroup_deserializer, prewhere_data); - Ok(()) - } else { - Err(ErrorCode::Internal("It's a bug. No remain reader")) - } - } - _ => Err(ErrorCode::Internal("It's a bug.")), +fn check_block_schema(schema: &DataSchema, mut block: DataBlock) -> Result { + // Check if the schema of the data block is matched with the schema of the table. + if block.num_columns() != schema.num_fields() { + return Err(ErrorCode::TableSchemaMismatch(format!( + "Data schema mismatched. Data columns length: {}, schema fields length: {}", + block.num_columns(), + schema.num_fields() + ))); + } + + for (col, field) in block.columns_mut().iter_mut().zip(schema.fields().iter()) { + // If the actual data is nullable, the field must be nullbale. + if col.data_type.is_nullable_or_null() && !field.is_nullable() { + return Err(ErrorCode::TableSchemaMismatch(format!( + "Data schema mismatched (col name: {}). Data column is nullable, but schema field is not nullable", + field.name() + ))); + } + // The inner type of the data and field should be the same. + let data_type = col.data_type.remove_nullable(); + let schema_type = field.data_type().remove_nullable(); + if data_type != schema_type { + return Err(ErrorCode::TableSchemaMismatch(format!( + "Data schema mismatched (col name: {}). Data column type is {:?}, but schema field type is {:?}", + field.name(), + col.data_type, + field.data_type() + ))); + } + // If the field is nullable but the actual data is not nullable, + // we should wrap nullable for the data. + if field.is_nullable() && !col.data_type.is_nullable_or_null() { + col.data_type = col.data_type.wrap_nullable(); + col.value = col.value.clone().wrap_nullable(None); } } + + Ok(block) } diff --git a/src/query/storages/hive/hive/src/lib.rs b/src/query/storages/hive/hive/src/lib.rs index 189f90365ea0..9989e1c7bc75 100644 --- a/src/query/storages/hive/hive/src/lib.rs +++ b/src/query/storages/hive/hive/src/lib.rs @@ -17,29 +17,17 @@ #![allow(clippy::diverging_sub_expression)] mod converters; -mod hive_block_filter; -mod hive_blocks; mod hive_catalog; mod hive_database; -mod hive_file_splitter; -mod hive_meta_data_reader; -mod hive_parquet_block_reader; mod hive_partition; mod hive_partition_filler; -mod hive_partition_pruner; mod hive_table; mod hive_table_options; mod hive_table_source; mod utils; -pub use hive_block_filter::HiveBlockFilter; -pub use hive_blocks::HiveBlocks; pub use hive_catalog::HiveCatalog; pub use hive_catalog::HiveCreator; -pub use hive_file_splitter::HiveFileSplitter; -pub use hive_meta_data_reader::MetaDataReader; -pub use hive_parquet_block_reader::filter_hive_partition_from_partition_keys; pub use hive_partition::HivePartInfo; pub use hive_partition_filler::HivePartitionFiller; -pub use hive_table::HiveFileInfo; pub use hive_table::HiveTable; diff --git a/src/query/storages/hive/hive/src/utils.rs b/src/query/storages/hive/hive/src/utils.rs index c176fe165691..8c5eedcaab8f 100644 --- a/src/query/storages/hive/hive/src/utils.rs +++ b/src/query/storages/hive/hive/src/utils.rs @@ -14,13 +14,13 @@ use std::fmt::Debug; -use databend_common_base::base::OrderedFloat; +use databend_common_catalog::partition_columns::str_to_scalar; use databend_common_exception::ErrorCode; use databend_common_exception::Result; -use databend_common_expression::types::number::NumberScalar; use databend_common_expression::types::DataType; -use databend_common_expression::types::NumberDataType; use databend_common_expression::Scalar; +use databend_common_expression::TableField; +use databend_storages_common_pruner::partition_prunner::FetchPartitionScalars; use volo_thrift::MaybeException; use crate::hive_table::HIVE_DEFAULT_PARTITION; @@ -34,53 +34,36 @@ pub(crate) fn str_field_to_scalar(value: &str, data_type: &DataType) -> Result Ok(Scalar::String(value.to_string())), - DataType::Number(num_ty) => match num_ty { - NumberDataType::UInt8 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt8(num))) - } - NumberDataType::UInt16 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt16(num))) - } - NumberDataType::UInt32 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt32(num))) - } - NumberDataType::UInt64 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::UInt64(num))) - } - NumberDataType::Int8 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int8(num))) - } - NumberDataType::Int16 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int16(num))) - } - NumberDataType::Int32 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int32(num))) - } - NumberDataType::Int64 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Int64(num))) - } - NumberDataType::Float32 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Float32(OrderedFloat(num)))) - } - NumberDataType::Float64 => { - let num = value.parse::().unwrap(); - Ok(Scalar::Number(NumberScalar::Float64(OrderedFloat(num)))) + _ => str_to_scalar(value, data_type), + } +} + +pub struct HiveFetchPartitionScalars; + +impl FetchPartitionScalars for HiveFetchPartitionScalars { + fn eval(value: &String, partition_fields: &[TableField]) -> Result> { + let mut res = Vec::new(); + let v = value.split('/'); + let mut idx = 0; + for singe_value in v { + let kv = singe_value.split('=').collect::>(); + if kv.len() == 2 { + let field = &partition_fields[idx]; + let scalar = str_field_to_scalar(kv[1], &field.data_type().into())?; + res.push(scalar); + idx += 1; } - }, - _ => Err(ErrorCode::Unimplemented(format!( - "generate scalar failed, {:?}", - data_type - ))), + } + if res.len() != partition_fields.len() { + Err(ErrorCode::ParquetFileInvalid(format!( + "Partition values mismatch, expect {}, got {} in {}", + partition_fields.len(), + res.len(), + value + ))) + } else { + Ok(res) + } } } diff --git a/src/query/storages/hive/hive/tests/it/hive_file_splitter.rs b/src/query/storages/hive/hive/tests/it/hive_file_splitter.rs deleted file mode 100644 index cf77bcf40614..000000000000 --- a/src/query/storages/hive/hive/tests/it/hive_file_splitter.rs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use databend_common_storages_hive::HiveFileSplitter; - -#[test] -fn test_splitter() { - let splitter = HiveFileSplitter::create(1024); - assert_eq!(splitter.split_length(1), vec![0..2]); - assert_eq!(splitter.split_length(1024), vec![0..1025]); - assert_eq!(splitter.split_length(1100), vec![0..1101]); - assert_eq!(splitter.split_length(1500), vec![0..1024, 1024..1501]); - assert_eq!(splitter.split_length(2048), vec![0..1024, 1024..2049]); - assert_eq!(splitter.split_length(3000), vec![ - 0..1024, - 1024..2048, - 2048..3001 - ]); -} diff --git a/src/query/storages/hive/hive/tests/it/main.rs b/src/query/storages/hive/hive/tests/it/main.rs deleted file mode 100644 index 78f1d926806f..000000000000 --- a/src/query/storages/hive/hive/tests/it/main.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod hive_file_splitter; diff --git a/src/query/storages/parquet/Cargo.toml b/src/query/storages/parquet/Cargo.toml index 2fa779fb9bf4..1d0a96c05a54 100644 --- a/src/query/storages/parquet/Cargo.toml +++ b/src/query/storages/parquet/Cargo.toml @@ -29,6 +29,7 @@ databend-common-metrics = { workspace = true } databend-common-pipeline-core = { workspace = true } databend-common-settings = { workspace = true } databend-common-storage = { workspace = true } +databend-storages-common-cache = { workspace = true } databend-storages-common-pruner = { workspace = true } databend-storages-common-stage = { workspace = true } databend-storages-common-table-meta = { workspace = true } diff --git a/src/query/storages/parquet/src/lib.rs b/src/query/storages/parquet/src/lib.rs index 6807e9508473..a7358ae8485b 100644 --- a/src/query/storages/parquet/src/lib.rs +++ b/src/query/storages/parquet/src/lib.rs @@ -30,15 +30,5 @@ mod utils; pub use parquet_part::ParquetFilesPart; pub use parquet_part::ParquetPart; -pub use parquet_rs::transform_record_batch; -pub use parquet_rs::InMemoryRowGroup; -pub use parquet_rs::ParquetFileReader; -pub use parquet_rs::ParquetRSFullReader; -pub use parquet_rs::ParquetRSPruner; -pub use parquet_rs::ParquetRSReaderBuilder; -pub use parquet_rs::ParquetRSRowGroupPart; -pub use parquet_rs::ParquetRSRowGroupReader; -pub use parquet_rs::ParquetRSTable; -pub use parquet_rs::ParquetSource; -pub use parquet_rs::ParquetTableForCopy; +pub use parquet_rs::*; pub use read_settings::ReadSettings; diff --git a/src/query/storages/parquet/src/parquet_rs/meta.rs b/src/query/storages/parquet/src/parquet_rs/meta.rs index e2780c97a0f6..da9231c1cb48 100644 --- a/src/query/storages/parquet/src/parquet_rs/meta.rs +++ b/src/query/storages/parquet/src/parquet_rs/meta.rs @@ -21,6 +21,10 @@ use databend_common_catalog::plan::FullParquetMeta; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::TableField; +use databend_storages_common_cache::CacheManager; +use databend_storages_common_cache::InMemoryItemCacheReader; +use databend_storages_common_cache::LoadParams; +use databend_storages_common_cache::Loader; use opendal::Operator; use parquet::file::metadata::ParquetMetaData; use parquet::schema::types::SchemaDescPtr; @@ -28,6 +32,21 @@ use parquet::schema::types::SchemaDescriptor; use crate::parquet_rs::statistics::collect_row_group_stats; +pub async fn read_metadata_async_cached( + path: &str, + operator: &Operator, + file_size: Option, +) -> Result> { + let reader = MetaReader::meta_data_reader(operator.clone()); + let load_params = LoadParams { + location: path.to_owned(), + len_hint: file_size, + ver: 0, + put_cache: true, + }; + reader.read(&load_params).await +} + #[async_backtrace::framed] pub async fn read_metas_in_parallel( op: &Operator, @@ -153,15 +172,14 @@ async fn load_and_check_parquet_meta( expect: &SchemaDescriptor, schema_from: &str, ) -> Result> { - let metadata = - databend_common_storage::parquet_rs::read_metadata_async(file, &op, Some(size)).await?; + let metadata = read_metadata_async_cached(file, &op, Some(size)).await?; check_parquet_schema( expect, metadata.file_metadata().schema_descr(), file, schema_from, )?; - Ok(Arc::new(metadata)) + Ok(metadata) } pub async fn read_parquet_metas_batch( @@ -200,10 +218,7 @@ pub async fn read_parquet_metas_batch_for_copy( ) -> Result>> { let mut metas = Vec::with_capacity(file_infos.len()); for (location, size) in file_infos { - let meta = Arc::new( - databend_common_storage::parquet_rs::read_metadata_async(&location, &op, Some(size)) - .await?, - ); + let meta = read_metadata_async_cached(&location, &op, Some(size)).await?; if unlikely(meta.file_metadata().num_rows() == 0) { // Don't collect empty files continue; @@ -230,3 +245,33 @@ fn check_memory_usage(max_memory_usage: u64) -> Result<()> { } Ok(()) } + +pub struct LoaderWrapper(T); +pub type ParquetMetaReader = InMemoryItemCacheReader>; + +pub struct MetaReader; +impl MetaReader { + pub fn meta_data_reader(dal: Operator) -> ParquetMetaReader { + ParquetMetaReader::new( + CacheManager::instance().get_parquet_meta_data_cache(), + LoaderWrapper(dal), + ) + } +} + +#[async_trait::async_trait] +impl Loader for LoaderWrapper { + #[async_backtrace::framed] + async fn load(&self, params: &LoadParams) -> Result { + let size = match params.len_hint { + Some(v) => v, + None => self.0.stat(¶ms.location).await?.content_length(), + }; + databend_common_storage::parquet_rs::read_metadata_async( + ¶ms.location, + &self.0, + Some(size), + ) + .await + } +} diff --git a/src/query/storages/parquet/src/parquet_rs/mod.rs b/src/query/storages/parquet/src/parquet_rs/mod.rs index 459024759223..97e5f259c591 100644 --- a/src/query/storages/parquet/src/parquet_rs/mod.rs +++ b/src/query/storages/parquet/src/parquet_rs/mod.rs @@ -24,6 +24,8 @@ mod meta; mod schema; pub use copy_into_table::ParquetTableForCopy; +pub use meta::read_metadata_async_cached; +pub use meta::read_metas_in_parallel; pub use meta::read_metas_in_parallel_for_copy; pub use meta::read_parquet_metas_batch; pub use parquet_reader::transform_record_batch; @@ -36,3 +38,5 @@ pub use parquet_table::ParquetRSTable; pub use partition::ParquetRSRowGroupPart; pub use pruning::ParquetRSPruner; pub use source::ParquetSource; +pub use statistics::collect_row_group_stats; +pub use statistics::collect_single_row_group_stats; diff --git a/src/query/storages/parquet/src/parquet_rs/pruning.rs b/src/query/storages/parquet/src/parquet_rs/pruning.rs index ec00657e303b..284091287e48 100644 --- a/src/query/storages/parquet/src/parquet_rs/pruning.rs +++ b/src/query/storages/parquet/src/parquet_rs/pruning.rs @@ -85,8 +85,6 @@ impl ParquetRSPruner { partition_columns .iter() .position(|c| c.eq_ignore_ascii_case(&name)) - .unwrap(); - None }) }) .collect::>(); diff --git a/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs b/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs index 6d1678a2032e..9fa2d78fefc3 100644 --- a/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs +++ b/src/query/storages/parquet/src/parquet_rs/statistics/mod.rs @@ -19,3 +19,4 @@ mod utils; pub use page::convert_index_to_column_statistics; pub use row_group::collect_row_group_stats; +pub use row_group::collect_single_row_group_stats; diff --git a/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs b/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs index f1bac90e2a35..3199627cf9cd 100644 --- a/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs +++ b/src/query/storages/parquet/src/parquet_rs/statistics/row_group.rs @@ -41,35 +41,41 @@ pub fn collect_row_group_stats( let mut stats = Vec::with_capacity(rgs.len()); for rg in rgs { assert_eq!(rg.num_columns(), leaf_fields.len()); - let mut stats_of_columns = HashMap::with_capacity(rg.columns().len()); + let stats_of_columns = collect_single_row_group_stats(rg, leaf_fields, columns)?; + stats.push(stats_of_columns); + } + Some(stats) +} - // Each row_group_stat is a `HashMap` holding key-value pairs. - // The first element of the pair is the offset in the schema, - // and the second element is the statistics of the column (according to the offset) - if let Some(columns) = columns { - for col_idx in columns.iter() { - let column = rg.column(*col_idx); - let field = &leaf_fields[*col_idx]; - let column_stats = column.statistics().unwrap(); - stats_of_columns.insert( - *col_idx as u32, - convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, - ); - } - } else { - for (col_idx, (column, field)) in - rg.columns().iter().zip(leaf_fields.iter()).enumerate() - { - let column_stats = column.statistics().unwrap(); - stats_of_columns.insert( - col_idx as u32, - convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, - ); - } +/// Note the keys of result is not column id but column offset in schema +pub fn collect_single_row_group_stats( + rg: &RowGroupMetaData, + leaf_fields: &[TableField], + columns: Option<&[usize]>, +) -> Option { + let mut stats_of_columns = HashMap::with_capacity(rg.columns().len()); + // Each row_group_stat is a `HashMap` holding key-value pairs. + // The first element of the pair is the offset in the schema, + // and the second element is the statistics of the column (according to the offset) + if let Some(columns) = columns { + for col_idx in columns.iter() { + let column = rg.column(*col_idx); + let field = &leaf_fields[*col_idx]; + let column_stats = column.statistics().unwrap(); + stats_of_columns.insert( + *col_idx as u32, + convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, + ); + } + } else { + for (idx, (column, field)) in rg.columns().iter().zip(leaf_fields.iter()).enumerate() { + let column_stats = column.statistics().unwrap(); + stats_of_columns.insert( + idx as u32, + convert_column_statistics(column_stats, &field.data_type().remove_nullable())?, + ); } - - stats.push(stats_of_columns); } - Some(stats) + Some(stats_of_columns) } diff --git a/src/query/storages/stage/Cargo.toml b/src/query/storages/stage/Cargo.toml index fe7993f4b483..4def88c4ab48 100644 --- a/src/query/storages/stage/Cargo.toml +++ b/src/query/storages/stage/Cargo.toml @@ -44,8 +44,6 @@ serde = { workspace = true } serde_json = { workspace = true } typetag = { workspace = true } -uuid = { workspace = true } - [build-dependencies] databend-common-building = { workspace = true } diff --git a/src/query/storages/stage/src/append/do_append.rs b/src/query/storages/stage/src/append/do_append.rs index 7ac7a30c2a00..5a62301aceaa 100644 --- a/src/query/storages/stage/src/append/do_append.rs +++ b/src/query/storages/stage/src/append/do_append.rs @@ -38,14 +38,14 @@ impl StageTable { let max_threads = settings.get_max_threads()? as usize; let op = StageTable::get_op(&self.table_info.stage_info)?; - let uuid = uuid::Uuid::new_v4().to_string(); + let query_id = ctx.get_id(); let group_id = AtomicUsize::new(0); match fmt { FileFormatParams::Parquet(_) => append_data_to_parquet_files( pipeline, self.table_info.clone(), op, - uuid, + query_id, &group_id, mem_limit, max_threads, @@ -55,13 +55,13 @@ impl StageTable { ctx.clone(), self.table_info.clone(), op, - uuid, + query_id, &group_id, mem_limit, max_threads, )?, }; - if !self.table_info.stage_info.copy_options.detailed_output { + if !self.table_info.copy_into_location_options.detailed_output { pipeline.try_resize(1)?; pipeline.add_accumulating_transformer(SumSummaryTransform::default); } diff --git a/src/query/storages/stage/src/append/parquet_file/pipeline.rs b/src/query/storages/stage/src/append/parquet_file/pipeline.rs index c432e495ea42..9069dc3f4e7d 100644 --- a/src/query/storages/stage/src/append/parquet_file/pipeline.rs +++ b/src/query/storages/stage/src/append/parquet_file/pipeline.rs @@ -27,13 +27,13 @@ pub(crate) fn append_data_to_parquet_files( pipeline: &mut Pipeline, table_info: StageTableInfo, op: Operator, - uuid: String, + query_id: String, group_id: &std::sync::atomic::AtomicUsize, mem_limit: usize, max_threads: usize, ) -> Result<()> { - let is_single = table_info.stage_info.copy_options.single; - let max_file_size = table_info.stage_info.copy_options.max_file_size; + let is_single = table_info.copy_into_location_options.single; + let max_file_size = table_info.copy_into_location_options.max_file_size; // when serializing block to parquet, the memory may be doubled let mem_limit = mem_limit / 2; pipeline.try_resize(1)?; @@ -60,7 +60,7 @@ pub(crate) fn append_data_to_parquet_files( output, table_info.clone(), op.clone(), - uuid.clone(), + query_id.clone(), gid, max_file_size, ) diff --git a/src/query/storages/stage/src/append/parquet_file/writer_processor.rs b/src/query/storages/stage/src/append/parquet_file/writer_processor.rs index 8e96840264aa..dc7354d5f511 100644 --- a/src/query/storages/stage/src/append/parquet_file/writer_processor.rs +++ b/src/query/storages/stage/src/append/parquet_file/writer_processor.rs @@ -62,7 +62,7 @@ pub struct ParquetFileWriter { unload_output: UnloadOutput, unload_output_blocks: Option>, - uuid: String, + query_id: String, group_id: usize, batch_id: usize, @@ -100,12 +100,12 @@ impl ParquetFileWriter { output: Arc, table_info: StageTableInfo, data_accessor: Operator, - uuid: String, + query_id: String, group_id: usize, targe_file_size: Option, ) -> Result { let unload_output = - UnloadOutput::create(table_info.stage_info.copy_options.detailed_output); + UnloadOutput::create(table_info.copy_into_location_options.detailed_output); let arrow_schema = Arc::new(table_schema_to_arrow_schema(&table_info.schema)); let writer = create_writer(arrow_schema.clone(), targe_file_size)?; @@ -122,7 +122,7 @@ impl ParquetFileWriter { input_bytes: 0, file_to_write: None, data_accessor, - uuid, + query_id, group_id, batch_id: 0, targe_file_size, @@ -242,7 +242,7 @@ impl Processor for ParquetFileWriter { assert!(self.file_to_write.is_some()); let path = unload_path( &self.table_info, - &self.uuid, + &self.query_id, self.group_id, self.batch_id, None, diff --git a/src/query/storages/stage/src/append/path.rs b/src/query/storages/stage/src/append/path.rs index 91bc5bbb37c7..62870e4693c5 100644 --- a/src/query/storages/stage/src/append/path.rs +++ b/src/query/storages/stage/src/append/path.rs @@ -17,7 +17,7 @@ use databend_common_compress::CompressAlgorithm; pub fn unload_path( stage_table_info: &StageTableInfo, - uuid: &str, + query_id: &str, group_id: usize, batch_id: usize, compression: Option, @@ -33,23 +33,31 @@ pub fn unload_path( .unwrap_or_default(); let path = &stage_table_info.files_info.path; - - if path.ends_with("data_") { - format!( - "{}{}_{:0>4}_{:0>8}.{}{}", - path, uuid, group_id, batch_id, format_name, suffix - ) + if stage_table_info.copy_into_location_options.use_raw_path { + path.to_string() } else { - let (path, sep) = if path == "/" { - ("", "") - } else if path.ends_with('/') { - (path.as_str(), "") + let query_id = if stage_table_info.copy_into_location_options.include_query_id { + format!("{query_id}_") } else { - (path.as_str(), "/") + "".to_string() }; - format!( - "{}{}data_{}_{:0>4}_{:0>8}.{}{}", - path, sep, uuid, group_id, batch_id, format_name, suffix - ) + if path.ends_with("data_") { + format!( + "{}{}{:0>4}_{:0>8}.{}{}", + path, query_id, group_id, batch_id, format_name, suffix + ) + } else { + let (path, sep) = if path == "/" { + ("", "") + } else if path.ends_with('/') { + (path.as_str(), "") + } else { + (path.as_str(), "/") + }; + format!( + "{}{}data_{}{:0>4}_{:0>8}.{}{}", + path, sep, query_id, group_id, batch_id, format_name, suffix + ) + } } } diff --git a/src/query/storages/stage/src/append/row_based_file/pipeline.rs b/src/query/storages/stage/src/append/row_based_file/pipeline.rs index 8b98744224f8..b1deb6dd1a3b 100644 --- a/src/query/storages/stage/src/append/row_based_file/pipeline.rs +++ b/src/query/storages/stage/src/append/row_based_file/pipeline.rs @@ -37,13 +37,13 @@ pub(crate) fn append_data_to_row_based_files( ctx: Arc, table_info: StageTableInfo, op: Operator, - uuid: String, + query_id: String, group_id: &std::sync::atomic::AtomicUsize, mem_limit: usize, max_threads: usize, ) -> Result<()> { - let is_single = table_info.stage_info.copy_options.single; - let max_file_size = table_info.stage_info.copy_options.max_file_size; + let is_single = table_info.copy_into_location_options.single; + let max_file_size = table_info.copy_into_location_options.max_file_size; let compression = table_info.stage_info.file_format_params.compression(); // when serializing block to parquet, the memory may be doubled let mem_limit = mem_limit / 2; @@ -101,7 +101,7 @@ pub(crate) fn append_data_to_row_based_files( table_info.clone(), op.clone(), prefix.clone(), - uuid.clone(), + query_id.clone(), gid, compression, ) diff --git a/src/query/storages/stage/src/append/row_based_file/writer_processor.rs b/src/query/storages/stage/src/append/row_based_file/writer_processor.rs index 594bb58bd451..4bb46cce94d6 100644 --- a/src/query/storages/stage/src/append/row_based_file/writer_processor.rs +++ b/src/query/storages/stage/src/append/row_based_file/writer_processor.rs @@ -52,7 +52,7 @@ pub struct RowBasedFileWriter { data_accessor: Operator, prefix: Vec, - uuid: String, + query_id: String, group_id: usize, batch_id: usize, @@ -66,19 +66,19 @@ impl RowBasedFileWriter { table_info: StageTableInfo, data_accessor: Operator, prefix: Vec, - uuid: String, + query_id: String, group_id: usize, compression: Option, ) -> Result { let unload_output = - UnloadOutput::create(table_info.stage_info.copy_options.detailed_output); + UnloadOutput::create(table_info.copy_into_location_options.detailed_output); Ok(ProcessorPtr::create(Box::new(RowBasedFileWriter { table_info, input, input_data: None, data_accessor, prefix, - uuid, + query_id, group_id, batch_id: 0, file_to_write: None, @@ -172,7 +172,7 @@ impl Processor for RowBasedFileWriter { async fn async_process(&mut self) -> Result<()> { let path = unload_path( &self.table_info, - &self.uuid, + &self.query_id, self.group_id, self.batch_id, self.compression, diff --git a/src/query/storages/system/src/caches_table.rs b/src/query/storages/system/src/caches_table.rs index ead0282a6aef..04c411675dfa 100644 --- a/src/query/storages/system/src/caches_table.rs +++ b/src/query/storages/system/src/caches_table.rs @@ -80,7 +80,7 @@ impl SyncSystemTable for CachesTable { let inverted_index_meta_cache = cache_manager.get_inverted_index_meta_cache(); let inverted_index_file_cache = cache_manager.get_inverted_index_file_cache(); let prune_partitions_cache = cache_manager.get_prune_partitions_cache(); - let file_meta_data_cache = cache_manager.get_file_meta_data_cache(); + let parquet_meta_data_cache = cache_manager.get_parquet_meta_data_cache(); let table_data_cache = cache_manager.get_table_data_cache(); let table_column_array_cache = cache_manager.get_table_data_array_cache(); @@ -121,8 +121,8 @@ impl SyncSystemTable for CachesTable { Self::append_row(&prune_partitions_cache, &local_node, &mut columns); } - if let Some(file_meta_data_cache) = file_meta_data_cache { - Self::append_row(&file_meta_data_cache, &local_node, &mut columns); + if let Some(parquet_meta_data_cache) = parquet_meta_data_cache { + Self::append_row(&parquet_meta_data_cache, &local_node, &mut columns); } if let Some(cache) = table_data_cache { diff --git a/src/query/storages/system/src/dictionaries_table.rs b/src/query/storages/system/src/dictionaries_table.rs new file mode 100644 index 000000000000..9cc0c61fafcd --- /dev/null +++ b/src/query/storages/system/src/dictionaries_table.rs @@ -0,0 +1,200 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use databend_common_catalog::plan::PushDownInfo; +use databend_common_catalog::table::Table; +use databend_common_catalog::table_context::TableContext; +use databend_common_exception::Result; +use databend_common_expression::types::DataType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; +use databend_common_expression::ColumnBuilder; +use databend_common_expression::DataBlock; +use databend_common_expression::FromData; +use databend_common_expression::ScalarRef; +use databend_common_expression::TableDataType; +use databend_common_expression::TableField; +use databend_common_expression::TableSchemaRefExt; +use databend_common_meta_app::schema::ListDictionaryReq; +use databend_common_meta_app::schema::TableIdent; +use databend_common_meta_app::schema::TableInfo; +use databend_common_meta_app::schema::TableMeta; + +use crate::table::AsyncOneBlockSystemTable; +use crate::table::AsyncSystemTable; + +pub struct DictionariesTable { + table_info: TableInfo, +} + +#[async_trait::async_trait] +impl AsyncSystemTable for DictionariesTable { + const NAME: &'static str = "system.dictionaries"; + + fn get_table_info(&self) -> &TableInfo { + &self.table_info + } + + #[async_backtrace::framed] + async fn get_full_data( + &self, + ctx: Arc, + _push_downs: Option, + ) -> Result { + let tenant = ctx.get_tenant(); + + let mut db_names = vec![]; + let mut names = vec![]; + + let mut key_names_builder = + ColumnBuilder::with_capacity(&DataType::Array(Box::new(DataType::String)), 0); + let mut attribute_names_builder = + ColumnBuilder::with_capacity(&DataType::Array(Box::new(DataType::String)), 0); + let mut key_types_builder = + ColumnBuilder::with_capacity(&DataType::Array(Box::new(DataType::String)), 0); + let mut attribute_types_builder = + ColumnBuilder::with_capacity(&DataType::Array(Box::new(DataType::String)), 0); + + let mut sources = vec![]; + let mut comments = vec![]; + let mut created_ons = vec![]; + let mut updated_ons = vec![]; + + let catalog = ctx.get_default_catalog().unwrap(); + let databases = catalog.list_databases(&tenant).await?; + for database in databases { + let db_id = database.get_db_info().database_id.db_id; + let req = ListDictionaryReq { + tenant: tenant.clone(), + db_id, + }; + let dictionaries = catalog.list_dictionaries(req).await?; + for (dict_name, dict_meta) in dictionaries { + db_names.push(database.get_db_name().to_string()); + + names.push(dict_name.clone()); + + let comment = dict_meta.comment; + comments.push(comment); + + let created_on = dict_meta.created_on.timestamp_micros(); + created_ons.push(created_on); + let updated_on = match dict_meta.updated_on { + Some(updated_on) => updated_on.timestamp_micros(), + None => created_on, + }; + updated_ons.push(updated_on); + + let schema = dict_meta.schema; + let fields = &schema.fields; + let primary_column_ids = dict_meta.primary_column_ids; + + let mut key_names = vec![]; + let mut attribute_names = vec![]; + let mut key_types = vec![]; + let mut attribute_types = vec![]; + + for field in fields { + if primary_column_ids.contains(&field.column_id) { + key_names.push(field.name.clone()); + key_types.push(field.data_type.sql_name()); + } else { + attribute_names.push(field.name.clone()); + attribute_types.push(field.data_type.sql_name()); + } + } + let key_names_column = ScalarRef::Array(StringType::from_data(key_names)); + key_names_builder.push(key_names_column); + let attribute_names_column = + ScalarRef::Array(StringType::from_data(attribute_names)); + attribute_names_builder.push(attribute_names_column); + let key_types_column = ScalarRef::Array(StringType::from_data(key_types)); + key_types_builder.push(key_types_column); + let attribute_types_column = + ScalarRef::Array(StringType::from_data(attribute_types)); + attribute_types_builder.push(attribute_types_column); + + let dict_source = dict_meta.source; + let mut options = dict_meta.options; + if let Some(password) = options.get_mut("password") { + *password = "[hidden]".to_string(); + } + let options_str: Vec = options + .iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect(); + let options_joined = options_str.join(" "); + let source = format!("{}({})", dict_source, options_joined); + sources.push(source); + } + } + return Ok(DataBlock::new_from_columns(vec![ + StringType::from_data(db_names), + StringType::from_data(names), + key_names_builder.build(), + key_types_builder.build(), + attribute_names_builder.build(), + attribute_types_builder.build(), + StringType::from_data(sources), + StringType::from_data(comments), + TimestampType::from_data(created_ons), + TimestampType::from_data(updated_ons), + ])); + } +} + +impl DictionariesTable { + pub fn create(table_id: u64) -> Arc { + let schema = TableSchemaRefExt::create(vec![ + TableField::new("database", TableDataType::String), + TableField::new("name", TableDataType::String), + TableField::new( + "key_names", + TableDataType::Array(Box::new(TableDataType::String)), + ), + TableField::new( + "key_types", + TableDataType::Array(Box::new(TableDataType::String)), + ), + TableField::new( + "attribute_names", + TableDataType::Array(Box::new(TableDataType::String)), + ), + TableField::new( + "attribute_types", + TableDataType::Array(Box::new(TableDataType::String)), + ), + TableField::new("source", TableDataType::String), + TableField::new("comment", TableDataType::String), + TableField::new("created_on", TableDataType::Timestamp), + TableField::new("updated_on", TableDataType::Timestamp), + ]); + + let table_info = TableInfo { + desc: "'system'.'dictionaries'".to_string(), + name: "dictionaries".to_string(), + ident: TableIdent::new(table_id, 0), + meta: TableMeta { + schema, + engine: "SystemDictionaries".to_string(), + ..Default::default() + }, + ..Default::default() + }; + + AsyncOneBlockSystemTable::create(DictionariesTable { table_info }) + } +} diff --git a/src/query/storages/system/src/lib.rs b/src/query/storages/system/src/lib.rs index 8cbaee5ac4db..85c14c4045c3 100644 --- a/src/query/storages/system/src/lib.rs +++ b/src/query/storages/system/src/lib.rs @@ -33,6 +33,7 @@ mod configs_table; mod contributors_table; mod credits_table; mod databases_table; +mod dictionaries_table; mod engines_table; mod functions_table; mod indexes_table; @@ -80,6 +81,7 @@ pub use configs_table::ConfigsTable; pub use contributors_table::ContributorsTable; pub use credits_table::CreditsTable; pub use databases_table::DatabasesTable; +pub use dictionaries_table::DictionariesTable; pub use engines_table::EnginesTable; pub use functions_table::FunctionsTable; pub use indexes_table::IndexesTable; diff --git a/src/query/storages/system/src/temp_files_table.rs b/src/query/storages/system/src/temp_files_table.rs index 34ca6ddfa459..6c53b84a73d6 100644 --- a/src/query/storages/system/src/temp_files_table.rs +++ b/src/query/storages/system/src/temp_files_table.rs @@ -119,7 +119,7 @@ impl AsyncSystemTable for TempFilesTable { num_rows, ); - Ok(data_block.convert_to_full()) + Ok(data_block.consume_convert_to_full()) } } diff --git a/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py b/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py index 8b9fc86d6b41..4f4462b76178 100644 --- a/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py +++ b/tests/sqllogictests/scripts/prepare_iceberg_tpch_data.py @@ -1,132 +1,159 @@ from pyspark.sql import SparkSession -from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType, DateType, DecimalType +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + DoubleType, + StringType, + DateType, + DecimalType, +) data_path = "tests/sqllogictests/data/tests/suites/0_stateless/13_tpch/data" -spark = SparkSession.builder \ - .appName("CSV to Iceberg REST Catalog") \ - .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \ - .config("spark.sql.catalog.iceberg.type", "rest") \ - .config("spark.sql.catalog.iceberg.uri", "http://127.0.0.1:8181") \ - .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \ - .config("spark.sql.catalog.iceberg.warehouse", "s3://iceberg-tpch/") \ - .config("spark.sql.catalog.iceberg.s3.access-key-id", "admin") \ - .config("spark.sql.catalog.iceberg.s3.secret-access-key", "password") \ - .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \ - .config("spark.sql.catalog.iceberg.s3.endpoint", "http://127.0.0.1:9000") \ - .config("spark.sql.catalog.iceberg.client.region", "us-east-1") \ - .config("spark.jars.packages", - "org.apache.iceberg:iceberg-aws-bundle:1.6.1,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1") \ +spark = ( + SparkSession.builder.appName("CSV to Iceberg REST Catalog") + .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.iceberg.type", "rest") + .config("spark.sql.catalog.iceberg.uri", "http://127.0.0.1:8181") + .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") + .config("spark.sql.catalog.iceberg.warehouse", "s3://iceberg-tpch/") + .config("spark.sql.catalog.iceberg.s3.access-key-id", "admin") + .config("spark.sql.catalog.iceberg.s3.secret-access-key", "password") + .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") + .config("spark.sql.catalog.iceberg.s3.endpoint", "http://127.0.0.1:9000") + .config("spark.sql.catalog.iceberg.client.region", "us-east-1") + .config( + "spark.jars.packages", + "org.apache.iceberg:iceberg-aws-bundle:1.6.1,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1", + ) .getOrCreate() +) tables = { "lineitem": ( - StructType([ - StructField("l_orderkey", IntegerType(), True), - StructField("l_partkey", IntegerType(), True), - StructField("l_suppkey", IntegerType(), True), - StructField("l_linenumber", IntegerType(), True), - StructField("l_quantity", DecimalType(15, 2), True), - StructField("l_extendedprice", DecimalType(15, 2), True), - StructField("l_discount", DecimalType(15, 2), True), - StructField("l_tax", DecimalType(15, 2), True), - StructField("l_returnflag", StringType(), True), - StructField("l_linestatus", StringType(), True), - StructField("l_shipdate", DateType(), True), - StructField("l_commitdate", DateType(), True), - StructField("l_receiptdate", DateType(), True), - StructField("l_shipinstruct", StringType(), True), - StructField("l_shipmode", StringType(), True), - StructField("l_comment", StringType(), True) - ]), - f"{data_path}/lineitem.tbl" + StructType( + [ + StructField("l_orderkey", IntegerType(), True), + StructField("l_partkey", IntegerType(), True), + StructField("l_suppkey", IntegerType(), True), + StructField("l_linenumber", IntegerType(), True), + StructField("l_quantity", DecimalType(15, 2), True), + StructField("l_extendedprice", DecimalType(15, 2), True), + StructField("l_discount", DecimalType(15, 2), True), + StructField("l_tax", DecimalType(15, 2), True), + StructField("l_returnflag", StringType(), True), + StructField("l_linestatus", StringType(), True), + StructField("l_shipdate", DateType(), True), + StructField("l_commitdate", DateType(), True), + StructField("l_receiptdate", DateType(), True), + StructField("l_shipinstruct", StringType(), True), + StructField("l_shipmode", StringType(), True), + StructField("l_comment", StringType(), True), + ] + ), + f"{data_path}/lineitem.tbl", ), "orders": ( - StructType([ - StructField("o_orderkey", IntegerType(), True), - StructField("o_custkey", IntegerType(), True), - StructField("o_orderstatus", StringType(), True), - StructField("o_totalprice", DecimalType(15, 2), True), - StructField("o_orderdate", DateType(), True), - StructField("o_orderpriority", StringType(), True), - StructField("o_clerk", StringType(), True), - StructField("o_shippriority", IntegerType(), True), - StructField("o_comment", StringType(), True) - ]), - f"{data_path}/orders.tbl" + StructType( + [ + StructField("o_orderkey", IntegerType(), True), + StructField("o_custkey", IntegerType(), True), + StructField("o_orderstatus", StringType(), True), + StructField("o_totalprice", DecimalType(15, 2), True), + StructField("o_orderdate", DateType(), True), + StructField("o_orderpriority", StringType(), True), + StructField("o_clerk", StringType(), True), + StructField("o_shippriority", IntegerType(), True), + StructField("o_comment", StringType(), True), + ] + ), + f"{data_path}/orders.tbl", ), "customer": ( - StructType([ - StructField("c_custkey", IntegerType(), True), - StructField("c_name", StringType(), True), - StructField("c_address", StringType(), True), - StructField("c_nationkey", IntegerType(), True), - StructField("c_phone", StringType(), True), - StructField("c_acctbal", DecimalType(15, 2), True), - StructField("c_mktsegment", StringType(), True), - StructField("c_comment", StringType(), True) - ]), - f"{data_path}/customer.tbl" + StructType( + [ + StructField("c_custkey", IntegerType(), True), + StructField("c_name", StringType(), True), + StructField("c_address", StringType(), True), + StructField("c_nationkey", IntegerType(), True), + StructField("c_phone", StringType(), True), + StructField("c_acctbal", DecimalType(15, 2), True), + StructField("c_mktsegment", StringType(), True), + StructField("c_comment", StringType(), True), + ] + ), + f"{data_path}/customer.tbl", ), "nation": ( - StructType([ - StructField("n_nationkey", IntegerType(), True), - StructField("n_name", StringType(), True), - StructField("n_regionkey", IntegerType(), True), - StructField("n_comment", StringType(), True) - ]), - f"{data_path}/nation.tbl" + StructType( + [ + StructField("n_nationkey", IntegerType(), True), + StructField("n_name", StringType(), True), + StructField("n_regionkey", IntegerType(), True), + StructField("n_comment", StringType(), True), + ] + ), + f"{data_path}/nation.tbl", ), "region": ( - StructType([ - StructField("r_regionkey", IntegerType(), True), - StructField("r_name", StringType(), True), - StructField("r_comment", StringType(), True) - ]), - f"{data_path}/region.tbl" + StructType( + [ + StructField("r_regionkey", IntegerType(), True), + StructField("r_name", StringType(), True), + StructField("r_comment", StringType(), True), + ] + ), + f"{data_path}/region.tbl", ), "part": ( - StructType([ - StructField("p_partkey", IntegerType(), True), - StructField("p_name", StringType(), True), - StructField("p_mfgr", StringType(), True), - StructField("p_brand", StringType(), True), - StructField("p_type", StringType(), True), - StructField("p_size", IntegerType(), True), - StructField("p_container", StringType(), True), - StructField("p_retailprice", DecimalType(15, 2), True), - StructField("p_comment", StringType(), True) - ]), - f"{data_path}/part.tbl" + StructType( + [ + StructField("p_partkey", IntegerType(), True), + StructField("p_name", StringType(), True), + StructField("p_mfgr", StringType(), True), + StructField("p_brand", StringType(), True), + StructField("p_type", StringType(), True), + StructField("p_size", IntegerType(), True), + StructField("p_container", StringType(), True), + StructField("p_retailprice", DecimalType(15, 2), True), + StructField("p_comment", StringType(), True), + ] + ), + f"{data_path}/part.tbl", ), "supplier": ( - StructType([ - StructField("s_suppkey", IntegerType(), True), - StructField("s_name", StringType(), True), - StructField("s_address", StringType(), True), - StructField("s_nationkey", IntegerType(), True), - StructField("s_phone", StringType(), True), - StructField("s_acctbal", DecimalType(15, 2), True), - StructField("s_comment", StringType(), True) - ]), - f"{data_path}/supplier.tbl" + StructType( + [ + StructField("s_suppkey", IntegerType(), True), + StructField("s_name", StringType(), True), + StructField("s_address", StringType(), True), + StructField("s_nationkey", IntegerType(), True), + StructField("s_phone", StringType(), True), + StructField("s_acctbal", DecimalType(15, 2), True), + StructField("s_comment", StringType(), True), + ] + ), + f"{data_path}/supplier.tbl", ), "partsupp": ( - StructType([ - StructField("ps_partkey", IntegerType(), True), - StructField("ps_suppkey", IntegerType(), True), - StructField("ps_availqty", IntegerType(), True), - StructField("ps_supplycost", DecimalType(15, 2), True), - StructField("ps_comment", StringType(), True) - ]), - f"{data_path}/partsupp.tbl" - ) + StructType( + [ + StructField("ps_partkey", IntegerType(), True), + StructField("ps_suppkey", IntegerType(), True), + StructField("ps_availqty", IntegerType(), True), + StructField("ps_supplycost", DecimalType(15, 2), True), + StructField("ps_comment", StringType(), True), + ] + ), + f"{data_path}/partsupp.tbl", + ), } for table_name, (schema, file_path) in tables.items(): full_table_name = f"iceberg.tpch.{table_name}" - #spark.sql(f"DROP TABLE IF EXISTS {full_table_name}") + # spark.sql(f"DROP TABLE IF EXISTS {full_table_name}") create_table = f""" CREATE OR REPLACE TABLE {full_table_name} ( diff --git a/tests/sqllogictests/suites/base/06_show/06_0024_show_dictionaries.test b/tests/sqllogictests/suites/base/06_show/06_0024_show_dictionaries.test new file mode 100644 index 000000000000..914bc37fdba2 --- /dev/null +++ b/tests/sqllogictests/suites/base/06_show/06_0024_show_dictionaries.test @@ -0,0 +1,65 @@ +statement ok +CREATE OR REPLACE DICTIONARY d1(c1 int NOT NULL, c2 Varchar NOT NULL) PRIMARY KEY c1 SOURCE(mysql(host='localhost' port='3306' username='root' password='1234' db='db1' table='test_table')) + +statement ok +CREATE OR REPLACE DICTIONARY d2(a int NOT NULL, b int NOT NULL) PRIMARY KEY a SOURCE(mysql(host='localhost' port='3306' username='root' password='1234' db='db1' table='test_table')) + +query T +show dictionaries +---- +default d1 ['c1'] ['INT'] ['c2'] ['VARCHAR'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +default d2 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) + +statement ok +DROP DATABASE IF EXISTS show_dictionary + +statement ok +CREATE DATABASE show_dictionary + +statement ok +use show_dictionary + +statement ok +CREATE OR REPLACE DICTIONARY show_dictionary.d1(c1 VARCHAR NOT NULL, c2 VARCHAR NOT NULL) PRIMARY KEY c1 SOURCE(mysql(host='localhost' port='3306' username='root' password='1234' db='db1' table='test_table')) + +statement ok +CREATE OR REPLACE DICTIONARY show_dictionary.d2(a int NOT NULL, b int NOT NULL) PRIMARY KEY a SOURCE(mysql(host='localhost' port='3306' username='root' password='1234' db='db1' table='test_table')) + +statement ok +CREATE OR REPLACE DICTIONARY show_dictionary.d3(`a` int NOT NULL, b int NOT NULL) PRIMARY KEY a SOURCE(mysql(host='localhost' port='3306' username='root' password='1234' db='db1' table='test_table')) + +query T +show dictionaries from show_dictionary +---- +show_dictionary d1 ['c1'] ['VARCHAR'] ['c2'] ['VARCHAR'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d2 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d3 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) + +query T +show dictionaries from show_dictionary like 'd%' +---- +show_dictionary d1 ['c1'] ['VARCHAR'] ['c2'] ['VARCHAR'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d2 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d3 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) + +query T +show dictionaries from show_dictionary WHERE name = 'd2' OR 1 = 1 +---- +show_dictionary d1 ['c1'] ['VARCHAR'] ['c2'] ['VARCHAR'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d2 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d3 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) + +query T +show dictionaries from show_dictionary WHERE name = 'd2' AND 1 = 1 +---- +show_dictionary d2 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) + +statement ok +show dictionaries WHERE name='d2' AND 1=0 + +query T +show dictionaries +---- +show_dictionary d1 ['c1'] ['VARCHAR'] ['c2'] ['VARCHAR'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d2 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) +show_dictionary d3 ['a'] ['INT'] ['b'] ['INT'] mysql(db=db1 host=localhost password=[hidden] port=3306 table=test_table username=root) (empty) diff --git a/tests/sqllogictests/suites/mode/standalone/explain/table_sample.test b/tests/sqllogictests/suites/mode/standalone/explain/table_sample.test index 840552957b26..5ae0527a2ae7 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/table_sample.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/table_sample.test @@ -34,7 +34,12 @@ statement ok create or replace table t as select number as a from numbers(10000000); query I -select count(distinct a) < 10000000 from t sample block(50); +select count(a) < 10000000 from t sample block(50); +---- +1 + +query I +select count(a) < 10000000 from t sample block(50) row(10); ---- 1 diff --git a/tests/sqllogictests/suites/query/functions/02_0000_function_aggregate_mix.test b/tests/sqllogictests/suites/query/functions/02_0000_function_aggregate_mix.test index 107e51d6ba4f..9c3a5d8a7911 100644 --- a/tests/sqllogictests/suites/query/functions/02_0000_function_aggregate_mix.test +++ b/tests/sqllogictests/suites/query/functions/02_0000_function_aggregate_mix.test @@ -418,5 +418,37 @@ statement ok DROP TABLE d statement ok -DROP DATABASE db1 +create or replace table aggr(k int, v decimal(10,2)); + +query I +select mode(v) from aggr; +---- +NULL + +statement ok +insert into aggr (k, v) values + (1, 10), + (1, 10), + (1, 10), + (2, 20), + (2, 20), + (2, 21), + (3, null); + +query I +select mode(v) from aggr; +---- +10.00 +query II +select k, mode(v) from aggr group by k order by k; +---- +1 10.00 +2 20.00 +3 NULL + +statement ok +DROP TABLE aggr + +statement ok +DROP DATABASE db1 diff --git a/tests/sqllogictests/suites/stage/unload.test b/tests/sqllogictests/suites/stage/unload.test index 4dd0087604d3..d80230ff92a5 100644 --- a/tests/sqllogictests/suites/stage/unload.test +++ b/tests/sqllogictests/suites/stage/unload.test @@ -81,3 +81,21 @@ select $1, $2 from @unload(file_format => 'tsv'); 1 2 3 4 5 6 + +query +copy into @unload/a_raw_path.csv from (select 1,2) file_format=(type=csv) single=true include_query_id=false use_raw_path=true detailed_output=true overwrite=true; +---- +a_raw_path.csv 4 1 + +query +copy into @unload/a_raw_path.csv from (select 3,4) file_format=(type=csv) single=true include_query_id=false use_raw_path=true detailed_output=true overwrite=true; +---- +a_raw_path.csv 4 1 + +query +select $1, $2 from @unload/a_raw_path.csv (file_format => 'csv'); +---- +3 4 + +statement error 1006.*file already exists +copy into @unload/a_raw_path.csv from (select 3,4) file_format=(type=csv) single=true include_query_id=false use_raw_path=true detailed_output=false overwrite=false; \ No newline at end of file diff --git a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result index ca587b0b1f44..167a7a804c2e 100755 --- a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result +++ b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.result @@ -33,6 +33,9 @@ 34 44 <<<< +>>>> select count() from test_delta where p0 = 10 and p2 = 12; +2 +<<<< >>>> select c5, p4 from test_delta where c1 - p0 = 11 order by c5; 25 24 <<<< diff --git a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh index 91f54f927a16..08cd14f64da2 100755 --- a/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh +++ b/tests/suites/1_stateful/12_delta/11_0000_delta_engine_partitioned.sh @@ -20,6 +20,8 @@ query "select p4 from test_delta where p2 = 12 order by p4;" query "select c1 from test_delta where p4 > 20 order by c1;" query "select p4 from test_delta where c1 > 20 order by p4;" +## explain works +query "select count() from test_delta where p0 = 10 and p2 = 12;" query "select c5, p4 from test_delta where c1 - p0 = 11 order by c5;"