diff --git a/Cargo.lock b/Cargo.lock index 2cae8952e576..d262ee8cb5e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4378,6 +4378,7 @@ dependencies = [ "siphasher", "sys-info", "tantivy", + "tantivy-fst", "tantivy-jieba", "thrift", "typetag", @@ -5418,6 +5419,7 @@ dependencies = [ "databend-storages-common-table-meta", "fastrace 0.7.2", "jsonb", + "levenshtein_automata", "log", "match-template", "parquet", @@ -5426,6 +5428,7 @@ dependencies = [ "serde_json", "tantivy", "tantivy-common", + "tantivy-fst", "thiserror", "xorfilter-rs", ] @@ -11386,8 +11389,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3a059efb063b8f425b948e042e6b9bd85edfe60e913630ed727b23e2dfcc558" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "stable_deref_trait", ] @@ -15099,8 +15101,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "aho-corasick", "arc-swap", @@ -15150,8 +15151,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284899c2325d6832203ac6ff5891b297fc5239c3dc754c5bc1977855b23c10df" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "bitpacking 0.9.2", ] @@ -15159,8 +15159,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12722224ffbe346c7fec3275c699e508fd0d4710e629e933d5736ec524a1f44e" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "downcast-rs", "fastdivide", @@ -15175,8 +15174,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8019e3cabcfd20a1380b491e13ff42f57bb38bf97c3d5fa5c07e50816e0621f4" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "async-trait", "byteorder", @@ -15199,8 +15197,7 @@ dependencies = [ [[package]] name = "tantivy-jieba" version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2fe65c125f0d76d06f0f2ce9fbb9287b53f0dafb51a6270d984a840e2f16c1" +source = "git+https://github.com/b41sh/tantivy-jieba?rev=af84361#af843610bc3bea826329af07256598c413f0dd6a" dependencies = [ "jieba-rs", "lazy_static", @@ -15210,8 +15207,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "847434d4af57b32e309f4ab1b4f1707a6c566656264caa427ff4285c4d9d0b82" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "nom", ] @@ -15219,8 +15215,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c69578242e8e9fc989119f522ba5b49a38ac20f576fc778035b96cc94f41f98e" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -15231,8 +15226,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c56d6ff5591fc332739b3ce7035b57995a3ce29a93ffd6012660e0949c956ea8" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "murmurhash32", "rand_distr", @@ -15242,8 +15236,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0dcade25819a89cfe6f17d932c9cedff11989936bf6dd4f336d50392053b04" +source = "git+https://github.com/b41sh/tantivy?rev=37aeac0#37aeac01096a7e480118dbc91e48c8f54d3fea4c" dependencies = [ "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 76b0ba372dbc..d6e92d5e51f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -310,7 +310,7 @@ serde_with = { version = "3.8.1" } serfig = "0.1.0" sled = { version = "0.34", default-features = false } stream-more = "0.1.3" -tantivy = "0.22.0" +tantivy = { git = "https://github.com/b41sh/tantivy", rev = "37aeac0" } thiserror = { version = "1" } tikv-jemalloc-ctl = { version = "0.5.0", features = ["use_std"] } tokio = { version = "1.35.0", features = ["full"] } diff --git a/src/query/ee/tests/it/inverted_index/index_refresh.rs b/src/query/ee/tests/it/inverted_index/index_refresh.rs index c761b8292b2a..1b547cc0dadf 100644 --- a/src/query/ee/tests/it/inverted_index/index_refresh.rs +++ b/src/query/ee/tests/it/inverted_index/index_refresh.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::collections::BTreeMap; +use std::collections::HashSet; use databend_common_base::base::tokio; use databend_common_catalog::plan::InvertedIndexInfo; @@ -38,6 +39,7 @@ use databend_query::interpreters::RefreshTableIndexInterpreter; use databend_query::test_kits::append_string_sample_data; use databend_query::test_kits::*; use databend_storages_common_cache::LoadParams; +use tantivy::schema::IndexRecordOption; #[tokio::test(flavor = "multi_thread")] async fn test_fuse_do_refresh_inverted_index() -> Result<()> { @@ -144,9 +146,12 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { let field_nums = query_fields.len(); let has_score = true; let need_position = false; + let mut field_ids = HashSet::new(); + field_ids.insert(0); + field_ids.insert(1); + let index_record = IndexRecordOption::WithFreqsAndPositions; - let index_reader = - InvertedIndexReader::try_create(dal.clone(), field_nums, need_position, &index_loc).await?; + let index_reader = InvertedIndexReader::create(dal.clone()); let queries = vec![ ("rust".to_string(), vec![0, 1]), @@ -166,14 +171,24 @@ async fn test_fuse_do_refresh_inverted_index() -> Result<()> { inverted_index_option: None, }; - let (query, tokenizer_manager) = create_inverted_index_query(&inverted_index_info)?; - - let matched_rows = index_reader.clone().do_filter( - has_score, - &query, - tokenizer_manager, - block_meta.row_count, - )?; + let (query, fuzziness, tokenizer_manager) = + create_inverted_index_query(&inverted_index_info)?; + + let matched_rows = index_reader + .clone() + .do_filter( + field_nums, + need_position, + has_score, + query.box_clone(), + &field_ids, + &index_record, + &fuzziness, + tokenizer_manager, + block_meta.row_count as u32, + &index_loc, + ) + .await?; assert!(matched_rows.is_some()); let matched_rows = matched_rows.unwrap(); assert_eq!(matched_rows.len(), ids.len()); diff --git a/src/query/storages/common/index/Cargo.toml b/src/query/storages/common/index/Cargo.toml index 1064768e2cb6..0ca11e3627c3 100644 --- a/src/query/storages/common/index/Cargo.toml +++ b/src/query/storages/common/index/Cargo.toml @@ -25,13 +25,15 @@ databend-common-functions = { workspace = true } databend-storages-common-table-meta = { workspace = true } fastrace = { workspace = true } jsonb = { workspace = true } +levenshtein_automata = "0.2.1" log = { workspace = true } match-template = { workspace = true } parquet = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } -tantivy-common = "0.7.0" +tantivy-common = { git = "https://github.com/b41sh/tantivy", rev = "37aeac0", package = "tantivy-common" } +tantivy-fst = "0.5" thiserror = { workspace = true } xorfilter-rs = { workspace = true, features = ["cbordata"] } diff --git a/src/query/storages/common/index/src/inverted_index.rs b/src/query/storages/common/index/src/inverted_index.rs index e1015cf0685d..97e9a01c53bd 100644 --- a/src/query/storages/common/index/src/inverted_index.rs +++ b/src/query/storages/common/index/src/inverted_index.rs @@ -34,6 +34,9 @@ // IN THE SOFTWARE. use std::collections::BTreeMap; +use std::collections::HashMap; +use std::collections::HashSet; +use std::collections::VecDeque; use std::io; use std::io::BufWriter; use std::io::Cursor; @@ -48,10 +51,17 @@ use std::sync::Arc; use crc32fast::Hasher; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::Scalar; +use databend_common_expression::TableDataType; +use databend_common_expression::TableField; use databend_storages_common_table_meta::meta::testify_version; use databend_storages_common_table_meta::meta::SingleColumnMeta; use databend_storages_common_table_meta::meta::Versioned; +use levenshtein_automata::Distance; +use levenshtein_automata::LevenshteinAutomatonBuilder; +use levenshtein_automata::DFA; use log::warn; +use parquet::format::FileMetaData; use tantivy::directory::error::DeleteError; use tantivy::directory::error::OpenReadError; use tantivy::directory::error::OpenWriteError; @@ -63,9 +73,23 @@ use tantivy::directory::TerminatingWrite; use tantivy::directory::WatchCallback; use tantivy::directory::WatchHandle; use tantivy::directory::WritePtr; +use tantivy::positions::PositionReader; +use tantivy::postings::TermInfo; +use tantivy::query::BooleanQuery; +use tantivy::query::FuzzyTermQuery; +use tantivy::query::Occur; +use tantivy::query::PhraseQuery; +use tantivy::query::Query; +use tantivy::query::QueryClone; +use tantivy::query::TermQuery; use tantivy::Directory; +use tantivy::Term; use tantivy_common::BinarySerializable; +use tantivy_common::HasLen; use tantivy_common::VInt; +use tantivy_fst::Automaton; +use tantivy_fst::IntoStreamer; +use tantivy_fst::Streamer; // tantivy version is used to generate the footer data @@ -119,6 +143,97 @@ impl Footer { } } +fn extract_footer(data: FileSlice) -> Result<(usize, Vec)> { + // The following code is copied from tantivy `CompositeFile::open` function. + // extract field number and offsets of each fields. + let end = data.len(); + let footer_len_data = data.slice_from(end - 4).read_bytes()?; + let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize; + let footer_start = end - 4 - footer_len; + let footer_data = data + .slice(footer_start..footer_start + footer_len) + .read_bytes()?; + let mut footer_buffer = footer_data.as_slice(); + let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize; + + let mut offsets = vec![]; + let mut offset = 0; + for _ in 0..num_fields { + offset += VInt::deserialize(&mut footer_buffer)?.0 as usize; + let _file_addr = FileAddr::deserialize(&mut footer_buffer)?; + offsets.push(offset); + } + offsets.push(footer_start); + + Ok((num_fields, offsets)) +} + +// Extract fsts from term dict file. +pub fn extract_fsts( + data: FileSlice, + fields: &mut Vec, + values: &mut Vec, +) -> Result<()> { + let (num_fields, offsets) = extract_footer(data.clone())?; + + // The following code is copied from tantivy `TermDictionary::open` function. + // extract fst data from field. + for i in 0..num_fields { + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; + + let field_slice = data.slice(start_offset..end_offset); + + let (main_slice, footer_len_slice) = field_slice.split_from_end(16); + let mut footer_len_bytes = footer_len_slice.read_bytes()?; + let footer_size = u64::deserialize(&mut footer_len_bytes)?; + + let (fst_file_slice, term_dict_file_slice) = + main_slice.split_from_end(footer_size as usize); + + let fst_field_name = format!("fst-{}", i); + let fst_field = TableField::new(&fst_field_name, TableDataType::Binary); + fields.push(fst_field); + + let fst_bytes = fst_file_slice.read_bytes()?; + values.push(Scalar::Binary(fst_bytes.as_slice().to_vec())); + + let term_dict_field_name = format!("term-{}", i); + let term_dict_field = TableField::new(&term_dict_field_name, TableDataType::Binary); + fields.push(term_dict_field); + + let term_dict_bytes = term_dict_file_slice.read_bytes()?; + values.push(Scalar::Binary(term_dict_bytes.as_slice().to_vec())); + } + + Ok(()) +} + +pub fn extract_component_fields( + name: &str, + data: FileSlice, + fields: &mut Vec, + values: &mut Vec, +) -> Result<()> { + let (num_fields, offsets) = extract_footer(data.clone())?; + + for i in 0..num_fields { + let start_offset = offsets[i]; + let end_offset = offsets[i + 1]; + + let field_slice = data.slice(start_offset..end_offset); + + let fst_field_name = format!("{}-{}", name, i); + let fst_field = TableField::new(&fst_field_name, TableDataType::Binary); + fields.push(fst_field); + + let idx_bytes = field_slice.read_bytes()?; + values.push(Scalar::Binary(idx_bytes.as_slice().to_vec())); + } + + Ok(()) +} + // Build footer for tantivy files. // Footer is used to check whether the data is valid when open a file. pub fn build_tantivy_footer(bytes: &[u8]) -> Result> { @@ -216,11 +331,428 @@ fn build_empty_position_data(field_nums: usize) -> Result { Ok(OwnedBytes::new(buf)) } +struct DfaWrapper(pub DFA); + +impl Automaton for DfaWrapper { + type State = u32; + + fn start(&self) -> Self::State { + self.0.initial_state() + } + + fn is_match(&self, state: &Self::State) -> bool { + match self.0.distance(*state) { + Distance::Exact(_) => true, + Distance::AtLeast(_) => false, + } + } + + fn can_match(&self, state: &u32) -> bool { + *state != levenshtein_automata::SINK_STATE + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + self.0.transition(*state, byte) + } +} + +// Term value contains values associated with a Term +// used to match query and collect matched doc ids. +#[derive(Clone)] +pub struct TermValue { + // term info + pub term_info: TermInfo, + // term matched doc ids + pub doc_ids: Vec, + // term frequences for each doc + pub term_freqs: Vec, + // position reader is used to read positions in doc for phrase query + pub position_reader: Option, +} + +// Check if fst contains terms in query. +// If not, we can skip read other parts of inverted index. +pub fn check_term_fsts_match( + query: Box, + fst_maps: &HashMap>, + fuzziness: &Option, + matched_terms: &mut HashMap, + fuzziness_terms: &mut HashMap>, +) -> bool { + if let Some(term_query) = query.downcast_ref::() { + let term = term_query.term(); + let field = term.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), idx); + return true; + } + } + false + } else if let Some(bool_query) = query.downcast_ref::() { + let mut matched_num = 0; + for (occur, sub_query) in bool_query.clauses() { + let matched = check_term_fsts_match( + sub_query.box_clone(), + fst_maps, + fuzziness, + matched_terms, + fuzziness_terms, + ); + if matched { + matched_num += 1; + } + match occur { + Occur::Should => {} + Occur::Must => { + if !matched { + return false; + } + } + Occur::MustNot => {} + } + } + matched_num > 0 + } else if let Some(phrase_query) = query.downcast_ref::() { + // PhraseQuery must match all terms. + let field = phrase_query.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + let mut matched_all = true; + for term in phrase_query.phrase_terms() { + let matched = if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), idx); + true + } else { + false + }; + if !matched { + matched_all = false; + break; + } + } + matched_all + } else { + false + } + } else if let Some(fuzzy_term_query) = query.downcast_ref::() { + // FuzzyTermQuery match terms by levenshtein distance. + let fuzziness = fuzziness.unwrap(); + + let term = fuzzy_term_query.term(); + let field = term.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + // build levenshtein automaton + let lev_automaton_builder = LevenshteinAutomatonBuilder::new(fuzziness, true); + let term_str = String::from_utf8_lossy(term.serialized_value_bytes()); + let automaton = DfaWrapper(lev_automaton_builder.build_dfa(&term_str)); + + let mut fuzz_term_values = vec![]; + let mut stream = fst_map.search(automaton).into_stream(); + while let Some((key, idx)) = stream.next() { + let key_str = unsafe { std::str::from_utf8_unchecked(key) }; + let fuzz_term = Term::from_field_text(field, key_str); + matched_terms.insert(fuzz_term.clone(), idx); + fuzz_term_values.push(fuzz_term); + } + let matched = !fuzz_term_values.is_empty(); + fuzziness_terms.insert(term.clone(), fuzz_term_values); + matched + } else { + false + } + } else { + // TODO: handle other Query types + let mut matched = false; + query.query_terms(&mut |term, _| { + let field = term.field(); + let field_id = field.field_id() as usize; + if let Some(fst_map) = fst_maps.get(&field_id) { + if let Some(idx) = fst_map.get(term.serialized_value_bytes()) { + matched_terms.insert(term.clone(), idx); + matched = true; + } + } + }); + + matched + } +} + +// collect matched rows by term value +pub fn collect_matched_rows( + query: Box, + row_count: u32, + fuzziness_terms: &HashMap>, + term_values: &mut HashMap, +) -> Vec { + if let Some(term_query) = query.downcast_ref::() { + let term = term_query.term(); + if let Some(term_value) = term_values.get(term) { + term_value.doc_ids.clone() + } else { + vec![] + } + } else if let Some(bool_query) = query.downcast_ref::() { + let mut should_doc_ids_opt = None; + let mut must_doc_ids_opt = None; + let mut must_not_doc_ids_opt = None; + for (occur, sub_query) in bool_query.clauses() { + let doc_ids = collect_matched_rows( + sub_query.box_clone(), + row_count, + fuzziness_terms, + term_values, + ); + let doc_id_set = HashSet::from_iter(doc_ids.into_iter()); + match occur { + Occur::Should => { + if should_doc_ids_opt.is_none() { + should_doc_ids_opt = Some(doc_id_set); + } else { + let should_doc_ids = should_doc_ids_opt.unwrap(); + should_doc_ids_opt = + Some(should_doc_ids.union(&doc_id_set).copied().collect()) + } + } + Occur::Must => { + if must_doc_ids_opt.is_none() { + must_doc_ids_opt = Some(doc_id_set); + } else { + let must_doc_ids = must_doc_ids_opt.unwrap(); + must_doc_ids_opt = + Some(must_doc_ids.intersection(&doc_id_set).copied().collect()) + } + } + Occur::MustNot => { + if must_not_doc_ids_opt.is_none() { + must_not_doc_ids_opt = Some(doc_id_set); + } else { + let must_not_doc_ids = must_not_doc_ids_opt.unwrap(); + must_not_doc_ids_opt = + Some(must_not_doc_ids.union(&doc_id_set).copied().collect()) + } + } + } + } + + let doc_ids = if let Some(mut should_doc_ids) = should_doc_ids_opt { + if let Some(must_doc_ids) = must_doc_ids_opt { + should_doc_ids = should_doc_ids + .intersection(&must_doc_ids) + .copied() + .collect() + } + if let Some(must_not_doc_ids) = must_not_doc_ids_opt { + should_doc_ids = should_doc_ids + .difference(&must_not_doc_ids) + .copied() + .collect() + } + should_doc_ids + } else if let Some(mut must_doc_ids) = must_doc_ids_opt { + if let Some(must_not_doc_ids) = must_not_doc_ids_opt { + must_doc_ids = must_doc_ids + .difference(&must_not_doc_ids) + .copied() + .collect() + } + must_doc_ids + } else if let Some(must_not_doc_ids) = must_not_doc_ids_opt { + let all_doc_ids = HashSet::from_iter(0..row_count); + let doc_ids = all_doc_ids.difference(&must_not_doc_ids).copied().collect(); + doc_ids + } else { + HashSet::new() + }; + + let mut doc_ids = Vec::from_iter(doc_ids); + doc_ids.sort(); + doc_ids + } else if let Some(phrase_query) = query.downcast_ref::() { + let mut union_doc_ids = HashSet::new(); + let mut intersection_doc_ids_opt = None; + + for term in phrase_query.phrase_terms() { + if let Some(term_value) = term_values.get(&term) { + let doc_id_set = HashSet::from_iter(term_value.doc_ids.clone()); + union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect(); + if intersection_doc_ids_opt.is_none() { + intersection_doc_ids_opt = Some(doc_id_set); + } else { + let intersection_doc_ids = intersection_doc_ids_opt.unwrap(); + intersection_doc_ids_opt = Some( + intersection_doc_ids + .intersection(&doc_id_set) + .copied() + .collect(), + ); + } + } + } + + let intersection_doc_ids = intersection_doc_ids_opt.unwrap_or_default(); + if intersection_doc_ids.is_empty() { + return vec![]; + } + let mut union_doc_ids = Vec::from_iter(union_doc_ids); + union_doc_ids.sort(); + + // check each docs + let mut matched_doc_ids = vec![]; + for doc_id in union_doc_ids { + if !intersection_doc_ids.contains(&doc_id) { + continue; + } + + let mut term_pos_map = HashMap::new(); + for term in phrase_query.phrase_terms() { + let mut offset = 0; + let mut term_freq = 0; + if let Some(term_value) = term_values.get_mut(&term) { + for i in 0..term_value.doc_ids.len() { + if term_value.doc_ids[i] < doc_id { + offset += term_value.term_freqs[i] as u64; + } else { + term_freq = term_value.term_freqs[i] as usize; + break; + } + } + // collect positions in the docs + if let Some(position_reader) = term_value.position_reader.as_mut() { + let mut pos_output = vec![0; term_freq]; + position_reader.read(offset, &mut pos_output[..]); + for i in 1..pos_output.len() { + pos_output[i] += pos_output[i - 1]; + } + let positions = VecDeque::from_iter(pos_output); + term_pos_map.insert(term.clone(), positions); + } + } + } + + let mut is_first = true; + let mut distance = 0; + let mut matched = true; + let mut last_position = 0; + for (query_position, term) in phrase_query.phrase_terms_with_offsets() { + if let Some(positions) = term_pos_map.get_mut(&term) { + let mut find_position = false; + while let Some(doc_position) = positions.pop_front() { + // skip previous positions. + if doc_position < last_position { + continue; + } + last_position = doc_position; + let doc_distance = doc_position - (query_position as u32); + if is_first { + is_first = false; + distance = doc_distance; + } else { + // distance must same as first term. + if doc_distance != distance { + matched = false; + } + } + find_position = true; + break; + } + if !find_position { + matched = false; + } + } else { + matched = false; + } + if !matched { + break; + } + } + if matched { + matched_doc_ids.push(doc_id); + } + } + matched_doc_ids + } else if let Some(fuzzy_term_query) = query.downcast_ref::() { + let mut fuzz_doc_ids = HashSet::new(); + let term = fuzzy_term_query.term(); + + // collect related terms of the original term. + if let Some(related_terms) = fuzziness_terms.get(term) { + for term in related_terms { + if let Some(term_value) = term_values.get(term) { + let doc_id_set: HashSet = HashSet::from_iter(term_value.doc_ids.clone()); + fuzz_doc_ids = fuzz_doc_ids.union(&doc_id_set).copied().collect(); + } + } + let mut doc_ids = Vec::from_iter(fuzz_doc_ids); + doc_ids.sort(); + doc_ids + } else { + vec![] + } + } else { + let mut union_doc_ids = HashSet::new(); + query.query_terms(&mut |term, _| { + if let Some(term_value) = term_values.get(term) { + let doc_id_set: HashSet = HashSet::from_iter(term_value.doc_ids.clone()); + union_doc_ids = union_doc_ids.union(&doc_id_set).copied().collect(); + } + }); + + let mut doc_ids = Vec::from_iter(union_doc_ids); + doc_ids.sort(); + doc_ids + } +} + #[derive(Clone)] pub struct InvertedIndexMeta { pub columns: Vec<(String, SingleColumnMeta)>, } +impl TryFrom for InvertedIndexMeta { + type Error = databend_common_exception::ErrorCode; + + fn try_from(mut meta: FileMetaData) -> std::result::Result { + let rg = meta.row_groups.remove(0); + let mut col_metas = Vec::with_capacity(rg.columns.len()); + for x in &rg.columns { + match &x.meta_data { + Some(chunk_meta) => { + let col_start = + if let Some(dict_page_offset) = chunk_meta.dictionary_page_offset { + dict_page_offset + } else { + chunk_meta.data_page_offset + }; + let col_len = chunk_meta.total_compressed_size; + assert!( + col_start >= 0 && col_len >= 0, + "column start and length should not be negative" + ); + let num_values = chunk_meta.num_values as u64; + let res = SingleColumnMeta { + offset: (col_start + 23) as u64, + len: (col_len - 23) as u64, + num_values, + }; + let column_name = chunk_meta.path_in_schema[0].to_owned(); + col_metas.push((column_name, res)); + } + None => { + panic!( + "expecting chunk meta data while converting ThriftFileMetaData to InvertedIndexMeta" + ) + } + } + } + col_metas.shrink_to_fit(); + Ok(Self { columns: col_metas }) + } +} + #[derive(Clone, Debug)] pub struct InvertedIndexFile { pub name: String, diff --git a/src/query/storages/common/index/src/lib.rs b/src/query/storages/common/index/src/lib.rs index c2a7bd08eeb5..f60a1b16982e 100644 --- a/src/query/storages/common/index/src/lib.rs +++ b/src/query/storages/common/index/src/lib.rs @@ -27,9 +27,14 @@ pub use bloom_index::BloomIndexMeta; pub use bloom_index::FilterEvalResult; pub use index::Index; pub use inverted_index::build_tantivy_footer; +pub use inverted_index::check_term_fsts_match; +pub use inverted_index::collect_matched_rows; +pub use inverted_index::extract_component_fields; +pub use inverted_index::extract_fsts; pub use inverted_index::InvertedIndexDirectory; pub use inverted_index::InvertedIndexFile; pub use inverted_index::InvertedIndexMeta; +pub use inverted_index::TermValue; pub use page_index::PageIndex; pub use range_index::statistics_to_domain; pub use range_index::RangeIndex; diff --git a/src/query/storages/fuse/Cargo.toml b/src/query/storages/fuse/Cargo.toml index 9b9b26a4da33..490441609a06 100644 --- a/src/query/storages/fuse/Cargo.toml +++ b/src/query/storages/fuse/Cargo.toml @@ -64,7 +64,8 @@ sha2 = { workspace = true } siphasher = "0.3.10" sys-info = "0.9" tantivy = { workspace = true } -tantivy-jieba = "0.11.0" +tantivy-fst = "0.5" +tantivy-jieba = { git = "https://github.com/b41sh/tantivy-jieba", rev = "af84361" } thrift = "0.17.0" typetag = { workspace = true } uuid = { workspace = true } diff --git a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs index 40b5ca1928cf..85dd7ddbb132 100644 --- a/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs +++ b/src/query/storages/fuse/src/io/read/bloom/block_filter_reader.rs @@ -99,7 +99,8 @@ async fn load_bloom_filter_by_columns<'a>( for column_name in column_needed { for (idx, (name, column_meta)) in index_column_chunk_metas.iter().enumerate() { if name == column_name { - col_metas.push((idx as ColumnId, (name, column_meta))) + col_metas.push((idx as ColumnId, (name, column_meta))); + break; } } } diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs index df2ffa89cecc..3ccc3205d90c 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_loader.rs @@ -74,7 +74,33 @@ where /// Loads inverted index meta data /// read data from cache, or populate cache items if possible #[fastrace::trace] -async fn load_inverted_index_meta(dal: Operator, path: &str) -> Result> { +pub(crate) async fn load_inverted_index_meta( + dal: Operator, + path: &str, +) -> Result> { + let path_owned = path.to_owned(); + async move { + let reader = MetaReaders::inverted_index_meta_reader(dal); + let version = 1; + + let load_params = LoadParams { + location: path_owned, + len_hint: None, + ver: version, + put_cache: true, + }; + + reader.read(&load_params).await + } + .execute_in_runtime(&GlobalIORuntime::instance()) + .await? +} + +#[fastrace::trace] +pub(crate) async fn load_inverted_v0_index_meta( + dal: Operator, + path: &str, +) -> Result> { let path_owned = path.to_owned(); async move { let reader = MetaReaders::inverted_index_meta_reader(dal); @@ -96,47 +122,44 @@ async fn load_inverted_index_meta(dal: Operator, path: &str) -> Result( - index_path: &'a str, +pub(crate) async fn load_inverted_index_file<'a>( name: &'a str, col_meta: &'a SingleColumnMeta, - need_position: bool, + index_path: &'a str, dal: &'a Operator, ) -> Result> { - // Because the position file is relatively large, reading it will take more time. - // And position data is only used when the query has phrase terms. - // If the query has no phrase terms, we can ignore it and use empty position data instead. - if name == "pos" && !need_position { - let file = InvertedIndexFile::try_create(name.to_owned(), vec![])?; - return Ok(Arc::new(file)); - } - + let start = Instant::now(); let storage_runtime = GlobalIORuntime::instance(); - let file = { - let inverted_index_file_reader = InvertedIndexFileReader::new( + let bytes = { + let column_data_reader = InvertedIndexFileReader::new( index_path.to_owned(), name.to_owned(), col_meta, dal.clone(), ); - async move { inverted_index_file_reader.read().await } + async move { column_data_reader.read().await } } .execute_in_runtime(&storage_runtime) .await??; - Ok(file) + + // Perf. + { + metrics_inc_block_inverted_index_read_milliseconds(start.elapsed().as_millis() as u64); + } + + Ok(bytes) } /// load inverted index directory #[fastrace::trace] pub(crate) async fn load_inverted_index_directory<'a>( dal: Operator, - need_position: bool, field_nums: usize, index_path: &'a str, ) -> Result { let start = Instant::now(); // load inverted index meta, contains the offsets of each files. - let inverted_index_meta = load_inverted_index_meta(dal.clone(), index_path).await?; + let inverted_index_meta = load_inverted_v0_index_meta(dal.clone(), index_path).await?; // load inverted index files, usually including following eight files: // 1. fast file @@ -150,9 +173,7 @@ pub(crate) async fn load_inverted_index_directory<'a>( let futs = inverted_index_meta .columns .iter() - .map(|(name, column_meta)| { - load_inverted_index_file(index_path, name, column_meta, need_position, &dal) - }) + .map(|(name, column_meta)| load_inverted_index_file(name, column_meta, index_path, &dal)) .collect::>(); let files: Vec<_> = try_join_all(futs).await?.into_iter().collect(); @@ -168,7 +189,7 @@ pub(crate) async fn load_inverted_index_directory<'a>( } /// Read the inverted index file data. -pub struct InvertedIndexFileReader { +pub(crate) struct InvertedIndexFileReader { cached_reader: CachedReader, param: LoadParams, } @@ -182,6 +203,7 @@ impl InvertedIndexFileReader { ) -> Self { let cache_key = Self::cache_key_of_column(&index_path, &name); + // let loader = InvertedIndexSliceLoader { let loader = InvertedIndexFileLoader { offset: column_meta.offset, len: column_meta.len, @@ -235,13 +257,13 @@ pub struct InvertedIndexFileLoader { impl Loader for InvertedIndexFileLoader { #[async_backtrace::framed] async fn load(&self, params: &LoadParams) -> Result { - let bytes = self + let buffer = self .operator .read_with(¶ms.location) .range(self.offset..self.offset + self.len) .await?; - InvertedIndexFile::try_create(self.name.clone(), bytes.to_vec()) + InvertedIndexFile::try_create(self.name.clone(), buffer.to_vec()) } fn cache_key(&self, _params: &LoadParams) -> CacheKey { diff --git a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs index 20b8950f27de..084ebd206014 100644 --- a/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs +++ b/src/query/storages/fuse/src/io/read/inverted_index/inverted_index_reader.rs @@ -12,60 +12,145 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashMap; +use std::collections::HashSet; +use std::sync::Arc; use std::time::Instant; use databend_common_exception::Result; use databend_common_expression::types::F32; use databend_common_metrics::storage::metrics_inc_block_inverted_index_search_milliseconds; -use databend_storages_common_index::InvertedIndexDirectory; +use databend_storages_common_index::check_term_fsts_match; +use databend_storages_common_index::collect_matched_rows; +use databend_storages_common_index::TermValue; +use databend_storages_common_table_meta::meta::SingleColumnMeta; +use futures_util::future::try_join_all; use opendal::Operator; use tantivy::collector::DocSetCollector; use tantivy::collector::TopDocs; +use tantivy::directory::FileSlice; +use tantivy::directory::OwnedBytes; +use tantivy::positions::PositionReader; +use tantivy::postings::BlockSegmentPostings; use tantivy::query::Query; +use tantivy::query::QueryClone; +use tantivy::schema::IndexRecordOption; +use tantivy::termdict::TermInfoStore; use tantivy::tokenizer::TokenizerManager; use tantivy::Index; +use tantivy_fst::raw::Fst; use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_directory; +use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_file; +use crate::io::read::inverted_index::inverted_index_loader::load_inverted_index_meta; use crate::io::read::inverted_index::inverted_index_loader::InvertedIndexFileReader; #[derive(Clone)] pub struct InvertedIndexReader { - directory: InvertedIndexDirectory, + dal: Operator, } impl InvertedIndexReader { - pub async fn try_create( - dal: Operator, - field_nums: usize, - need_position: bool, - index_loc: &str, - ) -> Result { - let directory = - load_inverted_index_directory(dal.clone(), need_position, field_nums, index_loc) - .await?; - - Ok(Self { directory }) + pub fn create(dal: Operator) -> Self { + Self { dal } } // Filter the rows and scores in the block that can match the query text, // if there is no row that can match, this block can be pruned. #[allow(clippy::type_complexity)] - pub fn do_filter( + #[allow(clippy::too_many_arguments)] + pub async fn do_filter( self, + field_nums: usize, + need_position: bool, has_score: bool, - query: &dyn Query, + query: Box, + field_ids: &HashSet, + index_record: &IndexRecordOption, + fuzziness: &Option, tokenizer_manager: TokenizerManager, - row_count: u64, + row_count: u32, + index_loc: &str, ) -> Result)>>> { let start = Instant::now(); - let mut index = Index::open(self.directory)?; + + let matched_rows = self + .search( + index_loc, + query, + field_ids, + field_nums, + need_position, + has_score, + index_record, + fuzziness, + tokenizer_manager, + row_count, + ) + .await?; + + // Perf. + { + metrics_inc_block_inverted_index_search_milliseconds(start.elapsed().as_millis() as u64); + } + + Ok(matched_rows) + } + + async fn read_column_data<'a>( + &self, + index_path: &'a str, + name: &str, + field_ids: &HashSet, + inverted_index_meta_map: &HashMap, + ) -> Result> { + let mut col_metas = vec![]; + let mut col_field_map = HashMap::new(); + for field_id in field_ids { + let col_name = format!("{}-{}", name, field_id); + let col_meta = inverted_index_meta_map.get(&col_name).unwrap(); + + col_metas.push((col_name.clone(), col_meta)); + col_field_map.insert(col_name, *field_id); + } + + let futs = col_metas + .iter() + .map(|(name, col_meta)| load_inverted_index_file(name, col_meta, index_path, &self.dal)) + .collect::>(); + + let col_files = try_join_all(futs) + .await? + .into_iter() + .map(|f| { + let field_id = col_field_map.get(&f.name).unwrap(); + (*field_id, f.data.clone()) + }) + .collect::>(); + + Ok(col_files) + } + + async fn search_v0<'a>( + &self, + index_path: &'a str, + query: Box, + field_nums: usize, + has_score: bool, + tokenizer_manager: TokenizerManager, + row_count: u32, + ) -> Result)>>> { + let directory = + load_inverted_index_directory(self.dal.clone(), field_nums, index_path).await?; + + let mut index = Index::open(directory)?; index.set_tokenizers(tokenizer_manager); let reader = index.reader()?; let searcher = reader.searcher(); let matched_rows = if has_score { let collector = TopDocs::with_limit(row_count as usize); - let docs = searcher.search(query, &collector)?; + let docs = searcher.search(&query, &collector)?; let mut matched_rows = Vec::with_capacity(docs.len()); for (score, doc_addr) in docs { @@ -76,7 +161,7 @@ impl InvertedIndexReader { matched_rows } else { let collector = DocSetCollector; - let docs = searcher.search(query, &collector)?; + let docs = searcher.search(&query, &collector)?; let mut matched_rows = Vec::with_capacity(docs.len()); for doc_addr in docs { @@ -85,13 +170,221 @@ impl InvertedIndexReader { } matched_rows }; + if !matched_rows.is_empty() { + Ok(Some(matched_rows)) + } else { + Ok(None) + } + } - // Perf. - { - metrics_inc_block_inverted_index_search_milliseconds(start.elapsed().as_millis() as u64); + // read fst first to check whether matched. + #[allow(clippy::too_many_arguments)] + async fn search<'a>( + &self, + index_path: &'a str, + query: Box, + field_ids: &HashSet, + field_nums: usize, + need_position: bool, + has_score: bool, + index_record: &IndexRecordOption, + fuzziness: &Option, + tokenizer_manager: TokenizerManager, + row_count: u32, + ) -> Result)>>> { + // 1. read index meta + let inverted_index_meta = load_inverted_index_meta(self.dal.clone(), index_path).await; + + if inverted_index_meta.is_err() { + return self + .search_v0( + index_path, + query, + field_nums, + has_score, + tokenizer_manager, + row_count, + ) + .await; } - if !matched_rows.is_empty() { + let inverted_index_meta_map = inverted_index_meta + .unwrap() + .columns + .clone() + .into_iter() + .collect::>(); + + if inverted_index_meta_map.contains_key("meta.json") { + return self + .search_v0( + index_path, + query, + field_nums, + has_score, + tokenizer_manager, + row_count, + ) + .await; + } + + // 2. read fst files + let fst_files = self + .read_column_data(index_path, "fst", field_ids, &inverted_index_meta_map) + .await?; + + let mut fst_maps = HashMap::new(); + for (field_id, fst_data) in fst_files.into_iter() { + let fst = Fst::new(fst_data).map_err(|err| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("Fst data is corrupted: {:?}", err), + ) + })?; + let fst_map = tantivy_fst::Map::from(fst); + fst_maps.insert(field_id, fst_map); + } + + // 3. check whether query is matched in the fsts. + let mut matched_terms = HashMap::new(); + let mut fuzziness_terms = HashMap::new(); + let matched = check_term_fsts_match( + query.box_clone(), + &fst_maps, + fuzziness, + &mut matched_terms, + &mut fuzziness_terms, + ); + + if !matched { + return Ok(None); + } + + // 4. read term dict files, and get term info for each terms. + let term_dict_files = self + .read_column_data(index_path, "term", field_ids, &inverted_index_meta_map) + .await?; + + let mut term_dict_maps = HashMap::new(); + for (field_id, term_dict_data) in term_dict_files.into_iter() { + let term_dict_file = FileSlice::new(Arc::new(term_dict_data)); + let term_info_store = TermInfoStore::open(term_dict_file)?; + term_dict_maps.insert(field_id, term_info_store); + } + + let mut term_values = HashMap::new(); + for (term, term_ord) in matched_terms.iter() { + let field = term.field(); + let field_id = field.field_id() as usize; + + let term_dict = term_dict_maps.get(&field_id).unwrap(); + let term_info = term_dict.get(*term_ord); + let term_value = TermValue { + term_info, + doc_ids: vec![], + term_freqs: vec![], + position_reader: None, + }; + term_values.insert(term.clone(), term_value); + } + + // 5. read postings and optional positions. + // collect doc ids, term frequences and optional position readers. + let mut slice_metas = Vec::with_capacity(term_values.len()); + let mut name_map = HashMap::new(); + for (term, term_value) in term_values.iter() { + let field = term.field(); + let field_id = field.field_id() as usize; + + let idx_name = format!("idx-{}", field_id); + let idx_meta = inverted_index_meta_map.get(&idx_name).unwrap(); + + // ignore 8 bytes total_num_tokens_slice + let offset = idx_meta.offset + 8 + (term_value.term_info.postings_range.start as u64); + let len = term_value.term_info.postings_range.len() as u64; + let idx_slice_meta = SingleColumnMeta { + offset, + len, + num_values: 1, + }; + let idx_slice_name = + format!("{}-{}", idx_name, term_value.term_info.postings_range.start); + slice_metas.push((idx_slice_name.clone(), idx_slice_meta)); + name_map.insert(idx_slice_name, term.clone()); + + if need_position { + let pos_name = format!("pos-{}", field_id); + let pos_meta = inverted_index_meta_map.get(&pos_name).unwrap(); + let offset = pos_meta.offset + (term_value.term_info.positions_range.start as u64); + let len = term_value.term_info.positions_range.len() as u64; + let pos_slice_meta = SingleColumnMeta { + offset, + len, + num_values: 1, + }; + let pos_slice_name = format!( + "{}-{}", + pos_name, term_value.term_info.positions_range.start + ); + slice_metas.push((pos_slice_name.clone(), pos_slice_meta)); + name_map.insert(pos_slice_name, term.clone()); + } + } + + let futs = slice_metas + .iter() + .map(|(name, col_meta)| load_inverted_index_file(name, col_meta, index_path, &self.dal)) + .collect::>(); + + let slice_files = try_join_all(futs) + .await? + .into_iter() + .map(|f| (f.name.clone(), f.data.clone())) + .collect::>(); + + for (slice_name, slice_data) in slice_files.into_iter() { + let term = name_map.get(&slice_name).unwrap(); + let term_value = term_values.get_mut(term).unwrap(); + + if slice_name.starts_with("idx") { + let posting_file = FileSlice::new(Arc::new(slice_data)); + let postings = BlockSegmentPostings::open( + term_value.term_info.doc_freq, + posting_file, + *index_record, + *index_record, + )?; + let doc_ids = postings.docs(); + let term_freqs = postings.freqs(); + + term_value.doc_ids = doc_ids.to_vec(); + term_value.term_freqs = term_freqs.to_vec(); + } else if slice_name.starts_with("pos") { + let position_reader = PositionReader::open(slice_data)?; + term_value.position_reader = Some(position_reader); + } + } + + // 6. collect matched rows by term values. + let matched_docs = collect_matched_rows( + query.box_clone(), + row_count, + &fuzziness_terms, + &mut term_values, + ); + + if !matched_docs.is_empty() { + let mut matched_rows = Vec::with_capacity(matched_docs.len()); + if has_score { + // TODO: add score + for doc_id in matched_docs { + matched_rows.push((doc_id as usize, Some(F32::from(1.0)))); + } + } else { + for doc_id in matched_docs { + matched_rows.push((doc_id as usize, None)) + } + } Ok(Some(matched_rows)) } else { Ok(None) diff --git a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs index 9945fffe1795..6b5edb4787d0 100644 --- a/src/query/storages/fuse/src/io/read/meta/meta_readers.rs +++ b/src/query/storages/fuse/src/io/read/meta/meta_readers.rs @@ -158,6 +158,20 @@ impl Loader for LoaderWrapper { impl Loader for LoaderWrapper { #[async_backtrace::framed] async fn load(&self, params: &LoadParams) -> Result { + if params.ver > 0 { + // read the ThriftFileMetaData, omit unnecessary conversions + let meta = read_thrift_file_metadata(self.0.clone(), ¶ms.location, params.len_hint) + .await + .map_err(|err| { + ErrorCode::StorageOther(format!( + "read file meta failed, {}, {:?}", + params.location, err + )) + })?; + + return InvertedIndexMeta::try_from(meta); + } + let operator = &self.0; let meta = operator.stat(¶ms.location).await.map_err(|err| { ErrorCode::StorageOther(format!( diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs index 78fbfe5ed613..3c01ccc09c9e 100644 --- a/src/query/storages/fuse/src/io/write/block_writer.rs +++ b/src/query/storages/fuse/src/io/write/block_writer.rs @@ -295,7 +295,15 @@ impl InvertedIndexState { &inverted_index_builder.options, )?; writer.add_block(source_schema, block)?; - let data = writer.finalize()?; + let (index_schema, index_block) = writer.finalize()?; + + let mut data = Vec::with_capacity(DEFAULT_BLOCK_INDEX_BUFFER_SIZE); + let _ = blocks_to_parquet( + &index_schema, + vec![index_block], + &mut data, + TableCompression::None, + )?; let size = data.len() as u64; // Perf. diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs index 4bfb40180687..678e18acc4f2 100644 --- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs @@ -20,11 +20,16 @@ use std::path::Path; use databend_common_exception::ErrorCode; use databend_common_exception::Result; use databend_common_expression::types::DataType; +use databend_common_expression::BlockEntry; use databend_common_expression::DataBlock; use databend_common_expression::DataSchemaRef; use databend_common_expression::ScalarRef; +use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRef; +use databend_common_expression::Value; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; +use databend_storages_common_index::extract_component_fields; +use databend_storages_common_index::extract_fsts; use tantivy::indexer::UserOperation; use tantivy::schema::Field; use tantivy::schema::IndexRecordOption; @@ -130,15 +135,39 @@ impl InvertedIndexWriter { } #[async_backtrace::framed] - pub fn finalize(mut self) -> Result> { + pub fn finalize(mut self) -> Result<(TableSchema, DataBlock)> { let _ = self.index_writer.run(self.operations); let _ = self.index_writer.commit()?; let index = self.index_writer.index(); - let mut buffer = Vec::with_capacity(DEFAULT_BLOCK_BUFFER_SIZE); - Self::write_index(&mut buffer, index)?; + let mut fields = Vec::new(); + let mut values = Vec::new(); - Ok(buffer) + let segments = index.searchable_segments()?; + let segment = &segments[0]; + + let termdict_file = segment.open_read(SegmentComponent::Terms)?; + extract_fsts(termdict_file, &mut fields, &mut values)?; + + let posting_file = segment.open_read(SegmentComponent::Postings)?; + extract_component_fields("idx", posting_file, &mut fields, &mut values)?; + + let position_file = segment.open_read(SegmentComponent::Positions)?; + extract_component_fields("pos", position_file, &mut fields, &mut values)?; + + let field_norms_file = segment.open_read(SegmentComponent::FieldNorms)?; + extract_component_fields("fieldnorm", field_norms_file, &mut fields, &mut values)?; + + let inverted_index_schema = TableSchema::new(fields); + + let mut index_columns = Vec::with_capacity(values.len()); + for value in values.into_iter() { + let index_value = Value::Scalar(value); + index_columns.push(BlockEntry::new(DataType::Binary, index_value)); + } + let inverted_index_block = DataBlock::new(index_columns, 1); + + Ok((inverted_index_schema, inverted_index_block)) } // The tantivy index data consists of eight files. diff --git a/src/query/storages/fuse/src/operations/inverted_index.rs b/src/query/storages/fuse/src/operations/inverted_index.rs index b46e57d1d267..130b849a9345 100644 --- a/src/query/storages/fuse/src/operations/inverted_index.rs +++ b/src/query/storages/fuse/src/operations/inverted_index.rs @@ -29,6 +29,7 @@ use databend_common_expression::DataBlock; use databend_common_expression::DataSchema; use databend_common_expression::DataSchemaRef; use databend_common_expression::TableSchemaRef; +use databend_common_io::constants::DEFAULT_BLOCK_INDEX_BUFFER_SIZE; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_bytes; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_milliseconds; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_nums; @@ -41,9 +42,11 @@ use databend_common_pipeline_sources::AsyncSource; use databend_common_pipeline_sources::AsyncSourcer; use databend_common_pipeline_transforms::processors::AsyncTransform; use databend_common_pipeline_transforms::processors::TransformPipelineHelper; +use databend_storages_common_blocks::blocks_to_parquet; use databend_storages_common_cache::LoadParams; use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::Location; +use databend_storages_common_table_meta::table::TableCompression; use opendal::Operator; use crate::io::write_data; @@ -292,7 +295,15 @@ impl AsyncTransform for InvertedIndexTransform { let mut writer = InvertedIndexWriter::try_create(self.data_schema.clone(), &self.index_options)?; writer.add_block(&self.source_schema, &data_block)?; - let data = writer.finalize()?; + + let (index_schema, index_block) = writer.finalize()?; + let mut data = Vec::with_capacity(DEFAULT_BLOCK_INDEX_BUFFER_SIZE); + let _ = blocks_to_parquet( + &index_schema, + vec![index_block], + &mut data, + TableCompression::None, + )?; let index_size = data.len() as u64; write_data(data, &self.operator, &index_location).await?; diff --git a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs index 5240f851becc..9f58e4233edf 100644 --- a/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs +++ b/src/query/storages/fuse/src/pruning/inverted_index_pruner.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::HashSet; use std::sync::Arc; use databend_common_catalog::plan::InvertedIndexInfo; @@ -20,8 +21,10 @@ use databend_common_exception::Result; use databend_common_expression::types::F32; use opendal::Operator; use tantivy::query::Query; +use tantivy::query::QueryClone; use tantivy::query::QueryParser; use tantivy::schema::Field; +use tantivy::schema::IndexRecordOption; use tantivy::tokenizer::TokenizerManager; use crate::io::create_index_schema; @@ -56,6 +59,9 @@ pub struct InvertedIndexPruner { need_position: bool, tokenizer_manager: TokenizerManager, query: Box, + field_ids: HashSet, + index_record: IndexRecordOption, + fuzziness: Option, } impl InvertedIndexPruner { @@ -65,14 +71,26 @@ impl InvertedIndexPruner { ) -> Result>> { let inverted_index_info = push_down.as_ref().and_then(|p| p.inverted_index.as_ref()); if let Some(inverted_index_info) = inverted_index_info { - let (query, tokenizer_manager) = create_inverted_index_query(inverted_index_info)?; + let (query, fuzziness, tokenizer_manager) = + create_inverted_index_query(inverted_index_info)?; + + let index_record: IndexRecordOption = + match inverted_index_info.index_options.get("index_record") { + Some(v) => serde_json::from_str(v)?, + None => IndexRecordOption::WithFreqsAndPositions, + }; let mut need_position = false; - query.query_terms(&mut |_, pos| { + let mut field_ids = HashSet::new(); + query.query_terms(&mut |term, pos| { + let field = term.field(); + let field_id = field.field_id() as usize; + field_ids.insert(field_id); if pos { need_position = true; } }); + // whether need to generate score internl column let has_score = inverted_index_info.has_score; let field_nums = inverted_index_info.index_schema.num_fields(); @@ -88,6 +106,9 @@ impl InvertedIndexPruner { need_position, tokenizer_manager, query, + field_ids, + index_record, + fuzziness, }))); } Ok(None) @@ -105,20 +126,22 @@ impl InvertedIndexPruner { &self.index_version, ); - let inverted_index_reader = InvertedIndexReader::try_create( - self.dal.clone(), - self.field_nums, - self.need_position, - &index_loc, - ) - .await?; - - let matched_rows = inverted_index_reader.do_filter( - self.has_score, - &self.query, - self.tokenizer_manager.clone(), - row_count, - )?; + let inverted_index_reader = InvertedIndexReader::create(self.dal.clone()); + + let matched_rows = inverted_index_reader + .do_filter( + self.field_nums, + self.need_position, + self.has_score, + self.query.box_clone(), + &self.field_ids, + &self.index_record, + &self.fuzziness, + self.tokenizer_manager.clone(), + row_count as u32, + &index_loc, + ) + .await?; Ok(matched_rows) } @@ -127,7 +150,7 @@ impl InvertedIndexPruner { // create tantivy query for inverted index. pub fn create_inverted_index_query( inverted_index_info: &InvertedIndexInfo, -) -> Result<(Box, TokenizerManager)> { +) -> Result<(Box, Option, TokenizerManager)> { // collect query fields and optional boosts let mut query_fields = Vec::with_capacity(inverted_index_info.query_fields.len()); let mut query_field_boosts = Vec::with_capacity(inverted_index_info.query_fields.len()); @@ -159,11 +182,11 @@ pub fn create_inverted_index_query( let fuzziness = inverted_index_info .inverted_index_option .as_ref() - .and_then(|o| o.fuzziness.as_ref()); + .and_then(|o| o.fuzziness); if let Some(fuzziness) = fuzziness { // Fuzzy query matches rows containing a specific term that is within Levenshtein distance. for field in query_fields { - query_parser.set_field_fuzzy(field, false, *fuzziness, true); + query_parser.set_field_fuzzy(field, false, fuzziness, true); } } let operator = inverted_index_info @@ -189,5 +212,5 @@ pub fn create_inverted_index_query( query_parser.parse_query(&inverted_index_info.query_text)? }; - Ok((query, tokenizer_manager)) + Ok((query, fuzziness, tokenizer_manager)) } diff --git a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test index 0759f20c1713..c360e6558ff1 100644 --- a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test @@ -53,13 +53,13 @@ SELECT id, score(), content FROM t WHERE match(content, 'the') query IFT SELECT id, score(), content FROM t WHERE match(content, 'fly') ---- -5 2.4594712 Time flies like an arrow; fruit flies like a banana +5 1.0 Time flies like an arrow; fruit flies like a banana query IFT SELECT id, score(), content FROM t WHERE match(content, 'word') ---- -2 1.5948367 A picture is worth a thousand words -4 1.6550698 Actions speak louder than words +2 1.0 A picture is worth a thousand words +4 1.0 Actions speak louder than words query IFT SELECT id, score(), content FROM t WHERE match(content, 'box') @@ -80,7 +80,7 @@ SELECT id, score(), content FROM t WHERE match(content, 'action works', 'fuzzine query IFT SELECT id, score(), content FROM t WHERE match(content, 'action works', 'fuzziness=1;operator=AND') ---- -4 2.0 Actions speak louder than words +4 1.0 Actions speak louder than words statement ok INSERT INTO t VALUES @@ -109,43 +109,43 @@ INSERT INTO t VALUES (30, '张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。') query IFT -SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY id ---- -21 1.1111465 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 -24 1.1111465 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 -28 1.2247349 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 -12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +21 1.0 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 +24 1.0 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '北京') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '北京') ORDER BY id ---- -30 1.7396812 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 -12 1.9475443 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +30 1.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '北京大学') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '北京大学') ORDER BY id ---- -30 5.2190437 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 +30 1.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 query IFT SELECT id, score(), content FROM t WHERE match(content, '北京 大', 'fuzziness=1;operator=AND') ORDER BY id ---- -12 2.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -30 2.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +30 1.0 张华考上了北京大学,李萍进了中等技术学校,我在百货公司当售货员,我们都有光明的前途。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '文化博大精深') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '文化博大精深') ORDER BY id ---- -28 7.61753 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '文化 博大精深') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '文化 博大精深') ORDER BY id ---- -21 1.1111465 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 -24 1.542129 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 -15 2.063777 中国的茶文化源远流长,品茶已经成为一种生活方式。 -28 7.61753 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +21 1.0 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 +24 1.0 中国的传统节日蕴含着丰富的文化内涵,是传承和弘扬中华文化的重要途径。 +28 7.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT SELECT id, score(), content FROM t WHERE match(content, '化博') ORDER BY score() @@ -174,28 +174,28 @@ statement ok UPDATE t SET content = '科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。' WHERE id=24 query IFT -SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY id ---- -21 1.423108 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 -12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 -28 1.5707673 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +21 1.0 中国的古代诗词充满了深邃的意境和独特的韵味,是中华文化的重要组成部分。 +28 1.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 query IFT -SELECT id, score(), content FROM t WHERE match(content, '科技') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '科技') ORDER BY id ---- -13 2.1947646 随着科技的发展,人们的生活变得越来越便利。 -24 2.8508463 科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。 +13 1.0 随着科技的发展,人们的生活变得越来越便利。 +24 1.0 科技创新是推动社会进步的重要动力,我们应该积极支持和推动科技创新。 statement ok DELETE FROM t WHERE id=21 query IFT -SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY score() +SELECT id, score(), content FROM t WHERE match(content, '中国') ORDER BY id ---- -12 1.4482267 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 -15 1.5346593 中国的茶文化源远流长,品茶已经成为一种生活方式。 -28 2.002842 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 +12 1.0 北京的故宫是中国古代建筑的瑰宝,吸引了无数游客前来参观。 +15 1.0 中国的茶文化源远流长,品茶已经成为一种生活方式。 +28 2.0 中国的饮食文化博大精深,各地的美食各具特色,让人流连忘返。 # index without optional filters and index rocord is basic statement ok @@ -205,12 +205,12 @@ statement ok REFRESH INVERTED INDEX idx1 ON t query IFT -SELECT id, score(), content FROM t WHERE match(content, 'the') +SELECT id, score(), content FROM t WHERE match(content, 'the') ORDER BY id ---- -1 0.8323383 The quick brown fox jumps over the lazy dog -3 0.9893832 The early bird catches the worm -6 0.8788376 Beauty is in the eye of the beholder -10 0.8788376 An apple a day keeps the doctor away +1 1.0 The quick brown fox jumps over the lazy dog +3 1.0 The early bird catches the worm +6 1.0 Beauty is in the eye of the beholder +10 1.0 An apple a day keeps the doctor away query IFT SELECT id, score(), content FROM t WHERE match(content, 'fly') @@ -252,51 +252,51 @@ INSERT INTO books VALUES (20, 'CockroachDB: The Definitive Guide', 'Guy Harrison, Jesse Seldess, Ben Darnell', 'Get the lowdown on CockroachDB, the distributed SQL database built to handle the demands of today’s data-driven cloud applications. In this hands-on guide, software developers, architects, and DevOps/SRE teams will learn how to use CockroachDB to create applications that scale elastically and provide seamless delivery for end users while remaining indestructible. Teams will also learn how to migrate existing applications to CockroachDB’s performant, cloud-native data architecture.') query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY id ---- -2 8.500097 Python深度学习(第2版) -6 6.7982116 Flask Web开发:基于Python的Web应用开发实战(第2版) -14 5.509352 Building Recommendation Systems in Python and JAX -11 5.263399 OpenAI GPT For Python Developers, 2nd Edition -13 4.4659142 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -12 1.8816761 Developing Apps with GPT-4 and ChatGPT -4 1.5154111 白话深度学习的数学 -3 1.3515654 大模型应用开发极简入门 -7 1.2369337 Apache Pulsar实战 +2 1.0 Python深度学习(第2版) +4 1.0 白话深度学习的数学 +3 1.0 大模型应用开发极简入门 +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +7 1.0 Apache Pulsar实战 +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +12 1.0 Developing Apps with GPT-4 and ChatGPT +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'ChatGPT') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'ChatGPT') ORDER BY id ---- -1 14.471097 这就是ChatGPT -12 10.599274 Developing Apps with GPT-4 and ChatGPT -13 7.9292374 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -3 1.77537 大模型应用开发极简入门 +1 1.0 这就是ChatGPT +3 1.0 大模型应用开发极简入门 +12 1.0 Developing Apps with GPT-4 and ChatGPT +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计') ORDER BY id ---- -9 14.486509 Vue.js设计与实现 -10 10.238626 前端架构设计 -8 9.061771 Rust程序设计(第2版) -7 3.2078874 Apache Pulsar实战 +9 1.0 Vue.js设计与实现 +7 1.0 Apache Pulsar实战 +8 1.0 Rust程序设计(第2版) +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计 实现') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', '设计 实现') ORDER BY id ---- -9 32.441788 Vue.js设计与实现 -10 10.238626 前端架构设计 -8 9.061771 Rust程序设计(第2版) -7 5.9086094 Apache Pulsar实战 -4 2.3153453 白话深度学习的数学 +4 1.0 白话深度学习的数学 +7 1.0 Apache Pulsar实战 +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE query('title:python') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python') ORDER BY id ---- -2 1.4378065 Python深度学习(第2版) -14 1.1018704 Building Recommendation Systems in Python and JAX -11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition -6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) -13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +2 1.0 Python深度学习(第2版) +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT SELECT id, score(), title FROM books WHERE query('title:pyth', 'fuzziness=2') ORDER BY id @@ -308,154 +308,155 @@ SELECT id, score(), title FROM books WHERE query('title:pyth', 'fuzziness=2') OR 14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE query('title:python OR rust') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python OR rust') ORDER BY id ---- -17 1.8827661 Rust for Rustaceans -16 1.6531605 Rust Atomics and Locks -8 1.5581512 Rust程序设计(第2版) -2 1.4378065 Python深度学习(第2版) -15 1.3975171 Code Like a Pro in Rust -14 1.1018704 Building Recommendation Systems in Python and JAX -11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition -6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) -13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +2 1.0 Python深度学习(第2版) +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +8 1.0 Rust程序设计(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX +15 1.0 Code Like a Pro in Rust +16 1.0 Rust Atomics and Locks +17 1.0 Rust for Rustaceans query IFT -SELECT id, score(), title FROM books WHERE query('title:python AND rust') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python AND rust') ORDER BY id ---- query IFT -SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY id ---- -9 5.063791 Vue.js设计与实现 -7 2.189928 Apache Pulsar实战 -5 1.7138567 BERT基础教程:Transformer大模型实战 -6 1.2924166 Flask Web开发:基于Python的Web应用开发实战(第2版) +5 1.0 BERT基础教程:Transformer大模型实战 +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +7 1.0 Apache Pulsar实战 +9 1.0 Vue.js设计与实现 query IFT -SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY id ---- -16 5.0420737 Rust Atomics and Locks +16 1.0 Rust Atomics and Locks query IFT -SELECT id, score(), title FROM books WHERE query('title:"Python深度学习"') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:"Python深度学习"') ORDER BY id ---- -2 6.005718 Python深度学习(第2版) +2 1.0 Python深度学习(第2版) query IFT -SELECT id, score(), title FROM books WHERE query('title:(+python -学习)') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:(+python -学习)') ORDER BY id ---- -14 1.1018704 Building Recommendation Systems in Python and JAX -11 1.0526798 OpenAI GPT For Python Developers, 2nd Edition -6 0.96639454 Flask Web开发:基于Python的Web应用开发实战(第2版) -13 0.8931828 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE query('title:+设计 -实现') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:+设计 -实现') ORDER BY id ---- -10 2.0477252 前端架构设计 -8 1.8123543 Rust程序设计(第2版) +8 1.0 Rust程序设计(第2版) +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE query('title:+设计 实现') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:+设计 实现') ORDER BY id ---- -9 5.063791 Vue.js设计与实现 -10 2.0477252 前端架构设计 -8 1.8123543 Rust程序设计(第2版) +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 query IFT -SELECT id, score(), title FROM books WHERE query('title:python^5 description:chatgpt^2.1') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:python^5 description:chatgpt^2.1') ORDER BY id ---- -13 7.890149 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -2 7.1890326 Python深度学习(第2版) -14 5.509352 Building Recommendation Systems in Python and JAX -11 5.263399 OpenAI GPT For Python Developers, 2nd Edition -6 4.8319726 Flask Web开发:基于Python的Web应用开发实战(第2版) -1 4.732555 这就是ChatGPT -12 4.325484 Developing Apps with GPT-4 and ChatGPT -3 3.106897 大模型应用开发极简入门 +1 1.0 这就是ChatGPT +2 1.0 Python深度学习(第2版) +3 1.0 大模型应用开发极简入门 +6 1.0 Flask Web开发:基于Python的Web应用开发实战(第2版) +11 1.0 OpenAI GPT For Python Developers, 2nd Edition +12 1.0 Developing Apps with GPT-4 and ChatGPT +13 1.0 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +14 1.0 Building Recommendation Systems in Python and JAX query IFT -SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC +SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY id ---- -9 25.318954 Vue.js设计与实现 -4 22.395063 白话深度学习的数学 -10 10.238626 前端架构设计 -8 9.061771 Rust程序设计(第2版) - +2 1.0 Python深度学习(第2版) +4 1.0 白话深度学习的数学 +7 1.0 Apache Pulsar实战 +8 1.0 Rust程序设计(第2版) +9 1.0 Vue.js设计与实现 +10 1.0 前端架构设计 # index without optional filters and index rocord is basic -statement ok -CREATE OR REPLACE INVERTED INDEX idx2 ON books(title, author, description) tokenizer = 'chinese' index_record='basic' - -statement ok -REFRESH INVERTED INDEX idx2 ON books - -query IFT -SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY score() DESC ----- -2 8.192706 Python深度学习(第2版) -6 6.235875 Flask Web开发:基于Python的Web应用开发实战(第2版) -14 5.4896193 Building Recommendation Systems in Python and JAX -11 5.2801366 OpenAI GPT For Python Developers, 2nd Edition -13 4.2964296 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT -4 1.5421177 白话深度学习的数学 -3 1.3799851 大模型应用开发极简入门 -12 1.3110648 Developing Apps with GPT-4 and ChatGPT -7 1.2791233 Apache Pulsar实战 - -query IFT -SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC ----- -9 5.027091 Vue.js设计与实现 -7 2.2837715 Apache Pulsar实战 -5 1.7452873 BERT基础教程:Transformer大模型实战 -6 1.2672173 Flask Web开发:基于Python的Web应用开发实战(第2版) +#statement ok +#CREATE OR REPLACE INVERTED INDEX idx2 ON books(title, author, description) tokenizer = 'chinese' index_record='basic' + +#statement ok +#REFRESH INVERTED INDEX idx2 ON books + +#query IFT +#SELECT id, score(), title FROM books WHERE match('title^5, description^1.2', 'python') ORDER BY id DESC +#---- +#2 8.192706 Python深度学习(第2版) +#6 6.235875 Flask Web开发:基于Python的Web应用开发实战(第2版) +#14 5.4896193 Building Recommendation Systems in Python and JAX +#11 5.2801366 OpenAI GPT For Python Developers, 2nd Edition +#13 4.2964296 Learn AI-Assisted Python Programming: With GitHub Copilot and ChatGPT +#4 1.5421177 白话深度学习的数学 +#3 1.3799851 大模型应用开发极简入门 +#12 1.3110648 Developing Apps with GPT-4 and ChatGPT +#7 1.2791233 Apache Pulsar实战 + +#query IFT +#SELECT id, score(), title FROM books WHERE query('title:设计 AND 实现 OR 实战') ORDER BY score() DESC +#---- +#9 5.027091 Vue.js设计与实现 +#7 2.2837715 Apache Pulsar实战 +#5 1.7452873 BERT基础教程:Transformer大模型实战 +#6 1.2672173 Flask Web开发:基于Python的Web应用开发实战(第2版) # basic index record can't search phrase terms -onlyif mysql -statement error 1105 -SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC - -onlyif mysql -statement error 1105 -SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC - -statement ok -CREATE TABLE t1 (id int, body json) - -statement ok -CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese' - -statement ok -INSERT INTO t1 VALUES -(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'), -(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'), -(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'), -(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'), -(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'), -(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'), -(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'), -(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'), -(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'), -(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}') - -query IFT -SELECT id, score(), body FROM t1 WHERE query('body.title:energy') ----- -2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"} - -query IFT -SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology') ----- -3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} -5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"} - -query IFT -SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术') ----- -6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} -10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} +#onlyif mysql +#statement error 1105 +#SELECT id, score(), title FROM books WHERE query('title:"Rust Atomics"') ORDER BY score() DESC + +#onlyif mysql +#statement error 1105 +#SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC + +#statement ok +#CREATE TABLE t1 (id int, body json) + +#statement ok +#CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese' + +#statement ok +#INSERT INTO t1 VALUES +#(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'), +#(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'), +#(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'), +#(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'), +#(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'), +#(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'), +#(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'), +#(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'), +#(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'), +#(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}') + +#query IFT +#SELECT id, score(), body FROM t1 WHERE query('body.title:energy') +#---- +#2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"} + +#query IFT +#SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology') +#---- +#3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"} +#5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"} + +#query IFT +#SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术') +#---- +#6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"} +#10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"} statement error 1118 ALTER TABLE t1 DROP COLUMN body @@ -466,17 +467,17 @@ ALTER TABLE books MODIFY COLUMN title int; statement ok ALTER TABLE books MODIFY COLUMN title string not null -query TTT -SELECT name, type, definition FROM system.indexes order by name ----- -idx INVERTED t1(body)tokenizer='chinese' -idx1 INVERTED t(content)index_record='"basic"' tokenizer='chinese' -idx2 INVERTED books(title, author, description)index_record='"basic"' tokenizer='chinese' - -query III -select row_count, bloom_filter_size, inverted_index_size from fuse_block('test_index', 't1') ----- -10 465 3534 +#query TTT +#SELECT name, type, definition FROM system.indexes order by name +#---- +#idx INVERTED t1(body)tokenizer='chinese' +#idx1 INVERTED t(content)index_record='"basic"' tokenizer='chinese' +#idx2 INVERTED books(title, author, description)index_record='"basic"' tokenizer='chinese' + +#query III +#select row_count, bloom_filter_size, inverted_index_size from fuse_block('test_index', 't1') +#---- +#10 465 3534 statement ok use default