Refactored FTSClause and TSQuery structures to improve parsing logic …

…and support for complex queries.
sachaarbonel · Jan 14, 2025 · 0e32379 · 0e32379
1 parent e87d1fb
commit 0e32379
Show file tree

Hide file tree

Showing 10 changed files with 312 additions and 205 deletions.
diff --git a/README.md b/README.md
@@ -88,12 +88,22 @@ fn main() {
 
 ### Critical for Production (Highest Priority)
 
-#### Real-time Index Management
-- [x] Real-time index updates with ACID compliance
-- [x] Index consistency verification
-- [x] Atomic index operations
-- [x] Index recovery mechanisms
-- [ ] Concurrent index access
+#### Query Analysis & Optimization
+- [ ] Query Analyzer Framework
+  - [ ] Cost-based query planning
+  - [ ] Statistics collection and management
+  - [ ] Index usage analysis
+  - [ ] Join order optimization
+  - [ ] Query rewriting
+- [ ] Query Plan Visualization
+  - [ ] Visual execution plan representation
+  - [ ] Cost breakdown analysis
+  - [ ] Performance bottleneck identification
+- [ ] Statistics Management
+  - [ ] Table statistics (row counts, size)
+  - [ ] Column statistics (cardinality, distribution)
+  - [ ] Index statistics (size, depth, usage)
+  - [ ] Automatic statistics updates
 
 #### Query Processing Essentials
 - [ ] Basic aggregate functions (COUNT, SUM)
@@ -103,21 +113,24 @@ fn main() {
 - [ ] Query timeout mechanism
 
 #### Core Performance Features
-- [ ] Memory-mapped storage
+- [x] Memory-mapped storage
+  - [x] Memory-mapped file handling
+  - [x] Basic persistence
+  - [x] Concurrent access support
   - [ ] Page-level operations
   - [ ] Buffer management
-  - [ ] Memory-mapped file handling
   - [ ] Crash recovery
-- [ ] Basic query optimization
-  - [ ] Statistics-based planning
-  - [ ] Index usage optimization
-  - [ ] Join order optimization
+  - [ ] Dynamic file resizing
+  - [ ] Memory-mapped index support
 - [ ] Index compression
 - [ ] Parallel query execution
 
 #### Monitoring & Diagnostics Essentials
-- [ ] Basic query metrics
-- [ ] Index usage statistics
+- [ ] Query Performance Metrics
+  - [ ] Execution time tracking
+  - [ ] Resource usage monitoring
+  - [ ] Query plan effectiveness
+  - [ ] Index usage statistics
 - [ ] Transaction monitoring
 - [ ] Error logging and tracing
 
@@ -127,9 +140,11 @@ fn main() {
 - [ ] Multi-column indexes
 - [ ] Hash indexes for equality comparisons
 - [ ] Bitmap indexes for low-cardinality columns
-- [ ] Cost-based optimizer
-- [ ] Query plan visualization
 - [ ] Incremental indexing
+- [ ] Index maintenance optimization
+  - [ ] Background index rebuilding
+  - [ ] Index fragmentation analysis
+  - [ ] Automatic index suggestions
 
 #### Additional JOIN Support
 - [ ] RIGHT JOIN
@@ -142,13 +157,9 @@ fn main() {
 #### Advanced Query Processing
 - [ ] Additional aggregate functions (AVG, MIN, MAX)
 - [ ] GROUP BY and HAVING clauses
-
-#### Monitoring & Diagnostics
-- [ ] Index statistics
-- [ ] Query explanation
-- [ ] Performance metrics
-- [ ] Index health checks
-- [ ] Query profiling
+- [ ] Window functions
+- [ ] Common Table Expressions (CTEs)
+- [ ] Subquery optimization
 
 #### Full-text Search Enhancements
 - [ ] Advanced Index Types
@@ -171,6 +182,24 @@ fn main() {
   - [ ] Result ranking with `ts_rank`
   - [ ] Text highlighting with `ts_headline`
 
+- [ ] Faceted Search
+  - [ ] Hierarchical facets
+  - [ ] Dynamic facet counting
+  - [ ] Custom facet ordering
+  - [ ] Multi-value facets
+
+- [ ] Enhanced Scoring & Ranking
+  - [ ] Configurable scoring algorithms
+  - [ ] Score explanation
+  - [ ] Custom boosting factors
+  - [ ] Field-weight customization
+  - [ ] Position-based scoring
+
+- [ ] Search Quality
+  - [ ] Highlighting with snippets
+  - [ ] Relevance tuning tools
+  - [ ] Search quality metrics
+
 #### Vector Search Capabilities
 - [ ] Vector Data Types and Operations
   - [ ] VECTOR(dimensions) data type
@@ -200,63 +229,44 @@ fn main() {
   - [ ] Incremental index updates
   - [ ] Cache-friendly layouts
 
-#### Advanced Text Processing
-- [ ] Multiple analyzer support
-- [ ] Custom token filters
-- [ ] Token position tracking
-- [ ] SIMD-accelerated processing
-- [ ] Phonetic matching
-- [ ] Configurable tokenization pipelines
-
-- [ ] CJK (Chinese, Japanese, Korean) Support
-  - [ ] Character-based tokenization
-  - [ ] N-gram tokenization
-  - [ ] Dictionary-based word segmentation
-  - [ ] Language-specific stop words
-  - [ ] Unicode normalization
-  - [ ] Ideograph handling
-  - [ ] Reading/pronunciation support
-    - [ ] Pinyin for Chinese
-    - [ ] Hiragana/Katakana for Japanese
-    - [ ] Hangul/Hanja for Korean
-  - [ ] Mixed script handling
-  - [ ] CJK-specific scoring adjustments
-  - [ ] Compound word processing
-  - [ ] Character variant normalization
-
-- [ ] Faceted Search
-  - [ ] Hierarchical facets
-  - [ ] Dynamic facet counting
-  - [ ] Custom facet ordering
-  - [ ] Multi-value facets
-
-- [ ] Enhanced Scoring & Ranking
-  - [ ] Configurable scoring algorithms
-  - [ ] Score explanation
-  - [ ] Custom boosting factors
-  - [ ] Field-weight customization
-  - [ ] Position-based scoring
-
-- [ ] Search Quality
-  - [ ] Highlighting with snippets
-  - [ ] Relevance tuning tools
-  - [ ] Search quality metrics
-
 ### Medium Priority
 
+#### Query Plan Management
+- [ ] Plan caching
+- [ ] Adaptive query execution
+- [ ] Runtime statistics collection
+- [ ] Dynamic plan adjustment
+- [ ] Materialized view suggestions
+
 #### Constraint System
 - [ ] UNIQUE constraints
 - [ ] CHECK constraints
 - [ ] NOT NULL constraints
 - [ ] DEFAULT values
 - [ ] Enhanced FOREIGN KEY support with ON DELETE/UPDATE actions
 
-#### Query Features
-- [ ] Subqueries
-- [ ] Common Table Expressions (CTEs)
+#### Advanced Features
 - [ ] Views
 - [ ] Stored procedures
 - [ ] User-defined functions
+- [ ] Triggers
+- [ ] Materialized views
+
+#### CJK (Chinese, Japanese, Korean) Support
+- [ ] Character-based tokenization
+- [ ] N-gram tokenization
+- [ ] Dictionary-based word segmentation
+- [ ] Language-specific stop words
+- [ ] Unicode normalization
+- [ ] Ideograph handling
+- [ ] Reading/pronunciation support
+  - [ ] Pinyin for Chinese
+  - [ ] Hiragana/Katakana for Japanese
+  - [ ] Hangul/Hanja for Korean
+- [ ] Mixed script handling
+- [ ] CJK-specific scoring adjustments
+- [ ] Compound word processing
+- [ ] Character variant normalization
 
 ### Lower Priority
 
@@ -284,13 +294,6 @@ fn main() {
 - [ ] Distributed transactions
 - [ ] Failover support
 
-#### Performance Optimization
-- [ ] Query result cache
-- [ ] Index page cache
-- [ ] Buffer pool management
-- [ ] Connection pooling
-- [ ] Query plan optimization
-
 #### Developer Experience
 - [ ] Command-line interface
 - [ ] Web-based admin interface

diff --git a/src/fts/text_processor.rs b/src/fts/text_processor.rs
@@ -2,6 +2,15 @@ use std::collections::HashSet;
 use serde::{Deserialize, Serialize};
 use crate::sql::clauses::full_text_search::weight::TextWeight;
 
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum QueryOperator {
+    And,
+    Or,
+    Not,
+    Phrase(Vec<Token>),
+    Proximity(Vec<Token>, usize),
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Token {
     pub text: String,
@@ -73,15 +82,6 @@ pub struct ProcessedQuery {
     pub operators: Vec<QueryOperator>,
 }
 
-#[derive(Clone, Debug, Serialize, Deserialize)]
-pub enum QueryOperator {
-    And,
-    Or,
-    Not,
-    Phrase(Vec<Token>),
-    Proximity(Vec<Token>, usize),
-}
-
 pub trait TextProcessor: Send + Sync {
     fn process_document(&self, text: &str, language: Option<&str>) -> ProcessedDocument;
     fn process_query(&self, query: &str, language: Option<&str>) -> ProcessedQuery;

diff --git a/src/fts/text_processor_impl.rs b/src/fts/text_processor_impl.rs
@@ -1,13 +1,11 @@
 use std::collections::HashMap;
 use std::fmt;
-use std::any::Any;
 use super::{
     text_processor::{ProcessedQuery, Token, TokenType, TsVector, QueryOperator},
     language::{LanguageProcessor, english::EnglishProcessor, LanguageConfig},
 };
 use serde::{Serialize, Deserialize};
-use crate::sql::clauses::full_text_search::query::{TSQuery, QueryOperator as SqlQueryOperator, QueryType};
-use super::text_processor::QueryOperator as FtsQueryOperator;
+use crate::sql::clauses::full_text_search::{TSQuery, QueryOperator as SqlQueryOperator, QueryType};
 
 #[derive(Serialize, Deserialize)]
 struct SerializedProcessor {

diff --git a/src/sql/clauses/full_text_search/clause.rs b/src/sql/clauses/full_text_search/clause.rs
@@ -1,21 +1,19 @@
+use super::{Language, QueryType, TSQuery};
+use crate::sql::column::Column;
+use crate::sql::clauses::full_text_search::weight::TextWeight;
 use nom::{
     IResult,
     bytes::complete::{tag, tag_no_case, take_until},
-    character::complete::{multispace0, multispace1, anychar},
+    character::complete::{multispace0, anychar},
     sequence::{tuple, delimited},
     combinator::opt,
 };
-
-use crate::sql::column::Column;
 use crate::sql::operators::op::Op;
-use super::{Language, TSQuery, QueryType, TSRanking, weight::TextWeight};
-use crate::fts::text_processor::TsVector;
 
-#[derive(Debug, PartialEq, Clone)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct FTSClause {
     pub column: Column,
     pub query: TSQuery,
-    pub ranking: Option<TSRanking>,
     pub weight: Option<TextWeight>,
 }
 
@@ -31,7 +29,6 @@ impl FTSClause {
         Self {
             column,
             query: TSQuery::new(query_text).with_type(query_type),
-            ranking: None,
             weight: None,
         }
     }
@@ -46,11 +43,6 @@ impl FTSClause {
         self
     }
 
-    pub fn with_ranking(mut self, ranking: TSRanking) -> Self {
-        self.ranking = Some(ranking);
-        self
-    }
-
     pub fn with_weight(mut self, weight: TextWeight) -> Self {
         self.weight = Some(weight);
         self

diff --git a/src/sql/clauses/full_text_search/mod.rs b/src/sql/clauses/full_text_search/mod.rs
@@ -1,10 +1,13 @@
-mod clause;
+pub mod operator;
 pub mod query;
-mod language;
-mod ranking;
+pub mod term;
+pub mod types;
+pub mod clause;
 pub mod weight;
 
+pub use operator::QueryOperator;
+pub use query::{TSQuery, ParsedTSQuery};
+pub use term::ParsedTerm;
+pub use types::{QueryType, Language, ParseError};
 pub use clause::FTSClause;
-pub use query::{TSQuery, QueryType};
-pub use language::Language;
-pub use ranking::TSRanking; 
+pub use weight::TextWeight; 
diff --git a/src/sql/clauses/full_text_search/operator.rs b/src/sql/clauses/full_text_search/operator.rs
@@ -0,0 +1,45 @@
+#[derive(Debug, Clone, PartialEq)]
+pub enum QueryOperator {
+    And,
+    Or,
+    Not,
+}
+
+impl QueryOperator {
+    pub fn from_str(s: &str) -> Option<Self> {
+        match s.to_uppercase().as_str() {
+            "AND" => Some(QueryOperator::And),
+            "OR" => Some(QueryOperator::Or),
+            "NOT" => Some(QueryOperator::Not),
+            _ => None,
+        }
+    }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            QueryOperator::And => "AND",
+            QueryOperator::Or => "OR",
+            QueryOperator::Not => "NOT",
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_operator_from_str() {
+        assert_eq!(QueryOperator::from_str("AND"), Some(QueryOperator::And));
+        assert_eq!(QueryOperator::from_str("OR"), Some(QueryOperator::Or));
+        assert_eq!(QueryOperator::from_str("NOT"), Some(QueryOperator::Not));
+        assert_eq!(QueryOperator::from_str("INVALID"), None);
+    }
+
+    #[test]
+    fn test_operator_as_str() {
+        assert_eq!(QueryOperator::And.as_str(), "AND");
+        assert_eq!(QueryOperator::Or.as_str(), "OR");
+        assert_eq!(QueryOperator::Not.as_str(), "NOT");
+    }
+}