feat: the new parser based on winnow

reubeno · Nov 30, 2024 · cf02d3b · cf02d3b
1 parent 2e9447d
commit cf02d3b
Show file tree

Hide file tree

Showing 20 changed files with 3,604 additions and 15 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/brush-parser/Cargo.toml b/brush-parser/Cargo.toml
@@ -15,11 +15,14 @@ rust-version.workspace = true
 bench = false
 
 [features]
+default = ["debug"]
 fuzz-testing = ["dep:arbitrary"]
+debug = ["winnow/debug"]
 
 [dependencies]
 arbitrary = { version = "1.4.1", optional = true, features = ["derive"] }
 cached = "0.54.0"
+winnow = { version = "0.6.20", features = ["simd"] }
 indenter = "0.3.3"
 peg = "0.8.4"
 thiserror = "2.0.3"

diff --git a/brush-parser/benches/parser.rs b/brush-parser/benches/parser.rs
@@ -13,15 +13,12 @@ mod unix {
         .unwrap()
     }
 
-    fn parse_sample_script() -> brush_parser::ast::Program {
-        let input = r#"
-            for f in A B C; do
-                echo "${f@L}" >&2
-            done
-    "#;
-
+    fn parse_sample_script(input: &str) -> brush_parser::ast::Program {
         parse_script(input)
     }
+    fn parse_sample_script2(input: &str) -> brush_parser::ast::Program {
+        brush_parser::parse_program(brush_parser::ParserOptions::default(), input).unwrap()
+    }
 
     fn benchmark_parsing_script(c: &mut Criterion, script_path: &std::path::Path) {
         let contents = std::fs::read_to_string(script_path).unwrap();
@@ -37,8 +34,13 @@ mod unix {
     }
 
     pub(crate) fn criterion_benchmark(c: &mut Criterion) {
+        let input = r#"
+            for f in A B C; do
+                echo "${f@L}" >&2
+            done
+    "#;
         c.bench_function("parse_sample_script", |b| {
-            b.iter(|| black_box(parse_sample_script()))
+            b.iter(|| black_box(parse_sample_script(input)))
         });
 
         const POSSIBLE_BASH_COMPLETION_SCRIPT_PATH: &str =
@@ -50,6 +52,84 @@ mod unix {
             benchmark_parsing_script(c, &well_known_complicated_script);
         }
     }
+
+    pub(crate) fn compare_parsers(c: &mut Criterion) {
+        // compare_parsers_cached(c);
+        compare_parsers_uncached(c);
+    }
+
+    fn compare_parsers_uncached(c: &mut Criterion) {
+        let mut group = c.benchmark_group("compare_parsers");
+        // prevent caching
+        let mut i: usize = 0;
+        group.bench_function("old_parser_uncached", |b| {
+            b.iter_batched(
+                || {
+                    i += 1;
+                    format!(
+                        r#"
+            for f in A B C; do
+                echo {i} "${{f@L}}" >&2
+            done
+    "#
+                    )
+                },
+                |input| black_box(parse_sample_script(input.as_str())),
+                criterion::BatchSize::SmallInput,
+            )
+        });
+        let mut i: usize = 0;
+        group.bench_function("new_parser_uncached", |b| {
+            b.iter_batched(
+                || {
+                    i += 1;
+                    format!(
+                        r#"
+            for f in A B C; do
+                echo {i} "${{f@L}}" >&2
+            done
+    "#
+                    )
+                },
+                |input| {
+                    black_box(
+                        brush_parser::parse_program(
+                            brush_parser::ParserOptions::default(),
+                            input.as_str(),
+                        )
+                        .unwrap(),
+                    )
+                },
+                criterion::BatchSize::SmallInput,
+            )
+        });
+
+        group.finish();
+    }
+    fn compare_parsers_cached(c: &mut Criterion) {
+        let input = r#"
+            for f in A B C; do
+                echo "${f@L}" >&2
+            done
+    "#;
+        let mut group = c.benchmark_group("compare_parsers_cached");
+
+        group.bench_function("old_parser_cached", |b| {
+            b.iter(|| black_box(parse_sample_script(input)))
+        });
+        group.bench_function("new_parser_cached", |b| {
+            b.iter(|| {
+                black_box(black_box(
+                    brush_parser::cacheable_parse_program(
+                        brush_parser::ParserOptions::default(),
+                        input.to_string(),
+                    )
+                    .unwrap(),
+                ))
+            })
+        });
+        group.finish();
+    }
 }
 
 #[cfg(unix)]
@@ -58,8 +138,16 @@ criterion::criterion_group! {
     config = criterion::Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
     targets = unix::criterion_benchmark
 }
+
+#[cfg(unix)]
+criterion::criterion_group! {
+    name = compare_parsers;
+    config = criterion::Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
+    targets =unix::compare_parsers
+}
+
 #[cfg(unix)]
-criterion::criterion_main!(benches);
+criterion::criterion_main!(compare_parsers);
 
 #[cfg(not(unix))]
 fn main() -> () {}
diff --git a/brush-parser/src/ast.rs b/brush-parser/src/ast.rs
@@ -42,6 +42,12 @@ pub enum SeparatorOperator {
     Sequence,
 }
 
+impl Default for SeparatorOperator {
+    fn default() -> Self {
+        SeparatorOperator::Sequence
+    }
+}
+
 impl Display for SeparatorOperator {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
@@ -1031,7 +1037,7 @@ impl Display for ExtendedTestExpr {
 }
 
 /// A unary predicate usable in an extended test expression.
-#[derive(Clone, Debug)]
+#[derive(Clone, Copy, Debug)]
 #[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
 #[cfg_attr(test, derive(PartialEq, Eq))]
 pub enum UnaryPredicate {
@@ -1120,7 +1126,7 @@ impl Display for UnaryPredicate {
 }
 
 /// A binary predicate usable in an extended test expression.
-#[derive(Clone, Debug)]
+#[derive(Clone, Copy, Debug)]
 #[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
 #[cfg_attr(test, derive(PartialEq, Eq))]
 pub enum BinaryPredicate {

diff --git a/brush-parser/src/lib.rs b/brush-parser/src/lib.rs
@@ -1,5 +1,6 @@
 //! Implements a tokenizer and parsers for POSIX / bash shell syntax.
 
+#![feature(test)]
 #![deny(missing_docs)]
 
 pub mod arithmetic;
@@ -11,8 +12,10 @@ pub mod word;
 
 mod error;
 mod parser;
+mod parser2;
 mod tokenizer;
 
+pub use parser2::{parse_program, cacheable_parse_program};
 pub use error::{ParseError, TestCommandParseError, WordParseError};
 pub use parser::{parse_tokens, Parser, ParserOptions, SourceInfo};
 pub use tokenizer::{tokenize_str, unquote_str, SourcePosition, Token, TokenLocation};
diff --git a/brush-parser/src/parser.rs b/brush-parser/src/parser.rs
@@ -3,7 +3,7 @@ use crate::error;
 use crate::tokenizer::{Token, TokenEndReason, Tokenizer, TokenizerOptions, Tokens};
 
 /// Options used to control the behavior of the parser.
-#[derive(Clone, Eq, Hash, PartialEq)]
+#[derive(Debug, Clone, Eq, Hash, PartialEq)]
 pub struct ParserOptions {
     /// Whether or not to enable extended globbing (a.k.a. `extglob`).
     pub enable_extended_globbing: bool,