From 64ae1e14ac14f421041f844256dd64e24d7215e8 Mon Sep 17 00:00:00 2001 From: Ben Lubas <56943754+benlubas@users.noreply.github.com> Date: Sat, 30 Nov 2024 23:32:01 -0500 Subject: [PATCH] feat: metadata parser (#18) --- src/error.rs | 7 + src/lib.rs | 11 +- src/metadata/mod.rs | 135 +++++++++++++++++ .../rust_norg__metadata__tests__arrays.snap | 40 +++++ ...org__metadata__tests__common_metadata.snap | 45 ++++++ .../rust_norg__metadata__tests__keys.snap | 38 +++++ ...org__metadata__tests__keys_and_values.snap | 41 +++++ src/metadata/stage_1.rs | 142 ++++++++++++++++++ src/stage_2.rs | 2 +- 9 files changed, 455 insertions(+), 6 deletions(-) create mode 100644 src/metadata/mod.rs create mode 100644 src/metadata/snapshots/rust_norg__metadata__tests__arrays.snap create mode 100644 src/metadata/snapshots/rust_norg__metadata__tests__common_metadata.snap create mode 100644 src/metadata/snapshots/rust_norg__metadata__tests__keys.snap create mode 100644 src/metadata/snapshots/rust_norg__metadata__tests__keys_and_values.snap create mode 100644 src/metadata/stage_1.rs diff --git a/src/error.rs b/src/error.rs index 7418348..5ae8c81 100644 --- a/src/error.rs +++ b/src/error.rs @@ -9,6 +9,7 @@ pub enum NorgParseError { Stage2(Vec>), Stage3(Vec>), Stage4(Vec>), + Meta(Simple), } impl From>> for NorgParseError { @@ -34,3 +35,9 @@ impl From>> for NorgParseError { NorgParseError::Stage4(error) } } + +impl From> for NorgParseError { + fn from(error: Simple) -> Self { + NorgParseError::Meta(error) + } +} diff --git a/src/lib.rs b/src/lib.rs index 72b9e7a..218b4f9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,6 +10,7 @@ pub use crate::stage_3::*; pub use crate::stage_4::NorgAST; mod error; +pub mod metadata; mod stage_1; mod stage_2; mod stage_3; @@ -101,11 +102,11 @@ mod tests { * Back to regular heading ", ] - .into_iter() - .map(|example| example.to_string() + "\n") - .map(|str| parse_tree(&str)) - .try_collect() - .unwrap(); + .into_iter() + .map(|example| example.to_string() + "\n") + .map(|str| parse_tree(&str)) + .try_collect() + .unwrap(); assert_yaml_snapshot!(headings_tree_examples); } diff --git a/src/metadata/mod.rs b/src/metadata/mod.rs new file mode 100644 index 0000000..a238481 --- /dev/null +++ b/src/metadata/mod.rs @@ -0,0 +1,135 @@ +use chumsky::Parser; +pub use stage_1::NorgMeta; + +use crate::error::NorgParseError; + +pub mod stage_1; + +/// Parses the given input string to produce an AST for the metadata +pub fn parse_metadata(input: &str) -> Result { + let processed = format!("{{\n{}\n}}\n", input.trim()); + Ok(stage_1::meta_parser().parse(processed)?) +} + +#[cfg(test)] +mod tests { + use insta::assert_yaml_snapshot; + use itertools::Itertools; + + use crate::metadata::parse_metadata; + + #[test] + fn common_metadata() { + let examples: Vec<_> = [ + " + title: Sunday November 17, 2024 + description: We Cooked + authors: benlubas + categories: journal + created: 2024-11-18 + updated: 2024-11-18T17:58:21-0500 + version: 1.1.1 + ", + " + title: Neorg Extras + description: Extra lua code to configure Neorg + authors: benlubas + categories: [ + neorg + nvim + config + ] + tangle: { + languages: { + lua: ~/github/.dotfiles/nvim/lua/benlubas/neorg/extras.lua + } + delimiter: heading + } + created: 2024-05-03T13:36:42-0500 + updated: 2024-10-27T11:12:32-0500 + version: 1.1.1 + ", + ] + .into_iter() + .map(|example| example.to_string() + "\n") + .map(|str| parse_metadata(&str)) + .try_collect() + .unwrap(); + + assert_yaml_snapshot!(examples); + } + + #[test] + fn arrays() { + let examples: Vec<_> = [ + "empty_arr: [] + arr: [ + + ]", + " + categories: [ + one + two + 45 + ]", + " + arr: [ + arrays can contain everything + 5 + -5 + 6.02e27 + nil + { + x: y + a: [ + b + ] + } + [] + [ + hi + hi + ] + ]", + "arr:[]\na2:[\n]x: y", + ] + .into_iter() + .map(|example| example.to_string() + "\n") + .map(|str| parse_metadata(&str)) + .try_collect() + .unwrap(); + + assert_yaml_snapshot!(examples); + } + + #[test] + fn keys_and_values() { + let examples: Vec<_> = [ + "key: value", + "x:y", + "x :y", + "x:5", + "x:-4", + "str:-4b", + "nil:nil", + "nil:", + "still_nil: + x: y", + " + key: value with : in it + key_2: value with: in it + ", + "keys: { + in: + objects: [] + }" + ] + .into_iter() + .map(|example| example.to_string() + "\n") + .map(|str| parse_metadata(&str)) + .try_collect() + .unwrap(); + + assert_yaml_snapshot!(examples); + } +} diff --git a/src/metadata/snapshots/rust_norg__metadata__tests__arrays.snap b/src/metadata/snapshots/rust_norg__metadata__tests__arrays.snap new file mode 100644 index 0000000..3591774 --- /dev/null +++ b/src/metadata/snapshots/rust_norg__metadata__tests__arrays.snap @@ -0,0 +1,40 @@ +--- +source: src/metadata/mod.rs +expression: examples +--- +- Object: + arr: + Array: [] + empty_arr: + Array: [] +- Object: + categories: + Array: + - Str: one + - Str: two + - Num: 45 +- Object: + arr: + Array: + - Str: arrays can contain everything + - Num: 5 + - Num: -5 + - Num: 6020000000000000000000000000 + - Nil + - Object: + a: + Array: + - Str: b + x: + Str: y + - Array: [] + - Array: + - Str: hi + - Str: hi +- Object: + a2: + Array: [] + arr: + Array: [] + x: + Str: y diff --git a/src/metadata/snapshots/rust_norg__metadata__tests__common_metadata.snap b/src/metadata/snapshots/rust_norg__metadata__tests__common_metadata.snap new file mode 100644 index 0000000..a932c75 --- /dev/null +++ b/src/metadata/snapshots/rust_norg__metadata__tests__common_metadata.snap @@ -0,0 +1,45 @@ +--- +source: src/metadata/mod.rs +expression: examples +--- +- Object: + authors: + Str: benlubas + categories: + Str: journal + created: + Str: 2024-11-18 + description: + Str: We Cooked + title: + Str: "Sunday November 17, 2024" + updated: + Str: "2024-11-18T17:58:21-0500" + version: + Str: 1.1.1 +- Object: + authors: + Str: benlubas + categories: + Array: + - Str: neorg + - Str: nvim + - Str: config + created: + Str: "2024-05-03T13:36:42-0500" + description: + Str: Extra lua code to configure Neorg + tangle: + Object: + delimiter: + Str: heading + languages: + Object: + lua: + Str: ~/github/.dotfiles/nvim/lua/benlubas/neorg/extras.lua + title: + Str: Neorg Extras + updated: + Str: "2024-10-27T11:12:32-0500" + version: + Str: 1.1.1 diff --git a/src/metadata/snapshots/rust_norg__metadata__tests__keys.snap b/src/metadata/snapshots/rust_norg__metadata__tests__keys.snap new file mode 100644 index 0000000..7f6390d --- /dev/null +++ b/src/metadata/snapshots/rust_norg__metadata__tests__keys.snap @@ -0,0 +1,38 @@ +--- +source: src/metadata/mod.rs +expression: examples +--- +- Object: + key: + Str: value +- Object: + x: + Str: y +- Object: + x: + Num: 5 +- Object: + x: + Num: -4 +- Object: + str: + Str: "-4b" +- Object: + nil: Nil +- Object: + nil: Nil +- Object: + still_nil: Nil + x: + Str: y +- Object: + key: + Str: "value with : in it" + key_2: + Str: "value with: in it" +- Object: + keys: + Object: + in: Nil + objects: + Array: [] diff --git a/src/metadata/snapshots/rust_norg__metadata__tests__keys_and_values.snap b/src/metadata/snapshots/rust_norg__metadata__tests__keys_and_values.snap new file mode 100644 index 0000000..eed039b --- /dev/null +++ b/src/metadata/snapshots/rust_norg__metadata__tests__keys_and_values.snap @@ -0,0 +1,41 @@ +--- +source: src/metadata/mod.rs +expression: examples +--- +- Object: + key: + Str: value +- Object: + x: + Str: y +- Object: + x: + Str: y +- Object: + x: + Num: 5 +- Object: + x: + Num: -4 +- Object: + str: + Str: "-4b" +- Object: + nil: Nil +- Object: + nil: Nil +- Object: + still_nil: Nil + x: + Str: y +- Object: + key: + Str: "value with : in it" + key_2: + Str: "value with: in it" +- Object: + keys: + Object: + in: Nil + objects: + Array: [] diff --git a/src/metadata/stage_1.rs b/src/metadata/stage_1.rs new file mode 100644 index 0000000..122e7ff --- /dev/null +++ b/src/metadata/stage_1.rs @@ -0,0 +1,142 @@ +use chumsky::prelude::*; +use serde::Serialize; +use std::collections::BTreeMap; +use text::TextParser; + +#[derive(Clone, Debug, Serialize)] +pub enum NorgMeta { + Invalid, + Nil, + Bool(bool), + Str(String), + EmptyKey(String), + Num(f64), + Array(Vec), + Object(BTreeMap), +} + +const SPECIAL: &str = "{}[]:\n"; + +pub fn meta_parser() -> impl Parser> { + recursive(|value| { + let frac = just('.').chain(text::digits(10)); + + let exp = just('e') + .or(just('E')) + .chain(just('+').or(just('-')).or_not()) + .chain::(text::digits(10)); + + let number = just(' ') + .repeated() + .ignore_then(just('-').or_not()) + .chain::(text::int(10)) + .chain::(frac.or_not().flatten()) + .chain::(exp.or_not().flatten()) + .then_ignore(just('\n').rewind()) + .collect::() + .from_str() + .unwrapped() + .labelled("number"); + + let escape = just('\\').ignore_then( + just('\\') + .or(just('/')) + .or(one_of(SPECIAL)) + .or(just('b').to('\x08')) + .or(just('f').to('\x0C')) + .or(just('n').to('\n')) + .or(just('r').to('\r')) + .or(just('t').to('\t')) + .or(just('u').ignore_then( + filter(|c: &char| c.is_ascii_hexdigit()) + .repeated() + .exactly(4) + .collect::() + .validate(|digits, span, emit| { + char::from_u32(u32::from_str_radix(&digits, 16).unwrap()) + .unwrap_or_else(|| { + emit(Simple::custom(span, "invalid unicode character")); + '\u{FFFD}' // unicode replacement character + }) + }), + )), + ); + + let string = none_of("{}[]\n") + .or(escape.clone()) + .repeated() + .at_least(1) + .try_map(|x, span| { + let binding = x.clone().into_iter().collect::(); + let s = binding.trim(); + if s.is_empty() { + Err(Simple::custom( + span, + format!("strings can't be all whitespace, got {x:?}"), + )) + } else { + Ok(s.to_string()) + } + }) + .map(|s| match &s[..] { + "true" => NorgMeta::Bool(true), + "false" => NorgMeta::Bool(false), + "nil" => NorgMeta::Nil, + _ => NorgMeta::Str(s), + }); + + let key = none_of(SPECIAL) + .repeated() + .at_least(1) + .then_ignore(just(':').then(one_of(" \t").repeated())) + .collect::() + .map(|s| s.trim().to_string()) + .labelled("key"); + + let array = value + .clone() + .separated_by(just('\n')) + .allow_trailing() + .padded() + .delimited_by(just('[').padded(), just(']').ignored()) + .map(NorgMeta::Array) + .labelled("array"); + + let empty_array = empty() + .padded() + .delimited_by(just('[').padded(), just(']')) + .to(NorgMeta::Array(vec![])); + + let property = key + .then_ignore(one_of(" \t").repeated()) + .then(value.or(empty().to(NorgMeta::Nil))) + .then_ignore(just('\n').or_not()) + .labelled("property"); + + let object = property + .clone() + .then_ignore(just('\n').or_not()) + .repeated() + .padded() + .collect() + .delimited_by(just('{').padded(), just('}').ignored()) + .map(NorgMeta::Object) + .labelled("object"); + + choice(( + number.map(NorgMeta::Num), + empty_array, + array, + object, + string, + )) + .recover_with(nested_delimiters('{', '}', [('[', ']')], |_| { + NorgMeta::Invalid + })) + .recover_with(nested_delimiters('[', ']', [('{', '}')], |_| { + NorgMeta::Invalid + })) + .recover_with(skip_then_retry_until(['}', ']'])) + }) + .then_ignore(end().padded().recover_with(skip_then_retry_until([]))) +} diff --git a/src/stage_2.rs b/src/stage_2.rs index 09b68ef..05fc41f 100644 --- a/src/stage_2.rs +++ b/src/stage_2.rs @@ -57,7 +57,7 @@ fn tokens_to_paragraph_segment(tokens: Vec) -> ParagraphTokenList { Some(ParagraphSegmentToken::Text(result)) } None => None, - x => { + _x => { unreachable!(); } })