From 4bd471947893bb37f8c31eea5799c9fbafaa1e95 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Mon, 25 Nov 2024 15:24:47 +0900 Subject: [PATCH] revset: parse unicode XID_CONTINUE characters as symbol Tag and bookmark names are usually ASCII, but they occasionally include Latin or Han characters. This doesn't fix the serialization problem, but should mitigate #5359. --- lib/src/revset.pest | 5 ++++- lib/src/revset_parser.rs | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/lib/src/revset.pest b/lib/src/revset.pest index caed8d9cde..86ba50a8f7 100644 --- a/lib/src/revset.pest +++ b/lib/src/revset.pest @@ -14,7 +14,10 @@ whitespace = _{ " " | "\t" | "\r" | "\n" | "\x0c" } -identifier_part = @{ (ASCII_ALPHANUMERIC | "_" | "/")+ } +// XID_CONTINUE: https://www.unicode.org/reports/tr31/#Default_Identifier_Syntax +// +, -, .: often included in tag/bookmark name or version number +// /: sometimes used as a tag/bookmark namespace separator +identifier_part = @{ (XID_CONTINUE | "_" | "/")+ } identifier = @{ identifier_part ~ (("." | "-" | "+") ~ identifier_part)* } diff --git a/lib/src/revset_parser.rs b/lib/src/revset_parser.rs index 7c3b5282d1..4d9e252c7c 100644 --- a/lib/src/revset_parser.rs +++ b/lib/src/revset_parser.rs @@ -1144,6 +1144,14 @@ mod tests { #[test] fn test_parse_identifier() { + // Integer is a symbol + assert_eq!(parse_into_kind("0"), Ok(ExpressionKind::Identifier("0"))); + // Tag/bookmark name separated by / + assert_eq!( + parse_into_kind("foo_bar/baz"), + Ok(ExpressionKind::Identifier("foo_bar/baz")) + ); + // Internal '.', '-', and '+' are allowed assert_eq!( parse_into_kind("foo.bar-v1+7"), @@ -1178,6 +1186,12 @@ mod tests { // Parse a parenthesized symbol assert_eq!(parse_normalized("(foo)"), parse_normalized("foo")); + + // Non-ASCII tag/bookmark name + assert_eq!( + parse_into_kind("柔術+jj"), + Ok(ExpressionKind::Identifier("柔術+jj")) + ); } #[test] @@ -1321,6 +1335,19 @@ mod tests { parse_into_kind(r#""main@origin""#), Ok(ExpressionKind::String("main@origin".to_owned())) ); + + // Non-ASCII name + assert_eq!( + parse_into_kind("柔術@"), + Ok(ExpressionKind::AtWorkspace("柔術".to_owned())) + ); + assert_eq!( + parse_into_kind("柔@術"), + Ok(ExpressionKind::RemoteSymbol { + name: "柔".to_owned(), + remote: "術".to_owned() + }) + ); } #[test] @@ -1330,6 +1357,9 @@ mod tests { assert!(aliases_map.insert("@", "none()").is_err()); assert!(aliases_map.insert("a@", "none()").is_err()); assert!(aliases_map.insert("a@b", "none()").is_err()); + // Non-ASCII character isn't allowed in alias symbol. This rule can be + // relaxed if needed. + assert!(aliases_map.insert("柔術", "none()").is_err()); } #[test]