From 4ae64b5f295ab3c28c74e1ef2eba9abde5e0a37b Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 13:34:53 +1200 Subject: [PATCH 1/8] feat: migrate to a simplified typing method definition api --- benches/transform_benchmark.rs | 24 +++---- examples/long.rs | 4 +- examples/repl.rs | 9 +-- examples/simple.rs | 4 +- examples/telex.rs | 4 +- examples/vni.rs | 4 +- src/lib.rs | 16 +---- src/methods.rs | 106 +++++++++++++++++++++++++++++++ src/telex.rs | 112 --------------------------------- src/vni.rs | 74 ---------------------- tests/telex.rs | 2 +- tests/vni.rs | 2 +- 12 files changed, 133 insertions(+), 228 deletions(-) create mode 100644 src/methods.rs delete mode 100644 src/telex.rs delete mode 100644 src/vni.rs diff --git a/benches/transform_benchmark.rs b/benches/transform_benchmark.rs index 4c91af5..60f6b84 100644 --- a/benches/transform_benchmark.rs +++ b/benches/transform_benchmark.rs @@ -4,42 +4,42 @@ pub fn telex_benchmark(c: &mut Criterion) { c.bench_function("telex vieejt", |b| { b.iter(|| { let mut output = String::new(); - vi::telex::transform_buffer(black_box("vieejt".chars()), &mut output) + vi::transform_buffer(&vi::TELEX, black_box("vieejt".chars()), &mut output) }) }); c.bench_function("telex ddaay", |b| { b.iter(|| { let mut output = String::new(); - vi::telex::transform_buffer(black_box("ddaay".chars()), &mut output) + vi::transform_buffer(&vi::TELEX, black_box("ddaay".chars()), &mut output) }) }); c.bench_function("telex jjjjjjjjjjjjjj", |b| { b.iter(|| { let mut output = String::new(); - vi::telex::transform_buffer(black_box("jjjjjjjjjjjjjj".chars()), &mut output) + vi::transform_buffer(&vi::TELEX, black_box("jjjjjjjjjjjjjj".chars()), &mut output) }) }); c.bench_function("telex jj", |b| { b.iter(|| { let mut output = String::new(); - vi::telex::transform_buffer(black_box("jj".chars()), &mut output) + vi::transform_buffer(&vi::TELEX, black_box("jj".chars()), &mut output) }) }); c.bench_function("telex nghienge", |b| { b.iter(|| { let mut output = String::new(); - vi::telex::transform_buffer(black_box("nghienge".chars()), &mut output) + vi::transform_buffer(&vi::TELEX, black_box("nghienge".chars()), &mut output) }) }); c.bench_function("telex ddaaysf", |b| { b.iter(|| { let mut output = String::new(); - vi::telex::transform_buffer(black_box("ddaaysf".chars()), &mut output) + vi::transform_buffer(&vi::TELEX, black_box("ddaaysf".chars()), &mut output) }) }); } @@ -48,42 +48,42 @@ pub fn vni_benchmark(c: &mut Criterion) { c.bench_function("vni viet65", |b| { b.iter(|| { let mut output = String::new(); - vi::vni::transform_buffer(black_box("viet65".chars()), &mut output) + vi::transform_buffer(&vi::VNI, black_box("viet65".chars()), &mut output) }) }); c.bench_function("vni day96", |b| { b.iter(|| { let mut output = String::new(); - vi::vni::transform_buffer(black_box("ddaay".chars()), &mut output) + vi::transform_buffer(&vi::VNI, black_box("ddaay".chars()), &mut output) }) }); c.bench_function("vni 1111111111111111", |b| { b.iter(|| { let mut output = String::new(); - vi::vni::transform_buffer(black_box("1111111111111111".chars()), &mut output) + vi::transform_buffer(&vi::VNI, black_box("1111111111111111".chars()), &mut output) }) }); c.bench_function("vni 11", |b| { b.iter(|| { let mut output = String::new(); - vi::vni::transform_buffer(black_box("11".chars()), &mut output) + vi::transform_buffer(&vi::VNI, black_box("11".chars()), &mut output) }) }); c.bench_function("vni nghieng6", |b| { b.iter(|| { let mut output = String::new(); - vi::vni::transform_buffer(black_box("nghieng6".chars()), &mut output) + vi::transform_buffer(&vi::VNI, black_box("nghieng6".chars()), &mut output) }) }); c.bench_function("vni day9612", |b| { b.iter(|| { let mut output = String::new(); - vi::vni::transform_buffer(black_box("day9612".chars()), &mut output) + vi::transform_buffer(&vi::VNI, black_box("day9612".chars()), &mut output) }) }); } diff --git a/examples/long.rs b/examples/long.rs index 67ab297..3826912 100644 --- a/examples/long.rs +++ b/examples/long.rs @@ -1,6 +1,6 @@ extern crate vi; -use vi::vni; +use vi::{transform_buffer, VNI}; fn main() { let inputs = "xin chao2 toi6 la2 Hung7, toi6 den961 tu72 Viet65 Nam"; @@ -9,7 +9,7 @@ fn main() { let mut result = String::new(); for word in words { - vni::transform_buffer(word.chars(), &mut result); + transform_buffer(&VNI, word.chars(), &mut result); result.push(' '); } diff --git a/examples/repl.rs b/examples/repl.rs index 3e5d7f4..1e60aef 100644 --- a/examples/repl.rs +++ b/examples/repl.rs @@ -1,7 +1,6 @@ extern crate vi; use rustyline::DefaultEditor; -use vi::{telex, vni}; // A REPL for testing transformation result. fn main() { @@ -16,11 +15,13 @@ fn main() { let mut result = String::new(); for word in input.split_whitespace() { - if method == "telex" { - telex::transform_buffer(word.chars(), &mut result) + let definition = if method == "telex" { + &vi::TELEX } else { - vni::transform_buffer(word.chars(), &mut result) + &vi::VNI }; + + vi::transform_buffer(definition, word.chars(), &mut result); result.push(' '); } diff --git a/examples/simple.rs b/examples/simple.rs index 6c7dcbd..0ae55b9 100644 --- a/examples/simple.rs +++ b/examples/simple.rs @@ -1,13 +1,11 @@ extern crate vi; -use vi::vni; - fn main() { let inputs = vec![vec!['v', 'i', 'e', 't', '5', '6'], vec!['n', 'a', 'm']]; let mut result = String::new(); for input in inputs { - vni::transform_buffer(input.iter().cloned(), &mut result); + vi::transform_buffer(&vi::VNI, input.iter().cloned(), &mut result); result.push(' '); } diff --git a/examples/telex.rs b/examples/telex.rs index 977bc4c..80abbb5 100644 --- a/examples/telex.rs +++ b/examples/telex.rs @@ -1,7 +1,5 @@ extern crate vi; -use vi::telex; - fn main() { let inputs = "hoiwx anh tifnh yeue gioosng nhuw cais cheets nuotos trooi taats car"; @@ -9,7 +7,7 @@ fn main() { let mut result = String::new(); for word in words { - telex::transform_buffer(word.chars(), &mut result); + vi::transform_buffer(&vi::TELEX, word.chars(), &mut result); result.push(' '); } diff --git a/examples/vni.rs b/examples/vni.rs index eeb9808..dc69794 100644 --- a/examples/vni.rs +++ b/examples/vni.rs @@ -1,7 +1,5 @@ extern crate vi; -use vi::vni; - fn main() { let inputs = "anh se4 lam2, lam2 ta6t1 ca3 de963 d9uo75c che6t1 thay em"; @@ -9,7 +7,7 @@ fn main() { let mut result = String::new(); for word in words { - vni::transform_buffer(word.chars(), &mut result); + vi::transform_buffer(&vi::VNI, word.chars(), &mut result); result.push(' '); } diff --git a/src/lib.rs b/src/lib.rs index 8f1106a..ead5702 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,11 +14,10 @@ //! # Example //! //! ``` -//! use vi::vni; //! let inputs = vec![vec!['v', 'i', 'e', 't', '5', '6'], vec!['n', 'a', 'm']]; //! let mut result = String::new(); //! for input in inputs { -//! vni::transform_buffer(input.iter().cloned(), &mut result); +//! vi::transform_buffer(&vi::VNI, input.iter().cloned(), &mut result); //! result.push(' '); //! } //! println!("{}", result); // prints "việt nam " @@ -33,20 +32,11 @@ //! - **`w` in telex will insert `ư`:** so `chuw` or `chw` will produce `chư` pub mod editing; pub mod maps; +pub mod methods; pub mod parsing; pub mod processor; -pub mod telex; pub mod util; pub mod validation; -pub mod vni; pub mod word; -/// A result of a buffer transformation. -#[derive(Debug, Clone)] -#[allow(dead_code)] -pub struct TransformResult { - /// Indicates whether a tone mark has been removed after the transformation. - pub tone_mark_removed: bool, - /// Indicates whether a letter modification has been removed after the transformation. - pub letter_modification_removed: bool, -} +pub use methods::*; diff --git a/src/methods.rs b/src/methods.rs new file mode 100644 index 0000000..9556503 --- /dev/null +++ b/src/methods.rs @@ -0,0 +1,106 @@ +use phf::{phf_map, Map}; + +use crate::{ + processor::{ + add_tone, modify_letter, remove_tone, LetterModification, ToneMark, Transformation, + }, + validation::is_valid_word, + word::Word, +}; + +#[derive(Debug, PartialEq)] +pub enum Action { + AddTonemark(ToneMark), + ModifyLetter(LetterModification), + RemoveToneMark, +} + +pub type Definition = Map; + +/// A result of a buffer transformation. +#[derive(Debug, Clone)] +pub struct TransformResult { + /// Indicates whether a tone mark has been removed after the transformation. + pub tone_mark_removed: bool, + /// Indicates whether a letter modification has been removed after the transformation. + pub letter_modification_removed: bool, +} + +pub static VNI: Definition = phf_map! { + '1' => &[Action::AddTonemark(ToneMark::Acute)], + '2' => &[Action::AddTonemark(ToneMark::Grave)], + '3' => &[Action::AddTonemark(ToneMark::HookAbove)], + '4' => &[Action::AddTonemark(ToneMark::Tilde)], + '5' => &[Action::AddTonemark(ToneMark::Underdot)], + '6' => &[Action::ModifyLetter(LetterModification::Circumflex)], + '7' => &[Action::ModifyLetter(LetterModification::Horn)], + '8' => &[Action::ModifyLetter(LetterModification::Breve)], + '9' => &[Action::ModifyLetter(LetterModification::Dyet)], + '0' => &[Action::RemoveToneMark], +}; + +/// TODO: Define Telex +pub static TELEX: Definition = phf_map! {}; + +pub fn transform_buffer( + definition: &Definition, + buffer: I, + output: &mut String, +) -> TransformResult +where + I: IntoIterator, +{ + let mut word = Word::empty(); + let mut tone_mark_removed = false; + let mut letter_modification_removed = false; + + for ch in buffer { + let lowercase_ch = ch.to_ascii_lowercase(); + + // If a character is not recognised as a transformation character in definition. Skip it. + if !definition.contains_key(&lowercase_ch) { + word.push(ch); + continue; + } + + let actions = definition.get(&lowercase_ch).unwrap(); + + for action in actions.iter() { + let fallback = format!("{}{}", word, ch); + + let transformation = match action { + Action::AddTonemark(tonemark) => add_tone(&mut word, tonemark), + Action::ModifyLetter(modification) => modify_letter(&mut word, modification), + Action::RemoveToneMark => remove_tone(&mut word), + }; + + if transformation == Transformation::ToneMarkRemoved { + tone_mark_removed = true; + } + + if transformation == Transformation::LetterModificationRemoved { + letter_modification_removed = true; + } + + let action_performed = match transformation { + Transformation::Ignored | Transformation::LetterModificationRemoved => false, + // If tone mark was intentionally removed with z character then it's count as an action. + Transformation::ToneMarkRemoved => *action == Action::RemoveToneMark, + _ => true, + }; + + if !action_performed { + word.push(ch); + } else if !is_valid_word(&word.to_string()) { + word.set(fallback); + } + } + } + + output.push_str(&word.to_string()); + + TransformResult { + tone_mark_removed, + letter_modification_removed, + } +} diff --git a/src/telex.rs b/src/telex.rs deleted file mode 100644 index 07feaa8..0000000 --- a/src/telex.rs +++ /dev/null @@ -1,112 +0,0 @@ -//! The telex method transformation -use crate::processor::{add_tone, modify_letter, remove_tone, Transformation}; -use crate::validation::is_valid_word; -use crate::word::Word; -use crate::TransformResult; - -use super::processor::{LetterModification, ToneMark}; - -/// Transform input buffer containing a single word to vietnamese string output using telex mode. -/// -/// # Example -/// ``` -/// use vi::telex::transform_buffer; -/// -/// let mut result = String::new(); -/// transform_buffer("vieetj".chars(), &mut result); -/// assert_eq!(result, "việt".to_owned()); -/// ``` -pub fn transform_buffer(buffer: I, output: &mut String) -> TransformResult -where - I: IntoIterator, -{ - let mut word = Word::empty(); - let mut ư_inserted_previously = false; - let mut tone_mark_removed = false; - let mut letter_modification_removed = false; - - for ch in buffer { - let fallback = format!("{}{}", word, ch); - let ch_lowercase = ch.to_ascii_lowercase(); - - if ch_lowercase != 'w' { - ư_inserted_previously = false; - } - - let transformation = match ch_lowercase { - 's' => add_tone(&mut word, &ToneMark::Acute), - 'f' => add_tone(&mut word, &ToneMark::Grave), - 'r' => add_tone(&mut word, &ToneMark::HookAbove), - 'x' => add_tone(&mut word, &ToneMark::Tilde), - 'j' => add_tone(&mut word, &ToneMark::Underdot), - 'z' => remove_tone(&mut word), - 'a' | 'e' | 'o' if word.vowel.to_ascii_lowercase().contains(ch_lowercase) => { - modify_letter(&mut word, &LetterModification::Circumflex) - } - 'w' if ư_inserted_previously => { - word.replace_last_char(ch); - Transformation::LetterModificationRemoved - } - 'w' => match modify_letter(&mut word, &LetterModification::Horn) { - Transformation::Ignored | Transformation::LetterModificationRemoved => { - match modify_letter(&mut word, &LetterModification::Breve) { - Transformation::Ignored | Transformation::LetterModificationRemoved => { - let transformation = - if word.vowel.is_empty() || word.to_string() == "gi" { - word.push(if ch.is_lowercase() { 'u' } else { 'U' }); - let last_index = word.len() - 1; - word.letter_modifications - .push((last_index, LetterModification::Horn)); - Transformation::LetterModificationAdded - } else { - Transformation::Ignored - }; - ư_inserted_previously = transformation != Transformation::Ignored; - transformation - } - transformation => transformation, - } - } - transformation => transformation, - }, - 'd' => modify_letter(&mut word, &LetterModification::Dyet), - _ => Transformation::Ignored, - }; - - if transformation == Transformation::ToneMarkRemoved { - tone_mark_removed = true; - } - - if transformation == Transformation::LetterModificationRemoved { - letter_modification_removed = true; - } - - let initial_ư_removed = Transformation::LetterModificationRemoved == transformation - && ư_inserted_previously - && word.len() == 1; - - let action_performed = match transformation { - Transformation::LetterModificationRemoved if initial_ư_removed => true, - Transformation::Ignored | Transformation::LetterModificationRemoved => false, - // If tone mark was intentionally removed with z character then it's count as an action. - Transformation::ToneMarkRemoved => ch_lowercase == 'z', - _ => true, - }; - - if !action_performed { - word.push(ch); - } else if !initial_ư_removed && !is_valid_word(&word.to_string()) { - word.set(fallback); - } - - if initial_ư_removed { - ư_inserted_previously = false; - } - } - output.push_str(&word.to_string()); - - TransformResult { - tone_mark_removed, - letter_modification_removed, - } -} diff --git a/src/vni.rs b/src/vni.rs deleted file mode 100644 index a187535..0000000 --- a/src/vni.rs +++ /dev/null @@ -1,74 +0,0 @@ -//! The vni method transformation -use crate::{ - processor::{add_tone, modify_letter, remove_tone, Transformation}, - validation::is_valid_word, - word::Word, - TransformResult, -}; - -use super::processor::{LetterModification, ToneMark}; - -/// Transform input buffer containing a single word to vietnamese string output using vni mode. -/// -/// # Example -/// ``` -/// use vi::vni::transform_buffer; -/// -/// let mut result = String::new(); -/// transform_buffer("viet65".chars(), &mut result); -/// assert_eq!(result, "việt".to_owned()); -/// ``` -pub fn transform_buffer(buffer: I, output: &mut String) -> TransformResult -where - I: IntoIterator, -{ - let mut word = Word::empty(); - let mut tone_mark_removed = false; - let mut letter_modification_removed = false; - - for ch in buffer { - let fallback = format!("{}{}", word, ch); - - let transformation = match ch { - '1' => add_tone(&mut word, &ToneMark::Acute), - '2' => add_tone(&mut word, &ToneMark::Grave), - '3' => add_tone(&mut word, &ToneMark::HookAbove), - '4' => add_tone(&mut word, &ToneMark::Tilde), - '5' => add_tone(&mut word, &ToneMark::Underdot), - '6' => modify_letter(&mut word, &LetterModification::Circumflex), - '7' => modify_letter(&mut word, &LetterModification::Horn), - '8' => modify_letter(&mut word, &LetterModification::Breve), - '9' => modify_letter(&mut word, &LetterModification::Dyet), - '0' => remove_tone(&mut word), - _ => Transformation::Ignored, - }; - - if transformation == Transformation::ToneMarkRemoved { - tone_mark_removed = true; - } - - if transformation == Transformation::LetterModificationRemoved { - letter_modification_removed = true; - } - - let action_performed = match transformation { - Transformation::Ignored | Transformation::LetterModificationRemoved => false, - // If tone mark was intentionally removed with 0 character then it's count as an action. - Transformation::ToneMarkRemoved => ch == '0', - _ => true, - }; - - if !action_performed { - word.push(ch); - } else if !is_valid_word(&word.to_string()) { - word.set(fallback); - } - } - - output.push_str(&word.to_string()); - - TransformResult { - tone_mark_removed, - letter_modification_removed, - } -} diff --git a/tests/telex.rs b/tests/telex.rs index 52a3ad3..a04fc6d 100644 --- a/tests/telex.rs +++ b/tests/telex.rs @@ -3,7 +3,7 @@ mod shared; fn snapshot_transform(lines: &str) -> String { shared::transform_lines(lines, |word| { let mut trasformed_word = String::new(); - vi::telex::transform_buffer(word.chars(), &mut trasformed_word); + vi::transform_buffer(&vi::TELEX, word.chars(), &mut trasformed_word); trasformed_word }) } diff --git a/tests/vni.rs b/tests/vni.rs index eb7ff7e..bb88c23 100644 --- a/tests/vni.rs +++ b/tests/vni.rs @@ -3,7 +3,7 @@ mod shared; fn snapshot_transform(lines: &str) -> String { shared::transform_lines(lines, |word| { let mut trasformed_word = String::new(); - vi::vni::transform_buffer(word.chars(), &mut trasformed_word); + vi::transform_buffer(&vi::VNI, word.chars(), &mut trasformed_word); trasformed_word }) } From 37abb2c0144b97869c9beaeffa31f6c23a016e31 Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 15:01:52 +1200 Subject: [PATCH 2/8] feat: added telex definition --- src/methods.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/src/methods.rs b/src/methods.rs index 9556503..7985b77 100644 --- a/src/methods.rs +++ b/src/methods.rs @@ -12,6 +12,8 @@ use crate::{ pub enum Action { AddTonemark(ToneMark), ModifyLetter(LetterModification), + ModifyLetterOnCharacterFamily(LetterModification, char), + InsertƯ, RemoveToneMark, } @@ -39,8 +41,19 @@ pub static VNI: Definition = phf_map! { '0' => &[Action::RemoveToneMark], }; -/// TODO: Define Telex -pub static TELEX: Definition = phf_map! {}; +pub static TELEX: Definition = phf_map! { + 's' => &[Action::AddTonemark(ToneMark::Acute)], + 'f' => &[Action::AddTonemark(ToneMark::Grave)], + 'r' => &[Action::AddTonemark(ToneMark::HookAbove)], + 'x' => &[Action::AddTonemark(ToneMark::Tilde)], + 'j' => &[Action::AddTonemark(ToneMark::Underdot)], + 'a' => &[Action::ModifyLetterOnCharacterFamily(LetterModification::Circumflex, 'a')], + 'e' => &[Action::ModifyLetterOnCharacterFamily(LetterModification::Circumflex, 'e')], + 'o' => &[Action::ModifyLetterOnCharacterFamily(LetterModification::Circumflex, 'o')], + 'w' => &[Action::ModifyLetter(LetterModification::Horn), Action::ModifyLetter(LetterModification::Breve), Action::InsertƯ], + 'd' => &[Action::ModifyLetter(LetterModification::Dyet)], + 'z' => &[Action::RemoveToneMark], +}; pub fn transform_buffer( definition: &Definition, @@ -63,17 +76,45 @@ where continue; } + let fallback = format!("{}{}", word, ch); let actions = definition.get(&lowercase_ch).unwrap(); - for action in actions.iter() { - let fallback = format!("{}{}", word, ch); + let mut action_iter = actions.iter(); + let mut action = action_iter.next().unwrap(); + loop { let transformation = match action { Action::AddTonemark(tonemark) => add_tone(&mut word, tonemark), Action::ModifyLetter(modification) => modify_letter(&mut word, modification), + Action::ModifyLetterOnCharacterFamily(modification, family_char) + if word.vowel.to_ascii_lowercase().contains(*family_char) => + { + modify_letter(&mut word, modification) + } Action::RemoveToneMark => remove_tone(&mut word), + Action::InsertƯ => { + let transformation = if word.vowel.is_empty() || word.to_string() == "gi" { + word.push(if ch.is_lowercase() { 'u' } else { 'U' }); + let last_index = word.len() - 1; + word.letter_modifications + .push((last_index, LetterModification::Horn)); + Transformation::LetterModificationAdded + } else { + Transformation::Ignored + }; + transformation + } + _ => Transformation::Ignored, }; + // If the transformation cannot be applied, try the next action if there's one. + if transformation == Transformation::Ignored { + if let Some(next_action) = action_iter.next() { + action = next_action; + continue; + } + } + if transformation == Transformation::ToneMarkRemoved { tone_mark_removed = true; } @@ -94,6 +135,7 @@ where } else if !is_valid_word(&word.to_string()) { word.set(fallback); } + break; } } From 46986d7302bcbcb64c62284eb1d77dfd36ea15de Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 20:07:52 +1200 Subject: [PATCH 3/8] =?UTF-8?q?bugfix:=20Fixed=20reset=20=C6=B0=20when=20w?= =?UTF-8?q?=20is=20received=20in=20telex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/methods.rs | 22 ++++++++++++++++++++-- testdata/output/telex__simple_telex.snap | 4 ++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/methods.rs b/src/methods.rs index 7985b77..0ebc388 100644 --- a/src/methods.rs +++ b/src/methods.rs @@ -8,12 +8,13 @@ use crate::{ word::Word, }; -#[derive(Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub enum Action { AddTonemark(ToneMark), ModifyLetter(LetterModification), ModifyLetterOnCharacterFamily(LetterModification, char), InsertƯ, + ResetInsertedƯ, RemoveToneMark, } @@ -50,7 +51,7 @@ pub static TELEX: Definition = phf_map! { 'a' => &[Action::ModifyLetterOnCharacterFamily(LetterModification::Circumflex, 'a')], 'e' => &[Action::ModifyLetterOnCharacterFamily(LetterModification::Circumflex, 'e')], 'o' => &[Action::ModifyLetterOnCharacterFamily(LetterModification::Circumflex, 'o')], - 'w' => &[Action::ModifyLetter(LetterModification::Horn), Action::ModifyLetter(LetterModification::Breve), Action::InsertƯ], + 'w' => &[Action::ResetInsertedƯ, Action::ModifyLetter(LetterModification::Horn), Action::ModifyLetter(LetterModification::Breve), Action::InsertƯ], 'd' => &[Action::ModifyLetter(LetterModification::Dyet)], 'z' => &[Action::RemoveToneMark], }; @@ -67,6 +68,8 @@ where let mut tone_mark_removed = false; let mut letter_modification_removed = false; + let mut last_executed_action = None; + for ch in buffer { let lowercase_ch = ch.to_ascii_lowercase(); @@ -104,6 +107,11 @@ where }; transformation } + Action::ResetInsertedƯ if matches!(last_executed_action, Some(Action::InsertƯ)) => + { + word.replace_last_char(ch); + Transformation::LetterModificationRemoved + } _ => Transformation::Ignored, }; @@ -130,10 +138,20 @@ where _ => true, }; + // If the action is to trigger reset ư insert then we don't need further processing + if *action == Action::ResetInsertedƯ { + last_executed_action = Some(action.clone()); + break; + } + if !action_performed { word.push(ch); + last_executed_action = None; } else if !is_valid_word(&word.to_string()) { word.set(fallback); + last_executed_action = None; + } else { + last_executed_action = Some(action.clone()); } break; } diff --git a/testdata/output/telex__simple_telex.snap b/testdata/output/telex__simple_telex.snap index c6af1e8..b39e2df 100644 --- a/testdata/output/telex__simple_telex.snap +++ b/testdata/output/telex__simple_telex.snap @@ -38,7 +38,7 @@ daasd aa dd chuw -chww +chw ựo chuyện quăng @@ -68,7 +68,7 @@ trường cười hường chửa -chuă +chuảwz ừa w ư From a62d1e1faeb4f2c5978b81cf437144e293cc284b Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 20:10:39 +1200 Subject: [PATCH 4/8] chore: clippy fix --- src/methods.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/methods.rs b/src/methods.rs index 0ebc388..991350d 100644 --- a/src/methods.rs +++ b/src/methods.rs @@ -96,7 +96,7 @@ where } Action::RemoveToneMark => remove_tone(&mut word), Action::InsertƯ => { - let transformation = if word.vowel.is_empty() || word.to_string() == "gi" { + if word.vowel.is_empty() || word.to_string() == "gi" { word.push(if ch.is_lowercase() { 'u' } else { 'U' }); let last_index = word.len() - 1; word.letter_modifications @@ -104,8 +104,7 @@ where Transformation::LetterModificationAdded } else { Transformation::Ignored - }; - transformation + } } Action::ResetInsertedƯ if matches!(last_executed_action, Some(Action::InsertƯ)) => { From 8fe106b3e2ba95d41b9c8eb2363b44248d1ae14c Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 20:26:45 +1200 Subject: [PATCH 5/8] misc: deprecate telex & vni instead of removing them --- src/lib.rs | 4 ++++ src/telex.rs | 19 +++++++++++++++++++ src/vni.rs | 19 +++++++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 src/telex.rs create mode 100644 src/vni.rs diff --git a/src/lib.rs b/src/lib.rs index ead5702..b56a05b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,8 +35,12 @@ pub mod maps; pub mod methods; pub mod parsing; pub mod processor; +#[deprecated(since = "0.7.0")] +pub mod telex; pub mod util; pub mod validation; +#[deprecated(since = "0.7.0")] +pub mod vni; pub mod word; pub use methods::*; diff --git a/src/telex.rs b/src/telex.rs new file mode 100644 index 0000000..1bd44f7 --- /dev/null +++ b/src/telex.rs @@ -0,0 +1,19 @@ +use crate::TransformResult; + +/// Transform input buffer containing a single word to vietnamese string output using telex mode. +/// +/// # Example +/// ``` +/// use vi::telex::transform_buffer; +/// +/// let mut result = String::new(); +/// transform_buffer("vieetj".chars(), &mut result); +/// assert_eq!(result, "việt".to_owned()); +/// ``` +#[deprecated(since = "0.7.0", note = "please use `vi::transform_buffer` instead")] +pub fn transform_buffer(buffer: I, output: &mut String) -> TransformResult +where + I: IntoIterator, +{ + crate::transform_buffer(&crate::TELEX, buffer, output) +} diff --git a/src/vni.rs b/src/vni.rs new file mode 100644 index 0000000..fcad959 --- /dev/null +++ b/src/vni.rs @@ -0,0 +1,19 @@ +use crate::TransformResult; + +/// Transform input buffer containing a single word to vietnamese string output using vni mode. +/// +/// # Example +/// ``` +/// use vi::vni::transform_buffer; +/// +/// let mut result = String::new(); +/// transform_buffer("viet65".chars(), &mut result); +/// assert_eq!(result, "việt".to_owned()); +/// ``` +#[deprecated(since = "0.7.0", note = "please use `vi::transform_buffer` instead")] +pub fn transform_buffer(buffer: I, output: &mut String) -> TransformResult +where + I: IntoIterator, +{ + crate::transform_buffer(&crate::VNI, buffer, output) +} From 4774d82035fa7c7d451bccf5becc58994167f87c Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 20:30:06 +1200 Subject: [PATCH 6/8] docs: Added CHANGELOG.md --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..48b2c01 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,17 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Changed + +- `vi::telex` & `vi::vni` are deprecated & will be removed in the next release. Users are recommended to use `vi::methods` instead. +- `vi::telex::transform_buffer` & `vi::vni::transform_buffer` are deprecated. Users are recommended to use `vi::transform_buffer` instead. + +### Added + +- `vi::methods` module containing method definition & transforming functions. \ No newline at end of file From 8447fb0369604bf26e19b34bc7d9181fd446f1f6 Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 22:03:14 +1200 Subject: [PATCH 7/8] docs: Updated docs & added custom definition example --- examples/custom_definition.rs | 34 ++++++++++ src/editing.rs | 3 + src/lib.rs | 4 +- src/methods.rs | 116 ++++++++++++++++++++++++++++++++++ src/util.rs | 1 + 5 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 examples/custom_definition.rs diff --git a/examples/custom_definition.rs b/examples/custom_definition.rs new file mode 100644 index 0000000..26d9ee3 --- /dev/null +++ b/examples/custom_definition.rs @@ -0,0 +1,34 @@ +use phf::phf_map; +use vi::{ + processor::{LetterModification, ToneMark}, + Action, Definition, +}; + +// Custom vni method with ư short hand using the w character +const MY_VNI: Definition = phf_map! { + '1' => &[Action::AddTonemark(ToneMark::Acute)], + '2' => &[Action::AddTonemark(ToneMark::Grave)], + '3' => &[Action::AddTonemark(ToneMark::HookAbove)], + '4' => &[Action::AddTonemark(ToneMark::Tilde)], + '5' => &[Action::AddTonemark(ToneMark::Underdot)], + '6' => &[Action::ModifyLetter(LetterModification::Circumflex)], + '7' => &[Action::ModifyLetter(LetterModification::Horn)], + '8' => &[Action::ModifyLetter(LetterModification::Breve)], + '9' => &[Action::ModifyLetter(LetterModification::Dyet)], + 'z' => &[Action::ResetInsertedƯ, Action::InsertƯ], + '0' => &[Action::RemoveToneMark], +}; + +fn main() { + let inputs = "Xin hay4 mo73 toang het61 nhzng4 canh1 cza3 cua3 qua1 khz1 de963 thuyen62 toi6 nzong gio1 lang4 quen6 ra khoi7"; + + let words = inputs.split(' '); + + let mut result = String::new(); + for word in words { + vi::transform_buffer(&MY_VNI, word.chars(), &mut result); + result.push(' '); + } + + println!("{}", result); // prints "Xin hãy mở toang hết những cánh cửa của quá khứ để thuyền tôi nương gió lãng quên ra khơi" +} diff --git a/src/editing.rs b/src/editing.rs index 82015c0..1b33860 100644 --- a/src/editing.rs +++ b/src/editing.rs @@ -1,3 +1,6 @@ +//! Functions used for character editing. +//! +//! These functions work directly with character & string instead of the abstract word struct. use crate::{ maps::{ ACCUTE_MAP, BREVE_MAP, CIRCUMFLEX_MAP, DOT_MAP, DYET_MAP, GRAVE_MAP, HOOK_ABOVE_MAP, diff --git a/src/lib.rs b/src/lib.rs index b56a05b..7c22303 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,8 +28,10 @@ //! VI aims to be as lean as possible, focusing on only the useful features and main use-cases. Therefore, the engine //! implemented these rules by default with no way of configuring them: //! -//! - **Tone mark are placed in the new accent:** hoà instead of hòa +//! - **Tone mark are placed in the new accent:** `hoà` instead of `hòa` //! - **`w` in telex will insert `ư`:** so `chuw` or `chw` will produce `chư` +//! +//! Although, should you need to customise any behaviour, you can create your custom typing methods. See: [`methods`]. pub mod editing; pub mod maps; pub mod methods; diff --git a/src/methods.rs b/src/methods.rs index 991350d..8268bdf 100644 --- a/src/methods.rs +++ b/src/methods.rs @@ -1,3 +1,53 @@ +//! The definitions of different typing methods. +//! +//! Normally, for IME developers, you only need these things from this module: +//! - [`transform_buffer`] function to transfer your sequence of character into a word using a typing definition. +//! - [`TELEX`] typing definition that you can use to pass in [`transform_buffer`] to transform character sequence using telex method. +//! - [`VNI`] typing defnition that you can use to pass in [`transform_buffer`] to trasnform character sequence using vni method. +//! +//! ## Example +//! +//! To transform a character sequence using the VNI definition: +//! ``` +//! use vi::methods::transform_buffer; +//! +//! let mut result = String::new(); +//! transform_buffer(&vi::VNI, "viet65".chars(), &mut result); +//! assert_eq!(result, "việt".to_owned()); +//! ``` +//! +//! ## Define your own typing definition +//! +//! `vi-rs` support some typing methods out of the box such as `telex` and `vni`. However, should users ever need to define their +//! own typing methods, they can use the existing APIs in the module. +//! +//! To define a new typing definition, you need to declare a definition map, which is a [`phf::Map`]: +//! +//! ``` +//! pub static MY_VNI: Definition = phf_map! { +//! '1' => &[Action::AddTonemark(ToneMark::Acute)], +//! '2' => &[Action::AddTonemark(ToneMark::Grave)], +//! '3' => &[Action::AddTonemark(ToneMark::HookAbove)], +//! '4' => &[Action::AddTonemark(ToneMark::Tilde)], +//! '5' => &[Action::AddTonemark(ToneMark::Underdot)], +//! '6' => &[Action::ModifyLetter(LetterModification::Circumflex)], +//! '7' => &[Action::ModifyLetter(LetterModification::Horn)], +//! '8' => &[Action::ModifyLetter(LetterModification::Breve)], +//! '9' => &[Action::ModifyLetter(LetterModification::Dyet)], +//! 'z' => &[Action::ResetInsertedƯ, Action::InsertƯ], +//! '0' => &[Action::RemoveToneMark], +//! }; +//! ``` +//! +//! Then you can pass that in [`transform_buffer`] as usual: +//! +//! ``` +//! use vi::methods::transform_buffer; +//! +//! let mut result = String::new(); +//! transform_buffer(MY_VNI, "chza".chars(), &mut result); +//! assert_eq!(result, "chưa".to_owned()); +//! ``` use phf::{phf_map, Map}; use crate::{ @@ -8,16 +58,41 @@ use crate::{ word::Word, }; +/// An action to be listed as part of a typing definition. #[derive(Clone, Debug, PartialEq)] pub enum Action { + /// Add a tonemark AddTonemark(ToneMark), + /// Apply letter modification where possible ModifyLetter(LetterModification), + /// Apply letter modification only if the character family exist. For example, + /// `ModifyLetterOnCharacterFamily(Circumflex, 'a')` will only apply circumflex + /// modification if `a` or any character in the `a` family (`â`, `ă`). ModifyLetterOnCharacterFamily(LetterModification, char), + /// Insert an ư character at the end of the word. InsertƯ, + /// Remove the last ư character inserted at the end of the word. **Note:** this only trigger if the last action is `InsertƯ`. ResetInsertedƯ, + /// Remove the tonemark from the word. RemoveToneMark, } +/// A definition of a typing method. +/// +/// The definition is a [`phf::Map`] with the key as the character that trigger an action and the value, +/// a list of actions that can be triggered by that character. +/// +/// If a character can trigger different actions depending on what is possible, its value will contains multiple Action. For example, +/// +/// ``` +/// pub static TELEX: Definition = phf_map! { +/// 'w' => &[Action::ResetInsertedƯ, Action::ModifyLetter(LetterModification::Horn), Action::ModifyLetter(LetterModification::Breve), Action::InsertƯ], +/// } +/// ``` +/// +/// The definition above specify that `w` can trigger a `ResetInseretedƯ`, or if that doesn't work, a `ModifyLetter(LetterModification::Horn)` action +/// will be executed instead and so on, and so on, \*sniff\*. Note that as soon as one action in the list is applied, the rest of the actions +/// in the list will be ignored. pub type Definition = Map; /// A result of a buffer transformation. @@ -29,6 +104,18 @@ pub struct TransformResult { pub letter_modification_removed: bool, } +/// A definition for the VNI typing method with these configuration: +/// +/// - `1` -> Acute (thêm dấu sắc) +/// - `2` -> Grave (thêm dấu huyền) +/// - `3` -> HookAbove (thêm dấu hỏi) +/// - `4` -> Tilde (thêm dấu ngã) +/// - `5` -> Underdot (thêm dấu nặng) +/// - `6` -> Circumflex (thêm dấu ^) +/// - `7` -> Horn (thêm dấu móc cho ư hoặc ơ) +/// - `8` -> Breve (thêm dấu cho a thành ă) +/// - `9` -> Dyet (thêm dấu gạch cho d thành đ) +/// - `0` -> RemoveToneMark bỏ dấu thanh (sắc, hỏi, ngã, huyền) pub static VNI: Definition = phf_map! { '1' => &[Action::AddTonemark(ToneMark::Acute)], '2' => &[Action::AddTonemark(ToneMark::Grave)], @@ -42,6 +129,24 @@ pub static VNI: Definition = phf_map! { '0' => &[Action::RemoveToneMark], }; +/// A definition for the Telex typing method with these configuration: +/// +/// - `s` -> Acute (thêm dấu sắc) +/// - `f` -> Grave (thêm dấu huyền) +/// - `r` -> HookAbove (thêm dấu hỏi) +/// - `x` -> Tilde (thêm dấu ngã) +/// - `j` -> Underdot (thêm dấu nặng) +/// - `a` -> Circumflex for a (thêm dấu ^ cho chữ a) +/// - `e` -> Circumflex for e (thêm dấu ^ cho chữ e) +/// - `o` -> Circumflex for o (thêm dấu ^ cho chữ o) +/// - `w` -> Horn for ư/ơ or Breve for a (thêm dấu móc cho ư hoặc ơ hoặc thêm dấu cho a thành ă) +/// - `d` -> Dyet (thêm dấu gạch cho d thành đ) +/// - `z` -> RemoveToneMark bỏ dấu thanh (sắc, hỏi, ngã, huyền) +/// +/// **Note:** +/// - By default `w` inserted by itself will be inserted as `ư` in the word. +/// - An `u` followed by a `w` will produce: `ư`, and if you add another `w`, it will result in `uw`. +/// - A `w` will produce `ư`, and if it's followed by a `w`, it will not produce `uw` but will replace `ư` with `w`. pub static TELEX: Definition = phf_map! { 's' => &[Action::AddTonemark(ToneMark::Acute)], 'f' => &[Action::AddTonemark(ToneMark::Grave)], @@ -56,6 +161,17 @@ pub static TELEX: Definition = phf_map! { 'z' => &[Action::RemoveToneMark], }; +/// Transform a buffer of characters using a typing method definition. +/// +/// # Example +/// +/// ``` +/// use vi::methods::transform_buffer; +/// +/// let mut result = String::new(); +/// transform_buffer(&vi::VNI, "viet65".chars(), &mut result); +/// assert_eq!(result, "việt".to_owned()); +/// ``` pub fn transform_buffer( definition: &Definition, buffer: I, diff --git a/src/util.rs b/src/util.rs index 93c29c4..a13e6b6 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,3 +1,4 @@ +//! Useful utilties functions that might be helpful for developing a Vietnamese IME. use crate::maps::VOWELS; /// Strip off tone mark & modifications from an input char. From 163aa837f24dc5ba1d95b5082e16814930962117 Mon Sep 17 00:00:00 2001 From: ZeroX-DG Date: Mon, 3 Jun 2024 22:12:31 +1200 Subject: [PATCH 8/8] docs: fix broken docs test --- src/methods.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/methods.rs b/src/methods.rs index 8268bdf..a0117a2 100644 --- a/src/methods.rs +++ b/src/methods.rs @@ -24,6 +24,13 @@ //! To define a new typing definition, you need to declare a definition map, which is a [`phf::Map`]: //! //! ``` +//! use phf::phf_map; +//! use vi::{ +//! processor::{LetterModification, ToneMark}, +//! Action, Definition, +//! }; +//! use vi::methods::transform_buffer; +//! //! pub static MY_VNI: Definition = phf_map! { //! '1' => &[Action::AddTonemark(ToneMark::Acute)], //! '2' => &[Action::AddTonemark(ToneMark::Grave)], @@ -37,15 +44,10 @@ //! 'z' => &[Action::ResetInsertedƯ, Action::InsertƯ], //! '0' => &[Action::RemoveToneMark], //! }; -//! ``` -//! -//! Then you can pass that in [`transform_buffer`] as usual: -//! -//! ``` -//! use vi::methods::transform_buffer; //! +//! // Then you can pass that in `transform_buffer` as usual: //! let mut result = String::new(); -//! transform_buffer(MY_VNI, "chza".chars(), &mut result); +//! transform_buffer(&MY_VNI, "chza".chars(), &mut result); //! assert_eq!(result, "chưa".to_owned()); //! ``` use phf::{phf_map, Map}; @@ -85,9 +87,14 @@ pub enum Action { /// If a character can trigger different actions depending on what is possible, its value will contains multiple Action. For example, /// /// ``` +/// use phf::phf_map; +/// use vi::{ +/// processor::{LetterModification, ToneMark}, +/// Action, Definition, +/// }; /// pub static TELEX: Definition = phf_map! { /// 'w' => &[Action::ResetInsertedƯ, Action::ModifyLetter(LetterModification::Horn), Action::ModifyLetter(LetterModification::Breve), Action::InsertƯ], -/// } +/// }; /// ``` /// /// The definition above specify that `w` can trigger a `ResetInseretedƯ`, or if that doesn't work, a `ModifyLetter(LetterModification::Horn)` action