From b91ca5f4c891fba4fdc370d60e88d6d25e7fa546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Tue, 7 Jan 2025 16:50:39 +0100 Subject: [PATCH 1/6] Added support for European and Brazilian Portuguese. --- Cargo.toml | 4 +- src/digit_string.rs | 6 + src/lang/mod.rs | 9 +- src/lang/pt/mod.rs | 441 ++++++++++++++++++++++++++++++++++++++ src/lang/pt/vocabulary.rs | 6 + 5 files changed, 463 insertions(+), 3 deletions(-) create mode 100644 src/lang/pt/mod.rs create mode 100644 src/lang/pt/vocabulary.rs diff --git a/Cargo.toml b/Cargo.toml index daa18a1..1e15a62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,10 @@ [package] name = "text2num" -version = "2.5.2" +version = "2.6.0" authors = ["Allo-Media "] edition = "2021" license = "MIT" -description = "Parse and convert numbers written in English, Dutch, Spanish, German, Italian or French into their digit representation." +description = "Parse and convert numbers written in English, Dutch, Spanish, Portuguese, German, Italian or French into their digit representation." keywords = ["NLP", "words-to-numbers"] categories = ["text-processing"] repository = "https://github.com/allo-media/text2num-rs" diff --git a/src/digit_string.rs b/src/digit_string.rs index 7cc5968..dcb7f4c 100644 --- a/src/digit_string.rs +++ b/src/digit_string.rs @@ -169,10 +169,16 @@ impl DigitString { position > max_pos || self.buffer[max_pos - position] == b'0' } + /// check strict emptiness, that is nothing, not event leading zeroes. pub fn is_empty(&self) -> bool { self.buffer.is_empty() && self.leading_zeroes == 0 } + /// check for emptiness or only leading zeroes + pub fn is_null(&self) -> bool { + self.buffer.is_empty() + } + pub fn len(&self) -> usize { self.buffer.len() + self.leading_zeroes } diff --git a/src/lang/mod.rs b/src/lang/mod.rs index 1b1cc0b..4010094 100644 --- a/src/lang/mod.rs +++ b/src/lang/mod.rs @@ -28,6 +28,7 @@ mod es; mod fr; mod it; mod nl; +mod pt; use crate::digit_string::DigitString; @@ -39,6 +40,7 @@ pub use es::Spanish; pub use fr::French; pub use it::Italian; pub use nl::Dutch; +pub use pt::Portuguese; pub trait BasicAnnotate { fn text_lowercase(&self) -> &str; @@ -136,6 +138,7 @@ pub enum Language { Italian(Italian), Spanish(Spanish), Dutch(Dutch), + Portuguese(Portuguese), } impl Language { @@ -162,6 +165,10 @@ impl Language { pub fn dutch() -> Self { Language::Dutch(Dutch::default()) } + + pub fn portuguese() -> Self { + Language::Portuguese(Portuguese::default()) + } } macro_rules! delegate { @@ -230,5 +237,5 @@ macro_rules! delegate { } impl LangInterpretor for Language { - delegate!(Dutch, French, English, German, Italian, Spanish); + delegate!(Dutch, French, English, German, Italian, Spanish, Portuguese); } diff --git a/src/lang/pt/mod.rs b/src/lang/pt/mod.rs new file mode 100644 index 0000000..b1a58d2 --- /dev/null +++ b/src/lang/pt/mod.rs @@ -0,0 +1,441 @@ +//! Spanish number interpretor +//! Sources: +//! - https://www.practiceportuguese.com/ +//! - http://www.portugaisfacile.fr/cours-pour-les-debutants/compter-en-portugais-les-nombres/ +//! - https://www.dicio.com.br/como-escrever-numeros-por-extenso/ +//! - https://exoportugais.blogspot.com/2012/12/nombres-ordinaux-en-portugais.html + +use bitflags::bitflags; + +use crate::digit_string::DigitString; +use crate::error::Error; + +mod vocabulary; + +use super::{LangInterpretor, MorphologicalMarker}; +use vocabulary::INSIGNIFICANT; + +#[derive(Default)] +pub struct Portuguese {} + +impl Portuguese { + pub fn new() -> Self { + Default::default() + } +} + +bitflags! { + /// word chaining restrictions + struct Restriction: u64 { + const CONJUNCTION = 1; + const ONLY_MULTIPLIERS = 2; + } +} + +/// pseud lemmatizer +fn lemmatize(word: &str) -> &str { + if word.ends_with('a') { + word.trim_end_matches('a') + } else if word.ends_with("as") && word != "duas" { + word.trim_end_matches("as") + } else if word.ends_with('o') && word != "zero" { + word.trim_end_matches('o') + } else if word.ends_with("os") { + word.trim_end_matches("os") + } else { + word + } +} + +impl LangInterpretor for Portuguese { + fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { + let num_marker = self.get_morph_marker(num_func); + if !b.is_empty() && num_marker != b.marker { + return Err(Error::Overlap); + } + let restrictions = Restriction::from_bits_truncate(b.flags); + let only_multipliers = restrictions.contains(Restriction::ONLY_MULTIPLIERS); + let smaller_blocked = only_multipliers + || !restrictions.contains(Restriction::CONJUNCTION) + && num_marker.is_none() + && !b.is_free(4); + let mut next_restrictions = Restriction::empty(); + let status = match lemmatize(num_func) { + "zero" => b.put(b"0"), + "um" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"1"), + "primeir" => b.put(b"1"), + "dois" | "duas" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"2"), + "segund" => b.put(b"2"), + "três" | "tres" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"3"), + "terceir" => b.put(b"3"), + "quatr" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"4"), + "quart" => b.put(b"4"), + "cinc" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"5"), + "quint" => b.put(b"5"), + "seis" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"6"), + "sext" => b.put(b"6"), + "sete" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"7"), + "sétim" => b.put(b"7"), + "oit" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"8"), + "oitav" => b.put(b"8"), + "nove" if b.peek(2) != b"10" && !smaller_blocked => b.put(b"9"), + "non" if !smaller_blocked => b.put(b"9"), + "dez" | "décim" if !smaller_blocked => b.put(b"10"), + "onze" if !smaller_blocked => b.put(b"11"), + "doze" if !smaller_blocked => b.put(b"12"), + "treze" if !smaller_blocked => b.put(b"13"), + "catorze" | "quatorze" if !smaller_blocked => b.put(b"14"), + "quinze" if !smaller_blocked => b.put(b"15"), + "dezasseis" | "dezesseis" if !smaller_blocked => b.put(b"16"), + "dezassete" | "dezessete" if !smaller_blocked => b.put(b"17"), + "dezoit" if !smaller_blocked => b.put(b"18"), + "dezanove" | "dezenove" if !smaller_blocked => b.put(b"19"), + "vinte" | "vigésim" if !smaller_blocked => b.put(b"20"), + "trint" | "trigésim" if !smaller_blocked => b.put(b"30"), + "quarent" | "quadragésim" if !smaller_blocked => b.put(b"40"), + "cinquent" | "cinqüent" | "quinquagésim" | "qüinquagésim" if !smaller_blocked => { + b.put(b"50") + } + "sessent" | "sexagésim" if !smaller_blocked => b.put(b"60"), + "setent" | "septuagésim" | "setuagésim" if !smaller_blocked => b.put(b"70"), + "oitent" | "octogésim" if !smaller_blocked => b.put(b"80"), + "novent" | "nonagésim" if !smaller_blocked => b.put(b"90"), + "cem" if !only_multipliers => { + next_restrictions = Restriction::ONLY_MULTIPLIERS; + b.put(b"100") + } + "cent" | "centésim" if !only_multipliers => b.put(b"100"), + "duzent" | "ducentésim" if !only_multipliers => b.put(b"200"), + "trezent" | "trecentésim" if !only_multipliers => b.put(b"300"), + "quatrocent" | "quadringentésim" if !only_multipliers => b.put(b"400"), + "quinhent" | "quingentésim" | "qüingentésim" if !only_multipliers => b.put(b"500"), + "seiscent" | "sexcentésim" | "seiscentésim" if !only_multipliers => b.put(b"600"), + "setecent" | "septingentésim" if !only_multipliers => b.put(b"700"), + "oitocent" | "octingentésim" if !only_multipliers => b.put(b"800"), + "novecent" | "noningentésim" | "nongentésim" if !only_multipliers => b.put(b"900"), + "mil" | "milésim" + if b.is_range_free(3, 5) && (only_multipliers || b.peek(3) != b"100") => + { + let peek = b.peek(2); + if peek == b"1" { + Err(Error::Overlap) + } else { + b.shift(3) + } + } + "milhã" | "milhões" | "milionésim" if b.is_range_free(6, 8) => b.shift(6), + "bilhã" | "biliã" | "bilhões" | "biliões" | "bilionésim" => b.shift(9), + "e" if b.len() >= 2 && b.marker.is_none() && !only_multipliers => { + Err(Error::Incomplete) + } + + _ => Err(Error::NaN), + }; + match status { + Ok(()) => { + b.marker = num_marker; + b.flags = next_restrictions.bits(); + } + Err(Error::Incomplete) => { + b.flags = Restriction::CONJUNCTION.bits(); + } + _ => { + b.flags = 0; + } + } + status + } + + fn apply_decimal(&self, decimal_func: &str, b: &mut DigitString) -> Result<(), Error> { + self.apply(decimal_func, b) + } + + fn get_morph_marker(&self, word: &str) -> MorphologicalMarker { + let lemma = lemmatize(word); + let prob_marker = if word.ends_with('a') { + MorphologicalMarker::Ordinal("ª") + } else if word.ends_with("as") { + MorphologicalMarker::Ordinal("ᵃˢ") + } else if word.ends_with('o') { + MorphologicalMarker::Ordinal("º") + } else if word.ends_with("os") { + MorphologicalMarker::Ordinal("ᵒˢ") + } else { + return MorphologicalMarker::None; + }; + match lemma { + "primeir" | "segund" | "terceir" | "quart" | "quint" | "sext" | "sétim" | "oitav" + | "non" => prob_marker, + ord if ord.ends_with("im") => prob_marker, + _ => MorphologicalMarker::None, + } + } + fn is_decimal_sep(&self, word: &str) -> bool { + word == "vírgula" + } + + fn format_and_value(&self, b: &DigitString) -> (String, f64) { + let repr = b.to_string(); + let val = repr.parse().unwrap(); + if let MorphologicalMarker::Ordinal(marker) = b.marker { + (format!("{}{}", b.to_string(), marker), val) + } else { + (repr, val) + } + } + + fn format_decimal_and_value(&self, int: &DigitString, dec: &DigitString) -> (String, f64) { + let sint = int.to_string(); + let sdec = dec.to_string(); + let val = format!("{sint}.{sdec}").parse().unwrap(); + (format!("{sint},{sdec}"), val) + } + + fn is_linking(&self, word: &str) -> bool { + INSIGNIFICANT.contains(word) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::word_to_digit::{replace_numbers_in_text, text2digits}; + + macro_rules! assert_text2digits { + ($text:expr, $res:expr) => { + let f = Portuguese {}; + let res = text2digits($text, &f); + dbg!(&res); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), $res) + }; + } + + macro_rules! assert_replace_numbers { + ($text:expr, $res:expr) => { + let f = Portuguese {}; + assert_eq!(replace_numbers_in_text($text, &f, 10.0), $res) + }; + } + + macro_rules! assert_replace_all_numbers { + ($text:expr, $res:expr) => { + let f = Portuguese {}; + assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res) + }; + } + + macro_rules! assert_invalid { + ($text:expr) => { + let f = Portuguese {}; + let res = text2digits($text, &f); + assert!(res.is_err()); + }; + } + + // Most of the test are ported (and corrected) from the python version of text2num. + + #[test] + fn test_apply() { + assert_text2digits!("zero", "0"); + assert_text2digits!("um", "1"); + assert_text2digits!("oito", "8"); + assert_text2digits!("dez", "10"); + assert_text2digits!("onze", "11"); + assert_text2digits!("dezanove", "19"); + assert_text2digits!("vinte", "20"); + assert_text2digits!("vinte e um", "21"); + assert_text2digits!("trinta", "30"); + assert_text2digits!("trinta e um", "31"); + assert_text2digits!("trinta e dois", "32"); + assert_text2digits!("trinta e três", "33"); + assert_text2digits!("trinta e nove", "39"); + assert_text2digits!("noventa e nove", "99"); + assert_text2digits!("cem", "100"); + assert_text2digits!("cento e um", "101"); + assert_text2digits!("duzentos", "200"); + assert_text2digits!("duzentos e um", "201"); + assert_text2digits!("mil", "1000"); + assert_text2digits!("mil e um", "1001"); + assert_text2digits!("dois mil", "2000"); + assert_text2digits!("dois mil e noventa e nove", "2099"); + assert_text2digits!("nove mil novecentos e noventa e nove", "9999"); + assert_text2digits!( + "novecentos e noventa e nove mil novecentos e noventa e nove", + "999999" + ); + assert_text2digits!("cinquenta e três mil e vinte milhões duzentos e quarenta e três mil setecentos e vinte e quatro", "53020243724"); + assert_text2digits!( + "cinquenta e um milhões quinhentos e setenta e oito mil trezentos e dois", + "51578302" + ); + assert_text2digits!("mil trezentos e vinte e cinco", "1325"); + assert_text2digits!("cem mil", "100000"); + assert_text2digits!("mil e duzentos", "1200"); + } + + #[test] + fn test_invalid() { + assert_invalid!("mil mil duzentos"); + assert_invalid!("sessenta quinze"); + assert_invalid!("sessenta cem"); + assert_invalid!("sessenta quatro"); + assert_invalid!("cem e um"); + assert_invalid!("cento mil"); + } + + #[test] + fn test_zeroes() { + assert_text2digits!("zero", "0"); + assert_text2digits!("zero oito", "08"); + assert_text2digits!("zero um", "01"); + assert_text2digits!("zero uma", "01"); + assert_text2digits!("zero zero cento e vinte e cinco", "00125"); + assert_invalid!("cinco zero"); + assert_invalid!("cinquenta zero três"); + assert_invalid!("cinquenta e zero três"); + assert_invalid!("cinquenta e zero"); + assert_invalid!("cinquenta e três zero"); + assert_invalid!("dez zero"); + } + + #[test] + fn test_ordinals() { + assert_text2digits!("vigésimo quarto", "24º"); + assert_text2digits!("vigésimo primeiro", "21º"); + assert_text2digits!("centésimo primeiro", "101º"); + assert_text2digits!("décima sexta", "16ª"); + assert_text2digits!("décimas sextas", "16ᵃˢ"); + assert_text2digits!("décimos sextos", "16ᵒˢ"); + } + + #[test] + fn test_replace_numbers_integers() { + assert_replace_numbers!( + "vinte e cinco vacas, doze galinhas e cento e vinte e cinco kg de batatas.", + "25 vacas, 12 galinhas e 125 kg de batatas." + ); + assert_replace_numbers!("mil duzentos e sessenta e seis dólares.", "1266 dólares."); + assert_replace_numbers!("um dois três quatro vinte quinze.", "1 2 3 4 20 15."); + assert_replace_numbers!( + "um, dois, três, quatro, vinte, quinze.", + "1, 2, 3, 4, 20, 15." + ); + assert_replace_numbers!("um dois três quatro trinta e cinco.", "1 2 3 4 35."); + assert_replace_numbers!("vinte e um, trinta e um.", "21, 31."); + assert_replace_numbers!("trinta e quatro ≠ trinta quatro", "34 ≠ 30 4"); + assert_replace_numbers!("cem e dois", "100 e 2"); + } + + #[test] + fn test_replace_numbers_formal() { + assert_replace_numbers!( + "trinta e três nove sessenta zero seis doze vinte e um", + "33 9 60 06 12 21" + ); + assert_replace_numbers!( + "zero nove sessenta zero seis doze vinte e um", + "09 60 06 12 21" + ); + } + + #[test] + fn test_replace_numbers_that_use_conjunction() { + assert_replace_numbers!("sessenta seis", "60 6"); + assert_replace_numbers!("sessenta e seis", "66"); + assert_replace_numbers!("duzentos e quarenta e quatro", "244"); + assert_replace_numbers!("dois mil e vinte", "2020"); + assert_replace_numbers!("mil novecentos e oitenta e quatro", "1984"); + assert_replace_numbers!("mil e novecentos", "1900"); + // assert_replace_numbers!( + // "mil novecentos", + // "1000 900" + // ); + assert_replace_numbers!("dois mil cento e vinte e cinco", "2125"); + assert_replace_numbers!( + "Trezentos e setenta e oito milhões vinte e sete mil trezentos e doze", + "378027312" + ); + } + + #[test] + fn test_replace_numbers_zero() { + assert_replace_numbers!("treze mil zero noventa", "13000 090"); + } + + #[test] + fn test_replace_numbers_decimals() { + assert_replace_numbers!( + "doze vírgula noventa e nove, cento e vinte vírgula zero cinco, um vírgula duzentos e trinta e seis, um vírgula dois três seis.", + "12,99, 120,05, 1,236, 1,2 3 6." + ); + assert_replace_numbers!("vírgula quinze", "vírgula 15"); + assert_replace_numbers!("zero vírgula quinze", "0,15"); + assert_replace_numbers!("zero vírgula cinco", "0,5"); + assert_replace_numbers!("um vírgula um", "1,1"); + assert_replace_numbers!("um vírgula quatrocentos e um", "1,401"); + } + + #[test] + fn test_replace_numbers_article() { + assert_replace_numbers!( + "Um momento por favor! trinta e um gatos. Um dois três quatro!", + "Um momento por favor! 31 gatos. 1 2 3 4!" + ); + assert_replace_numbers!("Nem um. Um um. Trinta e um", "Nem um. 1 1. 31"); + } + + #[test] + fn test_replace_numbers_second_as_time_unit_vs_ordinal() { + assert_replace_numbers!( + "Um segundo por favor! Vigésimo segundo é diferente de vinte segundos.", + "Um segundo por favor! 22º é diferente de 20 segundos." + ); + } + + #[test] + fn test_replace_numbers_ordinals() { + assert_replace_numbers!( + "Ordinais: primeiro, quinto, terceiro, vigésima, vigésimo primeiro, centésimo quadragésimo quinto", + "Ordinais: 1º, 5º, 3º, 20ª, 21º, 145º" + ); + assert_replace_numbers!( + "A décima quarta brigada do exército português, juntamento com o nonagésimo sexto regimento britânico, bateu o centésimo vigésimo sétimo regimento de infantaria de Napoleão", + "A 14ª brigada do exército português, juntamento com o 96º regimento britânico, bateu o 127º regimento de infantaria de Napoleão" + ); + } + + #[test] + fn test_brazilian_variants() { + assert_replace_numbers!("catorze", "14"); + assert_replace_numbers!("mil quatrocentos e catorze", "1414"); + assert_replace_numbers!( + "em mil quinhentos e catorze, ela nasceu", + "em 1514, ela nasceu" + ); + assert_replace_numbers!("dezesseis", "16"); + assert_replace_numbers!("mil seiscentos e dezesseis", "1616"); + assert_replace_numbers!( + "tudo aconteceu até mil novecentos e dezesseis", + "tudo aconteceu até 1916" + ); + assert_replace_numbers!("dezessete", "17"); + assert_replace_numbers!("mil setecentos e dezessete", "1717"); + assert_replace_numbers!( + "em dezessete de janeiro de mil novecentos e noventa", + "em 17 de janeiro de 1990" + ); + assert_replace_numbers!("dezenove", "19"); + assert_replace_numbers!("mil novecentos e dezenove", "1919"); + assert_replace_numbers!( + "quanto é dezenove menos três? É dezesseis", + "quanto é 19 menos 3? É 16" + ); + assert_replace_numbers!("um milhão quatrocentos e trinta e três", "1000433"); + assert_replace_numbers!( + "dois milhões oitocentos e quarenta e quatro mil trezentos e trinta e três", + "2844333" + ); + assert_text2digits!("cinquenta e três bilhões e vinte milhões duzentos e quarenta e três mil setecentos e vinte e quatro", "53020243724"); + } +} diff --git a/src/lang/pt/vocabulary.rs b/src/lang/pt/vocabulary.rs new file mode 100644 index 0000000..f3fb49b --- /dev/null +++ b/src/lang/pt/vocabulary.rs @@ -0,0 +1,6 @@ +use phf::{phf_set, Set}; + +pub static INSIGNIFICANT: Set<&'static str> = phf_set! { + "eh", "então", "bem", "isso", "outra vez", "e", "uh", "ha", "ah", "hu", "um", "menos", "ok", "sim", "mais", "aí está", + "digo", "ou", "seja", "aquele", "é", "aquilo", "em", "fim", "mais tarde", "mas", "ei", "agora", "hum", "não", "com", "são", "novamente" +}; From c94a38265e08369864db931ae90bd4e6413034d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Tue, 7 Jan 2025 16:53:41 +0100 Subject: [PATCH 2/6] Clippy. --- src/lang/de/mod.rs | 4 +++- src/lang/fr/mod.rs | 1 + src/lang/pt/mod.rs | 8 -------- src/tokenizer.rs | 2 +- src/word_to_digit.rs | 2 +- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/lang/de/mod.rs b/src/lang/de/mod.rs index d4ca677..53ffe81 100644 --- a/src/lang/de/mod.rs +++ b/src/lang/de/mod.rs @@ -21,7 +21,7 @@ fn lemmatize(word: &str) -> &str { || word.ends_with("ten") || word.ends_with("tem") { - word.trim_end_matches(&['s', 'n', 'm', 'r']) + word.trim_end_matches(['s', 'n', 'm', 'r']) } else { word } @@ -337,6 +337,7 @@ mod tests { assert_invalid!("zwanzig zweitausend"); assert_invalid!("eine und zwanzig"); assert_invalid!("eins und zwanzig"); + assert_invalid!("neun zwanzig"); } #[test] @@ -379,6 +380,7 @@ mod tests { ); assert_replace_numbers!("Einhundert und Ende", "100 und Ende"); assert_replace_numbers!("Einhundert und und", "100 und und"); + assert_replace_numbers!("neun zwanzig", "9 20"); } #[test] diff --git a/src/lang/fr/mod.rs b/src/lang/fr/mod.rs index 574e00e..dee509d 100644 --- a/src/lang/fr/mod.rs +++ b/src/lang/fr/mod.rs @@ -493,6 +493,7 @@ mod tests { assert_replace_numbers!("cinq cent premiers", "500 premiers"); assert_replace_numbers!("cinq cent premier", "500 premier"); assert_replace_all_numbers!("une seconde", "une seconde"); + assert_replace_numbers!("vingt-cinquième et trentième", "25ème et 30ème"); } #[test] diff --git a/src/lang/pt/mod.rs b/src/lang/pt/mod.rs index b1a58d2..5639f19 100644 --- a/src/lang/pt/mod.rs +++ b/src/lang/pt/mod.rs @@ -218,13 +218,6 @@ mod tests { }; } - macro_rules! assert_replace_all_numbers { - ($text:expr, $res:expr) => { - let f = Portuguese {}; - assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res) - }; - } - macro_rules! assert_invalid { ($text:expr) => { let f = Portuguese {}; @@ -237,7 +230,6 @@ mod tests { #[test] fn test_apply() { - assert_text2digits!("zero", "0"); assert_text2digits!("um", "1"); assert_text2digits!("oito", "8"); assert_text2digits!("dez", "10"); diff --git a/src/tokenizer.rs b/src/tokenizer.rs index dfdfda1..ebf14ad 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -79,7 +79,7 @@ impl<'a> Tokenize<'a> { } } -impl<'a> Iterator for Tokenize<'a> { +impl Iterator for Tokenize<'_> { type Item = BasicToken; fn next(&mut self) -> Option { diff --git a/src/word_to_digit.rs b/src/word_to_digit.rs index bc2f924..d228634 100644 --- a/src/word_to_digit.rs +++ b/src/word_to_digit.rs @@ -428,7 +428,7 @@ where } } -impl<'a, L, T, I> Iterator for FindNumbers<'a, L, T, I> +impl Iterator for FindNumbers<'_, L, T, I> where L: LangInterpretor, T: Token, From a08136ba134991150d72bbcc32a7a3826cce8fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Tue, 7 Jan 2025 17:04:10 +0100 Subject: [PATCH 3/6] Typo in comment. --- src/lang/pt/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lang/pt/mod.rs b/src/lang/pt/mod.rs index 5639f19..d1299da 100644 --- a/src/lang/pt/mod.rs +++ b/src/lang/pt/mod.rs @@ -32,7 +32,7 @@ bitflags! { } } -/// pseud lemmatizer +/// pseudo lemmatizer fn lemmatize(word: &str) -> &str { if word.ends_with('a') { word.trim_end_matches('a') From c80fac910bd3a1fd33c970cccb96fbff513d895e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Thu, 9 Jan 2025 16:06:35 +0100 Subject: [PATCH 4/6] LangInterpretor -> LangInterpreter --- src/lang/de/mod.rs | 4 ++-- src/lang/en/mod.rs | 4 ++-- src/lang/es/mod.rs | 4 ++-- src/lang/fr/mod.rs | 4 ++-- src/lang/it/mod.rs | 4 ++-- src/lang/mod.rs | 4 ++-- src/lang/nl/mod.rs | 4 ++-- src/lang/pt/mod.rs | 4 ++-- src/lib.rs | 4 ++-- src/word_to_digit.rs | 24 ++++++++++++------------ 10 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/lang/de/mod.rs b/src/lang/de/mod.rs index 53ffe81..060b371 100644 --- a/src/lang/de/mod.rs +++ b/src/lang/de/mod.rs @@ -11,7 +11,7 @@ use crate::tokenizer::WordSplitter; mod vocabulary; -use super::{LangInterpretor, MorphologicalMarker}; +use super::{LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; fn lemmatize(word: &str) -> &str { @@ -68,7 +68,7 @@ impl German { } } -impl LangInterpretor for German { +impl LangInterpreter for German { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { // In German, numbers are compounded to form a group let lemma = lemmatize(num_func); diff --git a/src/lang/en/mod.rs b/src/lang/en/mod.rs index 50c77bb..041e13e 100644 --- a/src/lang/en/mod.rs +++ b/src/lang/en/mod.rs @@ -5,7 +5,7 @@ use crate::error::Error; mod vocabulary; -use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker}; +use super::{BasicAnnotate, LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; fn lemmatize(word: &str) -> &str { @@ -26,7 +26,7 @@ impl English { } } -impl LangInterpretor for English { +impl LangInterpreter for English { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { // In English, numbers can be compounded to form a group with "-" if num_func.contains('-') { diff --git a/src/lang/es/mod.rs b/src/lang/es/mod.rs index 3a584ee..622f70f 100644 --- a/src/lang/es/mod.rs +++ b/src/lang/es/mod.rs @@ -4,7 +4,7 @@ use crate::error::Error; mod vocabulary; -use super::{LangInterpretor, MorphologicalMarker}; +use super::{LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; fn lemmatize(word: &str) -> &str { @@ -27,7 +27,7 @@ impl Spanish { } } -impl LangInterpretor for Spanish { +impl LangInterpreter for Spanish { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { let num_marker = self.get_morph_marker(num_func); if !b.is_empty() && num_marker != b.marker && !num_marker.is_fraction() { diff --git a/src/lang/fr/mod.rs b/src/lang/fr/mod.rs index dee509d..4f192ef 100644 --- a/src/lang/fr/mod.rs +++ b/src/lang/fr/mod.rs @@ -8,7 +8,7 @@ use crate::error::Error; mod vocabulary; -use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker}; +use super::{BasicAnnotate, LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; fn lemmatize(word: &str) -> &str { @@ -43,7 +43,7 @@ bitflags! { } } -impl LangInterpretor for French { +impl LangInterpreter for French { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { // In French, numbers can be compounded to form a group with "-" if num_func.contains('-') { diff --git a/src/lang/it/mod.rs b/src/lang/it/mod.rs index 8513d5a..c281a59 100644 --- a/src/lang/it/mod.rs +++ b/src/lang/it/mod.rs @@ -6,7 +6,7 @@ use crate::tokenizer::WordSplitter; mod vocabulary; -use super::{LangInterpretor, MorphologicalMarker}; +use super::{LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; pub struct Italian { @@ -76,7 +76,7 @@ impl Italian { } } -impl LangInterpretor for Italian { +impl LangInterpreter for Italian { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { let lemma = lemmatize(num_func); if self.word_splitter.is_splittable(lemma) { diff --git a/src/lang/mod.rs b/src/lang/mod.rs index 4010094..c97d190 100644 --- a/src/lang/mod.rs +++ b/src/lang/mod.rs @@ -75,7 +75,7 @@ impl MorphologicalMarker { /// /// All methods must be implemented except the [`exec_group`](Self::exec_group), which comes with a default implementation. /// Self must be stateless. -pub trait LangInterpretor { +pub trait LangInterpreter { /// Interpret the word `num_func`, that may be part of a larger sequence. /// /// `num_func` is interpreted by calling the appropriate methods on `b`. @@ -236,6 +236,6 @@ macro_rules! delegate { }; } -impl LangInterpretor for Language { +impl LangInterpreter for Language { delegate!(Dutch, French, English, German, Italian, Spanish, Portuguese); } diff --git a/src/lang/nl/mod.rs b/src/lang/nl/mod.rs index 7f60761..1b1e6a0 100644 --- a/src/lang/nl/mod.rs +++ b/src/lang/nl/mod.rs @@ -11,7 +11,7 @@ use crate::tokenizer::WordSplitter; mod vocabulary; -use super::{LangInterpretor, MorphologicalMarker}; +use super::{LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; bitflags! { @@ -83,7 +83,7 @@ impl Dutch { } } -impl LangInterpretor for Dutch { +impl LangInterpreter for Dutch { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { // In Dutch, numbers are compounded to form a group if self.word_splitter.is_splittable(num_func) { diff --git a/src/lang/pt/mod.rs b/src/lang/pt/mod.rs index d1299da..51cdacf 100644 --- a/src/lang/pt/mod.rs +++ b/src/lang/pt/mod.rs @@ -12,7 +12,7 @@ use crate::error::Error; mod vocabulary; -use super::{LangInterpretor, MorphologicalMarker}; +use super::{LangInterpreter, MorphologicalMarker}; use vocabulary::INSIGNIFICANT; #[derive(Default)] @@ -47,7 +47,7 @@ fn lemmatize(word: &str) -> &str { } } -impl LangInterpretor for Portuguese { +impl LangInterpreter for Portuguese { fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> { let num_marker = self.get_morph_marker(num_func); if !b.is_empty() && num_marker != b.marker { diff --git a/src/lib.rs b/src/lib.rs index a9285e5..8d6b907 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -213,14 +213,14 @@ pub mod lang; mod tokenizer; pub mod word_to_digit; -pub use lang::{BasicAnnotate, LangInterpretor, Language}; +pub use lang::{BasicAnnotate, LangInterpreter, Language}; pub use word_to_digit::{ find_numbers, find_numbers_iter, replace_numbers_in_stream, replace_numbers_in_text, text2digits, Occurence, Replace, Token, }; /// Get an interpreter for the language represented by the `language_code` ISO code. -pub fn get_interpretor_for(language_code: &str) -> Option { +pub fn get_interpreter_for(language_code: &str) -> Option { match language_code { "de" => Some(Language::german()), "en" => Some(Language::english()), diff --git a/src/word_to_digit.rs b/src/word_to_digit.rs index d228634..630d124 100644 --- a/src/word_to_digit.rs +++ b/src/word_to_digit.rs @@ -9,17 +9,17 @@ use std::iter::Enumerate; use crate::digit_string::DigitString; use crate::error::Error; -use crate::lang::{BasicAnnotate, LangInterpretor}; +use crate::lang::{BasicAnnotate, LangInterpreter}; use crate::tokenizer::{tokenize, BasicToken}; -struct WordToDigitParser<'a, T: LangInterpretor> { +struct WordToDigitParser<'a, T: LangInterpreter> { int_part: DigitString, dec_part: DigitString, is_dec: bool, lang: &'a T, } -impl<'a, T: LangInterpretor> WordToDigitParser<'a, T> { +impl<'a, T: LangInterpreter> WordToDigitParser<'a, T> { pub fn new(lang: &'a T) -> Self { Self { int_part: DigitString::new(), @@ -77,7 +77,7 @@ impl<'a, T: LangInterpretor> WordToDigitParser<'a, T> { /// Interpret the `text` as a integer number or ordinal, and translate it into digits. /// Return an error if the text couldn't be undestood as a valid number. -pub fn text2digits(text: &str, lang: &T) -> Result { +pub fn text2digits(text: &str, lang: &T) -> Result { match lang.exec_group(text.to_lowercase().split_whitespace()) { Ok(ds) => Ok(lang.format_and_value(&ds).0), Err(err) => Err(err), @@ -324,7 +324,7 @@ impl NumTracker { /// It lazily consumes the token stream. pub struct FindNumbers<'a, L, T, I> where - L: LangInterpretor, + L: LangInterpreter, T: Token, I: Iterator, { @@ -338,7 +338,7 @@ where impl<'a, L, T, I> FindNumbers<'a, L, T, I> where - L: LangInterpretor, + L: LangInterpreter, T: Token, I: Iterator, { @@ -430,7 +430,7 @@ where impl Iterator for FindNumbers<'_, L, T, I> where - L: LangInterpretor, + L: LangInterpreter, T: Token, I: Iterator, { @@ -453,7 +453,7 @@ where /// Find spelled numbers (including decimal numbers) in the input token stream. /// Isolated digits strictly under `threshold` are not converted (set to 0.0 to convert everything). -fn track_numbers>( +fn track_numbers>( input: I, lang: &L, threshold: f64, @@ -470,7 +470,7 @@ The `threshold` drives the *lone number* policy: if a number is isolated — tha surrounded by significant non-number words — and lower than `threshold`, then it is ignored. */ -pub fn find_numbers>( +pub fn find_numbers>( input: I, lang: &L, threshold: f64, @@ -492,7 +492,7 @@ pub fn find_numbers_iter( threshold: f64, ) -> FindNumbers<'_, L, T, Enumerate> where - L: LangInterpretor, + L: LangInterpreter, T: Token, I: Iterator, { @@ -503,7 +503,7 @@ where /// Isolated digits strictly under `threshold` are not converted (set to 0.0 to convert everything). pub fn replace_numbers_in_stream<'a, L, T>(mut input: Vec, lang: &L, threshold: f64) -> Vec where - L: LangInterpretor, + L: LangInterpreter, T: Replace + 'a, for<'b> &'b T: Token, { @@ -514,7 +514,7 @@ where /// Find spelled numbers (including decimal) in the `text` and replace them by their digit representation. /// Isolated digits strictly under `threshold` are not converted (set to 0.0 to convert everything). -pub fn replace_numbers_in_text(text: &str, lang: &L, threshold: f64) -> String { +pub fn replace_numbers_in_text(text: &str, lang: &L, threshold: f64) -> String { let mut tokens = tokenize(text).collect(); lang.basic_annotate(&mut tokens); let out = replace_numbers_in_stream(tokens, lang, threshold); From 2bc129675bb3d274085350f5368cb3f479665bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Thu, 9 Jan 2025 16:15:00 +0100 Subject: [PATCH 5/6] More typos. --- src/digit_string.rs | 2 +- src/lang/de/mod.rs | 4 ++-- src/lang/en/mod.rs | 2 +- src/lang/es/mod.rs | 2 +- src/lang/fr/mod.rs | 2 +- src/lang/it/mod.rs | 2 +- src/lang/mod.rs | 4 ++-- src/lang/nl/mod.rs | 4 ++-- src/lang/pt/mod.rs | 2 +- src/lib.rs | 4 ++-- 10 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/digit_string.rs b/src/digit_string.rs index dcb7f4c..2716c7e 100644 --- a/src/digit_string.rs +++ b/src/digit_string.rs @@ -169,7 +169,7 @@ impl DigitString { position > max_pos || self.buffer[max_pos - position] == b'0' } - /// check strict emptiness, that is nothing, not event leading zeroes. + /// check strict emptiness, that is nothing, not even leading zeroes. pub fn is_empty(&self) -> bool { self.buffer.is_empty() && self.leading_zeroes == 0 } diff --git a/src/lang/de/mod.rs b/src/lang/de/mod.rs index 060b371..d9d53f3 100644 --- a/src/lang/de/mod.rs +++ b/src/lang/de/mod.rs @@ -1,6 +1,6 @@ -//! German number interpretor +//! German number interpreter //! -//! This interpretor is tolerant and accepts splitted words, that is "ein und zwanzig" is treated like "einundzwanzig", as +//! This interpreter is tolerant and accepts splitted words, that is "ein und zwanzig" is treated like "einundzwanzig", as //! the main application, Speech-to-text recognition, may introduce spurious spaces. use bitflags::bitflags; diff --git a/src/lang/en/mod.rs b/src/lang/en/mod.rs index 041e13e..067ab50 100644 --- a/src/lang/en/mod.rs +++ b/src/lang/en/mod.rs @@ -1,4 +1,4 @@ -//! English number interpretor +//! English number interpreter use crate::digit_string::DigitString; use crate::error::Error; diff --git a/src/lang/es/mod.rs b/src/lang/es/mod.rs index 622f70f..e3a0163 100644 --- a/src/lang/es/mod.rs +++ b/src/lang/es/mod.rs @@ -1,4 +1,4 @@ -//! Spanish number interpretor +//! Spanish number interpreter use crate::digit_string::DigitString; use crate::error::Error; diff --git a/src/lang/fr/mod.rs b/src/lang/fr/mod.rs index 4f192ef..d9314aa 100644 --- a/src/lang/fr/mod.rs +++ b/src/lang/fr/mod.rs @@ -1,4 +1,4 @@ -//! French number interpretor. +//! French number interpreter. //! //! It supports regional variants. use bitflags::bitflags; diff --git a/src/lang/it/mod.rs b/src/lang/it/mod.rs index c281a59..17b30c1 100644 --- a/src/lang/it/mod.rs +++ b/src/lang/it/mod.rs @@ -1,4 +1,4 @@ -//! Italian number interpretor +//! Italian number interpreter use crate::digit_string::DigitString; use crate::error::Error; diff --git a/src/lang/mod.rs b/src/lang/mod.rs index c97d190..33e14e6 100644 --- a/src/lang/mod.rs +++ b/src/lang/mod.rs @@ -11,11 +11,11 @@ a subset of the language that is "simple" and consistent enough to be interprete A number expressed in words is then seen as a little program whose interpretation result is either a sequence of digits, if the number is valid, or an error. -The common runtime for all interpretors is the [`DigitString`]. It provided the memory +The common runtime for all interpreters is the [`DigitString`]. It provided the memory and the elementary functions to build a number in base 10 (even if the language to be interpreted counts otherwise). The `DigitString` is responsible for checking the validity of the constructed number at each step (i.e at each method call). -The intepretor part, which is specific to each language, is built by implementing the `LangInterpretor` trait, which +The intepretor part, which is specific to each language, is built by implementing the `Langinterpreter` trait, which translate each number word into a sequence of elementary instructions on a `DigitString`. A language is just an empty (stateless) type. Everything is provided by implementating the trait. diff --git a/src/lang/nl/mod.rs b/src/lang/nl/mod.rs index 1b1e6a0..cdae15c 100644 --- a/src/lang/nl/mod.rs +++ b/src/lang/nl/mod.rs @@ -1,6 +1,6 @@ -//! Dutch number interpretor +//! Dutch number interpreter //! -//! This interpretor is tolerant and accepts splitted words, that is "negen en zeventig" is treated like "negenenzeventig", as +//! This interpreter is tolerant and accepts splitted words, that is "negen en zeventig" is treated like "negenenzeventig", as //! the main application, Speech-to-text recognition, may introduce spurious spaces. use bitflags::bitflags; diff --git a/src/lang/pt/mod.rs b/src/lang/pt/mod.rs index 51cdacf..b92879e 100644 --- a/src/lang/pt/mod.rs +++ b/src/lang/pt/mod.rs @@ -1,4 +1,4 @@ -//! Spanish number interpretor +//! Portuguese number interpreter //! Sources: //! - https://www.practiceportuguese.com/ //! - http://www.portugaisfacile.fr/cours-pour-les-debutants/compter-en-portugais-les-nombres/ diff --git a/src/lib.rs b/src/lib.rs index 8d6b907..9d3e2b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,7 @@ you can easily switch languages at runtime. Each builtin language support regional varieties automatically, so you don't need to specify a region. -The language interpretors are stateless so you can reuse and share them. +The language interpreters are stateless so you can reuse and share them. ```rust use text2num::{Language, text2digits}; @@ -53,7 +53,7 @@ match text2digits(utterance, &es) { When run, the above code should print `'ochenta y cinco' means 85 in Spanish` on the standard output. -If you don't need to dynamically switch languages, you can directly use the appropriate interpretor instead of +If you don't need to dynamically switch languages, you can directly use the appropriate interpreter instead of the `Language` type: ``` From 686ac20c4c8f6603e3be232a4816c5ba7230a5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romuald=20Texier-Marcad=C3=A9?= Date: Fri, 10 Jan 2025 16:46:34 +0100 Subject: [PATCH 6/6] Updated README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1283d21..b4dc9be 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Parse and convert numbers written in English, Dutch, Spanish, German, Italian or French into their digit representation. +# Parse and convert numbers written in English, Dutch, Spanish, Portuguese (Europe & Brazil), German, Italian or French into their digit representation. This crate provides a library for recognizing, parsing and transcribing into digits (base 10) numbers expressed in natural language.