Add a Sentence struct, replace Vec<Token> with Sentence where possi…

…ble (bminixhofer#54) * replace Vec<Token> with new Sentence struct where possible (+ with IncompleteSentence for Vec<IncompleteToken>) * separate match sentence and match graph, reduce dependents on tokenizer * fix missing SENT_START special case, debug impls for WordId, PosId * make MatchSentence private, docs * use new Span struct for byte and char ranges * fix PartialOrd impl on Position, get_token_str -> get_token_ranges
drahnr · Apr 7, 2021 · ea63b98 · ea63b98
1 parent 05ef51c
commit ea63b98
Show file tree

Hide file tree

Showing 23 changed files with 1,235 additions and 628 deletions.
diff --git a/build/src/lib.rs b/build/src/lib.rs
@@ -24,7 +24,7 @@ pub enum Error {
     #[error("Failed to validate {1:?} binary for lang {0}")]
     ValidationFailed(String, Binary, #[source] nlprule::Error),
     #[error(transparent)]
-    IOError(#[from] io::Error),
+    IoError(#[from] io::Error),
     #[error(transparent)]
     ZipError(#[from] ZipError),
     #[error("error postprocessing binaries: {0}")]

diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml
@@ -27,7 +27,7 @@ fst = "0.4"
 fs-err = "2.5"
 aho-corasick = "0.7"
 half = { version = "1.7", features = ["serde"] }
-srx = { version = "^0.1.2", features = ["serde"] }
+srx = { version = "^0.1.3", features = ["serde"] }
 lazycell = "1"
 cfg-if = "1"
 
@@ -70,7 +70,15 @@ regex-all-test = ["regex-onig", "regex-fancy"]
 
 # needed for the bin test targets and to compile nlprule binaries, you'll usually not need these
 bin = ["clap", "env_logger"]
-compile = ["regex-syntax", "serde-xml-rs", "xml-rs", "roxmltree", "serde_json", "srx/from_xml", "regex-all-test"]
+compile = [
+    "regex-syntax",
+    "serde-xml-rs",
+    "xml-rs",
+    "roxmltree",
+    "serde_json",
+    "srx/from_xml",
+    "regex-all-test",
+]
 
 [[bin]]
 name = "compile"

diff --git a/nlprule/src/bin/run.rs b/nlprule/src/bin/run.rs
@@ -23,6 +23,6 @@ fn main() {
 
     let tokens = tokenizer.pipe(&opts.text);
 
-    println!("Tokens: {:#?}", tokens);
+    println!("Tokens: {:#?}", tokens.collect::<Vec<_>>());
     println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer));
 }
diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs
@@ -12,13 +12,13 @@ use std::{
 
 use crate::{
     rule::{
-        disambiguation::POSFilter,
+        disambiguation::PosFilter,
         engine::{
             composition::{GraphId, Matcher, PosMatcher, TextMatcher},
             Engine,
         },
         id::Category,
-        DisambiguationRule, MatchGraph, Rule,
+        DisambiguationRule, Rule,
     },
     rules::{Rules, RulesLangOptions, RulesOptions},
     tokenizer::{
@@ -167,6 +167,7 @@ impl Tagger {
             word_store,
             tag_store,
             lang_options,
+            ..Default::default()
         })
     }
 }
@@ -203,8 +204,6 @@ impl MultiwordTagger {
 
 impl TextMatcher {
     pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
-        let graph = MatchGraph::default();
-
         // can not cache a matcher that depends on the graph
         let set = if matcher.graph_id().is_some() {
             None
@@ -223,7 +222,7 @@ impl TextMatcher {
                 let set: DefaultHashSet<_> = data
                     .into_maybe_par_iter()
                     .filter_map(|(word, id)| {
-                        if matcher.is_match(word.as_str(), &graph, None) {
+                        if matcher.is_match(word.as_str(), None, None) {
                             Some(*id)
                         } else {
                             None
@@ -249,10 +248,9 @@ impl TextMatcher {
 impl PosMatcher {
     pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
         let mut mask = vec![false; info.tagger().tag_store().len()];
-        let graph = MatchGraph::default();
 
         for (word, id) in info.tagger().tag_store().iter() {
-            mask[id.0 as usize] = matcher.is_match(word.as_str(), &graph, None);
+            mask[id.0 as usize] = matcher.is_match(word.as_str(), None, None);
         }
 
         PosMatcher { mask }
@@ -507,9 +505,9 @@ impl chunk::Chunker {
     }
 }
 
-impl POSFilter {
+impl PosFilter {
     pub(in crate::compile) fn new(matcher: PosMatcher) -> Self {
-        POSFilter { matcher }
+        PosFilter { matcher }
     }
 }
 

diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs
@@ -66,9 +66,9 @@ pub enum Error {
     #[error("serialization error")]
     Serialization(#[from] bincode::Error),
     #[error("JSON deserialization error")]
-    JSON(#[from] serde_json::Error),
+    Json(#[from] serde_json::Error),
     #[error("error loading SRX")]
-    SRX(#[from] srx::Error),
+    Srx(#[from] srx::Error),
     #[error("language options do not exist for '{lang_code}'")]
     LanguageOptionsDoNotExist { lang_code: String },
     #[error("regex syntax error: {0}")]

diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/compile/parse_structure.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::{ops::Range, sync::Arc};
 
 use super::{structure, Error};
 use crate::{tokenizer::tag::Tagger, types::*};
@@ -644,9 +644,9 @@ fn parse_features(
     pattern: &structure::Pattern,
     unifications: &Option<Vec<structure::Unification>>,
     info: &mut BuildInfo,
-) -> Vec<Vec<POSFilter>> {
+) -> Vec<Vec<PosFilter>> {
     let mut filters = Vec::new();
-    let mut parse_feature = |id: &str| -> Vec<POSFilter> {
+    let mut parse_feature = |id: &str| -> Vec<PosFilter> {
         let unification = unifications
             .as_ref()
             .unwrap()
@@ -812,30 +812,32 @@ impl Rule {
             }
 
             let mut texts = Vec::new();
-            let mut char_length = 0;
             let mut suggestion: Option<Suggestion> = None;
 
             for part in &example.parts {
                 match part {
                     structure::ExamplePart::Text(text) => {
                         texts.push(text.as_str());
-                        char_length += text.chars().count();
                     }
                     structure::ExamplePart::Marker(marker) => {
+                        let (bytes_before, chars_before) =
+                            texts.iter().fold((0, 0), |acc, text| {
+                                (acc.0 + text.len(), acc.1 + text.chars().count())
+                            });
+
                         if suggestion.is_some() {
                             return Err(Error::Unexpected(
                                 "example must have one or zero markers".into(),
                             ));
                         }
 
                         texts.push(marker.text.as_str());
-                        let length = marker.text.chars().count();
 
                         if let Some(correction_text) = &example.correction {
                             let mut replacements: Vec<_> =
                                 correction_text.split('|').map(|x| x.to_string()).collect();
 
-                            replacements = if char_length == 0 {
+                            replacements = if chars_before == 0 {
                                 // title case if at start
                                 replacements
                                     .into_iter()
@@ -847,16 +849,16 @@ impl Rule {
                                 replacements
                             };
 
-                            suggestion = Some(Suggestion {
-                                source: "_Test".to_string(),
-                                message: "_Test".to_string(),
-                                start: char_length,
-                                end: char_length + length,
+                            suggestion = Some(Suggestion::new(
+                                "_Test".into(),
+                                "_Test".into(),
+                                Span::new(
+                                    bytes_before..bytes_before + marker.text.len(),
+                                    chars_before..chars_before + marker.text.chars().count(),
+                                ),
                                 replacements,
-                            });
+                            ));
                         }
-
-                        char_length += marker.text.chars().count();
                     }
                 }
             }
@@ -953,17 +955,17 @@ impl owned::WordData {
     }
 }
 
-fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> POSFilter {
+fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter {
     match postag_regexp.as_deref() {
-        Some("yes") => POSFilter::new(PosMatcher::new(
+        Some("yes") => PosFilter::new(PosMatcher::new(
             Matcher::new_regex(
                 Regex::from_java_regex(&postag, true, true).unwrap(),
                 false,
                 true,
             ),
             info,
         )),
-        Some(_) | None => POSFilter::new(PosMatcher::new(
+        Some(_) | None => PosFilter::new(PosMatcher::new(
             Matcher::new_string(either::Left(postag.into()), false, false, true),
             info,
         )),
@@ -1221,7 +1223,7 @@ impl DisambiguationRule {
         if let Some(examples_structure) = data.examples.as_ref() {
             for example in examples_structure {
                 let mut texts = Vec::new();
-                let mut char_span: Option<(usize, usize)> = None;
+                let mut char_span: Option<Range<usize>> = None;
                 let mut char_length = 0;
 
                 for part in &example.parts {
@@ -1240,8 +1242,7 @@ impl DisambiguationRule {
                             texts.push(marker.text.as_str());
                             let length = marker.text.chars().count();
 
-                            char_span = Some((char_length, char_length + length));
-
+                            char_span = Some(char_length..char_length + length);
                             char_length += marker.text.chars().count();
                         }
                     }

diff --git a/nlprule/src/compile/structure.rs b/nlprule/src/compile/structure.rs
@@ -197,40 +197,40 @@ pub struct Category {
 }
 
 #[derive(Debug, Clone, Deserialize)]
-pub struct XMLString {
+pub struct XmlString {
     pub text: String,
 }
 
-impl std::ops::Deref for XMLString {
+impl std::ops::Deref for XmlString {
     type Target = String;
 
     fn deref(&self) -> &Self::Target {
         &self.text
     }
 }
 
-impl std::convert::Into<String> for XMLString {
-    fn into(self) -> String {
-        self.text
+impl From<XmlString> for String {
+    fn from(data: XmlString) -> String {
+        data.text
     }
 }
 
 #[derive(Debug, Clone, Deserialize)]
-pub struct XMLText {
-    pub text: XMLString,
+pub struct XmlText {
+    pub text: XmlString,
 }
 
-impl std::ops::Deref for XMLText {
+impl std::ops::Deref for XmlText {
     type Target = String;
 
     fn deref(&self) -> &Self::Target {
         &self.text
     }
 }
 
-impl std::convert::Into<String> for XMLText {
-    fn into(self) -> String {
-        self.text.into()
+impl From<XmlText> for String {
+    fn from(data: XmlText) -> String {
+        data.text.into()
     }
 }
 
@@ -242,7 +242,7 @@ pub struct Match {
     #[serde(rename = "postag_regexp")]
     pub postag_regex: Option<String>,
     pub postag_replace: Option<String>,
-    pub text: Option<XMLString>,
+    pub text: Option<XmlString>,
     pub include_skipped: Option<String>,
     pub case_conversion: Option<String>,
     pub regexp_match: Option<String>,
@@ -253,7 +253,7 @@ pub struct Match {
 #[serde(deny_unknown_fields, rename_all = "lowercase")]
 pub enum SuggestionPart {
     Match(Match),
-    Text(XMLString),
+    Text(XmlString),
 }
 
 #[derive(Debug, Clone, Deserialize)]
@@ -268,7 +268,7 @@ pub struct Suggestion {
 #[serde(deny_unknown_fields, rename_all = "lowercase")]
 pub enum MessagePart {
     Suggestion(Suggestion),
-    Text(XMLString),
+    Text(XmlString),
     Match(Match),
 }
 
@@ -283,14 +283,14 @@ pub struct Message {
 #[derive(Debug, Clone, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct ExampleMarker {
-    pub text: XMLString,
+    pub text: XmlString,
 }
 
 #[derive(Debug, Clone, Deserialize)]
 #[serde(deny_unknown_fields, rename_all = "lowercase")]
 pub enum ExamplePart {
     Marker(ExampleMarker),
-    Text(XMLString),
+    Text(XmlString),
 }
 
 #[derive(Debug, Clone, Deserialize)]
@@ -318,15 +318,15 @@ pub struct Exception {
     pub negate: Option<String>,
     pub negate_pos: Option<String>,
     pub scope: Option<String>,
-    pub text: Option<XMLString>,
+    pub text: Option<XmlString>,
 }
 
 #[derive(Debug, Clone, Deserialize)]
 #[serde(rename_all = "lowercase")]
 #[serde(deny_unknown_fields)]
 #[allow(clippy::large_enum_variant)]
 pub enum TokenPart {
-    Text(XMLString),
+    Text(XmlString),
     Exception(Exception),
     #[serde(rename = "match")]
     Sub(Sub),
@@ -474,7 +474,7 @@ pub struct Pattern {
 #[derive(Debug, Clone, Deserialize)]
 #[serde(deny_unknown_fields)]
 pub struct Regex {
-    pub text: XMLString,
+    pub text: XmlString,
     pub case_sensitive: Option<String>,
     pub mark: Option<String>,
 }
@@ -494,8 +494,8 @@ pub struct Rule {
     pub examples: Vec<Example>,
     pub id: Option<String>,
     pub name: Option<String>,
-    pub short: Option<XMLText>,
-    pub url: Option<XMLText>,
+    pub short: Option<XmlText>,
+    pub url: Option<XmlText>,
     pub default: Option<String>,
     pub filter: Option<Filter>,
     #[serde(rename = "__unused_unifications")]
@@ -510,8 +510,8 @@ pub struct RuleGroup {
     pub antipatterns: Option<Vec<Pattern>>,
     pub default: Option<String>,
     pub name: String,
-    pub short: Option<XMLText>,
-    pub url: Option<XMLText>,
+    pub short: Option<XmlText>,
+    pub url: Option<XmlText>,
     #[serde(rename = "rule")]
     pub rules: Vec<Rule>,
 }