Skip to content

Commit

Permalink
Add a Sentence struct, replace Vec<Token> with Sentence where possi…
Browse files Browse the repository at this point in the history
…ble (bminixhofer#54)

* replace Vec<Token> with new Sentence struct where possible (+ with IncompleteSentence for Vec<IncompleteToken>)

* separate match sentence and match graph, reduce dependents on tokenizer

* fix missing SENT_START special case, debug impls for WordId, PosId

* make MatchSentence private, docs

* use new Span struct for byte and char ranges

* fix PartialOrd impl on Position, get_token_str -> get_token_ranges
  • Loading branch information
bminixhofer authored and drahnr committed Apr 7, 2021
1 parent 05ef51c commit ea63b98
Show file tree
Hide file tree
Showing 23 changed files with 1,235 additions and 628 deletions.
2 changes: 1 addition & 1 deletion build/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub enum Error {
#[error("Failed to validate {1:?} binary for lang {0}")]
ValidationFailed(String, Binary, #[source] nlprule::Error),
#[error(transparent)]
IOError(#[from] io::Error),
IoError(#[from] io::Error),
#[error(transparent)]
ZipError(#[from] ZipError),
#[error("error postprocessing binaries: {0}")]
Expand Down
12 changes: 10 additions & 2 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fst = "0.4"
fs-err = "2.5"
aho-corasick = "0.7"
half = { version = "1.7", features = ["serde"] }
srx = { version = "^0.1.2", features = ["serde"] }
srx = { version = "^0.1.3", features = ["serde"] }
lazycell = "1"
cfg-if = "1"

Expand Down Expand Up @@ -70,7 +70,15 @@ regex-all-test = ["regex-onig", "regex-fancy"]

# needed for the bin test targets and to compile nlprule binaries, you'll usually not need these
bin = ["clap", "env_logger"]
compile = ["regex-syntax", "serde-xml-rs", "xml-rs", "roxmltree", "serde_json", "srx/from_xml", "regex-all-test"]
compile = [
"regex-syntax",
"serde-xml-rs",
"xml-rs",
"roxmltree",
"serde_json",
"srx/from_xml",
"regex-all-test",
]

[[bin]]
name = "compile"
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/bin/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ fn main() {

let tokens = tokenizer.pipe(&opts.text);

println!("Tokens: {:#?}", tokens);
println!("Tokens: {:#?}", tokens.collect::<Vec<_>>());
println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer));
}
16 changes: 7 additions & 9 deletions nlprule/src/compile/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ use std::{

use crate::{
rule::{
disambiguation::POSFilter,
disambiguation::PosFilter,
engine::{
composition::{GraphId, Matcher, PosMatcher, TextMatcher},
Engine,
},
id::Category,
DisambiguationRule, MatchGraph, Rule,
DisambiguationRule, Rule,
},
rules::{Rules, RulesLangOptions, RulesOptions},
tokenizer::{
Expand Down Expand Up @@ -167,6 +167,7 @@ impl Tagger {
word_store,
tag_store,
lang_options,
..Default::default()
})
}
}
Expand Down Expand Up @@ -203,8 +204,6 @@ impl MultiwordTagger {

impl TextMatcher {
pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
let graph = MatchGraph::default();

// can not cache a matcher that depends on the graph
let set = if matcher.graph_id().is_some() {
None
Expand All @@ -223,7 +222,7 @@ impl TextMatcher {
let set: DefaultHashSet<_> = data
.into_maybe_par_iter()
.filter_map(|(word, id)| {
if matcher.is_match(word.as_str(), &graph, None) {
if matcher.is_match(word.as_str(), None, None) {
Some(*id)
} else {
None
Expand All @@ -249,10 +248,9 @@ impl TextMatcher {
impl PosMatcher {
pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
let mut mask = vec![false; info.tagger().tag_store().len()];
let graph = MatchGraph::default();

for (word, id) in info.tagger().tag_store().iter() {
mask[id.0 as usize] = matcher.is_match(word.as_str(), &graph, None);
mask[id.0 as usize] = matcher.is_match(word.as_str(), None, None);
}

PosMatcher { mask }
Expand Down Expand Up @@ -507,9 +505,9 @@ impl chunk::Chunker {
}
}

impl POSFilter {
impl PosFilter {
pub(in crate::compile) fn new(matcher: PosMatcher) -> Self {
POSFilter { matcher }
PosFilter { matcher }
}
}

Expand Down
4 changes: 2 additions & 2 deletions nlprule/src/compile/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ pub enum Error {
#[error("serialization error")]
Serialization(#[from] bincode::Error),
#[error("JSON deserialization error")]
JSON(#[from] serde_json::Error),
Json(#[from] serde_json::Error),
#[error("error loading SRX")]
SRX(#[from] srx::Error),
Srx(#[from] srx::Error),
#[error("language options do not exist for '{lang_code}'")]
LanguageOptionsDoNotExist { lang_code: String },
#[error("regex syntax error: {0}")]
Expand Down
43 changes: 22 additions & 21 deletions nlprule/src/compile/parse_structure.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::sync::Arc;
use std::{ops::Range, sync::Arc};

use super::{structure, Error};
use crate::{tokenizer::tag::Tagger, types::*};
Expand Down Expand Up @@ -644,9 +644,9 @@ fn parse_features(
pattern: &structure::Pattern,
unifications: &Option<Vec<structure::Unification>>,
info: &mut BuildInfo,
) -> Vec<Vec<POSFilter>> {
) -> Vec<Vec<PosFilter>> {
let mut filters = Vec::new();
let mut parse_feature = |id: &str| -> Vec<POSFilter> {
let mut parse_feature = |id: &str| -> Vec<PosFilter> {
let unification = unifications
.as_ref()
.unwrap()
Expand Down Expand Up @@ -812,30 +812,32 @@ impl Rule {
}

let mut texts = Vec::new();
let mut char_length = 0;
let mut suggestion: Option<Suggestion> = None;

for part in &example.parts {
match part {
structure::ExamplePart::Text(text) => {
texts.push(text.as_str());
char_length += text.chars().count();
}
structure::ExamplePart::Marker(marker) => {
let (bytes_before, chars_before) =
texts.iter().fold((0, 0), |acc, text| {
(acc.0 + text.len(), acc.1 + text.chars().count())
});

if suggestion.is_some() {
return Err(Error::Unexpected(
"example must have one or zero markers".into(),
));
}

texts.push(marker.text.as_str());
let length = marker.text.chars().count();

if let Some(correction_text) = &example.correction {
let mut replacements: Vec<_> =
correction_text.split('|').map(|x| x.to_string()).collect();

replacements = if char_length == 0 {
replacements = if chars_before == 0 {
// title case if at start
replacements
.into_iter()
Expand All @@ -847,16 +849,16 @@ impl Rule {
replacements
};

suggestion = Some(Suggestion {
source: "_Test".to_string(),
message: "_Test".to_string(),
start: char_length,
end: char_length + length,
suggestion = Some(Suggestion::new(
"_Test".into(),
"_Test".into(),
Span::new(
bytes_before..bytes_before + marker.text.len(),
chars_before..chars_before + marker.text.chars().count(),
),
replacements,
});
));
}

char_length += marker.text.chars().count();
}
}
}
Expand Down Expand Up @@ -953,17 +955,17 @@ impl owned::WordData {
}
}

fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> POSFilter {
fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter {
match postag_regexp.as_deref() {
Some("yes") => POSFilter::new(PosMatcher::new(
Some("yes") => PosFilter::new(PosMatcher::new(
Matcher::new_regex(
Regex::from_java_regex(&postag, true, true).unwrap(),
false,
true,
),
info,
)),
Some(_) | None => POSFilter::new(PosMatcher::new(
Some(_) | None => PosFilter::new(PosMatcher::new(
Matcher::new_string(either::Left(postag.into()), false, false, true),
info,
)),
Expand Down Expand Up @@ -1221,7 +1223,7 @@ impl DisambiguationRule {
if let Some(examples_structure) = data.examples.as_ref() {
for example in examples_structure {
let mut texts = Vec::new();
let mut char_span: Option<(usize, usize)> = None;
let mut char_span: Option<Range<usize>> = None;
let mut char_length = 0;

for part in &example.parts {
Expand All @@ -1240,8 +1242,7 @@ impl DisambiguationRule {
texts.push(marker.text.as_str());
let length = marker.text.chars().count();

char_span = Some((char_length, char_length + length));

char_span = Some(char_length..char_length + length);
char_length += marker.text.chars().count();
}
}
Expand Down
46 changes: 23 additions & 23 deletions nlprule/src/compile/structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,40 +197,40 @@ pub struct Category {
}

#[derive(Debug, Clone, Deserialize)]
pub struct XMLString {
pub struct XmlString {
pub text: String,
}

impl std::ops::Deref for XMLString {
impl std::ops::Deref for XmlString {
type Target = String;

fn deref(&self) -> &Self::Target {
&self.text
}
}

impl std::convert::Into<String> for XMLString {
fn into(self) -> String {
self.text
impl From<XmlString> for String {
fn from(data: XmlString) -> String {
data.text
}
}

#[derive(Debug, Clone, Deserialize)]
pub struct XMLText {
pub text: XMLString,
pub struct XmlText {
pub text: XmlString,
}

impl std::ops::Deref for XMLText {
impl std::ops::Deref for XmlText {
type Target = String;

fn deref(&self) -> &Self::Target {
&self.text
}
}

impl std::convert::Into<String> for XMLText {
fn into(self) -> String {
self.text.into()
impl From<XmlText> for String {
fn from(data: XmlText) -> String {
data.text.into()
}
}

Expand All @@ -242,7 +242,7 @@ pub struct Match {
#[serde(rename = "postag_regexp")]
pub postag_regex: Option<String>,
pub postag_replace: Option<String>,
pub text: Option<XMLString>,
pub text: Option<XmlString>,
pub include_skipped: Option<String>,
pub case_conversion: Option<String>,
pub regexp_match: Option<String>,
Expand All @@ -253,7 +253,7 @@ pub struct Match {
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub enum SuggestionPart {
Match(Match),
Text(XMLString),
Text(XmlString),
}

#[derive(Debug, Clone, Deserialize)]
Expand All @@ -268,7 +268,7 @@ pub struct Suggestion {
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub enum MessagePart {
Suggestion(Suggestion),
Text(XMLString),
Text(XmlString),
Match(Match),
}

Expand All @@ -283,14 +283,14 @@ pub struct Message {
#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ExampleMarker {
pub text: XMLString,
pub text: XmlString,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub enum ExamplePart {
Marker(ExampleMarker),
Text(XMLString),
Text(XmlString),
}

#[derive(Debug, Clone, Deserialize)]
Expand Down Expand Up @@ -318,15 +318,15 @@ pub struct Exception {
pub negate: Option<String>,
pub negate_pos: Option<String>,
pub scope: Option<String>,
pub text: Option<XMLString>,
pub text: Option<XmlString>,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "lowercase")]
#[serde(deny_unknown_fields)]
#[allow(clippy::large_enum_variant)]
pub enum TokenPart {
Text(XMLString),
Text(XmlString),
Exception(Exception),
#[serde(rename = "match")]
Sub(Sub),
Expand Down Expand Up @@ -474,7 +474,7 @@ pub struct Pattern {
#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct Regex {
pub text: XMLString,
pub text: XmlString,
pub case_sensitive: Option<String>,
pub mark: Option<String>,
}
Expand All @@ -494,8 +494,8 @@ pub struct Rule {
pub examples: Vec<Example>,
pub id: Option<String>,
pub name: Option<String>,
pub short: Option<XMLText>,
pub url: Option<XMLText>,
pub short: Option<XmlText>,
pub url: Option<XmlText>,
pub default: Option<String>,
pub filter: Option<Filter>,
#[serde(rename = "__unused_unifications")]
Expand All @@ -510,8 +510,8 @@ pub struct RuleGroup {
pub antipatterns: Option<Vec<Pattern>>,
pub default: Option<String>,
pub name: String,
pub short: Option<XMLText>,
pub url: Option<XMLText>,
pub short: Option<XmlText>,
pub url: Option<XmlText>,
#[serde(rename = "rule")]
pub rules: Vec<Rule>,
}
Expand Down
Loading

0 comments on commit ea63b98

Please sign in to comment.