Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a Sentence struct, replace Vec<Token> with Sentence where possible #54

Merged
merged 11 commits into from
Mar 30, 2021
2 changes: 1 addition & 1 deletion build/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub enum Error {
#[error("Failed to validate {1:?} binary for lang {0}")]
ValidationFailed(String, Binary, #[source] nlprule::Error),
#[error(transparent)]
IOError(#[from] io::Error),
IoError(#[from] io::Error),
bminixhofer marked this conversation as resolved.
Show resolved Hide resolved
#[error(transparent)]
ZipError(#[from] ZipError),
#[error("error postprocessing binaries: {0}")]
Expand Down
12 changes: 10 additions & 2 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ fst = "0.4"
fs-err = "2.5"
aho-corasick = "0.7"
half = { version = "1.7", features = ["serde"] }
srx = { version = "^0.1.2", features = ["serde"] }
srx = { version = "^0.1.3", features = ["serde"] }
lazycell = "1"
cfg-if = "1"

Expand Down Expand Up @@ -70,7 +70,15 @@ regex-all-test = ["regex-onig", "regex-fancy"]

# needed for the bin test targets and to compile nlprule binaries, you'll usually not need these
bin = ["clap", "env_logger"]
compile = ["regex-syntax", "serde-xml-rs", "xml-rs", "roxmltree", "serde_json", "srx/from_xml", "regex-all-test"]
compile = [
"regex-syntax",
"serde-xml-rs",
"xml-rs",
"roxmltree",
"serde_json",
"srx/from_xml",
"regex-all-test",
]

[[bin]]
name = "compile"
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/bin/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ fn main() {

let tokens = tokenizer.pipe(&opts.text);

println!("Tokens: {:#?}", tokens);
println!("Tokens: {:#?}", tokens.collect::<Vec<_>>());
println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer));
}
16 changes: 7 additions & 9 deletions nlprule/src/compile/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ use std::{

use crate::{
rule::{
disambiguation::POSFilter,
disambiguation::PosFilter,
engine::{
composition::{GraphId, Matcher, PosMatcher, TextMatcher},
Engine,
},
id::Category,
DisambiguationRule, MatchGraph, Rule,
DisambiguationRule, Rule,
},
rules::{Rules, RulesLangOptions, RulesOptions},
tokenizer::{
Expand Down Expand Up @@ -167,6 +167,7 @@ impl Tagger {
word_store,
tag_store,
lang_options,
..Default::default()
})
}
}
Expand Down Expand Up @@ -203,8 +204,6 @@ impl MultiwordTagger {

impl TextMatcher {
pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
let graph = MatchGraph::default();

// can not cache a matcher that depends on the graph
let set = if matcher.graph_id().is_some() {
None
Expand All @@ -223,7 +222,7 @@ impl TextMatcher {
let set: DefaultHashSet<_> = data
.into_maybe_par_iter()
.filter_map(|(word, id)| {
if matcher.is_match(word.as_str(), &graph, None) {
if matcher.is_match(word.as_str(), None, None) {
Some(*id)
} else {
None
Expand All @@ -249,10 +248,9 @@ impl TextMatcher {
impl PosMatcher {
pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self {
let mut mask = vec![false; info.tagger().tag_store().len()];
let graph = MatchGraph::default();

for (word, id) in info.tagger().tag_store().iter() {
mask[id.0 as usize] = matcher.is_match(word.as_str(), &graph, None);
mask[id.0 as usize] = matcher.is_match(word.as_str(), None, None);
}

PosMatcher { mask }
Expand Down Expand Up @@ -507,9 +505,9 @@ impl chunk::Chunker {
}
}

impl POSFilter {
impl PosFilter {
pub(in crate::compile) fn new(matcher: PosMatcher) -> Self {
POSFilter { matcher }
PosFilter { matcher }
}
}

Expand Down
4 changes: 2 additions & 2 deletions nlprule/src/compile/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ pub enum Error {
#[error("serialization error")]
Serialization(#[from] bincode::Error),
#[error("JSON deserialization error")]
JSON(#[from] serde_json::Error),
Json(#[from] serde_json::Error),
#[error("error loading SRX")]
SRX(#[from] srx::Error),
Srx(#[from] srx::Error),
#[error("language options do not exist for '{lang_code}'")]
LanguageOptionsDoNotExist { lang_code: String },
#[error("regex syntax error: {0}")]
Expand Down
43 changes: 22 additions & 21 deletions nlprule/src/compile/parse_structure.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::sync::Arc;
use std::{ops::Range, sync::Arc};

use super::{structure, Error};
use crate::{tokenizer::tag::Tagger, types::*};
Expand Down Expand Up @@ -644,9 +644,9 @@ fn parse_features(
pattern: &structure::Pattern,
unifications: &Option<Vec<structure::Unification>>,
info: &mut BuildInfo,
) -> Vec<Vec<POSFilter>> {
) -> Vec<Vec<PosFilter>> {
let mut filters = Vec::new();
let mut parse_feature = |id: &str| -> Vec<POSFilter> {
let mut parse_feature = |id: &str| -> Vec<PosFilter> {
let unification = unifications
.as_ref()
.unwrap()
Expand Down Expand Up @@ -812,30 +812,32 @@ impl Rule {
}

let mut texts = Vec::new();
let mut char_length = 0;
let mut suggestion: Option<Suggestion> = None;

for part in &example.parts {
match part {
structure::ExamplePart::Text(text) => {
texts.push(text.as_str());
char_length += text.chars().count();
}
structure::ExamplePart::Marker(marker) => {
let (bytes_before, chars_before) =
texts.iter().fold((0, 0), |acc, text| {
(acc.0 + text.len(), acc.1 + text.chars().count())
});

if suggestion.is_some() {
return Err(Error::Unexpected(
"example must have one or zero markers".into(),
));
}

texts.push(marker.text.as_str());
let length = marker.text.chars().count();

if let Some(correction_text) = &example.correction {
let mut replacements: Vec<_> =
correction_text.split('|').map(|x| x.to_string()).collect();

replacements = if char_length == 0 {
replacements = if chars_before == 0 {
// title case if at start
replacements
.into_iter()
Expand All @@ -847,16 +849,16 @@ impl Rule {
replacements
};

suggestion = Some(Suggestion {
source: "_Test".to_string(),
message: "_Test".to_string(),
start: char_length,
end: char_length + length,
suggestion = Some(Suggestion::new(
"_Test".into(),
"_Test".into(),
Span::new(
bytes_before..bytes_before + marker.text.len(),
chars_before..chars_before + marker.text.chars().count(),
),
replacements,
});
));
}

char_length += marker.text.chars().count();
}
}
}
Expand Down Expand Up @@ -953,17 +955,17 @@ impl owned::WordData {
}
}

fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> POSFilter {
fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter {
match postag_regexp.as_deref() {
Some("yes") => POSFilter::new(PosMatcher::new(
Some("yes") => PosFilter::new(PosMatcher::new(
Matcher::new_regex(
Regex::from_java_regex(&postag, true, true).unwrap(),
false,
true,
),
info,
)),
Some(_) | None => POSFilter::new(PosMatcher::new(
Some(_) | None => PosFilter::new(PosMatcher::new(
Matcher::new_string(either::Left(postag.into()), false, false, true),
info,
)),
Expand Down Expand Up @@ -1221,7 +1223,7 @@ impl DisambiguationRule {
if let Some(examples_structure) = data.examples.as_ref() {
for example in examples_structure {
let mut texts = Vec::new();
let mut char_span: Option<(usize, usize)> = None;
let mut char_span: Option<Range<usize>> = None;
let mut char_length = 0;

for part in &example.parts {
Expand All @@ -1240,8 +1242,7 @@ impl DisambiguationRule {
texts.push(marker.text.as_str());
let length = marker.text.chars().count();

char_span = Some((char_length, char_length + length));

char_span = Some(char_length..char_length + length);
char_length += marker.text.chars().count();
}
}
Expand Down
46 changes: 23 additions & 23 deletions nlprule/src/compile/structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,40 +197,40 @@ pub struct Category {
}

#[derive(Debug, Clone, Deserialize)]
pub struct XMLString {
pub struct XmlString {
pub text: String,
}

impl std::ops::Deref for XMLString {
impl std::ops::Deref for XmlString {
type Target = String;

fn deref(&self) -> &Self::Target {
&self.text
}
}

impl std::convert::Into<String> for XMLString {
fn into(self) -> String {
self.text
impl From<XmlString> for String {
fn from(data: XmlString) -> String {
data.text
}
}

#[derive(Debug, Clone, Deserialize)]
pub struct XMLText {
pub text: XMLString,
pub struct XmlText {
pub text: XmlString,
}

impl std::ops::Deref for XMLText {
impl std::ops::Deref for XmlText {
type Target = String;

fn deref(&self) -> &Self::Target {
&self.text
}
}

impl std::convert::Into<String> for XMLText {
fn into(self) -> String {
self.text.into()
impl From<XmlText> for String {
fn from(data: XmlText) -> String {
data.text.into()
}
}

Expand All @@ -242,7 +242,7 @@ pub struct Match {
#[serde(rename = "postag_regexp")]
pub postag_regex: Option<String>,
pub postag_replace: Option<String>,
pub text: Option<XMLString>,
pub text: Option<XmlString>,
pub include_skipped: Option<String>,
pub case_conversion: Option<String>,
pub regexp_match: Option<String>,
Expand All @@ -253,7 +253,7 @@ pub struct Match {
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub enum SuggestionPart {
Match(Match),
Text(XMLString),
Text(XmlString),
}

#[derive(Debug, Clone, Deserialize)]
Expand All @@ -268,7 +268,7 @@ pub struct Suggestion {
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub enum MessagePart {
Suggestion(Suggestion),
Text(XMLString),
Text(XmlString),
Match(Match),
}

Expand All @@ -283,14 +283,14 @@ pub struct Message {
#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ExampleMarker {
pub text: XMLString,
pub text: XmlString,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields, rename_all = "lowercase")]
pub enum ExamplePart {
Marker(ExampleMarker),
Text(XMLString),
Text(XmlString),
}

#[derive(Debug, Clone, Deserialize)]
Expand Down Expand Up @@ -318,15 +318,15 @@ pub struct Exception {
pub negate: Option<String>,
pub negate_pos: Option<String>,
pub scope: Option<String>,
pub text: Option<XMLString>,
pub text: Option<XmlString>,
}

#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "lowercase")]
#[serde(deny_unknown_fields)]
#[allow(clippy::large_enum_variant)]
pub enum TokenPart {
Text(XMLString),
Text(XmlString),
Exception(Exception),
#[serde(rename = "match")]
Sub(Sub),
Expand Down Expand Up @@ -474,7 +474,7 @@ pub struct Pattern {
#[derive(Debug, Clone, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct Regex {
pub text: XMLString,
pub text: XmlString,
pub case_sensitive: Option<String>,
pub mark: Option<String>,
}
Expand All @@ -494,8 +494,8 @@ pub struct Rule {
pub examples: Vec<Example>,
pub id: Option<String>,
pub name: Option<String>,
pub short: Option<XMLText>,
pub url: Option<XMLText>,
pub short: Option<XmlText>,
pub url: Option<XmlText>,
pub default: Option<String>,
pub filter: Option<Filter>,
#[serde(rename = "__unused_unifications")]
Expand All @@ -510,8 +510,8 @@ pub struct RuleGroup {
pub antipatterns: Option<Vec<Pattern>>,
pub default: Option<String>,
pub name: String,
pub short: Option<XMLText>,
pub url: Option<XMLText>,
pub short: Option<XmlText>,
pub url: Option<XmlText>,
#[serde(rename = "rule")]
pub rules: Vec<Rule>,
}
Expand Down
Loading