diff --git a/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala b/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala index a040e7f8..4ffaff9b 100644 --- a/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala +++ b/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala @@ -11,11 +11,9 @@ import org.apache.lucene.search.{ } import org.apache.lucene.store.{ Directory, FSDirectory } import org.apache.lucene.index.{ DirectoryReader, Term } -import org.apache.lucene.queryparser.classic.QueryParser -import com.typesafe.config.{ Config, ConfigValueFactory } +import com.typesafe.config.Config import ai.lum.common.ConfigFactory import ai.lum.common.ConfigUtils._ -import ai.lum.common.StringUtils._ import ai.lum.odinson.DataGatherer.VerboseLevels import ai.lum.odinson.DataGatherer.VerboseLevels.Verbosity import ai.lum.odinson.compiler.QueryCompiler @@ -23,10 +21,7 @@ import ai.lum.odinson.lucene._ import ai.lum.odinson.lucene.search._ import ai.lum.odinson.state.{ MockState, State } import ai.lum.odinson.digraph.Vocabulary -import ai.lum.odinson.metadata.MetadataCompiler import ai.lum.odinson.utils.MostRecentlyUsed -import ai.lum.odinson.utils.exceptions.OdinsonException -import org.apache.lucene.queryparser.xml.builders.BooleanQueryBuilder import scala.collection.mutable.ArrayBuffer @@ -256,38 +251,88 @@ class ExtractorEngine private ( } // Access methods + // ps - so many because there compileRuleFile is overloaded (String and File) so can't have + // default args def compileRuleString(rules: String): Seq[Extractor] = { - compileRuleString(rules, Map.empty[String, String]) + ruleReader.compileRuleString(rules) } def compileRuleString(rules: String, variables: Map[String, String]): Seq[Extractor] = { ruleReader.compileRuleString(rules, variables) } + def compileRuleString(rules: String, metadataFilter: Query): Seq[Extractor] = { + ruleReader.compileRuleString(rules, metadataFilter) + } + + def compileRuleString( + rules: String, + variables: Map[String, String], + metadataFilterOpt: Option[Query] + ): Seq[Extractor] = { + ruleReader.compileRuleString(rules, variables, metadataFilterOpt) + } + def compileRuleFile(ruleFile: File): Seq[Extractor] = { - compileRuleFile(ruleFile, Map.empty[String, String]) + ruleReader.compileRuleFile(ruleFile) } def compileRuleFile(ruleFile: File, variables: Map[String, String]): Seq[Extractor] = { ruleReader.compileRuleFile(ruleFile, variables) } + def compileRuleFile(ruleFile: File, metadataFilter: Query): Seq[Extractor] = { + ruleReader.compileRuleFile(ruleFile, metadataFilter) + } + + def compileRuleFile( + ruleFile: File, + variables: Map[String, String], + metadataFilterOpt: Option[Query] + ): Seq[Extractor] = { + ruleReader.compileRuleFile(ruleFile, variables, metadataFilterOpt) + } + def compileRuleFile(rulePath: String): Seq[Extractor] = { - compileRuleFile(rulePath, Map.empty[String, String]) + ruleReader.compileRuleFile(rulePath) } def compileRuleFile(rulePath: String, variables: Map[String, String]): Seq[Extractor] = { ruleReader.compileRuleFile(rulePath, variables) } + def compileRuleFile(rulePath: String, metadataFilter: Query): Seq[Extractor] = { + ruleReader.compileRuleFile(rulePath, metadataFilter) + } + + def compileRuleFile( + rulePath: String, + variables: Map[String, String], + metadataFilterOpt: Option[Query] + ): Seq[Extractor] = { + ruleReader.compileRuleFile(rulePath, variables, metadataFilterOpt) + } + def compileRuleResource(rulePath: String): Seq[Extractor] = { - compileRuleResource(rulePath, Map.empty[String, String]) + ruleReader.compileRuleResource(rulePath) } def compileRuleResource(rulePath: String, variables: Map[String, String]): Seq[Extractor] = { ruleReader.compileRuleResource(rulePath, variables) } + def compileRuleResource(rulePath: String, metadataFilter: Query): Seq[Extractor] = { + ruleReader.compileRuleResource(rulePath, metadataFilter) + } + + def compileRuleResource( + rulePath: String, + variables: Map[String, String], + metadataFilterOpt: Option[Query] + ): Seq[Extractor] = { + ruleReader.compileRuleResource(rulePath, variables, metadataFilterOpt) + } + private def extract( extractor: Extractor, numSentences: Int, diff --git a/core/src/main/scala/ai/lum/odinson/RuleReader.scala b/core/src/main/scala/ai/lum/odinson/RuleReader.scala index f77b5663..8a96cc40 100644 --- a/core/src/main/scala/ai/lum/odinson/RuleReader.scala +++ b/core/src/main/scala/ai/lum/odinson/RuleReader.scala @@ -6,10 +6,12 @@ import java.util.{ Collection, Map => JMap } import ai.lum.common.TryWithResources.using import scala.collection.JavaConverters._ +import org.apache.lucene.search.{ Query => LuceneQuery } import org.yaml.snakeyaml.Yaml import org.yaml.snakeyaml.constructor.Constructor import ai.lum.odinson.compiler.QueryCompiler import ai.lum.odinson.lucene.search.OdinsonQuery +import ai.lum.odinson.metadata.MetadataCompiler import ai.lum.odinson.utils.exceptions.OdinsonException import ai.lum.odinson.utils.{ RuleSources, SituatedStream, VariableSubstitutor } @@ -19,7 +21,8 @@ import ai.lum.odinson.utils.{ RuleSources, SituatedStream, VariableSubstitutor } */ case class RuleFile( rules: Seq[Rule], - variables: Map[String, String] + variables: Map[String, String], + metadataFilter: Option[LuceneQuery] ) /** A Rule represents a single rule parsed from a yaml file. @@ -48,7 +51,7 @@ class RuleReader(val compiler: QueryCompiler) { /** gets a rule stream and returns a sequence of extractors ready to be used */ def compileRuleStream(input: SituatedStream): Seq[Extractor] = { - compileRuleStream(input, Map.empty[String, String]) + compileRuleStream(input, Map.empty[String, String], None) } /** Gets a rule stream as well as a map of variables. @@ -56,7 +59,27 @@ class RuleReader(val compiler: QueryCompiler) { * The variables passed as an argument will override the variables declared in the file. */ def compileRuleStream(input: SituatedStream, variables: Map[String, String]): Seq[Extractor] = { - val ruleFiles = parseRuleFile(input, variables) + compileRuleStream(input, variables, None) + } + + /** Gets a rule stream as well as a metadata filter. + * Returns a sequence of extractors ready to be used. + * The variables passed as an argument will override the variables declared in the file. + */ + def compileRuleStream(input: SituatedStream, metadataFilter: LuceneQuery): Seq[Extractor] = { + compileRuleStream(input, Map.empty, Some(metadataFilter)) + } + + /** Gets a rule stream, variables, and a metadata filter. + * Returns a sequence of extractors ready to be used. + * The variables passed as an argument will override the variables declared in the file. + */ + def compileRuleStream( + input: SituatedStream, + variables: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] + ): Seq[Extractor] = { + val ruleFiles = parseRuleFile(input, variables, metadataFilterOpt) mkExtractorsFromRuleFiles(ruleFiles, variables) } @@ -65,7 +88,7 @@ class RuleReader(val compiler: QueryCompiler) { * @return extractors */ def compileRuleFile(input: String): Seq[Extractor] = { - compileRuleFile(input, Map.empty[String, String]) + compileRuleFile(input, Map.empty[String, String], None) } /** Gets the path to a rule file as well as a map of variables. @@ -73,59 +96,144 @@ class RuleReader(val compiler: QueryCompiler) { * The variables passed as an argument will override the variables declared in the file. */ def compileRuleFile(input: String, variables: Map[String, String]): Seq[Extractor] = { - compileRuleFile(new File(input), variables) + compileRuleFile(new File(input), variables, None) + } + + /** Gets the path to a rule file as well as a metadata filter. + * Returns a sequence of extractors ready to be used. + * The variables passed as an argument will override the variables declared in the file. + */ + def compileRuleFile(input: String, metadataFilter: LuceneQuery): Seq[Extractor] = { + compileRuleFile(new File(input), Map.empty[String, String], Some(metadataFilter)) + } + + /** Gets the path to a rule file, a map of variables, and a metadata filter. + * Returns a sequence of extractors ready to be used. + * The variables passed as an argument will override the variables declared in the file. + */ + def compileRuleFile( + input: String, + variables: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] + ): Seq[Extractor] = { + compileRuleFile(new File(input), variables, metadataFilterOpt) } /** gets a rule File object and returns a sequence of extractors ready to be used */ def compileRuleFile(input: File): Seq[Extractor] = { - compileRuleFile(input, Map.empty[String, String]) + compileRuleFile(input, Map.empty[String, String], None) } - /** Gets a rule File object as well as a map of variables. + /** Gets a rule File object and a map of variables. * Returns a sequence of extractors ready to be used. * The variables passed as an argument will override the variables declared in the file. */ def compileRuleFile(input: File, variables: Map[String, String]): Seq[Extractor] = { - compileRuleStream(SituatedStream.fromFile(input.getCanonicalPath), variables) + compileRuleStream(SituatedStream.fromFile(input.getCanonicalPath), variables, None) + } + + /** Gets a rule File object and a metadata filter. + * Returns a sequence of extractors ready to be used. + * The variables passed as an argument will override the variables declared in the file. + */ + def compileRuleFile(input: File, metadataFilterOpt: LuceneQuery): Seq[Extractor] = { + compileRuleStream( + SituatedStream.fromFile(input.getCanonicalPath), + Map.empty, + Some(metadataFilterOpt) + ) + } + + /** Gets a rule File object, a map of variables, and a metadata filter. + * Returns a sequence of extractors ready to be used. + * The variables passed as an argument will override the variables declared in the file. + */ + def compileRuleFile( + input: File, + variables: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] + ): Seq[Extractor] = { + compileRuleStream(SituatedStream.fromFile(input.getCanonicalPath), variables, metadataFilterOpt) } /** Gets the path to a rule file in the jar resources as well as a map of variables. * Returns a sequence of extractors ready to be used */ def compileRuleResource(rulePath: String): Seq[Extractor] = { - compileRuleResource(rulePath, Map.empty[String, String]) + compileRuleResource(rulePath, Map.empty[String, String], None) } - /** Gets the path to a rule file in the jar resources as well as a map of variables. + /** Gets the path to a rule file in the jar resources and a map of variables. * Returns a sequence of extractors ready to be used */ def compileRuleResource(rulePath: String, variables: Map[String, String]): Seq[Extractor] = { - compileRuleStream(SituatedStream.fromResource(rulePath), variables) + compileRuleStream(SituatedStream.fromResource(rulePath), variables, None) + } + + /** Gets the path to a rule file in the jar resources and a metadata filter. + * Returns a sequence of extractors ready to be used + */ + def compileRuleResource(rulePath: String, metadataFilter: LuceneQuery): Seq[Extractor] = { + compileRuleStream(SituatedStream.fromResource(rulePath), Map.empty, Some(metadataFilter)) + } + + /** Gets the path to a rule file in the jar resources, a map of variables, and a metadata filter. + * Returns a sequence of extractors ready to be used + */ + def compileRuleResource( + rulePath: String, + variables: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] + ): Seq[Extractor] = { + compileRuleStream(SituatedStream.fromResource(rulePath), variables, metadataFilterOpt) } /** Gets the actual rules content as a string. * Returns a sequence of extractors ready to be used */ def compileRuleString(rules: String): Seq[Extractor] = { - compileRuleString(rules, Map.empty[String, String]) + compileRuleString(rules, Map.empty[String, String], None) } /** Gets the actual rules content as a string. * Returns a sequence of extractors ready to be used */ def compileRuleString(rules: String, variables: Map[String, String]): Seq[Extractor] = { - compileRuleStream(SituatedStream.fromString(rules), variables) + compileRuleStream(SituatedStream.fromString(rules), variables, None) + } + + /** Gets the actual rules content as a string. + * Returns a sequence of extractors ready to be used + */ + def compileRuleString(rules: String, metadataFilter: LuceneQuery): Seq[Extractor] = { + compileRuleStream(SituatedStream.fromString(rules), Map.empty, Some(metadataFilter)) + } + + /** Gets the actual rules content as a string. + * Returns a sequence of extractors ready to be used + */ + def compileRuleString( + rules: String, + variables: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] + ): Seq[Extractor] = { + compileRuleStream(SituatedStream.fromString(rules), variables, metadataFilterOpt) } /** Parses the content of the rule file and returns a RuleFile object * that contains the parsed rules and the variables declared in the file. * Note that variable replacement hasn't happened yet. */ - def parseRuleFile(input: SituatedStream, parentVars: Map[String, String]): Seq[RuleFile] = { + def parseRuleFile( + input: SituatedStream, + parentVars: Map[String, String], + metadataFilterOptIn: Option[LuceneQuery] + ): Seq[RuleFile] = { val master = yamlContents(input) // Parent vars passed in case we need to resolve variables in import paths val localVariables = mkVariables(master, input, parentVars) ++ parentVars - mkRules(master, input, localVariables) + val metadataFilter = mkMetadataFilter(master, metadataFilterOptIn) + mkRules(master, input, localVariables, metadataFilter) } def mkExtractorsFromRuleFiles( @@ -137,7 +245,7 @@ class RuleReader(val compiler: QueryCompiler) { /** gets a RuleFile and returns a sequence of extractors */ def mkExtractors(f: RuleFile): Seq[Extractor] = { - mkExtractors(f.rules, f.variables) + mkExtractors(f.rules, f.variables, f.metadataFilter) } /** Gets a RuleFile and a variable map and returns a sequence of extractors. @@ -146,23 +254,31 @@ class RuleReader(val compiler: QueryCompiler) { def mkExtractors(f: RuleFile, variables: Map[String, String]): Seq[Extractor] = { // The order in which the variable maps are concatenated is important. // The variables provided should override the variables in the RuleFile. - mkExtractors(f.rules, f.variables ++ variables) + mkExtractors(f.rules, f.variables ++ variables, f.metadataFilter) } /** gets a sequence of rules and returns a sequence of extractors */ def mkExtractors(rules: Seq[Rule]): Seq[Extractor] = { - mkExtractors(rules, Map.empty[String, String]) + mkExtractors(rules, Map.empty[String, String], None) } /** Gets a sequence of rules as well as a variable map * and returns a sequence of extractors ready to be used. */ - def mkExtractors(rules: Seq[Rule], variables: Map[String, String]): Seq[Extractor] = { + def mkExtractors( + rules: Seq[Rule], + variables: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] + ): Seq[Extractor] = { val varsub = new VariableSubstitutor(variables) - for (rule <- rules) yield mkExtractor(rule, varsub) + for (rule <- rules) yield mkExtractor(rule, varsub, metadataFilterOpt) } - private def mkExtractor(rule: Rule, varsub: VariableSubstitutor): Extractor = { + private def mkExtractor( + rule: Rule, + varsub: VariableSubstitutor, + metadataFilterOpt: Option[LuceneQuery] + ): Extractor = { // any field in the rule may contain variables, // so we need to pass them through the variable substitutor val name = varsub(rule.name) @@ -176,8 +292,47 @@ class RuleReader(val compiler: QueryCompiler) { case "event" => compiler.compileEventQuery(pattern) case t => throw new OdinsonException(s"invalid rule type '$t'") } - // return an extractor - Extractor(name, label, priority, query) + // add the metadata filter if applicable, and return an extractor + if (metadataFilterOpt.isEmpty) { + Extractor(name, label, priority, query) + } else { + Extractor(name, label, priority, compiler.mkQuery(query, metadataFilterOpt.get)) + } + } + + private def mkMetadataFilter( + data: Map[String, Any], + parentMetadataFilter: Option[LuceneQuery] + ): Option[LuceneQuery] = { + val localFilter = data.get("metadataFilters").flatMap(parseFilter) + joinFilters(localFilter, parentMetadataFilter) + } + + private def parseFilter(data: Any): Option[LuceneQuery] = { + data match { + case pattern: String => Some(MetadataCompiler.mkQuery(pattern)) + case filters: Collection[_] => + val allFilters = filters.asScala.toSeq.flatMap(parseFilter) + joinFilters(allFilters) + case _ => ??? + } + } + + def joinFilters(q1: Option[LuceneQuery], q2: Option[LuceneQuery]): Option[LuceneQuery] = { + if (q1.isEmpty) q2 + else if (q2.isEmpty) q1 + else { + // both defined + joinFilters(Seq(q1.get, q2.get)) + } + } + + def joinFilters(queries: Seq[LuceneQuery]): Option[LuceneQuery] = { + queries match { + case Seq() => None + case Seq(oneQuery) => Some(oneQuery) + case several => Some(MetadataCompiler.combineAnd(several)) + } } // parentVars passed in case we need to resolve variables in import paths @@ -237,13 +392,14 @@ class RuleReader(val compiler: QueryCompiler) { private def mkRules( data: Map[String, Any], source: SituatedStream, - vars: Map[String, String] + vars: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] ): Seq[RuleFile] = { data.get("rules") match { case None => Seq.empty case Some(rules: Collection[_]) => rules.asScala.toSeq.flatMap { r => - makeOrImportRules(r, source, vars) + makeOrImportRules(r, source, vars, metadataFilterOpt) } case _ => throw new OdinsonException("invalid rules data") } @@ -252,7 +408,8 @@ class RuleReader(val compiler: QueryCompiler) { def makeOrImportRules( data: Any, source: SituatedStream, - parentVars: Map[String, String] + parentVars: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] ): Seq[RuleFile] = { data match { case ruleJMap: JMap[_, _] => @@ -263,10 +420,12 @@ class RuleReader(val compiler: QueryCompiler) { // import rules from a file and return them // Parent vars passed in case we need to resolve variables in import paths val importVars = mkVariables(rulesData, source, parentVars) - importRules(rulesData, source, parentVars ++ importVars) + // resolve the metadata filters, combining with AND + val importFilters = mkMetadataFilter(rulesData, metadataFilterOpt) + importRules(rulesData, source, parentVars ++ importVars, importFilters) } else { // Otherwise, process the data as individual rules - Seq(RuleFile(Seq(mkRule(data)), parentVars)) + Seq(RuleFile(Seq(mkRule(data)), parentVars, metadataFilterOpt)) } case _ => ??? } @@ -275,14 +434,15 @@ class RuleReader(val compiler: QueryCompiler) { private def importRules( data: Map[String, Any], source: SituatedStream, - importVars: Map[String, String] + importVars: Map[String, String], + metadataFilterOpt: Option[LuceneQuery] ): Seq[RuleFile] = { // get the current working directory, with ending separator val relativePath = data("import").toString // handle substitutions in path name val resolved = new VariableSubstitutor(importVars).apply(relativePath) val importStream = source.relativePathStream(resolved) - parseRuleFile(importStream, importVars) + parseRuleFile(importStream, importVars, metadataFilterOpt) } private def mkRule(data: Any): Rule = { diff --git a/core/src/main/scala/ai/lum/odinson/metadata/MetadataCompiler.scala b/core/src/main/scala/ai/lum/odinson/metadata/MetadataCompiler.scala index e479f4ae..8221ac04 100644 --- a/core/src/main/scala/ai/lum/odinson/metadata/MetadataCompiler.scala +++ b/core/src/main/scala/ai/lum/odinson/metadata/MetadataCompiler.scala @@ -25,6 +25,14 @@ object MetadataCompiler { compile(expression, isNested = false) } + def combineAnd(queries: Seq[Query]): Query = { + val builder = new BooleanQuery.Builder + queries foreach { query => + builder.add(query, BooleanClause.Occur.MUST) + } + builder.build() + } + def compile(expr: Ast.BoolExpression, isNested: Boolean): Query = { expr match { case Ast.OrExpression(clauses) => diff --git a/core/src/test/resources/testMetadataImports/imported.yml b/core/src/test/resources/testMetadataImports/imported.yml new file mode 100644 index 00000000..f7530bd3 --- /dev/null +++ b/core/src/test/resources/testMetadataImports/imported.yml @@ -0,0 +1,14 @@ +metadataFilters: + - doctype == 'article' + +vars: + chunk: "[chunk=B-NP][chunk=I-NP]*" + +rules: + - name: testrule + type: event + label: Test + pattern: | + trigger = [lemma=eat] + subject: ^NP = >nsubj ${chunk} + object: ^NP = >dobj ${chunk} \ No newline at end of file diff --git a/core/src/test/resources/testMetadataImports/master.yml b/core/src/test/resources/testMetadataImports/master.yml new file mode 100644 index 00000000..5f23aa72 --- /dev/null +++ b/core/src/test/resources/testMetadataImports/master.yml @@ -0,0 +1,5 @@ +metadataFilters: + - (date(1999, 01, 01) < pubdate < date(2012, 01, 01)) + +rules: + - import: imported.yml \ No newline at end of file diff --git a/core/src/test/resources/testMetadataImports/master2.yml b/core/src/test/resources/testMetadataImports/master2.yml new file mode 100644 index 00000000..b0cfb626 --- /dev/null +++ b/core/src/test/resources/testMetadataImports/master2.yml @@ -0,0 +1,5 @@ + +rules: + - import: imported.yml + metadataFilters: + - (date(1999, 01, 01) < pubdate < date(2012, 01, 01)) \ No newline at end of file diff --git a/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala b/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala index f776c5c3..a017b136 100644 --- a/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala +++ b/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala @@ -313,6 +313,54 @@ class TestEvents extends OdinsonTest { } + it should "match a prev found mention as a trigger" in { + + val rules = + """ + |rules: + | - name: bears-rule + | label: Bear + | type: event + | priority: 1 + | pattern: | + | trigger = bears + | bearType = >amod [] + | + | - name: eating-rule + | label: Consumption + | type: event + | priority: 2 + | pattern: | + | trigger = @Bear + | verb = nsubj ${chunk} + | object: ^NP = >dobj ${chunk} + """.stripMargin + val extractors = ee.ruleReader.compileRuleString(rules) + val mentions = getMentionsWithLabel(ee.extractMentions(extractors).toSeq, "Test") + mentions should have size (3) + + } + + "Metadata" should "work when in grammars as a list" in { + ee.clearState() + val rules = """ + |metadataFilters: + | - doctype == 'article' + | - (date(1999, 01, 01) < pubdate < date(2012, 01, 01)) + | + |vars: + | chunk: "[chunk=B-NP][chunk=I-NP]*" + | + |rules: + | - name: testrule + | type: event + | label: Test + | pattern: | + | trigger = [lemma=eat] + | subject: ^NP = >nsubj ${chunk} + | object: ^NP = >dobj ${chunk} + """.stripMargin + val extractors = ee.ruleReader.compileRuleString(rules) + val mentions = getMentionsWithLabel(ee.extractMentions(extractors).toSeq, "Test") + mentions should have size (2) + + } + + "Metadata" should "combine properly when files are imported" in { + ee.clearState() + val masterPath = "/testMetadataImports/master.yml" + val extractors = ee.compileRuleResource(masterPath) + val mentions = getMentionsWithLabel(ee.extractMentions(extractors).toSeq, "Test") + mentions should have size (2) + + } + + "Metadata" should "combine properly when files are imported with filter" in { + ee.clearState() + val masterPath = "/testMetadataImports/master2.yml" + val extractors = ee.compileRuleResource(masterPath) + val mentions = getMentionsWithLabel(ee.extractMentions(extractors).toSeq, "Test") + mentions should have size (2) + + } }