From ae5267203f50c4ac03a69259871e47b854efdfc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Wed, 11 Dec 2019 11:00:37 +0100 Subject: [PATCH] Corpus: add title attribute selection dropdown --- orangecontrib/text/corpus.py | 10 +- orangecontrib/text/tests/test_corpus.py | 7 - orangecontrib/text/widgets/owcorpus.py | 76 +++++++- .../text/widgets/tests/test_owcorpus.py | 165 ++++++++++++++++++ 4 files changed, 240 insertions(+), 18 deletions(-) create mode 100644 orangecontrib/text/widgets/tests/test_owcorpus.py diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py index 058b165fc..0600170e6 100644 --- a/orangecontrib/text/corpus.py +++ b/orangecontrib/text/corpus.py @@ -197,15 +197,7 @@ def titles(self): """ Returns a list of titles. """ attrs = [attr for attr in chain(self.domain.variables, self.domain.metas) if attr.attributes.get('title', False)] - # Alternatively, use heuristics - if not attrs: - for var in sorted(chain(self.domain.metas, self.domain.variables), - key=lambda var: var.name, - reverse=True): # reverse so that title < heading < filename - if var.name.lower() in ('title', 'heading', 'h1', 'filename') \ - and not var.attributes.get('hidden', False): # skip BoW features - attrs = [var] - break + if attrs: return self.documents_from_features(attrs) else: diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py index 0d98c62a7..edf6d05c3 100644 --- a/orangecontrib/text/tests/test_corpus.py +++ b/orangecontrib/text/tests/test_corpus.py @@ -163,13 +163,6 @@ def test_titles(self): for title in titles: self.assertIn('Document ', title) - # inferred title from heuristics - expected = list(map(str, range(len(c)))) - c2 = Corpus(Domain([], [], (StringVariable('heading'),)), - None, None, np.c_[expected]) - titles = c2.titles - self.assertEqual(titles, expected) - # title feature set c.domain[0].attributes['title'] = True titles = c.titles diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py index 74b45c6a8..288c2d28e 100644 --- a/orangecontrib/text/widgets/owcorpus.py +++ b/orangecontrib/text/widgets/owcorpus.py @@ -1,9 +1,10 @@ import os +import numpy as np -from Orange.data import Table +from Orange.data import Table, StringVariable, Variable from Orange.data.io import FileFormat from Orange.widgets import gui -from Orange.widgets.utils.itemmodels import VariableListModel +from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel from Orange.widgets.data.owselectcolumns import VariablesListItemView from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler from Orange.widgets.widget import OWWidget, Msg, Input, Output @@ -46,6 +47,7 @@ class Outputs: "andersen.tab", ]) used_attrs = ContextSetting([]) + title_variable = ContextSetting("") class Error(OWWidget.Error): read_file = Msg("Can't read file {} ({})") @@ -73,6 +75,15 @@ def __init__(self): self.info_label = gui.label(ibox, self, "") self.update_info() + # dropdown to select title variable + self.title_model = DomainModel( + valid_types=(StringVariable,), placeholder="(no title)") + gui.comboBox( + self.controlArea, self, "title_variable", + box="Title variable", model=self.title_model, + callback=self.update_feature_selection + ) + # Used Text Features fbox = gui.widgetBox(self.controlArea, orientation=0) ubox = gui.widgetBox(fbox, "Used text features", addSpace=False) @@ -138,6 +149,7 @@ def open_file(self, path=None, data=None): return self.update_info() + self._setup_title_dropdown() self.used_attrs = list(self.corpus.text_features) if not self.corpus.text_features: self.Error.corpus_without_text_features() @@ -149,6 +161,56 @@ def open_file(self, path=None, data=None): [f for f in self.corpus.domain.metas if f.is_string and f not in self.used_attrs_model]) + def _setup_title_dropdown(self): + self.title_model.set_domain(self.corpus.domain) + + # if title variable is already marked in a dataset set it as a title + # variable + title_var = list(filter( + lambda x: x.attributes.get("title", False), + self.corpus.domain.metas)) + if title_var: + self.title_variable = title_var[0] + return + + # if not title attribute use heuristic for selecting it + v_len = np.vectorize(len) + first_selection = (None, 0) # value, uniqueness + second_selection = (None, 100) # value, avg text length + + variables = [v for v in self.title_model + if v is not None and isinstance(v, Variable)] + + for variable in sorted( + variables, key=lambda var: var.name, reverse=True): + # if there is title, heading, or filename attribute in corpus + # heuristic should select them - + # in order title > heading > filename - this is why we use sort + if str(variable).lower() in ('title', 'heading', 'filename'): + first_selection = (variable, 0) + break + + # otherwise uniqueness and length counts + column_values = self.corpus.get_column_view(variable)[0] + average_text_length = v_len(column_values).mean() + uniqueness = len(np.unique(column_values)) + + # if the variable is short enough to be a title select one with + # the highest number of unique values + if uniqueness > first_selection[1] and average_text_length <= 30: + first_selection = (variable, uniqueness) + # else select the variable with shortest average text that is + # shorter than 100 (if all longer than 100 leave empty) + elif average_text_length < second_selection[1]: + second_selection = (variable, average_text_length) + + if first_selection[0] is not None: + self.title_variable = first_selection[0] + elif second_selection[0] is not None: + self.title_variable = second_selection[0] + else: + self.title_variable = None + def update_info(self): def describe(corpus): dom = corpus.domain @@ -194,6 +256,7 @@ def remove_duplicates(l): if len(self.unused_attrs_model) > 0 and not self.corpus.text_features: self.Error.no_text_features_used() + self._set_title_attribute() # prevent sending "empty" corpora dom = self.corpus.domain empty = not (dom.variables or dom.metas) \ @@ -201,6 +264,15 @@ def remove_duplicates(l): or not self.corpus.text_features self.Outputs.corpus.send(self.corpus if not empty else None) + def _set_title_attribute(self): + # remove all title attributes + for a in self.corpus.domain.variables + self.corpus.domain.metas: + a.attributes.pop("title", None) + + if self.title_variable and self.title_variable in self.corpus.domain: + self.corpus.domain[ + self.title_variable].attributes["title"] = True + def send_report(self): def describe(features): if len(features): diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py new file mode 100644 index 000000000..678e8fd3c --- /dev/null +++ b/orangecontrib/text/widgets/tests/test_owcorpus.py @@ -0,0 +1,165 @@ +import numpy as np +from Orange.data import Table, Domain, StringVariable +from Orange.widgets.tests.base import WidgetTest + +from orangecontrib.text import Corpus +from orangecontrib.text.widgets.owcorpus import OWCorpus + + +class TestOWCorpus(WidgetTest): + def setUp(self): + self.widget = self.create_widget(OWCorpus) + + def check_output(self, sel_title): + """ + This function check whether the `sel_title` variable has a title true + in the output + """ + output = self.get_output(self.widget.Outputs.corpus) + for attr in output.domain.variables + output.domain.metas: + if str(attr) == sel_title: + # sel_title attribute must be marked as a title + self.assertTrue(attr.attributes.get("title", False)) + else: + # others must not be marked as a title + self.assertFalse(attr.attributes.get("title", False)) + + def test_title_combo(self): + # default corpus dataset + self.assertEqual(self.widget.corpus.name, "book-excerpts") + + options = self.widget.title_model[:] + self.assertIn(self.widget.corpus.domain["Text"], options) + # for this dataset no title variable is selected + self.assertEqual(None, self.widget.title_variable) + self.check_output(None) + + def test_title_already_in_dataset(self): + """ + This dataset already have the title attribute so the title option + is set to this attribute by default + """ + # default corpus dataset + data = Corpus.from_file("election-tweets-2016") + self.send_signal(self.widget.Inputs.data, data) + + self.assertEqual(data.domain["Content"], self.widget.title_variable) + self.check_output("Content") + + def test_title_selection_strategy_title_heading(self): + """ + When a there is a title, heading, filename attribute, select this one + as a default title. + """ + data = Table( + Domain([], metas=[StringVariable("title"), StringVariable("b"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 40, "a" * 40], + ["b" * 100, "a" * 40, "b" * 30], + ["c" * 100, "a" * 40, "b" * 40]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["title"], self.widget.title_variable) + self.check_output("title") + + data = Table( + Domain([], metas=[StringVariable("Title"), StringVariable("b"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 40, "a" * 40], + ["b" * 100, "a" * 40, "b" * 30], + ["c" * 100, "a" * 40, "b" * 40]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["Title"], self.widget.title_variable) + self.check_output("Title") + + # when title and heading present first select title + data = Table( + Domain([], metas=[ + StringVariable("Title"), + StringVariable("Heading"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 40, "a" * 40], + ["b" * 100, "a" * 40, "b" * 30], + ["c" * 100, "a" * 40, "b" * 40]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["Title"], self.widget.title_variable) + self.check_output("Title") + + data = Table( + Domain([], metas=[ + StringVariable("Heading"), + StringVariable("Title"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 40, "a" * 40], + ["b" * 100, "a" * 40, "b" * 30], + ["c" * 100, "a" * 40, "b" * 40]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["Title"], self.widget.title_variable) + self.check_output("Title") + + data = Table( + Domain([], metas=[ + StringVariable("Heading"), + StringVariable("Filename"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 40, "a" * 40], + ["b" * 100, "a" * 40, "b" * 30], + ["c" * 100, "a" * 40, "b" * 40]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["Heading"], self.widget.title_variable) + self.check_output("Heading") + + def test_title_selection_strategy(self): + """ + With this test we test whether the selection strategy for a title + attribute works correctly + """ + # select the most unique + data = Table( + Domain([], metas=[StringVariable("a"), StringVariable("b")]), + np.empty((3, 0)), + metas=[["a" * 10, "a" * 10], + ["a" * 10, "b" * 10], + ["a" * 10, "c" * 10]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["b"], self.widget.title_variable) + self.check_output("b") + + # select the uniquest and also short enough, here attribute a is not + # suitable since it has too long title, and c is more unique than b + data = Table( + Domain([], metas=[StringVariable("a"), StringVariable("b"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 10, "a" * 10], + ["b" * 100, "a" * 10, "b" * 10], + ["c" * 100, "a" * 10, "b" * 10]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["c"], self.widget.title_variable) + self.check_output("c") + + # when no variable is short enough we just select the shortest + # attribute + data = Table( + Domain([], metas=[StringVariable("a"), StringVariable("b"), + StringVariable("c")]), + np.empty((3, 0)), + metas=[["a" * 100, "a" * 40, "a" * 40], + ["b" * 100, "a" * 40, "b" * 30], + ["c" * 100, "a" * 40, "b" * 40]] + ) + self.send_signal(self.widget.Inputs.data, data) + self.assertEqual(data.domain["c"], self.widget.title_variable) + self.check_output("c") +