From ae5267203f50c4ac03a69259871e47b854efdfc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?=
Date: Wed, 11 Dec 2019 11:00:37 +0100
Subject: [PATCH] Corpus: add title attribute selection dropdown
---
orangecontrib/text/corpus.py | 10 +-
orangecontrib/text/tests/test_corpus.py | 7 -
orangecontrib/text/widgets/owcorpus.py | 76 +++++++-
.../text/widgets/tests/test_owcorpus.py | 165 ++++++++++++++++++
4 files changed, 240 insertions(+), 18 deletions(-)
create mode 100644 orangecontrib/text/widgets/tests/test_owcorpus.py
diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index 058b165fc..0600170e6 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -197,15 +197,7 @@ def titles(self):
""" Returns a list of titles. """
attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]
- # Alternatively, use heuristics
- if not attrs:
- for var in sorted(chain(self.domain.metas, self.domain.variables),
- key=lambda var: var.name,
- reverse=True): # reverse so that title < heading < filename
- if var.name.lower() in ('title', 'heading', 'h1', 'filename') \
- and not var.attributes.get('hidden', False): # skip BoW features
- attrs = [var]
- break
+
if attrs:
return self.documents_from_features(attrs)
else:
diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
index 0d98c62a7..edf6d05c3 100644
--- a/orangecontrib/text/tests/test_corpus.py
+++ b/orangecontrib/text/tests/test_corpus.py
@@ -163,13 +163,6 @@ def test_titles(self):
for title in titles:
self.assertIn('Document ', title)
- # inferred title from heuristics
- expected = list(map(str, range(len(c))))
- c2 = Corpus(Domain([], [], (StringVariable('heading'),)),
- None, None, np.c_[expected])
- titles = c2.titles
- self.assertEqual(titles, expected)
-
# title feature set
c.domain[0].attributes['title'] = True
titles = c.titles
diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
index 74b45c6a8..288c2d28e 100644
--- a/orangecontrib/text/widgets/owcorpus.py
+++ b/orangecontrib/text/widgets/owcorpus.py
@@ -1,9 +1,10 @@
import os
+import numpy as np
-from Orange.data import Table
+from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
-from Orange.widgets.utils.itemmodels import VariableListModel
+from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
from Orange.widgets.data.owselectcolumns import VariablesListItemView
from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
from Orange.widgets.widget import OWWidget, Msg, Input, Output
@@ -46,6 +47,7 @@ class Outputs:
"andersen.tab",
])
used_attrs = ContextSetting([])
+ title_variable = ContextSetting("")
class Error(OWWidget.Error):
read_file = Msg("Can't read file {} ({})")
@@ -73,6 +75,15 @@ def __init__(self):
self.info_label = gui.label(ibox, self, "")
self.update_info()
+ # dropdown to select title variable
+ self.title_model = DomainModel(
+ valid_types=(StringVariable,), placeholder="(no title)")
+ gui.comboBox(
+ self.controlArea, self, "title_variable",
+ box="Title variable", model=self.title_model,
+ callback=self.update_feature_selection
+ )
+
# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
@@ -138,6 +149,7 @@ def open_file(self, path=None, data=None):
return
self.update_info()
+ self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
self.Error.corpus_without_text_features()
@@ -149,6 +161,56 @@ def open_file(self, path=None, data=None):
[f for f in self.corpus.domain.metas
if f.is_string and f not in self.used_attrs_model])
+ def _setup_title_dropdown(self):
+ self.title_model.set_domain(self.corpus.domain)
+
+ # if title variable is already marked in a dataset set it as a title
+ # variable
+ title_var = list(filter(
+ lambda x: x.attributes.get("title", False),
+ self.corpus.domain.metas))
+ if title_var:
+ self.title_variable = title_var[0]
+ return
+
+ # if not title attribute use heuristic for selecting it
+ v_len = np.vectorize(len)
+ first_selection = (None, 0) # value, uniqueness
+ second_selection = (None, 100) # value, avg text length
+
+ variables = [v for v in self.title_model
+ if v is not None and isinstance(v, Variable)]
+
+ for variable in sorted(
+ variables, key=lambda var: var.name, reverse=True):
+ # if there is title, heading, or filename attribute in corpus
+ # heuristic should select them -
+ # in order title > heading > filename - this is why we use sort
+ if str(variable).lower() in ('title', 'heading', 'filename'):
+ first_selection = (variable, 0)
+ break
+
+ # otherwise uniqueness and length counts
+ column_values = self.corpus.get_column_view(variable)[0]
+ average_text_length = v_len(column_values).mean()
+ uniqueness = len(np.unique(column_values))
+
+ # if the variable is short enough to be a title select one with
+ # the highest number of unique values
+ if uniqueness > first_selection[1] and average_text_length <= 30:
+ first_selection = (variable, uniqueness)
+ # else select the variable with shortest average text that is
+ # shorter than 100 (if all longer than 100 leave empty)
+ elif average_text_length < second_selection[1]:
+ second_selection = (variable, average_text_length)
+
+ if first_selection[0] is not None:
+ self.title_variable = first_selection[0]
+ elif second_selection[0] is not None:
+ self.title_variable = second_selection[0]
+ else:
+ self.title_variable = None
+
def update_info(self):
def describe(corpus):
dom = corpus.domain
@@ -194,6 +256,7 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()
+ self._set_title_attribute()
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
@@ -201,6 +264,15 @@ def remove_duplicates(l):
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)
+ def _set_title_attribute(self):
+ # remove all title attributes
+ for a in self.corpus.domain.variables + self.corpus.domain.metas:
+ a.attributes.pop("title", None)
+
+ if self.title_variable and self.title_variable in self.corpus.domain:
+ self.corpus.domain[
+ self.title_variable].attributes["title"] = True
+
def send_report(self):
def describe(features):
if len(features):
diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py
new file mode 100644
index 000000000..678e8fd3c
--- /dev/null
+++ b/orangecontrib/text/widgets/tests/test_owcorpus.py
@@ -0,0 +1,165 @@
+import numpy as np
+from Orange.data import Table, Domain, StringVariable
+from Orange.widgets.tests.base import WidgetTest
+
+from orangecontrib.text import Corpus
+from orangecontrib.text.widgets.owcorpus import OWCorpus
+
+
+class TestOWCorpus(WidgetTest):
+ def setUp(self):
+ self.widget = self.create_widget(OWCorpus)
+
+ def check_output(self, sel_title):
+ """
+ This function check whether the `sel_title` variable has a title true
+ in the output
+ """
+ output = self.get_output(self.widget.Outputs.corpus)
+ for attr in output.domain.variables + output.domain.metas:
+ if str(attr) == sel_title:
+ # sel_title attribute must be marked as a title
+ self.assertTrue(attr.attributes.get("title", False))
+ else:
+ # others must not be marked as a title
+ self.assertFalse(attr.attributes.get("title", False))
+
+ def test_title_combo(self):
+ # default corpus dataset
+ self.assertEqual(self.widget.corpus.name, "book-excerpts")
+
+ options = self.widget.title_model[:]
+ self.assertIn(self.widget.corpus.domain["Text"], options)
+ # for this dataset no title variable is selected
+ self.assertEqual(None, self.widget.title_variable)
+ self.check_output(None)
+
+ def test_title_already_in_dataset(self):
+ """
+ This dataset already have the title attribute so the title option
+ is set to this attribute by default
+ """
+ # default corpus dataset
+ data = Corpus.from_file("election-tweets-2016")
+ self.send_signal(self.widget.Inputs.data, data)
+
+ self.assertEqual(data.domain["Content"], self.widget.title_variable)
+ self.check_output("Content")
+
+ def test_title_selection_strategy_title_heading(self):
+ """
+ When a there is a title, heading, filename attribute, select this one
+ as a default title.
+ """
+ data = Table(
+ Domain([], metas=[StringVariable("title"), StringVariable("b"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 40, "a" * 40],
+ ["b" * 100, "a" * 40, "b" * 30],
+ ["c" * 100, "a" * 40, "b" * 40]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["title"], self.widget.title_variable)
+ self.check_output("title")
+
+ data = Table(
+ Domain([], metas=[StringVariable("Title"), StringVariable("b"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 40, "a" * 40],
+ ["b" * 100, "a" * 40, "b" * 30],
+ ["c" * 100, "a" * 40, "b" * 40]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["Title"], self.widget.title_variable)
+ self.check_output("Title")
+
+ # when title and heading present first select title
+ data = Table(
+ Domain([], metas=[
+ StringVariable("Title"),
+ StringVariable("Heading"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 40, "a" * 40],
+ ["b" * 100, "a" * 40, "b" * 30],
+ ["c" * 100, "a" * 40, "b" * 40]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["Title"], self.widget.title_variable)
+ self.check_output("Title")
+
+ data = Table(
+ Domain([], metas=[
+ StringVariable("Heading"),
+ StringVariable("Title"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 40, "a" * 40],
+ ["b" * 100, "a" * 40, "b" * 30],
+ ["c" * 100, "a" * 40, "b" * 40]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["Title"], self.widget.title_variable)
+ self.check_output("Title")
+
+ data = Table(
+ Domain([], metas=[
+ StringVariable("Heading"),
+ StringVariable("Filename"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 40, "a" * 40],
+ ["b" * 100, "a" * 40, "b" * 30],
+ ["c" * 100, "a" * 40, "b" * 40]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["Heading"], self.widget.title_variable)
+ self.check_output("Heading")
+
+ def test_title_selection_strategy(self):
+ """
+ With this test we test whether the selection strategy for a title
+ attribute works correctly
+ """
+ # select the most unique
+ data = Table(
+ Domain([], metas=[StringVariable("a"), StringVariable("b")]),
+ np.empty((3, 0)),
+ metas=[["a" * 10, "a" * 10],
+ ["a" * 10, "b" * 10],
+ ["a" * 10, "c" * 10]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["b"], self.widget.title_variable)
+ self.check_output("b")
+
+ # select the uniquest and also short enough, here attribute a is not
+ # suitable since it has too long title, and c is more unique than b
+ data = Table(
+ Domain([], metas=[StringVariable("a"), StringVariable("b"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 10, "a" * 10],
+ ["b" * 100, "a" * 10, "b" * 10],
+ ["c" * 100, "a" * 10, "b" * 10]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["c"], self.widget.title_variable)
+ self.check_output("c")
+
+ # when no variable is short enough we just select the shortest
+ # attribute
+ data = Table(
+ Domain([], metas=[StringVariable("a"), StringVariable("b"),
+ StringVariable("c")]),
+ np.empty((3, 0)),
+ metas=[["a" * 100, "a" * 40, "a" * 40],
+ ["b" * 100, "a" * 40, "b" * 30],
+ ["c" * 100, "a" * 40, "b" * 40]]
+ )
+ self.send_signal(self.widget.Inputs.data, data)
+ self.assertEqual(data.domain["c"], self.widget.title_variable)
+ self.check_output("c")
+