From ae5267203f50c4ac03a69259871e47b854efdfc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Primo=C5=BE=20Godec?= <p.godec9@gmail.com>
Date: Wed, 11 Dec 2019 11:00:37 +0100
Subject: [PATCH] Corpus: add title attribute selection dropdown

---
 orangecontrib/text/corpus.py                  |  10 +-
 orangecontrib/text/tests/test_corpus.py       |   7 -
 orangecontrib/text/widgets/owcorpus.py        |  76 +++++++-
 .../text/widgets/tests/test_owcorpus.py       | 165 ++++++++++++++++++
 4 files changed, 240 insertions(+), 18 deletions(-)
 create mode 100644 orangecontrib/text/widgets/tests/test_owcorpus.py

diff --git a/orangecontrib/text/corpus.py b/orangecontrib/text/corpus.py
index 058b165fc..0600170e6 100644
--- a/orangecontrib/text/corpus.py
+++ b/orangecontrib/text/corpus.py
@@ -197,15 +197,7 @@ def titles(self):
         """ Returns a list of titles. """
         attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
                  if attr.attributes.get('title', False)]
-        # Alternatively, use heuristics
-        if not attrs:
-            for var in sorted(chain(self.domain.metas, self.domain.variables),
-                              key=lambda var: var.name,
-                              reverse=True):  # reverse so that title < heading < filename
-                if var.name.lower() in ('title', 'heading', 'h1', 'filename') \
-                        and not var.attributes.get('hidden', False):    # skip BoW features
-                    attrs = [var]
-                    break
+
         if attrs:
             return self.documents_from_features(attrs)
         else:
diff --git a/orangecontrib/text/tests/test_corpus.py b/orangecontrib/text/tests/test_corpus.py
index 0d98c62a7..edf6d05c3 100644
--- a/orangecontrib/text/tests/test_corpus.py
+++ b/orangecontrib/text/tests/test_corpus.py
@@ -163,13 +163,6 @@ def test_titles(self):
         for title in titles:
             self.assertIn('Document ', title)
 
-        # inferred title from heuristics
-        expected = list(map(str, range(len(c))))
-        c2 = Corpus(Domain([], [], (StringVariable('heading'),)),
-                    None, None, np.c_[expected])
-        titles = c2.titles
-        self.assertEqual(titles, expected)
-
         # title feature set
         c.domain[0].attributes['title'] = True
         titles = c.titles
diff --git a/orangecontrib/text/widgets/owcorpus.py b/orangecontrib/text/widgets/owcorpus.py
index 74b45c6a8..288c2d28e 100644
--- a/orangecontrib/text/widgets/owcorpus.py
+++ b/orangecontrib/text/widgets/owcorpus.py
@@ -1,9 +1,10 @@
 import os
+import numpy as np
 
-from Orange.data import Table
+from Orange.data import Table, StringVariable, Variable
 from Orange.data.io import FileFormat
 from Orange.widgets import gui
-from Orange.widgets.utils.itemmodels import VariableListModel
+from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
 from Orange.widgets.data.owselectcolumns import VariablesListItemView
 from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
 from Orange.widgets.widget import OWWidget, Msg, Input, Output
@@ -46,6 +47,7 @@ class Outputs:
         "andersen.tab",
     ])
     used_attrs = ContextSetting([])
+    title_variable = ContextSetting("")
 
     class Error(OWWidget.Error):
         read_file = Msg("Can't read file {} ({})")
@@ -73,6 +75,15 @@ def __init__(self):
         self.info_label = gui.label(ibox, self, "")
         self.update_info()
 
+        # dropdown to select title variable
+        self.title_model = DomainModel(
+            valid_types=(StringVariable,), placeholder="(no title)")
+        gui.comboBox(
+            self.controlArea, self, "title_variable",
+            box="Title variable", model=self.title_model,
+            callback=self.update_feature_selection
+        )
+
         # Used Text Features
         fbox = gui.widgetBox(self.controlArea, orientation=0)
         ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
@@ -138,6 +149,7 @@ def open_file(self, path=None, data=None):
             return
 
         self.update_info()
+        self._setup_title_dropdown()
         self.used_attrs = list(self.corpus.text_features)
         if not self.corpus.text_features:
             self.Error.corpus_without_text_features()
@@ -149,6 +161,56 @@ def open_file(self, path=None, data=None):
             [f for f in self.corpus.domain.metas
              if f.is_string and f not in self.used_attrs_model])
 
+    def _setup_title_dropdown(self):
+        self.title_model.set_domain(self.corpus.domain)
+
+        # if title variable is already marked in a dataset set it as a title
+        # variable
+        title_var = list(filter(
+            lambda x: x.attributes.get("title", False),
+            self.corpus.domain.metas))
+        if title_var:
+            self.title_variable = title_var[0]
+            return
+
+        # if not title attribute use heuristic for selecting it
+        v_len = np.vectorize(len)
+        first_selection = (None, 0)  # value, uniqueness
+        second_selection = (None, 100)  # value, avg text length
+
+        variables = [v for v in self.title_model
+                     if v is not None and isinstance(v, Variable)]
+
+        for variable in sorted(
+                variables, key=lambda var: var.name, reverse=True):
+            # if there is title, heading, or filename attribute in corpus
+            # heuristic should select them -
+            # in order title > heading > filename - this is why we use sort
+            if str(variable).lower() in ('title', 'heading', 'filename'):
+                first_selection = (variable, 0)
+                break
+
+            # otherwise uniqueness and length counts
+            column_values = self.corpus.get_column_view(variable)[0]
+            average_text_length = v_len(column_values).mean()
+            uniqueness = len(np.unique(column_values))
+
+            # if the variable is short enough to be a title select one with
+            # the highest number of unique values
+            if uniqueness > first_selection[1] and average_text_length <= 30:
+                first_selection = (variable, uniqueness)
+            # else select the variable with shortest average text that is
+            # shorter than 100 (if all longer than 100 leave empty)
+            elif average_text_length < second_selection[1]:
+                second_selection = (variable, average_text_length)
+
+        if first_selection[0] is not None:
+            self.title_variable = first_selection[0]
+        elif second_selection[0] is not None:
+            self.title_variable = second_selection[0]
+        else:
+            self.title_variable = None
+
     def update_info(self):
         def describe(corpus):
             dom = corpus.domain
@@ -194,6 +256,7 @@ def remove_duplicates(l):
             if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
                 self.Error.no_text_features_used()
 
+            self._set_title_attribute()
             # prevent sending "empty" corpora
             dom = self.corpus.domain
             empty = not (dom.variables or dom.metas) \
@@ -201,6 +264,15 @@ def remove_duplicates(l):
                 or not self.corpus.text_features
             self.Outputs.corpus.send(self.corpus if not empty else None)
 
+    def _set_title_attribute(self):
+        # remove all title attributes
+        for a in self.corpus.domain.variables + self.corpus.domain.metas:
+            a.attributes.pop("title", None)
+
+        if self.title_variable and self.title_variable in self.corpus.domain:
+            self.corpus.domain[
+                self.title_variable].attributes["title"] = True
+
     def send_report(self):
         def describe(features):
             if len(features):
diff --git a/orangecontrib/text/widgets/tests/test_owcorpus.py b/orangecontrib/text/widgets/tests/test_owcorpus.py
new file mode 100644
index 000000000..678e8fd3c
--- /dev/null
+++ b/orangecontrib/text/widgets/tests/test_owcorpus.py
@@ -0,0 +1,165 @@
+import numpy as np
+from Orange.data import Table, Domain, StringVariable
+from Orange.widgets.tests.base import WidgetTest
+
+from orangecontrib.text import Corpus
+from orangecontrib.text.widgets.owcorpus import OWCorpus
+
+
+class TestOWCorpus(WidgetTest):
+    def setUp(self):
+        self.widget = self.create_widget(OWCorpus)
+
+    def check_output(self, sel_title):
+        """
+        This function check whether the `sel_title` variable has a title true
+        in the output
+        """
+        output = self.get_output(self.widget.Outputs.corpus)
+        for attr in output.domain.variables + output.domain.metas:
+            if str(attr) == sel_title:
+                # sel_title attribute must be marked as a title
+                self.assertTrue(attr.attributes.get("title", False))
+            else:
+                # others must not be marked as a title
+                self.assertFalse(attr.attributes.get("title", False))
+
+    def test_title_combo(self):
+        # default corpus dataset
+        self.assertEqual(self.widget.corpus.name, "book-excerpts")
+
+        options = self.widget.title_model[:]
+        self.assertIn(self.widget.corpus.domain["Text"], options)
+        # for this dataset no title variable is selected
+        self.assertEqual(None, self.widget.title_variable)
+        self.check_output(None)
+
+    def test_title_already_in_dataset(self):
+        """
+        This dataset already have the title attribute so the title option
+        is set to this attribute by default
+        """
+        # default corpus dataset
+        data = Corpus.from_file("election-tweets-2016")
+        self.send_signal(self.widget.Inputs.data, data)
+
+        self.assertEqual(data.domain["Content"], self.widget.title_variable)
+        self.check_output("Content")
+
+    def test_title_selection_strategy_title_heading(self):
+        """
+        When a there is a title, heading, filename attribute, select this one
+        as a default title.
+        """
+        data = Table(
+            Domain([], metas=[StringVariable("title"), StringVariable("b"),
+                              StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 40, "a" * 40],
+                   ["b" * 100, "a" * 40, "b" * 30],
+                   ["c" * 100, "a" * 40, "b" * 40]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["title"], self.widget.title_variable)
+        self.check_output("title")
+
+        data = Table(
+            Domain([], metas=[StringVariable("Title"), StringVariable("b"),
+                              StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 40, "a" * 40],
+                   ["b" * 100, "a" * 40, "b" * 30],
+                   ["c" * 100, "a" * 40, "b" * 40]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["Title"], self.widget.title_variable)
+        self.check_output("Title")
+
+        # when title and heading present first select title
+        data = Table(
+            Domain([], metas=[
+                StringVariable("Title"),
+                StringVariable("Heading"),
+                StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 40, "a" * 40],
+                   ["b" * 100, "a" * 40, "b" * 30],
+                   ["c" * 100, "a" * 40, "b" * 40]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["Title"], self.widget.title_variable)
+        self.check_output("Title")
+
+        data = Table(
+            Domain([], metas=[
+                StringVariable("Heading"),
+                StringVariable("Title"),
+                StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 40, "a" * 40],
+                   ["b" * 100, "a" * 40, "b" * 30],
+                   ["c" * 100, "a" * 40, "b" * 40]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["Title"], self.widget.title_variable)
+        self.check_output("Title")
+
+        data = Table(
+            Domain([], metas=[
+                StringVariable("Heading"),
+                StringVariable("Filename"),
+                StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 40, "a" * 40],
+                   ["b" * 100, "a" * 40, "b" * 30],
+                   ["c" * 100, "a" * 40, "b" * 40]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["Heading"], self.widget.title_variable)
+        self.check_output("Heading")
+
+    def test_title_selection_strategy(self):
+        """
+        With this test we test whether the selection strategy for a title
+        attribute works correctly
+        """
+        # select the most unique
+        data = Table(
+            Domain([], metas=[StringVariable("a"), StringVariable("b")]),
+            np.empty((3, 0)),
+            metas=[["a" * 10, "a" * 10],
+                   ["a" * 10, "b" * 10],
+                   ["a" * 10, "c" * 10]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["b"], self.widget.title_variable)
+        self.check_output("b")
+
+        # select the uniquest and also short enough, here attribute a is not
+        # suitable since it has too long title, and c is more unique than b
+        data = Table(
+            Domain([], metas=[StringVariable("a"), StringVariable("b"),
+                              StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 10, "a" * 10],
+                   ["b" * 100, "a" * 10, "b" * 10],
+                   ["c" * 100, "a" * 10, "b" * 10]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["c"], self.widget.title_variable)
+        self.check_output("c")
+
+        # when no variable is short enough we just select the shortest
+        # attribute
+        data = Table(
+            Domain([], metas=[StringVariable("a"), StringVariable("b"),
+                              StringVariable("c")]),
+            np.empty((3, 0)),
+            metas=[["a" * 100, "a" * 40, "a" * 40],
+                   ["b" * 100, "a" * 40, "b" * 30],
+                   ["c" * 100, "a" * 40, "b" * 40]]
+        )
+        self.send_signal(self.widget.Inputs.data, data)
+        self.assertEqual(data.domain["c"], self.widget.title_variable)
+        self.check_output("c")
+