Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Corpus: add title attribute selection dropdown #481

Merged
merged 1 commit into from
Dec 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,7 @@ def titles(self):
""" Returns a list of titles. """
attrs = [attr for attr in chain(self.domain.variables, self.domain.metas)
if attr.attributes.get('title', False)]
# Alternatively, use heuristics
if not attrs:
for var in sorted(chain(self.domain.metas, self.domain.variables),
key=lambda var: var.name,
reverse=True): # reverse so that title < heading < filename
if var.name.lower() in ('title', 'heading', 'h1', 'filename') \
and not var.attributes.get('hidden', False): # skip BoW features
attrs = [var]
break

if attrs:
return self.documents_from_features(attrs)
else:
Expand Down
7 changes: 0 additions & 7 deletions orangecontrib/text/tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,13 +163,6 @@ def test_titles(self):
for title in titles:
self.assertIn('Document ', title)

# inferred title from heuristics
expected = list(map(str, range(len(c))))
c2 = Corpus(Domain([], [], (StringVariable('heading'),)),
None, None, np.c_[expected])
titles = c2.titles
self.assertEqual(titles, expected)

# title feature set
c.domain[0].attributes['title'] = True
titles = c.titles
Expand Down
76 changes: 74 additions & 2 deletions orangecontrib/text/widgets/owcorpus.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import numpy as np

from Orange.data import Table
from Orange.data import Table, StringVariable, Variable
from Orange.data.io import FileFormat
from Orange.widgets import gui
from Orange.widgets.utils.itemmodels import VariableListModel
from Orange.widgets.utils.itemmodels import VariableListModel, DomainModel
from Orange.widgets.data.owselectcolumns import VariablesListItemView
from Orange.widgets.settings import Setting, ContextSetting, PerfectDomainContextHandler
from Orange.widgets.widget import OWWidget, Msg, Input, Output
Expand Down Expand Up @@ -46,6 +47,7 @@ class Outputs:
"andersen.tab",
])
used_attrs = ContextSetting([])
title_variable = ContextSetting("")

class Error(OWWidget.Error):
read_file = Msg("Can't read file {} ({})")
Expand Down Expand Up @@ -73,6 +75,15 @@ def __init__(self):
self.info_label = gui.label(ibox, self, "")
self.update_info()

# dropdown to select title variable
self.title_model = DomainModel(
valid_types=(StringVariable,), placeholder="(no title)")
gui.comboBox(
self.controlArea, self, "title_variable",
box="Title variable", model=self.title_model,
callback=self.update_feature_selection
)

# Used Text Features
fbox = gui.widgetBox(self.controlArea, orientation=0)
ubox = gui.widgetBox(fbox, "Used text features", addSpace=False)
Expand Down Expand Up @@ -138,6 +149,7 @@ def open_file(self, path=None, data=None):
return

self.update_info()
self._setup_title_dropdown()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
self.Error.corpus_without_text_features()
Expand All @@ -149,6 +161,56 @@ def open_file(self, path=None, data=None):
[f for f in self.corpus.domain.metas
if f.is_string and f not in self.used_attrs_model])

def _setup_title_dropdown(self):
self.title_model.set_domain(self.corpus.domain)

# if title variable is already marked in a dataset set it as a title
# variable
title_var = list(filter(
lambda x: x.attributes.get("title", False),
self.corpus.domain.metas))
if title_var:
self.title_variable = title_var[0]
return

# if not title attribute use heuristic for selecting it
v_len = np.vectorize(len)
first_selection = (None, 0) # value, uniqueness
second_selection = (None, 100) # value, avg text length

variables = [v for v in self.title_model
if v is not None and isinstance(v, Variable)]

for variable in sorted(
variables, key=lambda var: var.name, reverse=True):
# if there is title, heading, or filename attribute in corpus
# heuristic should select them -
# in order title > heading > filename - this is why we use sort
if str(variable).lower() in ('title', 'heading', 'filename'):
first_selection = (variable, 0)
break

# otherwise uniqueness and length counts
column_values = self.corpus.get_column_view(variable)[0]
average_text_length = v_len(column_values).mean()
uniqueness = len(np.unique(column_values))

# if the variable is short enough to be a title select one with
# the highest number of unique values
if uniqueness > first_selection[1] and average_text_length <= 30:
first_selection = (variable, uniqueness)
# else select the variable with shortest average text that is
# shorter than 100 (if all longer than 100 leave empty)
elif average_text_length < second_selection[1]:
second_selection = (variable, average_text_length)

if first_selection[0] is not None:
self.title_variable = first_selection[0]
elif second_selection[0] is not None:
self.title_variable = second_selection[0]
else:
self.title_variable = None

def update_info(self):
def describe(corpus):
dom = corpus.domain
Expand Down Expand Up @@ -194,13 +256,23 @@ def remove_duplicates(l):
if len(self.unused_attrs_model) > 0 and not self.corpus.text_features:
self.Error.no_text_features_used()

self._set_title_attribute()
# prevent sending "empty" corpora
dom = self.corpus.domain
empty = not (dom.variables or dom.metas) \
or len(self.corpus) == 0 \
or not self.corpus.text_features
self.Outputs.corpus.send(self.corpus if not empty else None)

def _set_title_attribute(self):
# remove all title attributes
for a in self.corpus.domain.variables + self.corpus.domain.metas:
a.attributes.pop("title", None)

if self.title_variable and self.title_variable in self.corpus.domain:
self.corpus.domain[
self.title_variable].attributes["title"] = True

def send_report(self):
def describe(features):
if len(features):
Expand Down
165 changes: 165 additions & 0 deletions orangecontrib/text/widgets/tests/test_owcorpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import numpy as np
from Orange.data import Table, Domain, StringVariable
from Orange.widgets.tests.base import WidgetTest

from orangecontrib.text import Corpus
from orangecontrib.text.widgets.owcorpus import OWCorpus


class TestOWCorpus(WidgetTest):
def setUp(self):
self.widget = self.create_widget(OWCorpus)

def check_output(self, sel_title):
"""
This function check whether the `sel_title` variable has a title true
in the output
"""
output = self.get_output(self.widget.Outputs.corpus)
for attr in output.domain.variables + output.domain.metas:
if str(attr) == sel_title:
# sel_title attribute must be marked as a title
self.assertTrue(attr.attributes.get("title", False))
else:
# others must not be marked as a title
self.assertFalse(attr.attributes.get("title", False))

def test_title_combo(self):
# default corpus dataset
self.assertEqual(self.widget.corpus.name, "book-excerpts")

options = self.widget.title_model[:]
self.assertIn(self.widget.corpus.domain["Text"], options)
# for this dataset no title variable is selected
self.assertEqual(None, self.widget.title_variable)
self.check_output(None)

def test_title_already_in_dataset(self):
"""
This dataset already have the title attribute so the title option
is set to this attribute by default
"""
# default corpus dataset
data = Corpus.from_file("election-tweets-2016")
self.send_signal(self.widget.Inputs.data, data)

self.assertEqual(data.domain["Content"], self.widget.title_variable)
self.check_output("Content")

def test_title_selection_strategy_title_heading(self):
"""
When a there is a title, heading, filename attribute, select this one
as a default title.
"""
data = Table(
Domain([], metas=[StringVariable("title"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["title"], self.widget.title_variable)
self.check_output("title")

data = Table(
Domain([], metas=[StringVariable("Title"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Title"], self.widget.title_variable)
self.check_output("Title")

# when title and heading present first select title
data = Table(
Domain([], metas=[
StringVariable("Title"),
StringVariable("Heading"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Title"], self.widget.title_variable)
self.check_output("Title")

data = Table(
Domain([], metas=[
StringVariable("Heading"),
StringVariable("Title"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Title"], self.widget.title_variable)
self.check_output("Title")

data = Table(
Domain([], metas=[
StringVariable("Heading"),
StringVariable("Filename"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["Heading"], self.widget.title_variable)
self.check_output("Heading")

def test_title_selection_strategy(self):
"""
With this test we test whether the selection strategy for a title
attribute works correctly
"""
# select the most unique
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b")]),
np.empty((3, 0)),
metas=[["a" * 10, "a" * 10],
["a" * 10, "b" * 10],
["a" * 10, "c" * 10]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["b"], self.widget.title_variable)
self.check_output("b")

# select the uniquest and also short enough, here attribute a is not
# suitable since it has too long title, and c is more unique than b
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 10, "a" * 10],
["b" * 100, "a" * 10, "b" * 10],
["c" * 100, "a" * 10, "b" * 10]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")

# when no variable is short enough we just select the shortest
# attribute
data = Table(
Domain([], metas=[StringVariable("a"), StringVariable("b"),
StringVariable("c")]),
np.empty((3, 0)),
metas=[["a" * 100, "a" * 40, "a" * 40],
["b" * 100, "a" * 40, "b" * 30],
["c" * 100, "a" * 40, "b" * 40]]
)
self.send_signal(self.widget.Inputs.data, data)
self.assertEqual(data.domain["c"], self.widget.title_variable)
self.check_output("c")