Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Topic modelling - Reorder methods by relevance #1020

Merged
merged 1 commit into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion orangecontrib/text/widgets/owtopicmodeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,14 @@ class Outputs:
want_main_area = True

methods = [
(LsiWidget, 'lsi'),
(LdaWidget, 'lda'),
(LsiWidget, 'lsi'),
(HdpWidget, 'hdp'),
(NmfWidget, 'nmf')
]

# Settings
settings_version = 2
autocommit = settings.Setting(True)
method_index = settings.Setting(0)

Expand Down Expand Up @@ -266,6 +267,8 @@ def on_done(self, corpus):
if self.model.name == "Latent Dirichlet Allocation":
bound = self.model.model.log_perplexity(infer_ngrams_corpus(corpus))
self.perplexity = "{:.5f}".format(np.exp2(-bound))
else:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that perplexity didn't reset to na after the method change

self.perplexity = "n/a"
# for small corpora it is slower to use more processes
# there is no good estimation when multiprocessing is helpful, but it is
# definitely not helpful for corpora smaller than 100
Expand Down Expand Up @@ -299,6 +302,15 @@ def send_topic_by_id(self, topic_id=None):
self.Outputs.selected_topic.send(
self.model.get_topics_table_by_id(topic_id))

@classmethod
def migrate_settings(cls, settings, version=0):
if version < 2 and "method_index" in settings:
# in version 2 we change the position of first and second method (lsi, lda)
# map changes that correct method from the workflow is loaded
change = {1: 0, 0: 1}
method_idx = settings["method_index"]
settings["method_index"] = change.get(method_idx, method_idx)


class TopicViewerTreeWidgetItem(QTreeWidgetItem):
def __init__(self, topic_id, words, weights, parent,
Expand Down
37 changes: 30 additions & 7 deletions orangecontrib/text/widgets/tests/test_owtopicmodeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def test_data(self):
self.assertIsNone(output)

def test_saved_selection(self):
self.widget.method_index = 1
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()

Expand Down Expand Up @@ -58,17 +57,41 @@ def test_topic_evaluation(self):
self.send_signal(self.widget.Inputs.corpus, self.corpus)
self.wait_until_finished()

# test LSI
self.assertEqual(self.widget.perplexity, "n/a")
self.assertNotEqual(self.widget.coherence, "n/a")

# test LDA, which is the only one with log perplexity
self.assertNotEqual(self.widget.perplexity, "n/a")
self.assertTrue(self.widget.coherence)

# test LSI
self.widget.method_index = 1
self.widget.commit.now()
self.wait_until_finished()
self.assertEqual(self.widget.perplexity, "n/a")
self.assertNotEqual(self.widget.coherence, "n/a")

self.assertNotEqual(self.widget.perplexity, "n/a")
self.assertTrue(self.widget.coherence)
def test_migrate_settings_transform(self):
# 0 used to be LSI in version <2 - it is on index 1 now
settings = {"__version__": 1, "method_index": 0}
widget = self.create_widget(OWTopicModeling, stored_settings=settings)
self.assertEqual(1, widget.method_index)
self.assertEqual("Latent Semantic Indexing", widget.model.name)

# 1 used to be LDA in version <2 - it is on index 0 now
settings = {"__version__": 1, "method_index": 1}
widget = self.create_widget(OWTopicModeling, stored_settings=settings)
self.assertEqual(0, widget.method_index)
self.assertEqual("Latent Dirichlet Allocation", widget.model.name)

# 2 is unchanged - still HDP
settings = {"__version__": 1, "method_index": 2}
widget = self.create_widget(OWTopicModeling, stored_settings=settings)
self.assertEqual(2, widget.method_index)
self.assertEqual("Hierarchical Dirichlet Process", widget.model.name)

# 2 is unchanged - still NMF
settings = {"__version__": 1, "method_index": 3}
widget = self.create_widget(OWTopicModeling, stored_settings=settings)
self.assertEqual(3, widget.method_index)
self.assertEqual("Negative Matrix Factorization", widget.model.name)


if __name__ == "__main__":
Expand Down