From 134976d8df5b49f1a6b8d7175a653881469b594f Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Thu, 4 Nov 2021 19:31:01 +0100
Subject: [PATCH] BoW: use training weights on test data
---
.../text/tests/test_bowvectorizer.py | 96 +++++++++++++++++++
.../text/vectorization/bagofwords.py | 4 +-
2 files changed, 98 insertions(+), 2 deletions(-)
diff --git a/orangecontrib/text/tests/test_bowvectorizer.py b/orangecontrib/text/tests/test_bowvectorizer.py
index 0de51e7fe..0f1efe6ae 100644
--- a/orangecontrib/text/tests/test_bowvectorizer.py
+++ b/orangecontrib/text/tests/test_bowvectorizer.py
@@ -1,6 +1,7 @@
import unittest
import numpy as np
+from Orange.data import Domain, StringVariable
from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
@@ -135,6 +136,101 @@ def tests_duplicated_names(self):
# human
self.assertIn("human", [v.name for v in out.domain.attributes[1:]])
+ def test_compute_values_same_tfidf_regardless_num_documents(self):
+ """
+ When computing TF-IDF from compute values TF-IDF should give same
+ results regardless of length of new corpus - IDF weighting should consider
+ only counts from original corpus.
+ """
+ corpus = Corpus.from_file('deerwester')
+ train_corpus = corpus[:5]
+ test_corpus = corpus[5:]
+ vect = BowVectorizer(wglobal=BowVectorizer.IDF)
+
+ bow = vect.transform(train_corpus)
+ computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
+ computed2 = Corpus.from_table(bow.domain, test_corpus)
+
+ self.assertEqual(computed1.domain, computed2.domain)
+ self.assertEqual(bow.domain, computed2.domain)
+ self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
+
+ # fmt: off
+ domain = Domain([], metas=[StringVariable("text")])
+ small_corpus_train = Corpus(
+ domain,
+ np.empty((4, 0)),
+ metas=np.array([
+ ["this is a nice day day"],
+ ["the day is nice"],
+ ["i love a beautiful day"],
+ ["this apple is mine"]
+ ])
+ )
+ terms = [
+ "this", "is", "a", "nice", "day", "the", "i", "love", "beautiful",
+ "apple", "mine"
+ ]
+ train_counts = np.array([
+ [1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
+ [0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
+ [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
+ [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]
+ ])
+ small_corpus_test = Corpus(
+ domain,
+ np.empty((3, 0)),
+ metas=np.array([
+ ["this is a nice day day"],
+ ["day nice summer mine"],
+ ["apple is cool"],
+ ])
+ )
+ test_counts = np.array([
+ [1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
+ [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
+ [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
+ ])
+ # fmt: on
+
+ def assert_bow_same(self, corpus, values, terms):
+ self.assertSetEqual(set(terms), set(a.name for a in corpus.domain.attributes))
+ for i, a in enumerate(terms):
+ self.assertListEqual(
+ corpus.get_column_view(a)[0].tolist(),
+ values[:, i].tolist(),
+ f"BOW differ for term {a}",
+ )
+
+ def test_count_correctness(self):
+ """Test if computed counts are correct for train and test dataset"""
+ bow = BowVectorizer().transform(self.small_corpus_train)
+ self.assert_bow_same(bow, self.train_counts, self.terms)
+
+ # computed from compute_values - result contains only terms from train dataset
+ bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
+ self.assert_bow_same(bow_test, self.test_counts, self.terms)
+
+ def test_tfidf_correctness(self):
+ """
+ Test if computed tf-ids are correct for train and test dataset
+ When computing tf-idf on the training dataset (from compute values)
+ weights (idf) must be computed based on numbers on training dataset
+ """
+ bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
+ self.small_corpus_train
+ )
+
+ document_appearance = (self.train_counts != 0).sum(0)
+ n = len(self.train_counts)
+ idfs_train = self.train_counts * np.log(n / document_appearance)
+ self.assert_bow_same(bow, idfs_train, self.terms)
+
+ bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
+ # weights computed based on numbers from training dataset
+ idfs_test = self.test_counts * np.log(n / document_appearance)
+ self.assert_bow_same(bow_test, idfs_test, self.terms)
+
if __name__ == "__main__":
unittest.main()
diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py
index a9ae061c2..972f021ad 100644
--- a/orangecontrib/text/vectorization/bagofwords.py
+++ b/orangecontrib/text/vectorization/bagofwords.py
@@ -72,11 +72,11 @@ def _transform(self, corpus, source_dict=None):
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
- model = models.TfidfModel(temp_corpus, normalize=False,
+ model = models.TfidfModel(dictionary=dic, normalize=False,
wlocal=self.wlocals[self.wlocal],
wglobal=self.wglobals[self.wglobal])
- X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T
+ X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T
norm = self.norms[self.norm]
if norm:
X = norm(X)