From 134976d8df5b49f1a6b8d7175a653881469b594f Mon Sep 17 00:00:00 2001
From: Primoz Godec <p.godec9@gmail.com>
Date: Thu, 4 Nov 2021 19:31:01 +0100
Subject: [PATCH] BoW: use training weights on test data

---
 .../text/tests/test_bowvectorizer.py          | 96 +++++++++++++++++++
 .../text/vectorization/bagofwords.py          |  4 +-
 2 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/orangecontrib/text/tests/test_bowvectorizer.py b/orangecontrib/text/tests/test_bowvectorizer.py
index 0de51e7fe..0f1efe6ae 100644
--- a/orangecontrib/text/tests/test_bowvectorizer.py
+++ b/orangecontrib/text/tests/test_bowvectorizer.py
@@ -1,6 +1,7 @@
 import unittest
 
 import numpy as np
+from Orange.data import Domain, StringVariable
 
 from orangecontrib.text import preprocess
 from orangecontrib.text.corpus import Corpus
@@ -135,6 +136,101 @@ def tests_duplicated_names(self):
         # human
         self.assertIn("human", [v.name for v in out.domain.attributes[1:]])
 
+    def test_compute_values_same_tfidf_regardless_num_documents(self):
+        """
+        When computing TF-IDF from compute values TF-IDF should give same
+        results regardless of length of new corpus - IDF weighting should consider
+        only counts from original corpus.
+        """
+        corpus = Corpus.from_file('deerwester')
+        train_corpus = corpus[:5]
+        test_corpus = corpus[5:]
+        vect = BowVectorizer(wglobal=BowVectorizer.IDF)
+
+        bow = vect.transform(train_corpus)
+        computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
+        computed2 = Corpus.from_table(bow.domain, test_corpus)
+
+        self.assertEqual(computed1.domain, computed2.domain)
+        self.assertEqual(bow.domain, computed2.domain)
+        self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
+
+    # fmt: off
+    domain = Domain([], metas=[StringVariable("text")])
+    small_corpus_train = Corpus(
+        domain,
+        np.empty((4, 0)),
+        metas=np.array([
+            ["this is a nice day day"],
+            ["the day is nice"],
+            ["i love a beautiful day"],
+            ["this apple is mine"]
+        ])
+    )
+    terms = [
+        "this", "is", "a", "nice", "day", "the", "i", "love", "beautiful",
+        "apple", "mine"
+    ]
+    train_counts = np.array([
+        [1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
+        [0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
+        [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
+        [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]
+    ])
+    small_corpus_test = Corpus(
+        domain,
+        np.empty((3, 0)),
+        metas=np.array([
+            ["this is a nice day day"],
+            ["day nice summer mine"],
+            ["apple is cool"],
+        ])
+    )
+    test_counts = np.array([
+        [1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
+        [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
+    ])
+    # fmt: on
+
+    def assert_bow_same(self, corpus, values, terms):
+        self.assertSetEqual(set(terms), set(a.name for a in corpus.domain.attributes))
+        for i, a in enumerate(terms):
+            self.assertListEqual(
+                corpus.get_column_view(a)[0].tolist(),
+                values[:, i].tolist(),
+                f"BOW differ for term {a}",
+            )
+
+    def test_count_correctness(self):
+        """Test if computed counts are correct for train and test dataset"""
+        bow = BowVectorizer().transform(self.small_corpus_train)
+        self.assert_bow_same(bow, self.train_counts, self.terms)
+
+        # computed from compute_values - result contains only terms from train dataset
+        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
+        self.assert_bow_same(bow_test, self.test_counts, self.terms)
+
+    def test_tfidf_correctness(self):
+        """
+        Test if computed tf-ids are correct for train and test dataset
+        When computing tf-idf on the training dataset (from compute values)
+        weights (idf) must be computed based on numbers on training dataset
+        """
+        bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
+            self.small_corpus_train
+        )
+
+        document_appearance = (self.train_counts != 0).sum(0)
+        n = len(self.train_counts)
+        idfs_train = self.train_counts * np.log(n / document_appearance)
+        self.assert_bow_same(bow, idfs_train, self.terms)
+
+        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
+        # weights computed based on numbers from training dataset
+        idfs_test = self.test_counts * np.log(n / document_appearance)
+        self.assert_bow_same(bow_test, idfs_test, self.terms)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/orangecontrib/text/vectorization/bagofwords.py b/orangecontrib/text/vectorization/bagofwords.py
index a9ae061c2..972f021ad 100644
--- a/orangecontrib/text/vectorization/bagofwords.py
+++ b/orangecontrib/text/vectorization/bagofwords.py
@@ -72,11 +72,11 @@ def _transform(self, corpus, source_dict=None):
         temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
         dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
         temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
-        model = models.TfidfModel(temp_corpus, normalize=False,
+        model = models.TfidfModel(dictionary=dic, normalize=False,
                                   wlocal=self.wlocals[self.wlocal],
                                   wglobal=self.wglobals[self.wglobal])
 
-        X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T
+        X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T
         norm = self.norms[self.norm]
         if norm:
             X = norm(X)