Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BoW: fix vectorisation to use only original weights for IDF on new data #745

Merged
merged 1 commit into from
Nov 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions orangecontrib/text/tests/test_bowvectorizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import unittest

import numpy as np
from Orange.data import Domain, StringVariable

from orangecontrib.text import preprocess
from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -135,6 +136,101 @@ def tests_duplicated_names(self):
# human
self.assertIn("human", [v.name for v in out.domain.attributes[1:]])

def test_compute_values_same_tfidf_regardless_num_documents(self):
"""
When computing TF-IDF from compute values TF-IDF should give same
results regardless of length of new corpus - IDF weighting should consider
only counts from original corpus.
"""
corpus = Corpus.from_file('deerwester')
train_corpus = corpus[:5]
test_corpus = corpus[5:]
vect = BowVectorizer(wglobal=BowVectorizer.IDF)

bow = vect.transform(train_corpus)
computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
computed2 = Corpus.from_table(bow.domain, test_corpus)

self.assertEqual(computed1.domain, computed2.domain)
self.assertEqual(bow.domain, computed2.domain)
self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)

# fmt: off
domain = Domain([], metas=[StringVariable("text")])
small_corpus_train = Corpus(
domain,
np.empty((4, 0)),
metas=np.array([
["this is a nice day day"],
["the day is nice"],
["i love a beautiful day"],
["this apple is mine"]
])
)
terms = [
"this", "is", "a", "nice", "day", "the", "i", "love", "beautiful",
"apple", "mine"
]
train_counts = np.array([
[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]
])
small_corpus_test = Corpus(
domain,
np.empty((3, 0)),
metas=np.array([
["this is a nice day day"],
["day nice summer mine"],
["apple is cool"],
])
)
test_counts = np.array([
[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
])
# fmt: on

def assert_bow_same(self, corpus, values, terms):
self.assertSetEqual(set(terms), set(a.name for a in corpus.domain.attributes))
for i, a in enumerate(terms):
self.assertListEqual(
corpus.get_column_view(a)[0].tolist(),
values[:, i].tolist(),
f"BOW differ for term {a}",
)

def test_count_correctness(self):
"""Test if computed counts are correct for train and test dataset"""
bow = BowVectorizer().transform(self.small_corpus_train)
self.assert_bow_same(bow, self.train_counts, self.terms)

# computed from compute_values - result contains only terms from train dataset
bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
self.assert_bow_same(bow_test, self.test_counts, self.terms)

def test_tfidf_correctness(self):
"""
Test if computed tf-ids are correct for train and test dataset
When computing tf-idf on the training dataset (from compute values)
weights (idf) must be computed based on numbers on training dataset
"""
bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
self.small_corpus_train
)

document_appearance = (self.train_counts != 0).sum(0)
n = len(self.train_counts)
idfs_train = self.train_counts * np.log(n / document_appearance)
self.assert_bow_same(bow, idfs_train, self.terms)

bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
# weights computed based on numbers from training dataset
idfs_test = self.test_counts * np.log(n / document_appearance)
self.assert_bow_same(bow_test, idfs_test, self.terms)


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions orangecontrib/text/vectorization/bagofwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ def _transform(self, corpus, source_dict=None):
temp_corpus = list(corpus.ngrams_iterator(' ', include_postags=True))
dic = corpora.Dictionary(temp_corpus, prune_at=None) if not source_dict else source_dict
temp_corpus = [dic.doc2bow(doc) for doc in temp_corpus]
model = models.TfidfModel(temp_corpus, normalize=False,
model = models.TfidfModel(dictionary=dic, normalize=False,
wlocal=self.wlocals[self.wlocal],
wglobal=self.wglobals[self.wglobal])

X = matutils.corpus2csc(model[temp_corpus], dtype=np.float, num_terms=len(dic)).T
X = matutils.corpus2csc(model[temp_corpus], dtype=float, num_terms=len(dic)).T
norm = self.norms[self.norm]
if norm:
X = norm(X)
Expand Down