Skip to content

Commit

Permalink
Merge pull request #990 from PrimozGodec/fix-corpus-info
Browse files Browse the repository at this point in the history
[FIX] Corpus - remove dictionary and fix wrong types count on subsampled corpus
  • Loading branch information
markotoplak authored Aug 25, 2023
2 parents 4da0aab + 587c49a commit 7de9c65
Show file tree
Hide file tree
Showing 9 changed files with 182 additions and 99 deletions.
90 changes: 44 additions & 46 deletions orangecontrib/text/corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import warnings
from collections import Counter, defaultdict
from copy import copy, deepcopy
from numbers import Integral
Expand All @@ -9,9 +8,6 @@

import nltk
import numpy as np
import scipy.sparse as sp
from gensim import corpora

from Orange.data import (
Variable,
ContinuousVariable,
Expand All @@ -23,17 +19,12 @@
)
from Orange.preprocess.transformation import Identity
from Orange.data.util import get_unique_names
from gensim import corpora
from orangewidget.utils.signals import summarize, PartialSummary
import scipy.sparse as sp

from orangecontrib.text.language import ISO2LANG

try:
from orangewidget.utils.signals import summarize, PartialSummary
# import to check if Table summary is available - if summarize_by_name does
# not exist Orange (3.28) does not support automated summaries
from Orange.widgets.utils.state_summary import summarize_by_name
except ImportError:
summarize, PartialSummary = None, None


def get_sample_corpora_dir():
path = os.path.dirname(__file__)
Expand Down Expand Up @@ -88,7 +79,6 @@ def _setup_corpus(self, text_features: List[Variable] = None) -> None:
"""
self.text_features = [] # list of text features for mining
self._tokens = None
self._dictionary = None
self.ngram_range = (1, 1)
self._pos_tags = None
from orangecontrib.text.preprocess import PreprocessorList
Expand Down Expand Up @@ -397,8 +387,13 @@ def store_tokens(self, tokens, dictionary=None):
Args:
tokens (list): List of lists containing tokens.
"""
if dictionary is not None:
warn(
"dictionary argument is deprecated and doesn't have effect."
"It will be removed in future orange3-text 1.15.",
FutureWarning,
)
self._tokens = np.array(tokens, dtype=object)
self._dictionary = dictionary or corpora.Dictionary(self.tokens)

@property
def tokens(self):
Expand All @@ -407,7 +402,7 @@ def tokens(self):
present, run default preprocessor and return tokens.
"""
if self._tokens is None:
return self._base_tokens()[0]
return self._base_tokens()
return self._tokens

def has_tokens(self):
Expand All @@ -419,19 +414,17 @@ def _base_tokens(self):
BASE_TOKENIZER, PreprocessorList

# don't use anything that requires NLTK data to assure async download
base_preprocessors = PreprocessorList([BASE_TRANSFORMER,
BASE_TOKENIZER])
base_preprocessors = PreprocessorList([BASE_TRANSFORMER, BASE_TOKENIZER])
corpus = base_preprocessors(self)
return corpus.tokens, corpus.dictionary
return corpus.tokens

@property
def dictionary(self):
"""
corpora.Dictionary: A token to id mapper.
"""
if self._dictionary is None:
return self._base_tokens()[1]
return self._dictionary
warn(
"dictionary is deprecated and will be removed in Orange3-text 1.15",
FutureWarning,
)
return corpora.Dictionary(self.tokens)

@property
def pos_tags(self):
Expand Down Expand Up @@ -468,6 +461,16 @@ def ngrams_iterator(self, join_with=NGRAMS_SEPARATOR, include_postags=False):
for n in range(self.ngram_range[0], self.ngram_range[1]+1))))
for doc in data)

def count_tokens(self) -> int:
"""Count number of all (non-unique) tokens in the corpus"""
return sum(map(len, self.tokens))

def count_unique_tokens(self) -> int:
"""Count number of all (unique) tokens in the corpus"""
# it seems to be fast enough even datasets very large dataset, so I
# would avoid caching to prevetnt potential problems connected to that
return len({tk for lst in self.tokens for tk in lst})

@property
def ngrams(self):
"""generator: Ngram representations of documents."""
Expand All @@ -476,10 +479,9 @@ def ngrams(self):
def copy(self):
"""Return a copy of the table."""
c = super().copy()
# since tokens and dictionary are considered immutable copies are not needed
c._setup_corpus(text_features=copy(self.text_features))
# since tokens are considered immutable copies are not needed
c._tokens = self._tokens
c._dictionary = self._dictionary
c.ngram_range = self.ngram_range
c.pos_tags = self.pos_tags
c.name = self.name
Expand Down Expand Up @@ -640,7 +642,6 @@ def retain_preprocessing(orig, new, key=...):
new.pos_tags = orig.pos_tags
else:
raise TypeError('Indexing by type {} not supported.'.format(type(key)))
new._dictionary = orig._dictionary

if isinstance(new, Corpus):
# _find_identical_feature returns non when feature not found
Expand All @@ -665,23 +666,20 @@ def retain_preprocessing(orig, new, key=...):
new._infer_text_features()


if summarize:
# summarize is not available in older versions of orange-widget-base
# skip if not available
@summarize.register(Corpus)
def summarize_corpus(corpus: Corpus) -> PartialSummary:
"""
Provides automated input and output summaries for Corpus
"""
table_summary = summarize.dispatch(Table)(corpus)
extras = (
(
f"<br/><nobr>Tokens: {sum(map(len, corpus.tokens))}, "
f"Types: {len(corpus.dictionary)}</nobr>"
)
if corpus.has_tokens()
else "<br/><nobr>Corpus is not preprocessed</nobr>"
@summarize.register(Corpus)
def summarize_corpus(corpus: Corpus) -> PartialSummary:
"""
Provides automated input and output summaries for Corpus
"""
table_summary = summarize.dispatch(Table)(corpus)
extras = (
(
f"<br/><nobr>Tokens: {corpus.count_tokens()}, "
f"Types: {corpus.count_unique_tokens()}</nobr>"
)
language = ISO2LANG[corpus.language] if corpus.language else "not set"
extras += f"<br/><nobr>Language: {language}</nobr>"
return PartialSummary(table_summary.summary, table_summary.details + extras)
if corpus.has_tokens()
else "<br/><nobr>Corpus is not preprocessed</nobr>"
)
language = ISO2LANG[corpus.language] if corpus.language else "not set"
extras += f"<br/><nobr>Language: {language}</nobr>"
return PartialSummary(table_summary.summary, table_summary.details + extras)
15 changes: 4 additions & 11 deletions orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
corpus = super().__call__(corpus, wrap_callback(callback, end=0.2))
return self._filter_tokens(corpus, wrap_callback(callback, start=0.2))

def _filter_tokens(self, corpus: Corpus, callback: Callable,
dictionary=None) -> Corpus:
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
callback(0, "Filtering...")
filtered_tokens = []
filtered_tags = []
Expand All @@ -37,10 +36,7 @@ def _filter_tokens(self, corpus: Corpus, callback: Callable,
if corpus.pos_tags is not None:
filtered_tags.append(list(compress(corpus.pos_tags[i],
filter_map)))
if dictionary is None:
corpus.store_tokens(filtered_tokens)
else:
corpus.store_tokens(filtered_tokens, dictionary)
corpus.store_tokens(filtered_tokens)
if filtered_tags:
corpus.pos_tags = np.array(filtered_tags, dtype=object)
return corpus
Expand Down Expand Up @@ -178,11 +174,8 @@ def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
def _fit(self, corpus: Corpus):
raise NotImplemented

def _filter_tokens(self, corpus: Corpus, callback: Callable,
dictionary=None) -> Corpus:
corpus = super()._filter_tokens(corpus, callback,
dictionary=self._dictionary)
return corpus
def _filter_tokens(self, corpus: Corpus, callback: Callable) -> Corpus:
return super()._filter_tokens(corpus, callback)

def _check(self, token):
assert self._lexicon is not None
Expand Down
Loading

0 comments on commit 7de9c65

Please sign in to comment.