Skip to content

Commit

Permalink
do not include words that never occur in the corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
rbroc committed Mar 13, 2024
1 parent a133446 commit 7247618
Showing 1 changed file with 6 additions and 5 deletions.
11 changes: 6 additions & 5 deletions turftopic/models/keynmf.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,12 @@ def extract_keywords(
embedding = embeddings[i].reshape(1, -1)
if self.keyword_scope == 'document':
mask = terms > 0
if not np.any(mask):
keywords.append(dict())
continue
else:
mask = np.ones(shape=terms.shape, dtype=bool)
tot_freq = document_term_matrix.sum(axis=0)
mask = tot_freq != 0
if not np.any(mask):
keywords.append(dict())
continue
important_terms = np.squeeze(np.asarray(mask))
word_embeddings = self.vocab_embeddings[important_terms]
sim = cosine_similarity(embedding, word_embeddings)
Expand Down Expand Up @@ -284,7 +285,7 @@ def prepare_topic_data(
except (NotFittedError, AttributeError):
doc_topic_matrix = self.nmf_.fit_transform(dtm)
self.components_ = self.nmf_.components_
console.log("Model fiting done.")
console.log("Model fitting done.")
res: TopicData = {
"corpus": corpus,
"document_term_matrix": dtm,
Expand Down

0 comments on commit 7247618

Please sign in to comment.