This repository has been archived by the owner on Jan 12, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminimize_vectors_file.py
executable file
·53 lines (40 loc) · 1.68 KB
/
minimize_vectors_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
# Copyright (c) 2021 Kemal Kurniawan
from itertools import chain
import os
from gensim.models.keyedvectors import KeyedVectors
from sacred import Experiment
from sacred.observers import MongoObserver
from sacred.utils import apply_backspaces_and_linefeeds
from text2array import Vocab
from ingredients.corpus import ing as corpus_ing, read_samples
ex = Experiment("xduft-minimize-vectors-file-testrun", ingredients=[corpus_ing])
ex.captured_out_filter = apply_backspaces_and_linefeeds
# Setup mongodb observer
mongo_url = os.getenv("SACRED_MONGO_URL")
db_name = os.getenv("SACRED_DB_NAME")
if None not in (mongo_url, db_name):
ex.observers.append(MongoObserver.create(url=mongo_url, db_name=db_name))
@ex.config
def default():
# path to vectors file in word2vec format
vectors_path = "wiki.en.vec"
# write minimized vectors to this file path
output_path = "wiki.min.en.vec"
@ex.automain
def minimize(_log, vectors_path="wiki.en.vec", output_path="wiki.min.en.vec"):
"""Minimize the given vectors file to contain only words in the given corpus."""
samples = {wh: list(read_samples(which=wh)) for wh in ["train", "test"]}
try:
samples["dev"] = list(read_samples(which="dev"))
except FileNotFoundError:
pass # skip if not exist
vocab = Vocab.from_samples(chain(*samples.values()))
kv = KeyedVectors.load_word2vec_format(vectors_path)
_log.info("Creating new, minimized word vectors")
min_kv = KeyedVectors(kv.vector_size)
for w in kv.vocab:
if w in vocab["words"]:
min_kv[w] = kv[w]
_log.info("Saving the new word vectors to %s", output_path)
min_kv.save_word2vec_format(output_path)