This repository has been archived by the owner on Jul 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembedder.py
79 lines (62 loc) · 2.42 KB
/
embedder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Text embedder.
This module implements Embedder class, that uses pretrained DistilBERT
to get text embeddings.
"""
import logging
import numpy as np
import torch
import transformers as ppb
from typing import List
def get_text_reduced(filename: str, maxlen: int = -1) -> str:
with open(filename, "r") as f:
lines = [line.rstrip() for line in f]
text = " ".join(x for x in lines if x != "")
if maxlen > 0:
text = " ".join(text.split()[:maxlen])
return text
class Embedder:
"""Class that is used to get dense text embeddings.
Attributes:
device: Specifies, should pytorch use cpu or gpu.
tokenizer: Pretrained DistilBERT tokenizer.
model: Pretrained DistilBERT model.
"""
def __init__(self):
"""Initialize Embedder by loading models and weights."""
logging.getLogger("transformers.tokenization_utils").setLevel(
logging.ERROR
)
model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = "distilbert-base-uncased"
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
self.model = model_class.from_pretrained(pretrained_weights).to(
self.device
)
def embed(self, texts: List[str]) -> np.ndarray:
"""Get dense embeddings for a collection of texts.
Args:
texts: List of texts, striped of '\n' and squashed into one string.
Returns:
Numpy array of shape (N, 768) with text embeddings.
"""
# Tokenize
tokenized = [
self.tokenizer.encode(x, add_special_tokens=True) for x in texts
]
# Pad to make everything the same length
max_len = max(len(x) for x in tokenized)
padded = np.array([x + [0] * (max_len - len(x)) for x in tokenized])
mask = np.where(padded != 0, 1, 0) # mask out padding
# Feed to BERT
input_ids = torch.tensor(padded).to(self.device)
mask = torch.tensor(mask).to(self.device)
with torch.no_grad():
last_hidden_states = self.model(input_ids, attention_mask=mask)
# Get embedding
# TODO it's emb. of [CLS] token, try different
embedding = np.sum(last_hidden_states[0].cpu().numpy(), axis=1)
return embedding