Skip to content

Commit

Permalink
✨ Support for individuals in EL models
Browse files Browse the repository at this point in the history
  • Loading branch information
ferzcam committed Nov 21, 2024
1 parent 9a56e7b commit 726cea6
Show file tree
Hide file tree
Showing 25 changed files with 550 additions and 396 deletions.
9 changes: 0 additions & 9 deletions README.rst

This file was deleted.

2 changes: 1 addition & 1 deletion mowl/base_models/elmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ def from_pretrained(self, model):
if not isinstance(model, str):
raise TypeError

self.module.load_state_dict(th.load(model))
self.module.load_state_dict(th.load(model, weights_only=True))
#self._kge_method = kge_method


Expand Down
2 changes: 1 addition & 1 deletion mowl/datasets/gci.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def push_to_device(self, data):
for gci in data:
class_ = self.class_index_dict[gci.class_]
individual = self.individual_index_dict[gci.individual]
pretensor.append([class_, individual])
pretensor.append([individual, class_])
tensor = th.tensor(pretensor).to(self.device)
return tensor

Expand Down
6 changes: 3 additions & 3 deletions mowl/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from mowl.models.elembeddings.examples.model_ppi import ELEmPPI
from mowl.models.elembeddings.examples.model_gda import ELEmGDA

from mowl.models.elboxembeddings.model import ELBoxEmbeddings
from mowl.models.elboxembeddings.examples.model_ppi import ELBoxPPI
from mowl.models.elboxembeddings.examples.model_gda import ELBoxGDA
from mowl.models.elbe.model import ELBoxEmbeddings, ELBE
from mowl.models.elbe.examples.model_ppi import ELBEPPI
from mowl.models.elbe.examples.model_gda import ELBEGDA

from mowl.models.boxsquaredel.model import BoxSquaredEL

Expand Down
109 changes: 94 additions & 15 deletions mowl/models/boxsquaredel/model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@

from mowl.nn import BoxSquaredELModule
from mowl.base_models.elmodel import EmbeddingELModel

from mowl.nn import BoxSquaredELModule
from tqdm import trange, tqdm
import torch as th
from torch import nn
import numpy as np
import logging

logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
logger.addHandler(handler)
logger.setLevel(logging.INFO)


class BoxSquaredEL(EmbeddingELModel):
"""
Implementation based on [peng2020]_.
Implementation based on [jackermeier2023]_.
"""

def __init__(self,
Expand All @@ -27,7 +32,6 @@ def __init__(self,
):
super().__init__(dataset, embed_dim, batch_size, extended=True, model_filepath=model_filepath)


self.margin = margin
self.reg_norm = reg_norm
self.delta = delta
Expand All @@ -44,31 +48,106 @@ def init_module(self):
self.module = BoxSquaredELModule(
len(self.class_index_dict),
len(self.object_property_index_dict),
len(self.individual_index_dict),
embed_dim=self.embed_dim,
gamma=self.margin,
delta=self.delta,
reg_factor=self.reg_factor

).to(self.device)

def train(self):
raise NotImplementedError
def train(self, epochs=None, validate_every=1):
logger.warning('You are using the default training method. If you want to use a cutomized training method (e.g., different negative sampling, etc.), please reimplement the train method in a subclass.')

points_per_dataset = {k: len(v) for k, v in self.training_datasets.items()}
string = "Training datasets: \n"
for k, v in points_per_dataset.items():
string += f"\t{k}: {v}\n"

logger.info(string)

optimizer = th.optim.Adam(self.module.parameters(), lr=self.learning_rate)
best_loss = float('inf')

all_classes_ids = list(self.class_index_dict.values())
all_inds_ids = list(self.individual_index_dict.values())

if epochs is None:
epochs = self.epochs

for epoch in trange(epochs):
self.module.train()

train_loss = 0
loss = 0

for gci_name, gci_dataset in self.training_datasets.items():
if len(gci_dataset) == 0:
continue

loss += th.mean(self.module(gci_dataset[:], gci_name))
if gci_name == "gci2":
idxs_for_negs = np.random.choice(all_classes_ids, size=len(gci_dataset), replace=True)
rand_index = th.tensor(idxs_for_negs).to(self.device)
data = gci_dataset[:]
neg_data = th.cat([data[:, :2], rand_index.unsqueeze(1)], dim=1)
loss += th.mean(self.module(neg_data, gci_name, neg=True))

if gci_name == "object_property_assertion":
idxs_for_negs = np.random.choice(all_inds_ids, size=len(gci_dataset), replace=True)
rand_index = th.tensor(idxs_for_negs).to(self.device)
data = gci_dataset[:]
neg_data = th.cat([data[:, :2], rand_index.unsqueeze(1)], dim=1)
loss += th.mean(self.module(neg_data, gci_name, neg=True))

loss += self.module.regularization_loss()

optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.detach().item()

loss = 0

if (epoch + 1) % validate_every == 0:
if self.dataset.validation is not None:
with th.no_grad():
self.module.eval()
valid_loss = 0
gci2_data = self.validation_datasets["gci2"][:]
loss = th.mean(self.module(gci2_data, "gci2"))
valid_loss += loss.detach().item()



if valid_loss < best_loss:
best_loss = valid_loss
th.save(self.module.state_dict(), self.model_filepath)
print(f'Epoch {epoch+1}: Train loss: {train_loss} Valid loss: {valid_loss}')
else:
print(f'Epoch {epoch+1}: Train loss: {train_loss}')

def eval_method(self, data):
return self.module.gci2_score(data)
return self.module.gci2_loss(data)

def get_embeddings(self):
self.init_module()

print('Load the best model', self.model_filepath)
self.load_best_model()

ent_embeds = {k: v for k, v in zip(self.class_index_dict.keys(),
self.module.class_embed.weight.cpu().detach().numpy())}
rel_embeds = {k: v for k, v in zip(self.object_property_index_dict.keys(),
self.module.rel_embed.weight.cpu().detach().numpy())}
return ent_embeds, rel_embeds
ent_embeds = {
k: v for k, v in zip(self.class_index_dict.keys(),
self.module.class_embed.weight.cpu().detach().numpy())}
rel_embeds = {
k: v for k, v in zip(self.object_property_index_dict.keys(),
self.module.rel_embed.weight.cpu().detach().numpy())}
if self.module.ind_embed is not None:
ind_embeds = {
k: v for k, v in zip(self.individual_index_dict.keys(),
self.module.ind_embed.weight.cpu().detach().numpy())}
else:
ind_embeds = None
return ent_embeds, rel_embeds, ind_embeds

def load_best_model(self):
self.init_module()
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from mowl.models import ELBoxEmbeddings
from mowl.models import ELBE

from mowl.projection.factory import projector_factory
from mowl.projection.edge import Edge
Expand All @@ -12,7 +12,7 @@
from torch import nn


class ELBoxGDA(ELBoxEmbeddings):
class ELBEGDA(ELBE):
"""
Example of ELBoxEmbeddings for gene-disease associations prediction.
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from mowl.models import ELBoxEmbeddings
from mowl.models import ELBE
from mowl.projection.factory import projector_factory
from mowl.projection.edge import Edge
import math
Expand All @@ -13,7 +13,7 @@
from torch import nn


class ELBoxPPI(ELBoxEmbeddings):
class ELBEPPI(ELBE):
"""
Example of ELBoxEmbeddings for protein-protein interaction prediction.
"""
Expand Down
156 changes: 156 additions & 0 deletions mowl/models/elbe/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from mowl.base_models.elmodel import EmbeddingELModel
from mowl.nn import ELBEModule
from tqdm import trange, tqdm
import torch as th
import numpy as np
from deprecated.sphinx import deprecated
import logging

logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
logger.addHandler(handler)
logger.setLevel(logging.INFO)

class ELBE(EmbeddingELModel):
"""
Implementation based on [peng2020]_.
"""

def __init__(self,
dataset,
embed_dim=50,
margin=0,
reg_norm=1,
learning_rate=0.001,
epochs=1000,
batch_size=4096 * 8,
model_filepath=None,
device='cpu'
):
super().__init__(dataset, embed_dim, batch_size, extended=True, model_filepath=model_filepath)

self.margin = margin
self.reg_norm = reg_norm
self.learning_rate = learning_rate
self.epochs = epochs
self.device = device
self._loaded = False
self.extended = False
self.init_module()

def init_module(self):
self.module = ELBEModule(
len(self.class_index_dict),
len(self.object_property_index_dict),
len(self.individual_index_dict),
embed_dim=self.embed_dim,
margin=self.margin
).to(self.device)

def train(self, epochs=None, validate_every=1):
logger.warning('You are using the default training method. If you want to use a cutomized training method (e.g., different negative sampling, etc.), please reimplement the train method in a subclass.')

points_per_dataset = {k: len(v) for k, v in self.training_datasets.items()}
string = "Training datasets: \n"
for k, v in points_per_dataset.items():
string += f"\t{k}: {v}\n"

logger.info(string)

optimizer = th.optim.Adam(self.module.parameters(), lr=self.learning_rate)
criterion = th.nn.MSELoss()
best_loss = float('inf')

all_classes_ids = list(self.class_index_dict.values())
all_inds_ids = list(self.individual_index_dict.values())

if epochs is None:
epochs = self.epochs

for epoch in trange(epochs):
self.module.train()

train_loss = 0
loss = 0

for gci_name, gci_dataset in self.training_datasets.items():
if len(gci_dataset) == 0:
continue

scores = th.mean(self.module(gci_dataset[:], gci_name))
loss += criterion(scores, th.zeros_like(scores, requires_grad=False))

if gci_name == "gci2":
idxs_for_negs = np.random.choice(all_classes_ids, size=len(gci_dataset), replace=True)
rand_index = th.tensor(idxs_for_negs).to(self.device)
data = gci_dataset[:]
neg_data = th.cat([data[:, :2], rand_index.unsqueeze(1)], dim=1)
scores = th.mean(self.module(neg_data, gci_name, neg=True))
loss += criterion(scores, th.ones_like(scores, requires_grad=False))

if gci_name == "object_property_assertion":
idxs_for_negs = np.random.choice(all_inds_ids, size=len(gci_dataset), replace=True)
rand_index = th.tensor(idxs_for_negs).to(self.device)
data = gci_dataset[:]
neg_data = th.cat([data[:, :2], rand_index.unsqueeze(1)], dim=1)
scores = th.mean(self.module(neg_data, gci_name, neg=True))
loss += criterion(scores, th.ones_like(scores, requires_grad=False))

optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.detach().item()

loss = 0

if (epoch + 1) % validate_every == 0:
if self.dataset.validation is not None:
with th.no_grad():
self.module.eval()
valid_loss = 0
gci2_data = self.validation_datasets["gci2"][:]
loss = th.mean(self.module(gci2_data, "gci2"))
valid_loss += loss.detach().item()


if valid_loss < best_loss:
best_loss = valid_loss
th.save(self.module.state_dict(), self.model_filepath)
print(f'Epoch {epoch+1}: Train loss: {train_loss} Valid loss: {valid_loss}')
else:
print(f'Epoch {epoch+1}: Train loss: {train_loss}')

def eval_method(self, data):
return self.module.gci2_loss(data)

def get_embeddings(self):
self.init_module()

print('Load the best model', self.model_filepath)
self.load_best_model()

ent_embeds = {
k: v for k, v in zip(self.class_index_dict.keys(),
self.module.class_embed.weight.cpu().detach().numpy())}
rel_embeds = {
k: v for k, v in zip(self.object_property_index_dict.keys(),
self.module.rel_embed.weight.cpu().detach().numpy())}
if self.module.ind_embed is not None:
ind_embeds = {
k: v for k, v in zip(self.individual_index_dict.keys(),
self.module.ind_embed.weight.cpu().detach().numpy())}
else:
ind_embeds = None
return ent_embeds, rel_embeds, ind_embeds

def load_best_model(self):
self.init_module()
self.module.load_state_dict(th.load(self.model_filepath))
self.module.eval()


@deprecated(version='1.0.2', reason="Use ELBoxEmbeddings instead.")
class ELBoxEmbeddings(ELBE):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

Loading

0 comments on commit 726cea6

Please sign in to comment.