From 52544d110cd1c4efb0b32084261b7a87887ad20b Mon Sep 17 00:00:00 2001 From: jpwchang Date: Fri, 17 May 2024 11:20:00 -0400 Subject: [PATCH] WIP CRAFT implementation --- convokit/convokitConfig.py | 5 + convokit/forecaster/CRAFT/CRAFTNN.py | 241 --------- convokit/forecaster/CRAFT/__init__.py | 5 +- .../CRAFT/{CRAFTUtil.py => data.py} | 142 ++--- convokit/forecaster/CRAFT/model.py | 208 ++++++++ convokit/forecaster/CRAFT/runners.py | 244 +++++++++ convokit/forecaster/CRAFTModel.py | 497 ++---------------- 7 files changed, 545 insertions(+), 797 deletions(-) delete mode 100644 convokit/forecaster/CRAFT/CRAFTNN.py rename convokit/forecaster/CRAFT/{CRAFTUtil.py => data.py} (67%) create mode 100644 convokit/forecaster/CRAFT/model.py create mode 100644 convokit/forecaster/CRAFT/runners.py diff --git a/convokit/convokitConfig.py b/convokit/convokitConfig.py index 37f3ac37..aed12f06 100644 --- a/convokit/convokitConfig.py +++ b/convokit/convokitConfig.py @@ -7,6 +7,7 @@ "# Default Backend Parameters\n" "db_host: localhost:27017\n" "data_directory: ~/.convokit/saved-corpora\n" + "model_directory: ~/.convokit/saved-models\n" "default_backend: mem" ) @@ -51,6 +52,10 @@ def db_host(self): def data_directory(self): return self.config_contents.get("data_directory", "~/.convokit/saved-corpora") + @property + def model_directory(self): + return self.config_contents.get("model_directory", "~/.convokit/saved-models") + @property def default_backend(self): return self._get_config_from_env_or_file("default_backend", "mem") diff --git a/convokit/forecaster/CRAFT/CRAFTNN.py b/convokit/forecaster/CRAFT/CRAFTNN.py deleted file mode 100644 index 71fa7a3f..00000000 --- a/convokit/forecaster/CRAFT/CRAFTNN.py +++ /dev/null @@ -1,241 +0,0 @@ -try: - import torch -except (ModuleNotFoundError, ImportError) as e: - raise ModuleNotFoundError( - "torch is not currently installed. Run 'pip install convokit[craft]' if you would like to use the CRAFT model." - ) - -from torch import nn -import os -from urllib.request import urlretrieve -from .CRAFTUtil import CONSTANTS - - -class EncoderRNN(nn.Module): - """ - This module represents the utterance encoder component of CRAFT, - responsible for creating vector representations of utterances - """ - - def __init__(self, hidden_size, embedding, n_layers=1, dropout=0): - super(EncoderRNN, self).__init__() - self.n_layers = n_layers - self.hidden_size = hidden_size - self.embedding = embedding - - # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size' - # because our input size is a word embedding with number of features == hidden_size - self.gru = nn.GRU( - hidden_size, - hidden_size, - n_layers, - dropout=(0 if n_layers == 1 else dropout), - bidirectional=True, - ) - - def forward(self, input_seq, input_lengths, hidden=None): - # Convert word indexes to embeddings - embedded = self.embedding(input_seq) - # Pack padded batch of sequences for RNN module - packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) - # Forward pass through GRU - outputs, hidden = self.gru(packed, hidden) - # Unpack padding - outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) - # Sum bidirectional GRU outputs - outputs = outputs[:, :, : self.hidden_size] + outputs[:, :, self.hidden_size :] - # Return output and final hidden state - return outputs, hidden - - -class ContextEncoderRNN(nn.Module): - """This module represents the context encoder component of CRAFT, responsible for creating an order-sensitive vector representation of conversation context""" - - def __init__(self, hidden_size, n_layers=1, dropout=0): - super(ContextEncoderRNN, self).__init__() - self.n_layers = n_layers - self.hidden_size = hidden_size - - # only unidirectional GRU for context encoding - self.gru = nn.GRU( - hidden_size, - hidden_size, - n_layers, - dropout=(0 if n_layers == 1 else dropout), - bidirectional=False, - ) - - def forward(self, input_seq, input_lengths, hidden=None): - # Pack padded batch of sequences for RNN module - packed = torch.nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths) - # Forward pass through GRU - outputs, hidden = self.gru(packed, hidden) - # Unpack padding - outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) - # return output and final hidden state - return outputs, hidden - - -class SingleTargetClf(nn.Module): - """This module represents the CRAFT classifier head, which takes the context encoding and uses it to make a forecast""" - - def __init__(self, hidden_size, dropout=0.1): - super(SingleTargetClf, self).__init__() - - self.hidden_size = hidden_size - - # initialize classifier - self.layer1 = nn.Linear(hidden_size, hidden_size) - self.layer1_act = nn.LeakyReLU() - self.layer2 = nn.Linear(hidden_size, hidden_size // 2) - self.layer2_act = nn.LeakyReLU() - self.clf = nn.Linear(hidden_size // 2, 1) - self.dropout = nn.Dropout(p=dropout) - - def forward(self, encoder_outputs, encoder_input_lengths): - # from stackoverflow (https://stackoverflow.com/questions/50856936/taking-the-last-state-from-bilstm-bigru-in-pytorch) - # First we unsqueeze seqlengths two times so it has the same number of - # of dimensions as output_forward - # (batch_size) -> (1, batch_size, 1) - lengths = encoder_input_lengths.unsqueeze(0).unsqueeze(2) - # Then we expand it accordingly - # (1, batch_size, 1) -> (1, batch_size, hidden_size) - lengths = lengths.expand((1, -1, encoder_outputs.size(2))) - - # take only the last state of the encoder for each batch - last_outputs = torch.gather(encoder_outputs, 0, lengths - 1).squeeze(dim=0) - # forward pass through hidden layers - layer1_out = self.layer1_act(self.layer1(self.dropout(last_outputs))) - layer2_out = self.layer2_act(self.layer2(self.dropout(layer1_out))) - # compute and return logits - logits = self.clf(self.dropout(layer2_out)).squeeze(dim=1) - return logits - - -class Predictor(nn.Module): - """This helper module encapsulates the CRAFT pipeline, defining the logic of passing an input through each consecutive sub-module.""" - - def __init__(self, encoder, context_encoder, classifier): - super(Predictor, self).__init__() - self.encoder = encoder - self.context_encoder = context_encoder - self.classifier = classifier - - def forward( - self, - input_batch, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - batch_size, - max_length, - ): - # Forward input through encoder model - _, utt_encoder_hidden = self.encoder(input_batch, utt_lengths) - - # Convert utterance encoder final states to batched dialogs for use by context encoder - context_encoder_input = makeContextEncoderInput( - utt_encoder_hidden, dialog_lengths_list, batch_size, batch_indices, dialog_indices - ) - - # Forward pass through context encoder - context_encoder_outputs, context_encoder_hidden = self.context_encoder( - context_encoder_input, dialog_lengths - ) - - # Forward pass through classifier to get prediction logits - logits = self.classifier(context_encoder_outputs, dialog_lengths) - - # Apply sigmoid activation - predictions = torch.sigmoid(logits) - return predictions - - -def makeContextEncoderInput( - utt_encoder_hidden, dialog_lengths, batch_size, batch_indices, dialog_indices -): - """The utterance encoder takes in utterances in combined batches, with no knowledge of which ones go where in which conversation. - Its output is therefore also unordered. We correct this by using the information computed during tensor conversion to regroup - the utterance vectors into their proper conversational order.""" - # first, sum the forward and backward encoder states - utt_encoder_summed = utt_encoder_hidden[-2, :, :] + utt_encoder_hidden[-1, :, :] - # we now have hidden state of shape [utterance_batch_size, hidden_size] - # split it into a list of [hidden_size,] x utterance_batch_size - last_states = [t.squeeze() for t in utt_encoder_summed.split(1, dim=0)] - - # create a placeholder list of tensors to group the states by source dialog - states_dialog_batched = [[None for _ in range(dialog_lengths[i])] for i in range(batch_size)] - - # group the states by source dialog - for hidden_state, batch_idx, dialog_idx in zip(last_states, batch_indices, dialog_indices): - states_dialog_batched[batch_idx][dialog_idx] = hidden_state - - # stack each dialog into a tensor of shape [dialog_length, hidden_size] - states_dialog_batched = [torch.stack(d) for d in states_dialog_batched] - - # finally, condense all the dialog tensors into a single zero-padded tensor - # of shape [max_dialog_length, batch_size, hidden_size] - return torch.nn.utils.rnn.pad_sequence(states_dialog_batched) - - -def initialize_model( - custom_model_path, - voc, - device, - device_type: str, - hidden_size, - encoder_n_layers, - dropout, - context_encoder_n_layers, -): - print("Loading saved parameters...") - if custom_model_path is None: - if not os.path.isfile("model.tar"): - print("\tDownloading trained CRAFT...") - urlretrieve(CONSTANTS["MODEL_URL"], "model.tar") - print("\t...Done!") - custom_model_path = "model.tar" - # If running in a non-GPU environment, you need to tell PyTorch to convert the parameters to CPU tensor format. - # To do so, replace the previous line with the following: - if device_type == "cpu": - checkpoint = torch.load(custom_model_path, map_location=torch.device("cpu")) - elif device_type == "cuda": - checkpoint = torch.load(custom_model_path) - encoder_sd = checkpoint["en"] - context_sd = checkpoint["ctx"] - if "atk_clf" in checkpoint: - attack_clf_sd = checkpoint["atk_clf"] - - embedding_sd = checkpoint["embedding"] - voc.__dict__ = checkpoint["voc_dict"] - - print("Building encoders, decoder, and classifier...") - # Initialize word embeddings - embedding = nn.Embedding(voc.num_words, hidden_size) - embedding.load_state_dict(embedding_sd) - # Initialize utterance and context encoders - encoder: EncoderRNN = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout) - context_encoder: ContextEncoderRNN = ContextEncoderRNN( - hidden_size, context_encoder_n_layers, dropout - ) - encoder.load_state_dict(encoder_sd) - context_encoder.load_state_dict(context_sd) - # Initialize classifier - attack_clf: SingleTargetClf = SingleTargetClf(hidden_size, dropout) - if "atk_clf" in checkpoint: - attack_clf.load_state_dict(attack_clf_sd) - # Use appropriate device - encoder = encoder.to(device) - context_encoder = context_encoder.to(device) - attack_clf = attack_clf.to(device) - print("Models built and ready to go!") - - # Set dropout layers to eval mode - encoder.eval() - context_encoder.eval() - attack_clf.eval() - - # Initialize the pipeline - return Predictor(encoder, context_encoder, attack_clf) diff --git a/convokit/forecaster/CRAFT/__init__.py b/convokit/forecaster/CRAFT/__init__.py index ce3c1739..d79367ea 100644 --- a/convokit/forecaster/CRAFT/__init__.py +++ b/convokit/forecaster/CRAFT/__init__.py @@ -1,2 +1,3 @@ -from .CRAFTUtil import * -from .CRAFTNN import * +from .data import * +from .model import * +from .runners import * diff --git a/convokit/forecaster/CRAFT/CRAFTUtil.py b/convokit/forecaster/CRAFT/data.py similarity index 67% rename from convokit/forecaster/CRAFT/CRAFTUtil.py rename to convokit/forecaster/CRAFT/data.py index 28dd2d6c..7e0e77cc 100644 --- a/convokit/forecaster/CRAFT/CRAFTUtil.py +++ b/convokit/forecaster/CRAFT/data.py @@ -3,6 +3,7 @@ import nltk import random import itertools +import json try: import torch @@ -13,47 +14,23 @@ from typing import List, Tuple -CONSTANTS = { - "PAD_token": 0, - "SOS_token": 1, - "EOS_token": 2, - "UNK_token": 3, - "WORD2INDEX_URL": "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/word2index.json", - "INDEX2WORD_URL": "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/index2word.json", - "MODEL_URL": "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/craft_full.tar", -} - # Default word tokens PAD_token = 0 # Used for padding short sentences SOS_token = 1 # Start-of-sentence token EOS_token = 2 # End-of-sentence token UNK_token = 3 # Unknown word token -# model download paths -WORD2INDEX_URL = "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/word2index.json" -INDEX2WORD_URL = "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/index2word.json" -MODEL_URL = "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/craft_full.tar" - - class Voc: - """A class for representing the vocabulary used by a CRAFT model""" - def __init__(self, name, word2index=None, index2word=None): self.name = name - self.trimmed = ( - False if not word2index else True - ) # if a precomputed vocab is specified assume the speaker wants to use it as-is + self.trimmed = False if not word2index else True # if a precomputed vocab is specified assume the user wants to use it as-is self.word2index = word2index if word2index else {"UNK": UNK_token} self.word2count = {} - self.index2word = ( - index2word - if index2word - else {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"} - ) + self.index2word = index2word if index2word else {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"} self.num_words = 4 if not index2word else len(index2word) # Count SOS, EOS, PAD, UNK def addSentence(self, sentence): - for word in sentence.split(" "): + for word in sentence.split(' '): self.addWord(word) def addWord(self, word): @@ -77,89 +54,93 @@ def trim(self, min_count): if v >= min_count: keep_words.append(k) - print( - "keep_words {} / {} = {:.4f}".format( - len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index) - ) - ) + print('keep_words {} / {} = {:.4f}'.format( + len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index) + )) # Reinitialize dictionaries self.word2index = {"UNK": UNK_token} self.word2count = {} self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"} - self.num_words = 4 # Count default tokens + self.num_words = 4 # Count default tokens for word in keep_words: self.addWord(word) - -# Create a Voc object from precomputed data structures -def loadPrecomputedVoc(corpus_name, word2index_url, index2word_url): - # load the word-to-index lookup map - r = requests.get(word2index_url) - word2index = r.json() - # load the index-to-word lookup map - r = requests.get(index2word_url) - index2word = r.json() - return Voc(corpus_name, word2index, index2word) - - -# Helper functions for preprocessing and tokenizing text - - # Turn a Unicode string to plain ASCII, thanks to # https://stackoverflow.com/a/518232/2809427 def unicodeToAscii(s): - return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn") - + return ''.join( + c for c in unicodedata.normalize('NFD', s) + if unicodedata.category(c) != 'Mn' + ) # Tokenize the string using NLTK -def craft_tokenize(voc, text): - tokenizer = nltk.tokenize.RegexpTokenizer(pattern=r"\w+|[^\w\s]") +def tokenize(voc, text): + tokenizer = nltk.tokenize.RegexpTokenizer(pattern=r'\w+|[^\w\s]') # simplify the problem space by considering only ASCII data cleaned_text = unicodeToAscii(text.lower()) # if the resulting string is empty, nothing else to do if not cleaned_text.strip(): return [] - + tokens = tokenizer.tokenize(cleaned_text) + + # replace out-of-vocabulary tokens for i in range(len(tokens)): if tokens[i] not in voc.word2index: tokens[i] = "UNK" - return tokens + return tokens -# Helper functions for turning dialog and text sequences into tensors, and manipulating those tensors +# Create a Voc object from precomputed data structures +def loadPrecomputedVoc(corpus_name, word2index_path, index2word_path): + with open(word2index_path) as fp: + word2index = json.load(fp) + with open(index2word_path) as fp: + index2word = json.load(fp) + return Voc(corpus_name, word2index, index2word) +# Given a context utterance list from Forecaster, preprocess each utterance's text by tokenizing and truncating. +# Returns the processed dialog entry where text has been replaced with a list of +# tokens, each no longer than MAX_LENGTH - 1 (to leave space for the EOS token) +def processContext(voc, context, is_attack): + processed = [] + for utterance in context: + # since the iterative nature of Forecaster may lead us to see the same utterance + # multiple times, we'll cache the tokenized form of the utterance as metadata + # and look it up if it already exists + if "craft_tokens" not in utterance.meta: + utterance.meta["craft_tokens"] = tokenize(voc, utterance.text) + tokens = utterance.meta["craft_tokens"] + if len(tokens) >= MAX_LENGTH: + tokens = tokens[:(MAX_LENGTH-1)] + processed.append({"tokens": tokens, "is_attack": is_attack, "id": utterance.id}) + return processed def indexesFromSentence(voc, sentence): return [voc.word2index[word] for word in sentence] + [EOS_token] - def zeroPadding(l, fillvalue=PAD_token): return list(itertools.zip_longest(*l, fillvalue=fillvalue)) - -def binaryMatrix(l): +def binaryMatrix(l, value=PAD_token): m = [] for i, seq in enumerate(l): m.append([]) for token in seq: if token == PAD_token: - m[i].append(0) + m[i].append(False) else: - m[i].append(1) + m[i].append(True) return m - # Takes a batch of dialogs (lists of lists of tokens) and converts it into a # batch of utterances (lists of tokens) sorted by length, while keeping track of # the information needed to reconstruct the original batch of dialogs def dialogBatch2UtteranceBatch(dialog_batch): - utt_tuples = ( - [] - ) # will store tuples of (utterance, original position in batch, original position in dialog) + utt_tuples = [] # will store tuples of (utterance, original position in batch, original position in dialog) for batch_idx in range(len(dialog_batch)): dialog = dialog_batch[batch_idx] for dialog_idx in range(len(dialog)): @@ -173,7 +154,6 @@ def dialogBatch2UtteranceBatch(dialog_batch): dialog_indices = [u[2] for u in utt_tuples] return utt_batch, batch_indices, dialog_indices - # Returns padded input sequence tensor and lengths def inputVar(l, voc): indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l] @@ -182,20 +162,18 @@ def inputVar(l, voc): padVar = torch.LongTensor(padList) return padVar, lengths - # Returns padded target sequence tensor, padding mask, and max target length def outputVar(l, voc): indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l] max_target_len = max([len(indexes) for indexes in indexes_batch]) padList = zeroPadding(indexes_batch) mask = binaryMatrix(padList) - mask = torch.ByteTensor(mask) + mask = torch.BoolTensor(mask) padVar = torch.LongTensor(padList) return padVar, mask, max_target_len - # Returns all items for a given batch of pairs -def batch2TrainData(voc, pair_batch: List[Tuple], already_sorted=False): +def batch2TrainData(voc, pair_batch, already_sorted=False): if not already_sorted: pair_batch.sort(key=lambda x: len(x[0]), reverse=True) input_batch, output_batch, label_batch, id_batch = [], [], [], [] @@ -203,28 +181,13 @@ def batch2TrainData(voc, pair_batch: List[Tuple], already_sorted=False): input_batch.append(pair[0]) output_batch.append(pair[1]) label_batch.append(pair[2]) - if len(pair) > 3: - id_batch.append(pair[3]) - else: - id_batch.append(None) + id_batch.append(pair[3]) dialog_lengths = torch.tensor([len(x) for x in input_batch]) input_utterances, batch_indices, dialog_indices = dialogBatch2UtteranceBatch(input_batch) inp, utt_lengths = inputVar(input_utterances, voc) output, mask, max_target_len = outputVar(output_batch, voc) label_batch = torch.FloatTensor(label_batch) if label_batch[0] is not None else None - return ( - inp, - dialog_lengths, - utt_lengths, - batch_indices, - dialog_indices, - label_batch, - id_batch, - output, - mask, - max_target_len, - ) - + return inp, dialog_lengths, utt_lengths, batch_indices, dialog_indices, label_batch, id_batch, output, mask, max_target_len def batchIterator(voc, source_data, batch_size, shuffle=True): cur_idx = 0 @@ -235,14 +198,15 @@ def batchIterator(voc, source_data, batch_size, shuffle=True): cur_idx = 0 if shuffle: random.shuffle(source_data) - batch = source_data[cur_idx : (cur_idx + batch_size)] + batch = source_data[cur_idx:(cur_idx+batch_size)] # the true batch size may be smaller than the given batch size if there is not enough data left true_batch_size = len(batch) # ensure that the dialogs in this batch are sorted by length, as expected by the padding module batch.sort(key=lambda x: len(x[0]), reverse=True) # for analysis purposes, get the source dialogs and labels associated with this batch batch_dialogs = [x[0] for x in batch] + batch_labels = [x[2] for x in batch] # convert batch to tensors batch_tensors = batch2TrainData(voc, batch, already_sorted=True) - yield (batch_tensors, batch_dialogs, true_batch_size) - cur_idx += batch_size + yield (batch_tensors, batch_dialogs, batch_labels, true_batch_size) + cur_idx += batch_size \ No newline at end of file diff --git a/convokit/forecaster/CRAFT/model.py b/convokit/forecaster/CRAFT/model.py new file mode 100644 index 00000000..b9975642 --- /dev/null +++ b/convokit/forecaster/CRAFT/model.py @@ -0,0 +1,208 @@ +try: + import torch +except (ModuleNotFoundError, ImportError) as e: + raise ModuleNotFoundError( + "torch is not currently installed. Run 'pip install convokit[craft]' if you would like to use the CRAFT model." + ) + +from torch import nn +import torch.nn.functional as F + +class EncoderRNN(nn.Module): + def __init__(self, hidden_size, embedding, n_layers=1, dropout=0): + super(EncoderRNN, self).__init__() + self.n_layers = n_layers + self.hidden_size = hidden_size + self.embedding = embedding + + # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size' + # because our input size is a word embedding with number of features == hidden_size + self.gru = nn.GRU(hidden_size, hidden_size, n_layers, + dropout=(0 if n_layers == 1 else dropout), bidirectional=True) + + def forward(self, input_seq, input_lengths, hidden=None): + # Convert word indexes to embeddings + embedded = self.embedding(input_seq) + # Pack padded batch of sequences for RNN module + packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu()) + # Forward pass through GRU + outputs, hidden = self.gru(packed, hidden) + # Unpack padding + outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) + # Sum bidirectional GRU outputs + outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] + # Return output and final hidden state + return outputs, hidden + +class ContextEncoderRNN(nn.Module): + def __init__(self, hidden_size, n_layers=1, dropout=0): + super(ContextEncoderRNN, self).__init__() + self.n_layers = n_layers + self.hidden_size = hidden_size + + # only unidirectional GRU for context encoding + self.gru = nn.GRU(hidden_size, hidden_size, n_layers, + dropout=(0 if n_layers == 1 else dropout), bidirectional=False) + + def forward(self, input_seq, input_lengths, hidden=None): + # Pack padded batch of sequences for RNN module + packed = torch.nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths.cpu()) + # Forward pass through GRU + outputs, hidden = self.gru(packed, hidden) + # Unpack padding + outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) + # return output and final hidden state + return outputs, hidden + +# Luong attention layer +class Attn(torch.nn.Module): + def __init__(self, method, hidden_size): + super(Attn, self).__init__() + self.method = method + if self.method not in ['dot', 'general', 'concat']: + raise ValueError(self.method, "is not an appropriate attention method.") + self.hidden_size = hidden_size + if self.method == 'general': + self.attn = torch.nn.Linear(self.hidden_size, hidden_size) + elif self.method == 'concat': + self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size) + self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size)) + + def dot_score(self, hidden, encoder_output): + return torch.sum(hidden * encoder_output, dim=2) + + def general_score(self, hidden, encoder_output): + energy = self.attn(encoder_output) + return torch.sum(hidden * energy, dim=2) + + def concat_score(self, hidden, encoder_output): + energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh() + return torch.sum(self.v * energy, dim=2) + + def forward(self, hidden, encoder_outputs): + # Calculate the attention weights (energies) based on the given method + if self.method == 'general': + attn_energies = self.general_score(hidden, encoder_outputs) + elif self.method == 'concat': + attn_energies = self.concat_score(hidden, encoder_outputs) + elif self.method == 'dot': + attn_energies = self.dot_score(hidden, encoder_outputs) + + # Transpose max_length and batch_size dimensions + attn_energies = attn_energies.t() + + # Return the softmax normalized probability scores (with added dimension) + return F.softmax(attn_energies, dim=1).unsqueeze(1) + +class LuongAttnDecoderRNN(nn.Module): + def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1): + super(LuongAttnDecoderRNN, self).__init__() + + # Keep for reference + self.attn_model = attn_model + self.hidden_size = hidden_size + self.output_size = output_size + self.n_layers = n_layers + self.dropout = dropout + + # Define layers + self.embedding = embedding + self.embedding_dropout = nn.Dropout(dropout) + self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout)) + self.concat = nn.Linear(hidden_size * 2, hidden_size) + self.out = nn.Linear(hidden_size, output_size) + + self.attn = Attn(attn_model, hidden_size) + + def forward(self, input_step, last_hidden, encoder_outputs): + # Note: we run this one step (word) at a time + # Get embedding of current input word + embedded = self.embedding(input_step) + embedded = self.embedding_dropout(embedded) + # Forward through unidirectional GRU + rnn_output, hidden = self.gru(embedded, last_hidden) + # Calculate attention weights from the current GRU output + attn_weights = self.attn(rnn_output, encoder_outputs) + # Multiply attention weights to encoder outputs to get new "weighted sum" context vector + context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) + # Concatenate weighted context vector and GRU output using Luong eq. 5 + rnn_output = rnn_output.squeeze(0) + context = context.squeeze(1) + concat_input = torch.cat((rnn_output, context), 1) + concat_output = torch.tanh(self.concat(concat_input)) + # Predict next word using Luong eq. 6 + output = self.out(concat_output) + output = F.softmax(output, dim=1) + # Return output and final hidden state + return output, hidden + +class AttnSingleTargetClf(nn.Module): + def __init__(self, hidden_size, dropout=0.1): + super(AttnSingleTargetClf, self).__init__() + + self.hidden_size = hidden_size + + # initialize attention + self.attn = nn.Linear(hidden_size, 1) + + # initialize classifier + self.layer1 = nn.Linear(hidden_size, hidden_size) + self.layer1_act = nn.LeakyReLU() + self.layer2 = nn.Linear(hidden_size, hidden_size // 2) + self.layer2_act = nn.LeakyReLU() + self.clf = nn.Linear(hidden_size // 2, 1) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, encoder_outputs): + # compute attention weights + self.attn_weights = self.attn(encoder_outputs).squeeze().transpose(0,1) + # softmax normalize weights + self.attn_weights = F.softmax(self.attn_weights, dim=1).unsqueeze(1) + # transpose context encoder outputs so we can apply batch matrix multiply + encoder_outputs_transp = encoder_outputs.transpose(0,1) + # compute weighted context vector + context_vec = torch.bmm(self.attn_weights, encoder_outputs_transp).squeeze() + # forward pass through hidden layers + layer1_out = self.layer1_act(self.layer1(self.dropout(context_vec))) + layer2_out = self.layer2_act(self.layer2(self.dropout(layer1_out))) + # compute and return logits + logits = self.clf(self.dropout(layer2_out)).squeeze() + return logits + +class SingleTargetClf(nn.Module): + """ + Single-target classifier head with no attention layer (predicts only from + the last state vector of the RNN) + """ + def __init__(self, hidden_size, dropout=0.1): + super(SingleTargetClf, self).__init__() + + self.hidden_size = hidden_size + + # initialize classifier + self.layer1 = nn.Linear(hidden_size, hidden_size) + self.layer1_act = nn.LeakyReLU() + self.layer2 = nn.Linear(hidden_size, hidden_size // 2) + self.layer2_act = nn.LeakyReLU() + self.clf = nn.Linear(hidden_size // 2, 1) + self.dropout = nn.Dropout(p=dropout) + + def forward(self, encoder_outputs, encoder_input_lengths): + # from stackoverflow (https://stackoverflow.com/questions/50856936/taking-the-last-state-from-bilstm-bigru-in-pytorch) + # First we unsqueeze seqlengths two times so it has the same number of + # of dimensions as output_forward + # (batch_size) -> (1, batch_size, 1) + lengths = encoder_input_lengths.unsqueeze(0).unsqueeze(2) + # Then we expand it accordingly + # (1, batch_size, 1) -> (1, batch_size, hidden_size) + lengths = lengths.expand((1, -1, encoder_outputs.size(2))) + + # take only the last state of the encoder for each batch + last_outputs = torch.gather(encoder_outputs, 0, lengths-1).squeeze() + # forward pass through hidden layers + layer1_out = self.layer1_act(self.layer1(self.dropout(last_outputs))) + layer2_out = self.layer2_act(self.layer2(self.dropout(layer1_out))) + # compute and return logits + logits = self.clf(self.dropout(layer2_out)).squeeze() + return logits + diff --git a/convokit/forecaster/CRAFT/runners.py b/convokit/forecaster/CRAFT/runners.py new file mode 100644 index 00000000..d2f6dbc1 --- /dev/null +++ b/convokit/forecaster/CRAFT/runners.py @@ -0,0 +1,244 @@ +try: + import torch +except (ModuleNotFoundError, ImportError) as e: + raise ModuleNotFoundError( + "torch is not currently installed. Run 'pip install convokit[craft]' if you would like to use the CRAFT model." + ) + +import numpy as np +import pandas as pd +import torch.nn.functional as F +from torch import nn + +class Predictor(nn.Module): + """This helper module encapsulates the CRAFT pipeline, defining the logic of passing an input through each consecutive sub-module.""" + def __init__(self, encoder, context_encoder, classifier): + super(Predictor, self).__init__() + self.encoder = encoder + self.context_encoder = context_encoder + self.classifier = classifier + + def forward(self, input_batch, dialog_lengths, dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, batch_size, max_length): + # Forward input through encoder model + _, utt_encoder_hidden = self.encoder(input_batch, utt_lengths) + + # Convert utterance encoder final states to batched dialogs for use by context encoder + context_encoder_input = makeContextEncoderInput(utt_encoder_hidden, dialog_lengths_list, batch_size, batch_indices, dialog_indices) + + # Forward pass through context encoder + context_encoder_outputs, context_encoder_hidden = self.context_encoder(context_encoder_input, dialog_lengths) + + # Forward pass through classifier to get prediction logits + logits = self.classifier(context_encoder_outputs, dialog_lengths) + + # Apply sigmoid activation + predictions = F.sigmoid(logits) + return predictions + +def makeContextEncoderInput(utt_encoder_hidden, dialog_lengths, batch_size, batch_indices, dialog_indices): + # first, sum the forward and backward encoder states + utt_encoder_summed = utt_encoder_hidden[-2,:,:] + utt_encoder_hidden[-1,:,:] + # we now have hidden state of shape [utterance_batch_size, hidden_size] + # split it into a list of [hidden_size,] x utterance_batch_size + last_states = [t.squeeze() for t in utt_encoder_summed.split(1, dim=0)] + + # create a placeholder list of tensors to group the states by source dialog + states_dialog_batched = [[None for _ in range(dialog_lengths[i])] for i in range(batch_size)] + + # group the states by source dialog + for hidden_state, batch_idx, dialog_idx in zip(last_states, batch_indices, dialog_indices): + states_dialog_batched[batch_idx][dialog_idx] = hidden_state + + # stack each dialog into a tensor of shape [dialog_length, hidden_size] + states_dialog_batched = [torch.stack(d) for d in states_dialog_batched] + + # finally, condense all the dialog tensors into a single zero-padded tensor + # of shape [max_dialog_length, batch_size, hidden_size] + return torch.nn.utils.rnn.pad_sequence(states_dialog_batched) + +def train(input_variable, dialog_lengths, dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, labels, # input/output arguments + encoder, context_encoder, attack_clf, # network arguments + encoder_optimizer, context_encoder_optimizer, attack_clf_optimizer, # optimization arguments + batch_size, clip, max_length=MAX_LENGTH): # misc arguments + + # Zero gradients + encoder_optimizer.zero_grad() + context_encoder_optimizer.zero_grad() + attack_clf_optimizer.zero_grad() + + # Set device options + input_variable = input_variable.to(device) + dialog_lengths = dialog_lengths.to(device) + utt_lengths = utt_lengths.to(device) + labels = labels.to(device) + + # Forward pass through utterance encoder + _, utt_encoder_hidden = encoder(input_variable, utt_lengths) + + # Convert utterance encoder final states to batched dialogs for use by context encoder + context_encoder_input = makeContextEncoderInput(utt_encoder_hidden, dialog_lengths_list, batch_size, batch_indices, dialog_indices) + + # Forward pass through context encoder + context_encoder_outputs, _ = context_encoder(context_encoder_input, dialog_lengths) + + # Forward pass through classifier to get prediction logits + logits = attack_clf(context_encoder_outputs, dialog_lengths) + + # Calculate loss + loss = F.binary_cross_entropy_with_logits(logits, labels) + + # Perform backpropatation + loss.backward() + + # Clip gradients: gradients are modified in place + _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip) + _ = torch.nn.utils.clip_grad_norm_(context_encoder.parameters(), clip) + _ = torch.nn.utils.clip_grad_norm_(attack_clf.parameters(), clip) + + # Adjust model weights + encoder_optimizer.step() + context_encoder_optimizer.step() + attack_clf_optimizer.step() + + return loss.item() + +def evaluateBatch(encoder, context_encoder, predictor, voc, input_batch, dialog_lengths, + dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, batch_size, device, max_length=MAX_LENGTH): + # Set device options + input_batch = input_batch.to(device) + dialog_lengths = dialog_lengths.to(device) + utt_lengths = utt_lengths.to(device) + # Predict future attack using predictor + scores = predictor(input_batch, dialog_lengths, dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, batch_size, max_length) + predictions = (scores > 0.5).float() + return predictions, scores + +def validate(dataset, encoder, context_encoder, predictor, voc, batch_size, device): + # create a batch iterator for the given data + batch_iterator = batchIterator(voc, dataset, batch_size, shuffle=False) + # find out how many iterations we will need to cover the whole dataset + n_iters = len(dataset) // batch_size + int(len(dataset) % batch_size > 0) + # containers for full prediction results so we can compute accuracy at the end + all_preds = [] + all_labels = [] + for iteration in range(1, n_iters+1): + batch, batch_dialogs, _, true_batch_size = next(batch_iterator) + # Extract fields from batch + input_variable, dialog_lengths, utt_lengths, batch_indices, dialog_indices, labels, convo_ids, target_variable, mask, max_target_len = batch + dialog_lengths_list = [len(x) for x in batch_dialogs] + # run the model + predictions, scores = evaluateBatch(encoder, context_encoder, predictor, voc, input_variable, + dialog_lengths, dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, + true_batch_size, device) + # aggregate results for computing accuracy at the end + all_preds += [p.item() for p in predictions] + all_labels += [l.item() for l in labels] + print("Iteration: {}; Percent complete: {:.1f}%".format(iteration, iteration / n_iters * 100)) + + # compute and return the accuracy + return (np.asarray(all_preds) == np.asarray(all_labels)).mean() + +def trainIters(voc, pairs, val_pairs, encoder, context_encoder, attack_clf, + encoder_optimizer, context_encoder_optimizer, attack_clf_optimizer, embedding, + n_iteration, batch_size, print_every, validate_every, clip): + + # create a batch iterator for training data + batch_iterator = batchIterator(voc, pairs, batch_size) + + # Initializations + print('Initializing ...') + start_iteration = 1 + print_loss = 0 + + # Training loop + print("Training...") + # keep track of best validation accuracy - only save when we have a model that beats the current best + best_acc = 0 + best_model = None + for iteration in range(start_iteration, n_iteration + 1): + training_batch, training_dialogs, _, true_batch_size = next(batch_iterator) + # Extract fields from batch + input_variable, dialog_lengths, utt_lengths, batch_indices, dialog_indices, labels, _, target_variable, mask, max_target_len = training_batch + dialog_lengths_list = [len(x) for x in training_dialogs] + + # Run a training iteration with batch + loss = train(input_variable, dialog_lengths, dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, labels, # input/output arguments + encoder, context_encoder, attack_clf, # network arguments + encoder_optimizer, context_encoder_optimizer, attack_clf_optimizer, # optimization arguments + true_batch_size, clip) # misc arguments + print_loss += loss + + # Print progress + if iteration % print_every == 0: + print_loss_avg = print_loss / print_every + print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg)) + print_loss = 0 + + # Evaluate on validation set + if (iteration % validate_every == 0): + print("Validating!") + # put the network components into evaluation mode + encoder.eval() + context_encoder.eval() + attack_clf.eval() + + predictor = Predictor(encoder, context_encoder, attack_clf) + accuracy = validate(val_pairs, encoder, context_encoder, predictor, voc, batch_size, device) + print("Validation set accuracy: {:.2f}%".format(accuracy * 100)) + + # keep track of our best model so far + if accuracy > best_acc: + print("Validation accuracy better than current best; saving model...") + best_acc = accuracy + best_model = { + 'iteration': iteration, + 'en': encoder.state_dict(), + 'ctx': context_encoder.state_dict(), + 'atk_clf': attack_clf.state_dict(), + 'en_opt': encoder_optimizer.state_dict(), + 'ctx_opt': context_encoder_optimizer.state_dict(), + 'atk_clf_opt': attack_clf_optimizer.state_dict(), + 'loss': loss, + 'voc_dict': voc.__dict__, + 'embedding': embedding.state_dict() + } + + # put the network components back into training mode + encoder.train() + context_encoder.train() + attack_clf.train() + + return best_model + +def evaluateDataset(dataset, encoder, context_encoder, predictor, voc, batch_size, device): + # create a batch iterator for the given data + batch_iterator = batchIterator(voc, dataset, batch_size, shuffle=False) + # find out how many iterations we will need to cover the whole dataset + n_iters = len(dataset) // batch_size + int(len(dataset) % batch_size > 0) + output_df = { + "id": [], + "prediction": [], + "score": [] + } + for iteration in range(1, n_iters+1): + batch, batch_dialogs, _, true_batch_size = next(batch_iterator) + # Extract fields from batch + input_variable, dialog_lengths, utt_lengths, batch_indices, dialog_indices, labels, convo_ids, target_variable, mask, max_target_len = batch + dialog_lengths_list = [len(x) for x in batch_dialogs] + # run the model + predictions, scores = evaluateBatch(encoder, context_encoder, predictor, voc, input_variable, + dialog_lengths, dialog_lengths_list, utt_lengths, batch_indices, dialog_indices, + true_batch_size, device) + + # format the output as a dataframe (which we can later re-join with the corpus) + for i in range(true_batch_size): + convo_id = convo_ids[i] + pred = predictions[i].item() + score = scores[i].item() + output_df["id"].append(convo_id) + output_df["prediction"].append(pred) + output_df["score"].append(score) + + print("Iteration: {}; Percent complete: {:.1f}%".format(iteration, iteration / n_iters * 100)) + + return pd.DataFrame(output_df).set_index("id") diff --git a/convokit/forecaster/CRAFTModel.py b/convokit/forecaster/CRAFTModel.py index 99214454..d958f149 100644 --- a/convokit/forecaster/CRAFTModel.py +++ b/convokit/forecaster/CRAFTModel.py @@ -6,17 +6,16 @@ ) import pandas as pd -from convokit.forecaster.CRAFT.CRAFTUtil import loadPrecomputedVoc, batchIterator, CONSTANTS -from .CRAFT.CRAFTNN import initialize_model, makeContextEncoderInput, Predictor +from convokit.forecaster.CRAFT.data import loadPrecomputedVoc, batchIterator +from .CRAFT.model import initialize_model, makeContextEncoderInput, Predictor from .forecasterModel import ForecasterModel import numpy as np import torch.nn.functional as F from torch import optim -from sklearn.model_selection import train_test_split -from typing import Dict +from typing import Dict, Union import os -default_options = { +DEFAULT_CONFIG = { "hidden_size": 500, "encoder_n_layers": 2, "context_encoder_n_layers": 2, @@ -28,475 +27,43 @@ "print_every": 10, "train_epochs": 30, "validation_size": 0.2, - "max_length": 80, - "trained_model_output_filepath": "finetuned_model.tar", + "max_length": 80 +} + +DECISION_THRESHOLDS = { + "craft-wiki-pretrained": 0.570617, + "craft-wiki-finetuned": 0.570617, + "craft-cmv-pretrained": 0.548580, + "craft-cmv-finetuned": 0.548580 } # To understand the separation of concerns for the CRAFT files: -# CRAFT/craftNN.py contains the class implementations needed to initialize the CRAFT Neural Network model -# CRAFT/craftUtil.py contains utility methods for manipulating the data for it to be passed to the CRAFT model +# CRAFT/model.py contains the pytorch modules that comprise the CRAFT neural network +# CRAFT/data.py contains utility methods for manipulating the data for it to be passed to the CRAFT model +# CRAFT/runners.py adapts the scripts for training and inference of a CRAFT model class CRAFTModel(ForecasterModel): """ - CRAFTModel is one of the Forecaster models that can be used with the Forecaster Transformer. - - By default, CRAFTModel will be initialized with default options - - - hidden_size: 500 - - encoder_n_layers: 2 - - context_encoder_n_layers: 2 - - decoder_n_layers: 2 - - dropout: 0.1 - - batch_size (batch size for computation, i.e. how many (context, reply, id) tuples to use per batch of evaluation): 64 - - clip: 50.0 - - learning_rate: 1e-5 - - print_every: 10 - - train_epochs (number of epochs for training): 30 - - validation_size (percentage of training input data to use as validation): 0.2 - - max_length (maximum utterance length in the dataset): 80 - - :param device_type: 'cpu' or 'cuda', default: 'cpu' - :param model_path: filepath to CRAFT model if loading a custom CRAFT model - :param options: configuration options for the neural network: uses default options otherwise. - :param forecast_attribute_name: name of DataFrame column containing predictions, default: "prediction" - :param forecast_prob_attribute_name: name of DataFrame column containing prediction scores, default: "score" + A ConvoKit Forecaster-adherent reimplementation of the CRAFT conversational forecasting model from + the paper "Trouble on the Horizon: Forecasting the Derailment of Online Conversations as they Develop" + (Chang and Danescu-Niculescu-Mizil, 2019). + + Usage note: CRAFT is a neural network model; full end-to-end training of neural networks is considered + outside the scope of ConvoKit, so the ConvoKit CRAFTModel must be initialized with existing weights. + ConvoKit provides weights for the CGA-WIKI and CGA-CMV corpora. If you just want to run a fully-trained + CRAFT model on those corpora (i.e., only transform, no fit), you can use the finetuned weights + (craft-wiki-finetuned and craft-cmv-finetuned, respectively). If you want to take a pretrained model and + finetune it on your own data (i.e., both fit and transform), you can use the pretrained weights + (craft-wiki-pretrained and craft-cmv-pretrained, respectively), which provide trained versions of the + underlying utterance and conversation encoder layers but leave the classification layers at their + random initializations so that they can be fitted to your data. """ def __init__( self, - device_type: str = "cpu", - model_path: str = None, - options: Dict = None, - forecast_attribute_name: str = "prediction", - forecast_feat_name=None, - forecast_prob_attribute_name: str = "pred_score", - forecast_prob_feat_name=None, - ): - super().__init__( - forecast_attribute_name=forecast_attribute_name, - forecast_feat_name=forecast_feat_name, - forecast_prob_attribute_name=forecast_prob_attribute_name, - forecast_prob_feat_name=forecast_prob_feat_name, - ) - assert device_type in ["cuda", "cpu"] - # device: controls GPU usage: 'cuda' to enable GPU, 'cpu' to run on CPU only. - self.device = torch.device(device_type) - self.device_type = device_type - # voc: the vocabulary object (convokit.forecaster.craftUtil.Voc) used by predictor. - # Used to convert text data into numerical input for CRAFT. - self.voc = loadPrecomputedVoc( - "wikiconv", CONSTANTS["WORD2INDEX_URL"], CONSTANTS["INDEX2WORD_URL"] - ) - - if options is None: - self.options = default_options - else: - for k, v in default_options.items(): - if k not in options: - options[k] = v - self.options = options - print("Initializing CRAFT model with options:") - print(self.options) - - if model_path is not None: - if not os.path.isfile(model_path) or not model_path.endswith(".tar"): - print("Could not find CRAFT model tar file at: {}".format(model_path)) - model_path = None - self.predictor: Predictor = initialize_model( - model_path, - self.voc, - self.device, - self.device_type, - self.options["hidden_size"], - self.options["encoder_n_layers"], - self.options["dropout"], - self.options["context_encoder_n_layers"], - ) - - def _evaluate_batch( - self, - predictor, - input_batch, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - true_batch_size, - ): - """ - Helper for _evaluate_dataset. Runs CRAFT evaluation on a single batch; _evaluate_dataset calls this helper iteratively to get results for the entire dataset. - - :param predictor: the trained CRAFT model to use, provided as a PyTorch Model instance. - :param input_batch: the batch to run CRAFT on (produced by convokit.forecaster.craftUtil.batchIterator, formatted as a batch of utterances) - :param dialog_lengths: how many comments are in each conversation in this batch, as a PyTorch Tensor - :param dialog_lengths_list: same as dialog_lengths, but as a Python List - :param utt_lengths: for each conversation, records the number of tokens in each utterance of the conversation - :param batch_indices: used by CRAFT to reconstruct the original dialog batch from the given utterance batch. Records which dialog each utterance originally came from. - :param dialog_indices: used by CRAFT to reconstruct the original dialog batch from the given utterance batch. Records where in the dialog the utterance originally came from. - :param true_batch_size: number of dialogs in the original dialog batch this utterance batch was generated from. - - :return: per-utterance scores and binarized predictions. - """ - # Set device options - input_batch = input_batch.to(self.device) - dialog_lengths = dialog_lengths.to(self.device) - utt_lengths = utt_lengths.to(self.device) - # Predict future attack using predictor - scores = predictor( - input_batch, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - true_batch_size, - self.options["max_length"], - ) - predictions = (scores > 0.5).float() - return predictions, scores - - def _evaluate_dataset(self, predictor, dataset): - """ - Run a trained CRAFT model over an entire dataset in a batched fashion. - - :param predictor: the trained CRAFT model to use, provided as a PyTorch Model instance. - :param dataset: the dataset to evaluate on, formatted as a list of (context, reply, id_of_reply) tuples. - :return: a DataFrame, indexed by utterance ID, of CRAFT scores for each utterance, and the corresponding binary prediction. - """ - # create a batch iterator for the given data - batch_iterator = batchIterator(self.voc, dataset, self.options["batch_size"], shuffle=False) - # find out how many iterations we will need to cover the whole dataset - n_iters = len(dataset) // self.options["batch_size"] + int( - len(dataset) % self.options["batch_size"] > 0 - ) - output_df = { - "id": [], - self.forecast_attribute_name: [], - self.forecast_prob_attribute_name: [], - } - for iteration in range(1, n_iters + 1): - batch, batch_dialogs, true_batch_size = next(batch_iterator) - # Extract fields from batch - ( - input_variable, - dialog_lengths, - utt_lengths, - batch_indices, - dialog_indices, - labels, - batch_ids, - target_variable, - mask, - max_target_len, - ) = batch - dialog_lengths_list = [len(x) for x in batch_dialogs] - # run the model - predictions, scores = self._evaluate_batch( - predictor, - input_variable, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - true_batch_size, - ) - - # format the output as a dataframe (which we can later re-join with the corpus) - for i in range(true_batch_size): - utt_id = batch_ids[i] - pred = predictions[i].item() - score = scores[i].item() - output_df["id"].append(utt_id) - output_df[self.forecast_attribute_name].append(pred) - output_df[self.forecast_prob_attribute_name].append(score) - - print( - "Iteration: {}; Percent complete: {:.1f}%".format( - iteration, iteration / n_iters * 100 - ) - ) - - return pd.DataFrame(output_df).set_index("id") - - def _train_NN( - self, - input_variable, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - labels, # input/output arguments - encoder, - context_encoder, - attack_clf, # network arguments - encoder_optimizer, - context_encoder_optimizer, - attack_clf_optimizer, # optimization arguments - batch_size, - clip, - ): # misc arguments - # Zero gradients - encoder_optimizer.zero_grad() - context_encoder_optimizer.zero_grad() - attack_clf_optimizer.zero_grad() - - # Set device options - input_variable = input_variable.to(self.device) - dialog_lengths = dialog_lengths.to(self.device) - utt_lengths = utt_lengths.to(self.device) - labels = labels.to(self.device) - - # Forward pass through utterance encoder - _, utt_encoder_hidden = encoder(input_variable, utt_lengths) - - # Convert utterance encoder final states to batched dialogs for use by context encoder - context_encoder_input = makeContextEncoderInput( - utt_encoder_hidden, dialog_lengths_list, batch_size, batch_indices, dialog_indices - ) - - # Forward pass through context encoder - context_encoder_outputs, _ = context_encoder(context_encoder_input, dialog_lengths) - - # Forward pass through classifier to get prediction logits - logits = attack_clf(context_encoder_outputs, dialog_lengths) - - # Calculate loss - loss = F.binary_cross_entropy_with_logits(logits, labels) - - # Perform backpropatation - loss.backward() - - # Clip gradients: gradients are modified in place - _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip) - _ = torch.nn.utils.clip_grad_norm_(context_encoder.parameters(), clip) - _ = torch.nn.utils.clip_grad_norm_(attack_clf.parameters(), clip) - - # Adjust model weights - encoder_optimizer.step() - context_encoder_optimizer.step() - attack_clf_optimizer.step() - - return loss.item() - - def _validate(self, predictor, dataset): - # create a batch iterator for the given data - batch_iterator = batchIterator(self.voc, dataset, self.options["batch_size"], shuffle=False) - # find out how many iterations we will need to cover the whole dataset - n_iters = len(dataset) // self.options["batch_size"] + int( - len(dataset) % self.options["batch_size"] > 0 - ) - # containers for full prediction results so we can compute accuracy at the end - all_preds = [] - all_labels = [] - for iteration in range(1, n_iters + 1): - batch, batch_dialogs, true_batch_size = next(batch_iterator) - # Extract fields from batch - ( - input_variable, - dialog_lengths, - utt_lengths, - batch_indices, - dialog_indices, - batch_labels, - batch_ids, - target_variable, - mask, - max_target_len, - ) = batch - dialog_lengths_list = [len(x) for x in batch_dialogs] - # run the model - predictions, scores = self._evaluate_batch( - predictor, - input_variable, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - true_batch_size, - ) - # aggregate results for computing accuracy at the end - all_preds += [p.item() for p in predictions] - all_labels += [l.item() for l in batch_labels] - print( - "Iteration: {}; Percent complete: {:.1f}%".format( - iteration, iteration / n_iters * 100 - ) - ) - - # compute and return the accuracy - return (np.asarray(all_preds) == np.asarray(all_labels)).mean() - - def _train_iters( - self, - train_pairs, - val_pairs, - encoder, - context_encoder, - attack_clf, - encoder_optimizer, - context_encoder_optimizer, - attack_clf_optimizer, - embedding, - n_iteration, - validate_every, + initial_weights: str, + decision_threshold: Union[float, str] = "auto", + config: dict = DEFAULT_CONFIG ): - # create a batch iterator for training data - batch_iterator = batchIterator(self.voc, train_pairs, self.options["batch_size"]) - - # Initializations - print("Initializing ...") - start_iteration = 1 - print_loss = 0 - - # Training loop - print("Training...") - # keep track of best validation accuracy - only save when we have a model that beats the current best - best_acc = 0 - for iteration in range(start_iteration, n_iteration + 1): - training_batch, training_dialogs, true_batch_size = next(batch_iterator) - # Extract fields from batch - ( - input_variable, - dialog_lengths, - utt_lengths, - batch_indices, - dialog_indices, - labels, - batch_ids, - target_variable, - mask, - max_target_len, - ) = training_batch - dialog_lengths_list = [len(x) for x in training_dialogs] - - # Run a training iteration with batch - loss = self._train_NN( - input_variable, - dialog_lengths, - dialog_lengths_list, - utt_lengths, - batch_indices, - dialog_indices, - labels, # input/output arguments - encoder, - context_encoder, - attack_clf, # network arguments - encoder_optimizer, - context_encoder_optimizer, - attack_clf_optimizer, # optimization arguments - true_batch_size, - self.options["clip"], - ) # misc arguments - print_loss += loss - - # Print progress - if iteration % self.options["print_every"] == 0: - print_loss_avg = print_loss / self.options["print_every"] - print( - "Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format( - iteration, iteration / n_iteration * 100, print_loss_avg - ) - ) - print_loss = 0 - - # Evaluate on validation set - if iteration % validate_every == 0: - print("Validating!") - # put the network components into evaluation mode - encoder.eval() - context_encoder.eval() - attack_clf.eval() - - predictor = Predictor(encoder, context_encoder, attack_clf) - accuracy = self._validate(predictor, val_pairs) - print("Validation set accuracy: {:.2f}%".format(accuracy * 100)) - - # keep track of our best model so far - if accuracy > best_acc: - print("Validation accuracy better than current best; saving model...") - best_acc = accuracy - torch.save( - { - "iteration": iteration, - "en": encoder.state_dict(), - "ctx": context_encoder.state_dict(), - "atk_clf": attack_clf.state_dict(), - "en_opt": encoder_optimizer.state_dict(), - "ctx_opt": context_encoder_optimizer.state_dict(), - "atk_clf_opt": attack_clf_optimizer.state_dict(), - "loss": loss, - "voc_dict": self.voc.__dict__, - "embedding": embedding.state_dict(), - }, - self.options["trained_model_output_filepath"], - ) - - # put the network components back into training mode - encoder.train() - context_encoder.train() - attack_clf.train() - - def train(self, id_to_context_reply_label): - ids = list(id_to_context_reply_label) - train_pair_ids, val_pair_ids = train_test_split( - ids, test_size=self.options["validation_size"] - ) - train_pairs = [id_to_context_reply_label[pair_id] for pair_id in train_pair_ids] - val_pairs = [id_to_context_reply_label[pair_id] for pair_id in val_pair_ids] - # Compute the number of training iterations we will need in order to achieve the number of epochs specified in the settings at the start of the notebook - n_iter_per_epoch = len(train_pairs) // self.options["batch_size"] + int( - len(train_pairs) % self.options["batch_size"] == 1 - ) - n_iteration = n_iter_per_epoch * self.options["train_epochs"] - - # Put dropout layers in train mode - self.predictor.encoder.train() - self.predictor.context_encoder.train() - self.predictor.classifier.train() - - # Initialize optimizers - print("Building optimizers...") - encoder_optimizer = optim.Adam( - self.predictor.encoder.parameters(), lr=self.options["learning_rate"] - ) - context_encoder_optimizer = optim.Adam( - self.predictor.context_encoder.parameters(), lr=self.options["learning_rate"] - ) - attack_clf_optimizer = optim.Adam( - self.predictor.classifier.parameters(), lr=self.options["learning_rate"] - ) - - # Run training iterations, validating after every epoch - print("Starting Training!") - print("Will train for {} iterations".format(n_iteration)) - self._train_iters( - train_pairs, - val_pairs, - self.predictor.encoder, - self.predictor.context_encoder, - self.predictor.classifier, - encoder_optimizer, - context_encoder_optimizer, - attack_clf_optimizer, - self.predictor.encoder.embedding, - n_iteration, - n_iter_per_epoch, - ) - - def forecast(self, id_to_context_reply_label): - """ - Compute forecasts and forecast scores for the given dictionary of utterance id to (context, reply) pairs. Return the values in a DataFrame. - - :param id_to_context_reply_label: dict mapping utterance id to (context, reply, label) - :return: a pandas DataFrame - """ - dataset = [ - (context, reply, label, id_) - for id_, (context, reply, label) in id_to_context_reply_label.items() - ] - return self._evaluate_dataset(self.predictor, dataset) + super().__init__() \ No newline at end of file