From 26777573c03c21773e633297bdf79804e9d1aa98 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 23 Jan 2023 18:13:05 +0100 Subject: [PATCH 01/14] integrate transformer-smaller-training-vocab --- flair/models/pairwise_classification_model.py | 8 +++- flair/models/relation_classifier_model.py | 8 ++++ flair/models/relation_extractor_model.py | 2 +- flair/models/tars_model.py | 9 +++- flair/nn/model.py | 20 +++++++- flair/trainers/trainer.py | 46 +++++++++++++++++++ requirements.txt | 1 + 7 files changed, 90 insertions(+), 4 deletions(-) diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py index a60fac5f1..bbd386d1e 100644 --- a/flair/models/pairwise_classification_model.py +++ b/flair/models/pairwise_classification_model.py @@ -1,10 +1,11 @@ +import typing from typing import List import torch import flair.embeddings import flair.nn -from flair.data import Sentence, TextPair +from flair.data import Sentence, TextPair, Corpus class TextPairClassifier(flair.nn.DefaultClassifier[TextPair, TextPair]): @@ -111,3 +112,8 @@ def _init_model_with_state_dict(cls, state, **kwargs): embed_separately=state.get("embed_separately"), **kwargs, ) + + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + for sentence_pair in corpus.get_all_sentences(): + yield [t.text for t in sentence_pair.first] + yield [t.text for t in sentence_pair.second] \ No newline at end of file diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py index 56819bc69..6fb0eda52 100644 --- a/flair/models/relation_classifier_model.py +++ b/flair/models/relation_classifier_model.py @@ -1,5 +1,6 @@ import itertools import logging +import typing from abc import ABC, abstractmethod from typing import ( Any, @@ -705,3 +706,10 @@ def zero_tag_value(self) -> str: @property def allow_unk_tag(self) -> bool: return self._allow_unk_tag + + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + yield from super().get_used_tokens(corpus) + for sentence in corpus.get_all_sentences(): + for span in sentence.get_spans(self.label_type): + yield self.encoding_strategy.encode_head(span, span.get_label(self.label_type)) + yield self.encoding_strategy.encode_tail(span, span.get_label(self.label_type)) diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py index 63dc2da6d..40d5f0772 100644 --- a/flair/models/relation_extractor_model.py +++ b/flair/models/relation_extractor_model.py @@ -6,7 +6,7 @@ import flair.embeddings import flair.nn -from flair.data import Relation, Sentence +from flair.data import Relation, Sentence, Corpus from flair.file_utils import cached_path log = logging.getLogger("flair") diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py index 1bae666db..0e7139f7b 100644 --- a/flair/models/tars_model.py +++ b/flair/models/tars_model.py @@ -1,4 +1,5 @@ import logging +import typing from abc import ABC from collections import OrderedDict from pathlib import Path @@ -11,7 +12,7 @@ from tqdm import tqdm import flair -from flair.data import Dictionary, Sentence, Span +from flair.data import Dictionary, Sentence, Span, Corpus from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings import ( TokenEmbeddings, @@ -308,6 +309,12 @@ def predict_zero_shot( return + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + yield from super().get_used_tokens(corpus) + for label in self.get_current_label_dictionary().idx2item.keys(): + yield [label.decode("utf-8")] + yield [self.separator] + class TARSTagger(FewshotClassifier): """ diff --git a/flair/nn/model.py b/flair/nn/model.py index 35fe0deb4..46c793cfc 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -14,7 +14,7 @@ from tqdm import tqdm import flair -from flair.data import DT, DT2, Dictionary, Sentence +from flair.data import DT, DT2, Dictionary, Sentence, Corpus from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings import Embeddings from flair.embeddings.base import load_embeddings @@ -233,6 +233,15 @@ def print_model_card(self): "trained or was trained with Flair version < 0.9.1)" ) + @property + def supports_smaller_training_vocab(self) -> bool: + # the smaller training vocab expects classification tasks, otherwise it won't work. + return False + + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + pass + + class Classifier(Model[DT], typing.Generic[DT], ABC): """Abstract base class for all Flair models that do classification, @@ -542,6 +551,15 @@ def _print_predictions(self, batch, gold_label_type): lines.append(eval_line) return lines + @property + def supports_smaller_training_vocab(self) -> bool: + # the smaller training vocab expects classification tasks, otherwise it won't work. + return True + + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + for sentence in corpus.get_all_sentences(): + yield [t.text for t in sentence] + class DefaultClassifier(Classifier[DT], typing.Generic[DT, DT2], ABC): """Default base class for all Flair models that do classification, both diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 48422d074..6f7329227 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -13,7 +13,9 @@ import torch from torch.optim.sgd import SGD from torch.utils.data.dataset import ConcatDataset +from transformer_smaller_training_vocab import reduce_train_vocab +from flair.embeddings import TransformerEmbeddings, Embeddings, StackedEmbeddings from flair.nn import Model try: @@ -123,6 +125,7 @@ def train( optimizer_state_dict: Optional[Dict[str, Any]] = None, scheduler_state_dict: Optional[Dict[str, Any]] = None, save_optimizer_state: bool = False, + reduce_transformer_vocab: bool = False, shuffle_first_epoch: bool = False, **kwargs, ) -> dict: @@ -387,6 +390,23 @@ def train( micro_batch_size = mini_batch_chunk_size + if not self.model.supports_smaller_training_vocab: + reduce_transformer_vocab = False + + if reduce_transformer_vocab: + transformer_embeddings = get_transformer_embeddings(self) + if not transformer_embeddings: + reduce_transformer_vocab = False + else: + tokens = list(self.model.get_used_tokens(self.corpus)) + vocab_contexts = [ + reduce_train_vocab(model=emb.model, tokenizer=emb.tokenizer, texts=tokens) + for emb in transformer_embeddings + ] + for context in vocab_contexts: + context.__enter__() + + # this field stores the names of all dynamic embeddings in the model (determined after first forward pass) dynamic_embeddings = None @@ -867,6 +887,11 @@ def train( else: final_score = 0 log.info("Test data not provided setting final score to 0") + if reduce_transformer_vocab: + for context in vocab_contexts: + context.__exit__() + if save_final_model and not param_selection_mode: + self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) if create_file_logs: log_handler.close() @@ -1081,3 +1106,24 @@ def find_learning_rate( log_line(log) return Path(learning_rate_tsv) + + +def get_transformer_embeddings(trainer: ModelTrainer) -> List[TransformerEmbeddings]: + embeddings = getattr(trainer.model, "embeddings", None) + + if embeddings is None: + log.warning(f"Could not extract embeddings of Model of type {type(trainer.model)}") + return [] + + transformer_embeddings = set() + + def scan_embeddings(emb: Embeddings): + if isinstance(emb, StackedEmbeddings): + for sub_emb in emb.embeddings: + scan_embeddings(sub_emb) + if isinstance(emb, TransformerEmbeddings): + transformer_embeddings.add(emb) + + scan_embeddings(embeddings) + + return list(transformer_embeddings) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 67ff199f8..dcdc4f30e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,4 @@ more-itertools wikipedia-api pptree pytorch_revgrad +transformer-smaller-training-vocab From f20cd80cea771f544e5a26c54338d73ff3edd05e Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 30 Jan 2023 13:55:57 +0100 Subject: [PATCH 02/14] extract tars embeddings too --- flair/trainers/trainer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 6f7329227..19b53a120 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -16,6 +16,7 @@ from transformer_smaller_training_vocab import reduce_train_vocab from flair.embeddings import TransformerEmbeddings, Embeddings, StackedEmbeddings +from flair.models import FewshotClassifier from flair.nn import Model try: @@ -1109,7 +1110,11 @@ def find_learning_rate( def get_transformer_embeddings(trainer: ModelTrainer) -> List[TransformerEmbeddings]: - embeddings = getattr(trainer.model, "embeddings", None) + + if isinstance(trainer.model, FewshotClassifier): + embeddings = trainer.model.tars_embeddings + else: + embeddings = getattr(trainer.model, "embeddings", None) if embeddings is None: log.warning(f"Could not extract embeddings of Model of type {type(trainer.model)}") From f717385c480fdd7170e39b341d6fc196adfd0173 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 30 Jan 2023 13:59:45 +0100 Subject: [PATCH 03/14] fix tars get vocab --- flair/models/tars_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py index 0e7139f7b..d0c50e7d8 100644 --- a/flair/models/tars_model.py +++ b/flair/models/tars_model.py @@ -311,7 +311,7 @@ def predict_zero_shot( def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: yield from super().get_used_tokens(corpus) - for label in self.get_current_label_dictionary().idx2item.keys(): + for label in self.get_current_label_dictionary().idx2item: yield [label.decode("utf-8")] yield [self.separator] From acd95d3b81b1c63776844b9ab5a76d909f354917 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 30 Jan 2023 14:04:56 +0100 Subject: [PATCH 04/14] fix __exit__ call --- flair/trainers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 19b53a120..a066b43f1 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -890,7 +890,7 @@ def train( log.info("Test data not provided setting final score to 0") if reduce_transformer_vocab: for context in vocab_contexts: - context.__exit__() + context.__exit__(*sys.exc_info()) if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) From a9177ba4102912e74ad7fb053e3e8b0936a38bf3 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Fri, 3 Feb 2023 17:30:07 +0100 Subject: [PATCH 05/14] reduce vocab before creating optimizer --- flair/trainers/trainer.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index a066b43f1..8de232e43 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -288,6 +288,22 @@ def train( weight_extractor = WeightExtractor(base_path) + if not self.model.supports_smaller_training_vocab: + reduce_transformer_vocab = False + + if reduce_transformer_vocab: + transformer_embeddings = get_transformer_embeddings(self) + if not transformer_embeddings: + reduce_transformer_vocab = False + else: + tokens = list(self.model.get_used_tokens(self.corpus)) + vocab_contexts = [ + reduce_train_vocab(model=emb.model, tokenizer=emb.tokenizer, texts=tokens) + for emb in transformer_embeddings + ] + for context in vocab_contexts: + context.__enter__() + # if optimizer class is passed, instantiate: if inspect.isclass(optimizer): kwargs["lr"] = learning_rate @@ -391,22 +407,6 @@ def train( micro_batch_size = mini_batch_chunk_size - if not self.model.supports_smaller_training_vocab: - reduce_transformer_vocab = False - - if reduce_transformer_vocab: - transformer_embeddings = get_transformer_embeddings(self) - if not transformer_embeddings: - reduce_transformer_vocab = False - else: - tokens = list(self.model.get_used_tokens(self.corpus)) - vocab_contexts = [ - reduce_train_vocab(model=emb.model, tokenizer=emb.tokenizer, texts=tokens) - for emb in transformer_embeddings - ] - for context in vocab_contexts: - context.__enter__() - # this field stores the names of all dynamic embeddings in the model (determined after first forward pass) dynamic_embeddings = None From bfdfc45fa00c51b4fb0af23e0fa06a582037b315 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 5 Feb 2023 04:33:10 +0100 Subject: [PATCH 06/14] fix text pair classification --- flair/models/pairwise_classification_model.py | 9 +++++---- flair/nn/model.py | 8 +++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py index bbd386d1e..868cf5bcf 100644 --- a/flair/models/pairwise_classification_model.py +++ b/flair/models/pairwise_classification_model.py @@ -38,6 +38,7 @@ def __init__( **classifierargs, embeddings=embeddings, final_embedding_size=2 * embeddings.embedding_length if embed_separately else embeddings.embedding_length, + should_embed_sentence=False, ) self._label_type = label_type @@ -48,11 +49,11 @@ def __init__( # set separator to concatenate two sentences self.sep = " " if isinstance( - self.document_embeddings, + self.embeddings, flair.embeddings.document.TransformerDocumentEmbeddings, ): - if self.document_embeddings.tokenizer.sep_token: - self.sep = " " + str(self.document_embeddings.tokenizer.sep_token) + " " + if self.embeddings.tokenizer.sep_token: + self.sep = " " + str(self.embeddings.tokenizer.sep_token) + " " else: self.sep = " [SEP] " @@ -116,4 +117,4 @@ def _init_model_with_state_dict(cls, state, **kwargs): def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: for sentence_pair in corpus.get_all_sentences(): yield [t.text for t in sentence_pair.first] - yield [t.text for t in sentence_pair.second] \ No newline at end of file + yield [t.text for t in sentence_pair.second] diff --git a/flair/nn/model.py b/flair/nn/model.py index 46c793cfc..0bbebaad4 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -242,7 +242,6 @@ def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: pass - class Classifier(Model[DT], typing.Generic[DT], ABC): """Abstract base class for all Flair models that do classification, both single- and multi-label. It inherits from flair.nn.Model and adds an @@ -544,7 +543,7 @@ def _print_predictions(self, batch, gold_label_type): correct_string = " -> MISMATCH!\n" if g != p else "" # print info eval_line = ( - f"{datapoint.to_original_text()}\n" + f"{datapoint.text}\n" f" - Gold: {', '.join(label.value if label.data_point == datapoint else label.labeled_identifier for label in datapoint.get_labels(gold_label_type))}\n" f" - Pred: {', '.join(label.value if label.data_point == datapoint else label.labeled_identifier for label in datapoint.get_labels('predicted'))}\n{correct_string}\n" ) @@ -584,6 +583,7 @@ def __init__( decoder: Optional[torch.nn.Module] = None, inverse_model: bool = False, train_on_gold_pairs_only: bool = False, + should_embed_sentence: bool = True, ): super().__init__() @@ -612,6 +612,7 @@ def __init__( self.dropout: torch.nn.Dropout = torch.nn.Dropout(dropout) self.locked_dropout = flair.nn.LockedDropout(locked_dropout) self.word_dropout = flair.nn.WordDropout(word_dropout) + self.should_embed_sentence = should_embed_sentence # loss weights and loss function self.weight_dict = loss_weights @@ -705,7 +706,8 @@ def _prepare_label_tensor(self, prediction_data_points: List[DT2]) -> torch.Tens def _encode_data_points(self, sentences: List[DT], data_points: List[DT2]): # embed sentences - self.embeddings.embed(sentences) + if self.should_embed_sentence: + self.embeddings.embed(sentences) # get a tensor of data points data_point_tensor = torch.stack([self._get_embedding_for_data_point(data_point) for data_point in data_points]) From ec983c63cd2ec03f34fb940d9a1ff69941c2e97b Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 5 Feb 2023 04:33:47 +0100 Subject: [PATCH 07/14] use context lib for cleaner ccontextualization --- flair/trainers/trainer.py | 928 +++++++++++++++++++------------------- 1 file changed, 467 insertions(+), 461 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 8de232e43..815e1c815 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -1,3 +1,4 @@ +import contextlib import copy import datetime import inspect @@ -291,96 +292,95 @@ def train( if not self.model.supports_smaller_training_vocab: reduce_transformer_vocab = False - if reduce_transformer_vocab: - transformer_embeddings = get_transformer_embeddings(self) - if not transformer_embeddings: - reduce_transformer_vocab = False - else: - tokens = list(self.model.get_used_tokens(self.corpus)) - vocab_contexts = [ - reduce_train_vocab(model=emb.model, tokenizer=emb.tokenizer, texts=tokens) - for emb in transformer_embeddings - ] - for context in vocab_contexts: - context.__enter__() + with contextlib.ExitStack() as context_stack: + if reduce_transformer_vocab: + transformer_embeddings = get_transformer_embeddings(self) + if not transformer_embeddings: + reduce_transformer_vocab = False + else: + tokens = list(self.model.get_used_tokens(self.corpus)) + for emb in transformer_embeddings: + context_stack.enter_context( + reduce_train_vocab(model=emb.model, tokenizer=emb.tokenizer, texts=tokens) + ) - # if optimizer class is passed, instantiate: - if inspect.isclass(optimizer): - kwargs["lr"] = learning_rate - optimizer = optimizer(self.model.parameters(), **kwargs) + # if optimizer class is passed, instantiate: + if inspect.isclass(optimizer): + kwargs["lr"] = learning_rate + optimizer = optimizer(self.model.parameters(), **kwargs) - if use_swa: - import torchcontrib + if use_swa: + import torchcontrib - optimizer = torchcontrib.optim.SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=learning_rate) + optimizer = torchcontrib.optim.SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=learning_rate) - # from here on, use list of learning rates - current_learning_rate: List = [group["lr"] for group in optimizer.param_groups] + # from here on, use list of learning rates + current_learning_rate: List = [group["lr"] for group in optimizer.param_groups] - if use_amp: - self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) - - optimizer = cast(torch.optim.Optimizer, optimizer) - - # load existing optimizer state dictionary if it exists - if optimizer_state_dict: - optimizer.load_state_dict(optimizer_state_dict) - - # minimize training loss if training with dev data, else maximize dev score - anneal_mode = "min" if train_with_dev or anneal_against_dev_loss else "max" - best_validation_score = 100000000000 if train_with_dev or anneal_against_dev_loss else -1.0 - - dataset_size = _len_dataset(self.corpus.train) - if train_with_dev: - dataset_size += _len_dataset(self.corpus.dev) - - # if scheduler is passed as a class, instantiate - if inspect.isclass(scheduler): - if scheduler == OneCycleLR: - scheduler = OneCycleLR( - optimizer, - max_lr=current_learning_rate, - steps_per_epoch=dataset_size // mini_batch_size + 1, - epochs=max_epochs - epoch, - # if we load a checkpoint, we have already trained for epoch - pct_start=0.0, - cycle_momentum=cycle_momentum, - ) - elif scheduler == LinearSchedulerWithWarmup: - steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size - num_train_steps = int(steps_per_epoch * max_epochs) - num_warmup_steps = int(num_train_steps * warmup_fraction) - - scheduler = LinearSchedulerWithWarmup( - optimizer, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - ) - else: - scheduler = scheduler( - optimizer, - factor=anneal_factor, - patience=patience, - initial_extra_patience=initial_extra_patience, - mode=anneal_mode, - verbose=True, - ) + if use_amp: + self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) - # Determine whether to log "bad epochs" information - log_bad_epochs = True if scheduler.__class__ == AnnealOnPlateau else False + optimizer = cast(torch.optim.Optimizer, optimizer) - # load existing scheduler state dictionary if it exists - if scheduler_state_dict: - scheduler.load_state_dict(scheduler_state_dict) + # load existing optimizer state dictionary if it exists + if optimizer_state_dict: + optimizer.load_state_dict(optimizer_state_dict) - # update optimizer and scheduler in model card - model_card["training_parameters"]["optimizer"] = optimizer - model_card["training_parameters"]["scheduler"] = scheduler + # minimize training loss if training with dev data, else maximize dev score + anneal_mode = "min" if train_with_dev or anneal_against_dev_loss else "max" + best_validation_score = 100000000000 if train_with_dev or anneal_against_dev_loss else -1.0 - if isinstance(scheduler, OneCycleLR) and batch_growth_annealing: - raise ValueError("Batch growth with OneCycle policy is not implemented.") + dataset_size = _len_dataset(self.corpus.train) + if train_with_dev: + dataset_size += _len_dataset(self.corpus.dev) - train_data = self.corpus.train + # if scheduler is passed as a class, instantiate + if inspect.isclass(scheduler): + if scheduler == OneCycleLR: + scheduler = OneCycleLR( + optimizer, + max_lr=current_learning_rate, + steps_per_epoch=dataset_size // mini_batch_size + 1, + epochs=max_epochs - epoch, + # if we load a checkpoint, we have already trained for epoch + pct_start=0.0, + cycle_momentum=cycle_momentum, + ) + elif scheduler == LinearSchedulerWithWarmup: + steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size + num_train_steps = int(steps_per_epoch * max_epochs) + num_warmup_steps = int(num_train_steps * warmup_fraction) + + scheduler = LinearSchedulerWithWarmup( + optimizer, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + ) + else: + scheduler = scheduler( + optimizer, + factor=anneal_factor, + patience=patience, + initial_extra_patience=initial_extra_patience, + mode=anneal_mode, + verbose=True, + ) + + # Determine whether to log "bad epochs" information + log_bad_epochs = True if scheduler.__class__ == AnnealOnPlateau else False + + # load existing scheduler state dictionary if it exists + if scheduler_state_dict: + scheduler.load_state_dict(scheduler_state_dict) + + # update optimizer and scheduler in model card + model_card["training_parameters"]["optimizer"] = optimizer + model_card["training_parameters"]["scheduler"] = scheduler + + if isinstance(scheduler, OneCycleLR) and batch_growth_annealing: + raise ValueError("Batch growth with OneCycle policy is not implemented.") + + train_data = self.corpus.train # if training also uses dev/train data, include in training set if train_with_dev or train_with_test: @@ -390,120 +390,123 @@ def train( if train_with_test and self.corpus.test: parts.append(self.corpus.test) - train_data = ConcatDataset(parts) + train_data = ConcatDataset(parts) - # initialize sampler if provided - if sampler is not None: - # init with default values if only class is provided - if inspect.isclass(sampler): - sampler = sampler() - # set dataset to sample from - sampler.set_dataset(train_data) - shuffle = False + # initialize sampler if provided + if sampler is not None: + # init with default values if only class is provided + if inspect.isclass(sampler): + sampler = sampler() + # set dataset to sample from + sampler.set_dataset(train_data) + shuffle = False - dev_score_history = [] - dev_loss_history = [] - train_loss_history = [] + dev_score_history = [] + dev_loss_history = [] + train_loss_history = [] - micro_batch_size = mini_batch_chunk_size + micro_batch_size = mini_batch_chunk_size + # this field stores the names of all dynamic embeddings in the model (determined after first forward pass) + dynamic_embeddings = None - # this field stores the names of all dynamic embeddings in the model (determined after first forward pass) - dynamic_embeddings = None + # At any point you can hit Ctrl + C to break out of training early. + try: + if create_file_logs: + log_handler = add_file_handler(log, base_path / "training.log") + else: + log_handler = None - # At any point you can hit Ctrl + C to break out of training early. - try: - if create_file_logs: - log_handler = add_file_handler(log, base_path / "training.log") - else: - log_handler = None - - lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) - - log_line(log) - log.info(f'Model: "{self.model}"') - log_line(log) - log.info(f'Corpus: "{self.corpus}"') - log_line(log) - log.info("Parameters:") - log.info(f' - learning_rate: "{lr_info}"') - log.info(f' - mini_batch_size: "{mini_batch_size}"') - log.info(f' - patience: "{patience}"') - log.info(f' - anneal_factor: "{anneal_factor}"') - log.info(f' - max_epochs: "{max_epochs}"') - log.info(f' - shuffle: "{shuffle}"') - log.info(f' - train_with_dev: "{train_with_dev}"') - log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') - log_line(log) - log.info(f'Model training base path: "{base_path}"') - log_line(log) - log.info(f"Device: {flair.device}") - log_line(log) - log.info(f"Embeddings storage mode: {embeddings_storage_mode}") - - previous_learning_rate = current_learning_rate - - momentum = [group["momentum"] if "momentum" in group else 0 for group in optimizer.param_groups] - - for epoch in range(epoch + 1, max_epochs + 1): + lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) + + log_line(log) + log.info(f'Model: "{self.model}"') + log_line(log) + log.info(f'Corpus: "{self.corpus}"') log_line(log) + log.info("Parameters:") + log.info(f' - learning_rate: "{lr_info}"') + log.info(f' - mini_batch_size: "{mini_batch_size}"') + log.info(f' - patience: "{patience}"') + log.info(f' - anneal_factor: "{anneal_factor}"') + log.info(f' - max_epochs: "{max_epochs}"') + log.info(f' - shuffle: "{shuffle}"') + log.info(f' - train_with_dev: "{train_with_dev}"') + log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') + log_line(log) + log.info(f'Model training base path: "{base_path}"') + log_line(log) + log.info(f"Device: {flair.device}") + log_line(log) + log.info(f"Embeddings storage mode: {embeddings_storage_mode}") + + previous_learning_rate = current_learning_rate - # update epoch in model card - model_card["training_parameters"]["epoch"] = epoch + momentum = [group["momentum"] if "momentum" in group else 0 for group in optimizer.param_groups] - if anneal_with_prestarts: - last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict()) + for epoch in range(epoch + 1, max_epochs + 1): + log_line(log) - if eval_on_train_shuffle: - train_part_indices = list(range(_len_dataset(self.corpus.train))) - random.shuffle(train_part_indices) - train_part_indices = train_part_indices[:train_part_size] - train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) + # update epoch in model card + model_card["training_parameters"]["epoch"] = epoch - # get new learning rate - current_learning_rate = [group["lr"] for group in optimizer.param_groups] + if anneal_with_prestarts: + last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict()) - lr_changed = any([lr != prev_lr for lr, prev_lr in zip(current_learning_rate, previous_learning_rate)]) + if eval_on_train_shuffle: + train_part_indices = list(range(_len_dataset(self.corpus.train))) + random.shuffle(train_part_indices) + train_part_indices = train_part_indices[:train_part_size] + train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) - if lr_changed and batch_growth_annealing: - mini_batch_size *= 2 + # get new learning rate + current_learning_rate = [group["lr"] for group in optimizer.param_groups] - # reload last best model if annealing with restarts is enabled - if ( - (anneal_with_restarts or anneal_with_prestarts) - and lr_changed - and os.path.exists(base_path / "best-model.pt") - ): - if anneal_with_restarts: - log.info("resetting to best model") - self.model.load_state_dict(self.model.load(base_path / "best-model.pt").state_dict()) - if anneal_with_prestarts: - log.info("resetting to pre-best model") - self.model.load_state_dict(self.model.load(base_path / "pre-best-model.pt").state_dict()) + lr_changed = any( + [lr != prev_lr for lr, prev_lr in zip(current_learning_rate, previous_learning_rate)] + ) - previous_learning_rate = current_learning_rate - if use_tensorboard: - if len(current_learning_rate) == 1: - writer.add_scalar("learning_rate", current_learning_rate[0], epoch) - else: - for i, lr in enumerate(current_learning_rate): - writer.add_scalar(f"learning_rate_{i}", lr, epoch) + if lr_changed and batch_growth_annealing: + mini_batch_size *= 2 - all_lrs_too_small = all([lr < min_lr for lr, min_lr in zip(current_learning_rate, min_learning_rate)]) + # reload last best model if annealing with restarts is enabled + if ( + (anneal_with_restarts or anneal_with_prestarts) + and lr_changed + and os.path.exists(base_path / "best-model.pt") + ): + if anneal_with_restarts: + log.info("resetting to best model") + self.model.load_state_dict(self.model.load(base_path / "best-model.pt").state_dict()) + if anneal_with_prestarts: + log.info("resetting to pre-best model") + self.model.load_state_dict(self.model.load(base_path / "pre-best-model.pt").state_dict()) + + previous_learning_rate = current_learning_rate + if use_tensorboard: + if len(current_learning_rate) == 1: + writer.add_scalar("learning_rate", current_learning_rate[0], epoch) + else: + for i, lr in enumerate(current_learning_rate): + writer.add_scalar(f"learning_rate_{i}", lr, epoch) - # stop training if learning rate becomes too small - if not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and all_lrs_too_small: - log_line(log) - log.info("learning rate too small - quitting training!") - log_line(log) - break + all_lrs_too_small = all( + [lr < min_lr for lr, min_lr in zip(current_learning_rate, min_learning_rate)] + ) + + # stop training if learning rate becomes too small + if not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and all_lrs_too_small: + log_line(log) + log.info("learning rate too small - quitting training!") + log_line(log) + break - start_time = time.time() + start_time = time.time() - # if shuffle_first_epoch==False, the first epoch is not shuffled - shuffle_data_this_epoch = shuffle - if not shuffle_first_epoch and epoch == 1: - shuffle_data_this_epoch = False + # if shuffle_first_epoch==False, the first epoch is not shuffled + shuffle_data_this_epoch = shuffle + if not shuffle_first_epoch and epoch == 1: + shuffle_data_this_epoch = False batch_loader = DataLoader( train_data, @@ -513,14 +516,14 @@ def train( sampler=sampler, ) - self.model.train() + self.model.train() - train_loss: float = 0 + train_loss: float = 0 - seen_batches = 0 - total_number_of_batches = len(batch_loader) + seen_batches = 0 + total_number_of_batches = len(batch_loader) - modulo = max(1, int(total_number_of_batches / 10)) + modulo = max(1, int(total_number_of_batches / 10)) # process mini-batches average_over = 0 @@ -529,10 +532,12 @@ def train( self.model.zero_grad() optimizer.zero_grad() - # if necessary, make batch_steps - batch_steps = [batch] - if len(batch) > micro_batch_size: - batch_steps = [batch[x : x + micro_batch_size] for x in range(0, len(batch), micro_batch_size)] + # if necessary, make batch_steps + batch_steps = [batch] + if len(batch) > micro_batch_size: + batch_steps = [ + batch[x : x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) + ] # forward and backward for batch for batch_step in batch_steps: @@ -552,86 +557,88 @@ def train( if dynamic_embeddings is None: dynamic_embeddings = identify_dynamic_embeddings(batch) - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(batch, embeddings_storage_mode, dynamic_embeddings) - - # do the optimizer step - torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) - optimizer.step() + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(batch, embeddings_storage_mode, dynamic_embeddings) - # do the scheduler step if one-cycle or linear decay - if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)): - scheduler.step() - # get new learning rate - current_learning_rate = [group["lr"] for group in optimizer.param_groups] + # do the optimizer step + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) + optimizer.step() - momentum = [ - group["betas"][0] if "betas" in group else group.get("momentum", 0) - for group in optimizer.param_groups - ] + # do the scheduler step if one-cycle or linear decay + if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)): + scheduler.step() + # get new learning rate + current_learning_rate = [group["lr"] for group in optimizer.param_groups] - seen_batches += 1 + momentum = [ + group["betas"][0] if "betas" in group else group.get("momentum", 0) + for group in optimizer.param_groups + ] - if seen_batches % modulo == 0: - momentum_info = "" - if cycle_momentum: - momentum_info = " - momentum:" + ",".join([f"{m:.4f}" for m in momentum]) + seen_batches += 1 - lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) + if seen_batches % modulo == 0: + momentum_info = "" + if cycle_momentum: + momentum_info = " - momentum:" + ",".join([f"{m:.4f}" for m in momentum]) - intermittent_loss = train_loss / average_over if average_over > 0 else train_loss / seen_batches - end_time = time.time() - log.info( - f"epoch {epoch}" - f" - iter {seen_batches}/{total_number_of_batches}" - f" - loss {intermittent_loss:.8f}" - f" - time (sec): {end_time - start_time:.2f}" - f" - samples/sec: {average_over / (end_time - start_time):.2f}" - f" - lr: {lr_info}{momentum_info}" - ) - iteration = epoch * total_number_of_batches + batch_no - if not param_selection_mode and write_weights: - weight_extractor.extract_weights(self.model.state_dict(), iteration) + lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) - if average_over != 0: - train_loss /= average_over + intermittent_loss = ( + train_loss / average_over if average_over > 0 else train_loss / seen_batches + ) + end_time = time.time() + log.info( + f"epoch {epoch}" + f" - iter {seen_batches}/{total_number_of_batches}" + f" - loss {intermittent_loss:.8f}" + f" - time (sec): {end_time - start_time:.2f}" + f" - samples/sec: {average_over / (end_time - start_time):.2f}" + f" - lr: {lr_info}{momentum_info}" + ) + iteration = epoch * total_number_of_batches + batch_no + if not param_selection_mode and write_weights: + weight_extractor.extract_weights(self.model.state_dict(), iteration) - self.model.eval() + if average_over != 0: + train_loss /= average_over - if save_model_each_k_epochs > 0 and epoch % save_model_each_k_epochs == 0: - log.info("saving model of current epoch") - model_name = "model_epoch_" + str(epoch) + ".pt" - self.model.save(base_path / model_name, checkpoint=save_optimizer_state) + self.model.eval() - log_line(log) - log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {lr_info}") + if save_model_each_k_epochs > 0 and epoch % save_model_each_k_epochs == 0: + log.info("saving model of current epoch") + model_name = "model_epoch_" + str(epoch) + ".pt" + self.model.save(base_path / model_name, checkpoint=save_optimizer_state) - if use_tensorboard: - writer.add_scalar("train_loss", train_loss, epoch) - - # evaluate on train / dev / test split depending on training settings - result_line: str = "" + log_line(log) + log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {lr_info}") - if log_train: - train_eval_result = self.model.evaluate( - self.corpus.train, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{train_eval_result.loss}\t{train_eval_result.log_line}" - log.info( - f"TRAIN : loss {train_eval_result.loss} -" - f" {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]}) " - f" {round(train_eval_result.main_score, 4)}" - ) - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(self.corpus.train, embeddings_storage_mode, dynamic_embeddings) + if use_tensorboard: + writer.add_scalar("train_loss", train_loss, epoch) + + # evaluate on train / dev / test split depending on training settings + result_line: str = "" + + if log_train: + train_eval_result = self.model.evaluate( + self.corpus.train, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{train_eval_result.loss}\t{train_eval_result.log_line}" + log.info( + f"TRAIN : loss {train_eval_result.loss} -" + f" {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]}) " + f" {round(train_eval_result.main_score, 4)}" + ) + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(self.corpus.train, embeddings_storage_mode, dynamic_embeddings) if log_train_part: train_part_eval_result = self.model.evaluate( @@ -659,140 +666,140 @@ def train( epoch, ) - if log_dev: - assert self.corpus.dev - dev_eval_result = self.model.evaluate( - self.corpus.dev, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - out_path=base_path / "dev.tsv", - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{dev_eval_result.loss}\t{dev_eval_result.log_line}" - log.info( - f"DEV : loss {dev_eval_result.loss}" - f" - {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]})" - f" {round(dev_eval_result.main_score, 4)}" - ) - # calculate scores using dev data if available - # append dev score to score history - dev_score_history.append(dev_eval_result.main_score) - dev_loss_history.append(dev_eval_result.loss) - - dev_score = dev_eval_result.main_score - - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(self.corpus.dev, embeddings_storage_mode, dynamic_embeddings) - - if use_tensorboard: - writer.add_scalar("dev_loss", dev_eval_result.loss, epoch) - writer.add_scalar("dev_score", dev_eval_result.main_score, epoch) - for ( - metric_class_avg_type, - metric_type, - ) in metrics_for_tensorboard: - writer.add_scalar( - f"dev_{metric_class_avg_type}_{metric_type}", - dev_eval_result.classification_report[metric_class_avg_type][metric_type], - epoch, - ) + if log_dev: + assert self.corpus.dev + dev_eval_result = self.model.evaluate( + self.corpus.dev, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + out_path=base_path / "dev.tsv", + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{dev_eval_result.loss}\t{dev_eval_result.log_line}" + log.info( + f"DEV : loss {dev_eval_result.loss}" + f" - {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]})" + f" {round(dev_eval_result.main_score, 4)}" + ) + # calculate scores using dev data if available + # append dev score to score history + dev_score_history.append(dev_eval_result.main_score) + dev_loss_history.append(dev_eval_result.loss) - if log_test: - assert self.corpus.test - test_eval_result = self.model.evaluate( - self.corpus.test, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - out_path=base_path / "test.tsv", - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{test_eval_result.loss}\t{test_eval_result.log_line}" - log.info( - f"TEST : loss {test_eval_result.loss} -" - f" {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]}) " - f" {round(test_eval_result.main_score, 4)}" - ) + dev_score = dev_eval_result.main_score - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(self.corpus.test, embeddings_storage_mode, dynamic_embeddings) + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(self.corpus.dev, embeddings_storage_mode, dynamic_embeddings) + + if use_tensorboard: + writer.add_scalar("dev_loss", dev_eval_result.loss, epoch) + writer.add_scalar("dev_score", dev_eval_result.main_score, epoch) + for ( + metric_class_avg_type, + metric_type, + ) in metrics_for_tensorboard: + writer.add_scalar( + f"dev_{metric_class_avg_type}_{metric_type}", + dev_eval_result.classification_report[metric_class_avg_type][metric_type], + epoch, + ) - if use_tensorboard: - writer.add_scalar("test_loss", test_eval_result.loss, epoch) - writer.add_scalar("test_score", test_eval_result.main_score, epoch) - for ( - metric_class_avg_type, - metric_type, - ) in metrics_for_tensorboard: - writer.add_scalar( - f"test_{metric_class_avg_type}_{metric_type}", - test_eval_result.classification_report[metric_class_avg_type][metric_type], - epoch, - ) + if log_test: + assert self.corpus.test + test_eval_result = self.model.evaluate( + self.corpus.test, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + out_path=base_path / "test.tsv", + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{test_eval_result.loss}\t{test_eval_result.log_line}" + log.info( + f"TEST : loss {test_eval_result.loss} -" + f" {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]}) " + f" {round(test_eval_result.main_score, 4)}" + ) - # determine if this is the best model or if we need to anneal - current_epoch_has_best_model_so_far = False - # default mode: anneal against dev score - if not train_with_dev and not anneal_against_dev_loss: - if dev_score > best_validation_score: - current_epoch_has_best_model_so_far = True - best_validation_score = dev_score - - if isinstance(scheduler, AnnealOnPlateau): - scheduler.step(dev_score, dev_eval_result.loss) - - # alternative: anneal against dev loss - if not train_with_dev and anneal_against_dev_loss: - if dev_eval_result.loss < best_validation_score: - current_epoch_has_best_model_so_far = True - best_validation_score = dev_eval_result.loss - - if isinstance(scheduler, AnnealOnPlateau): - scheduler.step(dev_eval_result.loss) - - # alternative: anneal against train loss - if train_with_dev: - if train_loss < best_validation_score: - current_epoch_has_best_model_so_far = True - best_validation_score = train_loss - - if isinstance(scheduler, AnnealOnPlateau): - scheduler.step(train_loss) - - train_loss_history.append(train_loss) - - # determine bad epoch number - try: - bad_epochs = scheduler.num_bad_epochs - except AttributeError: - bad_epochs = 0 - - new_learning_rate = [group["lr"] for group in optimizer.param_groups] - - if any([new_lr != prev_lr for new_lr, prev_lr in zip(new_learning_rate, previous_learning_rate)]): - bad_epochs = patience + 1 - - # lr unchanged - if all( - [ - prev_lr == initial_lr - for prev_lr, initial_lr in zip(previous_learning_rate, initial_learning_rate) - ] - ): - bad_epochs += initial_extra_patience + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(self.corpus.test, embeddings_storage_mode, dynamic_embeddings) + + if use_tensorboard: + writer.add_scalar("test_loss", test_eval_result.loss, epoch) + writer.add_scalar("test_score", test_eval_result.main_score, epoch) + for ( + metric_class_avg_type, + metric_type, + ) in metrics_for_tensorboard: + writer.add_scalar( + f"test_{metric_class_avg_type}_{metric_type}", + test_eval_result.classification_report[metric_class_avg_type][metric_type], + epoch, + ) - # log bad epochs - if log_bad_epochs: - log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") + # determine if this is the best model or if we need to anneal + current_epoch_has_best_model_so_far = False + # default mode: anneal against dev score + if not train_with_dev and not anneal_against_dev_loss: + if dev_score > best_validation_score: + current_epoch_has_best_model_so_far = True + best_validation_score = dev_score + + if isinstance(scheduler, AnnealOnPlateau): + scheduler.step(dev_score, dev_eval_result.loss) + + # alternative: anneal against dev loss + if not train_with_dev and anneal_against_dev_loss: + if dev_eval_result.loss < best_validation_score: + current_epoch_has_best_model_so_far = True + best_validation_score = dev_eval_result.loss + + if isinstance(scheduler, AnnealOnPlateau): + scheduler.step(dev_eval_result.loss) + + # alternative: anneal against train loss + if train_with_dev: + if train_loss < best_validation_score: + current_epoch_has_best_model_so_far = True + best_validation_score = train_loss + + if isinstance(scheduler, AnnealOnPlateau): + scheduler.step(train_loss) + + train_loss_history.append(train_loss) + + # determine bad epoch number + try: + bad_epochs = scheduler.num_bad_epochs + except AttributeError: + bad_epochs = 0 + + new_learning_rate = [group["lr"] for group in optimizer.param_groups] + + if any([new_lr != prev_lr for new_lr, prev_lr in zip(new_learning_rate, previous_learning_rate)]): + bad_epochs = patience + 1 + + # lr unchanged + if all( + [ + prev_lr == initial_lr + for prev_lr, initial_lr in zip(previous_learning_rate, initial_learning_rate) + ] + ): + bad_epochs += initial_extra_patience + + # log bad epochs + if log_bad_epochs: + log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") if loss_txt is not None: # output log file @@ -802,95 +809,95 @@ def train( bad_epoch_header = "BAD_EPOCHS\t" if log_bad_epochs else "" f.write(f"EPOCH\tTIMESTAMP\t{bad_epoch_header}LEARNING_RATE\tTRAIN_LOSS") - if log_train: - f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t"))) - - if log_train_part: - f.write( - "\tTRAIN_PART_LOSS\tTRAIN_PART_" - + "\tTRAIN_PART_".join(train_part_eval_result.log_header.split("\t")) - ) + if log_train: + f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t"))) - if log_dev: - f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t"))) + if log_train_part: + f.write( + "\tTRAIN_PART_LOSS\tTRAIN_PART_" + + "\tTRAIN_PART_".join(train_part_eval_result.log_header.split("\t")) + ) - if log_test: - f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(test_eval_result.log_header.split("\t"))) + if log_dev: + f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t"))) - lr_info = ",".join([f"{lr:.4f}" for lr in current_learning_rate]) + if log_test: + f.write( + "\tTEST_LOSS\tTEST_" + "\tTEST_".join(test_eval_result.log_header.split("\t")) + ) - bad_epoch_info = "\t" + str(bad_epochs) if log_bad_epochs else "" - f.write( - f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}" - f"{bad_epoch_info}" - f"\t{lr_info}\t{train_loss}" - ) - f.write(result_line) - - # if checkpoint is enabled, save model at each epoch - if checkpoint and not param_selection_mode: - self.model.save(base_path / "checkpoint.pt", checkpoint=True) - - # Check whether to save best model - if ( - (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) - and not param_selection_mode - and current_epoch_has_best_model_so_far - and not use_final_model_for_eval - ): - log.info("saving best model") - self.model.save(base_path / "best-model.pt", checkpoint=save_optimizer_state) + lr_info = ",".join([f"{lr:.4f}" for lr in current_learning_rate]) - if anneal_with_prestarts: - current_state_dict = self.model.state_dict() - self.model.load_state_dict(last_epoch_model_state_dict) - self.model.save(base_path / "pre-best-model.pt") - self.model.load_state_dict(current_state_dict) + bad_epoch_info = "\t" + str(bad_epochs) if log_bad_epochs else "" + f.write( + f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}" + f"{bad_epoch_info}" + f"\t{lr_info}\t{train_loss}" + ) + f.write(result_line) + + # if checkpoint is enabled, save model at each epoch + if checkpoint and not param_selection_mode: + self.model.save(base_path / "checkpoint.pt", checkpoint=True) + + # Check whether to save best model + if ( + (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) + and not param_selection_mode + and current_epoch_has_best_model_so_far + and not use_final_model_for_eval + ): + log.info("saving best model") + self.model.save(base_path / "best-model.pt", checkpoint=save_optimizer_state) - if use_swa: - import torchcontrib + if anneal_with_prestarts: + current_state_dict = self.model.state_dict() + self.model.load_state_dict(last_epoch_model_state_dict) + self.model.save(base_path / "pre-best-model.pt") + self.model.load_state_dict(current_state_dict) - cast(torchcontrib.optim.SWA, optimizer).swap_swa_sgd() + if use_swa: + import torchcontrib - # if we do not use dev data for model selection, save final model - if save_final_model and not param_selection_mode: - self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) + cast(torchcontrib.optim.SWA, optimizer).swap_swa_sgd() - except KeyboardInterrupt: - log_line(log) - log.info("Exiting from training early.") + # if we do not use dev data for model selection, save final model + if save_final_model and not param_selection_mode: + self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) - if not param_selection_mode: - log.info("Saving model ...") - self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) - log.info("Done.") - except Exception: - if create_file_logs: - log_handler.close() - log.removeHandler(log_handler) - raise - finally: - if use_tensorboard: - writer.close() - optimizer.zero_grad(set_to_none=True) - del optimizer - - # test best model if test data is present - if self.corpus.test and not train_with_test: - final_score = self.final_test( - base_path=base_path, - eval_mini_batch_size=eval_batch_size, - num_workers=num_workers, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary_for_eval=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - else: - final_score = 0 - log.info("Test data not provided setting final score to 0") + except KeyboardInterrupt: + log_line(log) + log.info("Exiting from training early.") + + if not param_selection_mode: + log.info("Saving model ...") + self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) + log.info("Done.") + except Exception: + if create_file_logs: + log_handler.close() + log.removeHandler(log_handler) + raise + finally: + if use_tensorboard: + writer.close() + optimizer.zero_grad(set_to_none=True) + del optimizer + + # test best model if test data is present + if self.corpus.test and not train_with_test: + final_score = self.final_test( + base_path=base_path, + eval_mini_batch_size=eval_batch_size, + num_workers=num_workers, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary_for_eval=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + else: + final_score = 0 + log.info("Test data not provided setting final score to 0") if reduce_transformer_vocab: - for context in vocab_contexts: - context.__exit__(*sys.exc_info()) if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) @@ -1110,7 +1117,6 @@ def find_learning_rate( def get_transformer_embeddings(trainer: ModelTrainer) -> List[TransformerEmbeddings]: - if isinstance(trainer.model, FewshotClassifier): embeddings = trainer.model.tars_embeddings else: @@ -1131,4 +1137,4 @@ def scan_embeddings(emb: Embeddings): scan_embeddings(embeddings) - return list(transformer_embeddings) \ No newline at end of file + return list(transformer_embeddings) From 99e727fe616ea7167bfe215fa28d3d94fe449a92 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 5 Feb 2023 04:47:36 +0100 Subject: [PATCH 08/14] fix typing and formatting --- flair/models/pairwise_classification_model.py | 6 +++--- flair/models/relation_classifier_model.py | 19 ++++++++++++++----- flair/models/relation_extractor_model.py | 2 +- flair/models/tars_model.py | 5 +++-- flair/nn/model.py | 8 ++++---- flair/trainers/trainer.py | 2 +- 6 files changed, 26 insertions(+), 16 deletions(-) diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py index 868cf5bcf..5e5162a7b 100644 --- a/flair/models/pairwise_classification_model.py +++ b/flair/models/pairwise_classification_model.py @@ -5,7 +5,7 @@ import flair.embeddings import flair.nn -from flair.data import Sentence, TextPair, Corpus +from flair.data import Corpus, Sentence, TextPair, _iter_dataset class TextPairClassifier(flair.nn.DefaultClassifier[TextPair, TextPair]): @@ -114,7 +114,7 @@ def _init_model_with_state_dict(cls, state, **kwargs): **kwargs, ) - def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: - for sentence_pair in corpus.get_all_sentences(): + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: + for sentence_pair in _iter_dataset(corpus.get_all_sentences()): yield [t.text for t in sentence_pair.first] yield [t.text for t in sentence_pair.second] diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py index 6fb0eda52..426226714 100644 --- a/flair/models/relation_classifier_model.py +++ b/flair/models/relation_classifier_model.py @@ -20,7 +20,16 @@ from torch.utils.data.dataset import Dataset import flair -from flair.data import Corpus, Dictionary, Label, Relation, Sentence, Span, Token +from flair.data import ( + Corpus, + Dictionary, + Label, + Relation, + Sentence, + Span, + Token, + _iter_dataset, +) from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings import DocumentEmbeddings, TransformerDocumentEmbeddings from flair.tokenization import SpaceTokenizer @@ -707,9 +716,9 @@ def zero_tag_value(self) -> str: def allow_unk_tag(self) -> bool: return self._allow_unk_tag - def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: yield from super().get_used_tokens(corpus) - for sentence in corpus.get_all_sentences(): + for sentence in _iter_dataset(corpus.get_all_sentences()): for span in sentence.get_spans(self.label_type): - yield self.encoding_strategy.encode_head(span, span.get_label(self.label_type)) - yield self.encoding_strategy.encode_tail(span, span.get_label(self.label_type)) + yield self.encoding_strategy.encode_head(span, span.get_label(self.label_type)).split(" ") + yield self.encoding_strategy.encode_tail(span, span.get_label(self.label_type)).split(" ") diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py index 40d5f0772..63dc2da6d 100644 --- a/flair/models/relation_extractor_model.py +++ b/flair/models/relation_extractor_model.py @@ -6,7 +6,7 @@ import flair.embeddings import flair.nn -from flair.data import Relation, Sentence, Corpus +from flair.data import Relation, Sentence from flair.file_utils import cached_path log = logging.getLogger("flair") diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py index d0c50e7d8..b56f63534 100644 --- a/flair/models/tars_model.py +++ b/flair/models/tars_model.py @@ -12,7 +12,7 @@ from tqdm import tqdm import flair -from flair.data import Dictionary, Sentence, Span, Corpus +from flair.data import Corpus, Dictionary, Sentence, Span from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings import ( TokenEmbeddings, @@ -33,6 +33,7 @@ def __init__(self): self._task_specific_attributes = {} self.label_nearest_map = None self.tars_model: flair.nn.Classifier[Sentence] + self.separator: str super(FewshotClassifier, self).__init__() @@ -309,7 +310,7 @@ def predict_zero_shot( return - def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: yield from super().get_used_tokens(corpus) for label in self.get_current_label_dictionary().idx2item: yield [label.decode("utf-8")] diff --git a/flair/nn/model.py b/flair/nn/model.py index 0bbebaad4..f8d6aa0e9 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -14,7 +14,7 @@ from tqdm import tqdm import flair -from flair.data import DT, DT2, Dictionary, Sentence, Corpus +from flair.data import DT, DT2, Corpus, Dictionary, Sentence, _iter_dataset from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings import Embeddings from flair.embeddings.base import load_embeddings @@ -238,7 +238,7 @@ def supports_smaller_training_vocab(self) -> bool: # the smaller training vocab expects classification tasks, otherwise it won't work. return False - def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: pass @@ -555,8 +555,8 @@ def supports_smaller_training_vocab(self) -> bool: # the smaller training vocab expects classification tasks, otherwise it won't work. return True - def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[str]: - for sentence in corpus.get_all_sentences(): + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: + for sentence in _iter_dataset(corpus.get_all_sentences()): yield [t.text for t in sentence] diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 815e1c815..1491dc677 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -16,7 +16,7 @@ from torch.utils.data.dataset import ConcatDataset from transformer_smaller_training_vocab import reduce_train_vocab -from flair.embeddings import TransformerEmbeddings, Embeddings, StackedEmbeddings +from flair.embeddings import Embeddings, StackedEmbeddings, TransformerEmbeddings from flair.models import FewshotClassifier from flair.nn import Model From 1a785d9b099fee6706eb84f9ee4e0b335fdc55fa Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 5 Feb 2023 11:59:20 +0100 Subject: [PATCH 09/14] reformatting with updated black release --- flair/trainers/trainer.py | 120 +++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 1491dc677..5eefb4388 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -508,13 +508,13 @@ def train( if not shuffle_first_epoch and epoch == 1: shuffle_data_this_epoch = False - batch_loader = DataLoader( - train_data, - batch_size=mini_batch_size, - shuffle=shuffle_data_this_epoch, - num_workers=0 if num_workers is None else num_workers, - sampler=sampler, - ) + batch_loader = DataLoader( + train_data, + batch_size=mini_batch_size, + shuffle=shuffle_data_this_epoch, + num_workers=0 if num_workers is None else num_workers, + sampler=sampler, + ) self.model.train() @@ -525,12 +525,12 @@ def train( modulo = max(1, int(total_number_of_batches / 10)) - # process mini-batches - average_over = 0 - for batch_no, batch in enumerate(batch_loader): - # zero the gradients on the model and optimizer - self.model.zero_grad() - optimizer.zero_grad() + # process mini-batches + average_over = 0 + for batch_no, batch in enumerate(batch_loader): + # zero the gradients on the model and optimizer + self.model.zero_grad() + optimizer.zero_grad() # if necessary, make batch_steps batch_steps = [batch] @@ -539,23 +539,23 @@ def train( batch[x : x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) ] - # forward and backward for batch - for batch_step in batch_steps: - # forward pass - loss, datapoint_count = self.model.forward_loss(batch_step) - average_over += datapoint_count - # Backward - if use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - train_loss += loss.item() + # forward and backward for batch + for batch_step in batch_steps: + # forward pass + loss, datapoint_count = self.model.forward_loss(batch_step) + average_over += datapoint_count + # Backward + if use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + train_loss += loss.item() - # identify dynamic embeddings (always deleted) on first sentence + # identify dynamic embeddings (always deleted) on first sentence - if dynamic_embeddings is None: - dynamic_embeddings = identify_dynamic_embeddings(batch) + if dynamic_embeddings is None: + dynamic_embeddings = identify_dynamic_embeddings(batch) # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(batch, embeddings_storage_mode, dynamic_embeddings) @@ -640,31 +640,31 @@ def train( # depending on memory mode, embeddings are moved to CPU, GPU or deleted store_embeddings(self.corpus.train, embeddings_storage_mode, dynamic_embeddings) - if log_train_part: - train_part_eval_result = self.model.evaluate( - train_part, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{train_part_eval_result.loss}" f"\t{train_part_eval_result.log_line}" - log.info( - f"TRAIN_SPLIT : loss {train_part_eval_result.loss}" - f" - {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]})" - f" {round(train_part_eval_result.main_score, 4)}" - ) - if use_tensorboard: - for metric_class_avg_type, metric_type in metrics_for_tensorboard: - writer.add_scalar( - f"train_{metric_class_avg_type}_{metric_type}", - train_part_eval_result.classification_report[metric_class_avg_type][metric_type], - epoch, - ) + if log_train_part: + train_part_eval_result = self.model.evaluate( + train_part, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{train_part_eval_result.loss}" f"\t{train_part_eval_result.log_line}" + log.info( + f"TRAIN_SPLIT : loss {train_part_eval_result.loss}" + f" - {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]})" + f" {round(train_part_eval_result.main_score, 4)}" + ) + if use_tensorboard: + for metric_class_avg_type, metric_type in metrics_for_tensorboard: + writer.add_scalar( + f"train_{metric_class_avg_type}_{metric_type}", + train_part_eval_result.classification_report[metric_class_avg_type][metric_type], + epoch, + ) if log_dev: assert self.corpus.dev @@ -801,13 +801,13 @@ def train( if log_bad_epochs: log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") - if loss_txt is not None: - # output log file - with open(loss_txt, "a") as f: - # make headers on first epoch - if epoch == 1: - bad_epoch_header = "BAD_EPOCHS\t" if log_bad_epochs else "" - f.write(f"EPOCH\tTIMESTAMP\t{bad_epoch_header}LEARNING_RATE\tTRAIN_LOSS") + if loss_txt is not None: + # output log file + with open(loss_txt, "a") as f: + # make headers on first epoch + if epoch == 1: + bad_epoch_header = "BAD_EPOCHS\t" if log_bad_epochs else "" + f.write(f"EPOCH\tTIMESTAMP\t{bad_epoch_header}LEARNING_RATE\tTRAIN_LOSS") if log_train: f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t"))) From ebc100df6574ef7585d8211b4e36a40f59391d42 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 19 Feb 2023 04:35:02 +0100 Subject: [PATCH 10/14] update smaller transformer vocab and use optimizer updates --- flair/trainers/trainer.py | 19 +++++++++---------- requirements.txt | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 5eefb4388..468073cb6 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -247,11 +247,13 @@ def train( if mini_batch_chunk_size is None: mini_batch_chunk_size = mini_batch_size + # if optimizer class is passed, instantiate: if inspect.isclass(optimizer): - # if optimizer is class, trainer will create a single parameter group - initial_learning_rate = [learning_rate] - else: - initial_learning_rate = [group["lr"] for group in optimizer.param_groups] + kwargs["lr"] = learning_rate + optimizer = optimizer(self.model.parameters(), **kwargs) + assert isinstance(optimizer, torch.optim.Optimizer) + + initial_learning_rate = [group["lr"] for group in optimizer.param_groups] if not isinstance(min_learning_rate, list): min_learning_rate = [min_learning_rate] * len(initial_learning_rate) @@ -301,14 +303,11 @@ def train( tokens = list(self.model.get_used_tokens(self.corpus)) for emb in transformer_embeddings: context_stack.enter_context( - reduce_train_vocab(model=emb.model, tokenizer=emb.tokenizer, texts=tokens) + reduce_train_vocab( + model=emb.model, tokenizer=emb.tokenizer, texts=tokens, optimizer=optimizer + ) ) - # if optimizer class is passed, instantiate: - if inspect.isclass(optimizer): - kwargs["lr"] = learning_rate - optimizer = optimizer(self.model.parameters(), **kwargs) - if use_swa: import torchcontrib diff --git a/requirements.txt b/requirements.txt index dcdc4f30e..e3741940e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,4 +25,4 @@ more-itertools wikipedia-api pptree pytorch_revgrad -transformer-smaller-training-vocab +transformer-smaller-training-vocab>=0.2.0 From 3ff37133d979612f1f675a020991bf902e8c967b Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 19 Feb 2023 05:47:59 +0100 Subject: [PATCH 11/14] fix indentation errors --- flair/trainers/trainer.py | 892 +++++++++++++++++++------------------- 1 file changed, 441 insertions(+), 451 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 468073cb6..734b8609e 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -389,513 +389,503 @@ def train( if train_with_test and self.corpus.test: parts.append(self.corpus.test) - train_data = ConcatDataset(parts) + train_data = ConcatDataset(parts) - # initialize sampler if provided - if sampler is not None: - # init with default values if only class is provided - if inspect.isclass(sampler): - sampler = sampler() - # set dataset to sample from - sampler.set_dataset(train_data) - shuffle = False + # initialize sampler if provided + if sampler is not None: + # init with default values if only class is provided + if inspect.isclass(sampler): + sampler = sampler() + # set dataset to sample from + sampler.set_dataset(train_data) + shuffle = False - dev_score_history = [] - dev_loss_history = [] - train_loss_history = [] + dev_score_history = [] + dev_loss_history = [] + train_loss_history = [] - micro_batch_size = mini_batch_chunk_size + micro_batch_size = mini_batch_chunk_size - # this field stores the names of all dynamic embeddings in the model (determined after first forward pass) - dynamic_embeddings = None + # this field stores the names of all dynamic embeddings in the model (determined after first forward pass) + dynamic_embeddings = None - # At any point you can hit Ctrl + C to break out of training early. - try: - if create_file_logs: - log_handler = add_file_handler(log, base_path / "training.log") - else: - log_handler = None + # At any point you can hit Ctrl + C to break out of training early. + try: + if create_file_logs: + log_handler = add_file_handler(log, base_path / "training.log") + else: + log_handler = None + + lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) + + log_line(log) + log.info(f'Model: "{self.model}"') + log_line(log) + log.info(f'Corpus: "{self.corpus}"') + log_line(log) + log.info("Parameters:") + log.info(f' - learning_rate: "{lr_info}"') + log.info(f' - mini_batch_size: "{mini_batch_size}"') + log.info(f' - patience: "{patience}"') + log.info(f' - anneal_factor: "{anneal_factor}"') + log.info(f' - max_epochs: "{max_epochs}"') + log.info(f' - shuffle: "{shuffle}"') + log.info(f' - train_with_dev: "{train_with_dev}"') + log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') + log_line(log) + log.info(f'Model training base path: "{base_path}"') + log_line(log) + log.info(f"Device: {flair.device}") + log_line(log) + log.info(f"Embeddings storage mode: {embeddings_storage_mode}") + + previous_learning_rate = current_learning_rate + + momentum = [group["momentum"] if "momentum" in group else 0 for group in optimizer.param_groups] + + for epoch in range(epoch + 1, max_epochs + 1): + log_line(log) - lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) + # update epoch in model card + model_card["training_parameters"]["epoch"] = epoch - log_line(log) - log.info(f'Model: "{self.model}"') - log_line(log) - log.info(f'Corpus: "{self.corpus}"') - log_line(log) - log.info("Parameters:") - log.info(f' - learning_rate: "{lr_info}"') - log.info(f' - mini_batch_size: "{mini_batch_size}"') - log.info(f' - patience: "{patience}"') - log.info(f' - anneal_factor: "{anneal_factor}"') - log.info(f' - max_epochs: "{max_epochs}"') - log.info(f' - shuffle: "{shuffle}"') - log.info(f' - train_with_dev: "{train_with_dev}"') - log.info(f' - batch_growth_annealing: "{batch_growth_annealing}"') - log_line(log) - log.info(f'Model training base path: "{base_path}"') - log_line(log) - log.info(f"Device: {flair.device}") - log_line(log) - log.info(f"Embeddings storage mode: {embeddings_storage_mode}") + if anneal_with_prestarts: + last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict()) - previous_learning_rate = current_learning_rate + if eval_on_train_shuffle: + train_part_indices = list(range(_len_dataset(self.corpus.train))) + random.shuffle(train_part_indices) + train_part_indices = train_part_indices[:train_part_size] + train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) - momentum = [group["momentum"] if "momentum" in group else 0 for group in optimizer.param_groups] + # get new learning rate + current_learning_rate = [group["lr"] for group in optimizer.param_groups] - for epoch in range(epoch + 1, max_epochs + 1): - log_line(log) + lr_changed = any([lr != prev_lr for lr, prev_lr in zip(current_learning_rate, previous_learning_rate)]) - # update epoch in model card - model_card["training_parameters"]["epoch"] = epoch + if lr_changed and batch_growth_annealing: + mini_batch_size *= 2 + # reload last best model if annealing with restarts is enabled + if ( + (anneal_with_restarts or anneal_with_prestarts) + and lr_changed + and os.path.exists(base_path / "best-model.pt") + ): + if anneal_with_restarts: + log.info("resetting to best model") + self.model.load_state_dict(self.model.load(base_path / "best-model.pt").state_dict()) if anneal_with_prestarts: - last_epoch_model_state_dict = copy.deepcopy(self.model.state_dict()) + log.info("resetting to pre-best model") + self.model.load_state_dict(self.model.load(base_path / "pre-best-model.pt").state_dict()) - if eval_on_train_shuffle: - train_part_indices = list(range(_len_dataset(self.corpus.train))) - random.shuffle(train_part_indices) - train_part_indices = train_part_indices[:train_part_size] - train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) + previous_learning_rate = current_learning_rate + if use_tensorboard: + if len(current_learning_rate) == 1: + writer.add_scalar("learning_rate", current_learning_rate[0], epoch) + else: + for i, lr in enumerate(current_learning_rate): + writer.add_scalar(f"learning_rate_{i}", lr, epoch) - # get new learning rate - current_learning_rate = [group["lr"] for group in optimizer.param_groups] + all_lrs_too_small = all([lr < min_lr for lr, min_lr in zip(current_learning_rate, min_learning_rate)]) - lr_changed = any( - [lr != prev_lr for lr, prev_lr in zip(current_learning_rate, previous_learning_rate)] - ) + # stop training if learning rate becomes too small + if not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and all_lrs_too_small: + log_line(log) + log.info("learning rate too small - quitting training!") + log_line(log) + break - if lr_changed and batch_growth_annealing: - mini_batch_size *= 2 + start_time = time.time() - # reload last best model if annealing with restarts is enabled - if ( - (anneal_with_restarts or anneal_with_prestarts) - and lr_changed - and os.path.exists(base_path / "best-model.pt") - ): - if anneal_with_restarts: - log.info("resetting to best model") - self.model.load_state_dict(self.model.load(base_path / "best-model.pt").state_dict()) - if anneal_with_prestarts: - log.info("resetting to pre-best model") - self.model.load_state_dict(self.model.load(base_path / "pre-best-model.pt").state_dict()) - - previous_learning_rate = current_learning_rate - if use_tensorboard: - if len(current_learning_rate) == 1: - writer.add_scalar("learning_rate", current_learning_rate[0], epoch) - else: - for i, lr in enumerate(current_learning_rate): - writer.add_scalar(f"learning_rate_{i}", lr, epoch) + # if shuffle_first_epoch==False, the first epoch is not shuffled + shuffle_data_this_epoch = shuffle + if not shuffle_first_epoch and epoch == 1: + shuffle_data_this_epoch = False - all_lrs_too_small = all( - [lr < min_lr for lr, min_lr in zip(current_learning_rate, min_learning_rate)] - ) + batch_loader = DataLoader( + train_data, + batch_size=mini_batch_size, + shuffle=shuffle_data_this_epoch, + num_workers=0 if num_workers is None else num_workers, + sampler=sampler, + ) - # stop training if learning rate becomes too small - if not isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)) and all_lrs_too_small: - log_line(log) - log.info("learning rate too small - quitting training!") - log_line(log) - break - - start_time = time.time() - - # if shuffle_first_epoch==False, the first epoch is not shuffled - shuffle_data_this_epoch = shuffle - if not shuffle_first_epoch and epoch == 1: - shuffle_data_this_epoch = False - - batch_loader = DataLoader( - train_data, - batch_size=mini_batch_size, - shuffle=shuffle_data_this_epoch, - num_workers=0 if num_workers is None else num_workers, - sampler=sampler, - ) + self.model.train() - self.model.train() + train_loss: float = 0 - train_loss: float = 0 + seen_batches = 0 + total_number_of_batches = len(batch_loader) - seen_batches = 0 - total_number_of_batches = len(batch_loader) + modulo = max(1, int(total_number_of_batches / 10)) - modulo = max(1, int(total_number_of_batches / 10)) + # process mini-batches + average_over = 0 + for batch_no, batch in enumerate(batch_loader): + # zero the gradients on the model and optimizer + self.model.zero_grad() + optimizer.zero_grad() - # process mini-batches - average_over = 0 - for batch_no, batch in enumerate(batch_loader): - # zero the gradients on the model and optimizer - self.model.zero_grad() - optimizer.zero_grad() + # if necessary, make batch_steps + batch_steps = [batch] + if len(batch) > micro_batch_size: + batch_steps = [batch[x : x + micro_batch_size] for x in range(0, len(batch), micro_batch_size)] - # if necessary, make batch_steps - batch_steps = [batch] - if len(batch) > micro_batch_size: - batch_steps = [ - batch[x : x + micro_batch_size] for x in range(0, len(batch), micro_batch_size) - ] + # forward and backward for batch + for batch_step in batch_steps: + # forward pass + loss, datapoint_count = self.model.forward_loss(batch_step) + average_over += datapoint_count + # Backward + if use_amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + train_loss += loss.item() - # forward and backward for batch - for batch_step in batch_steps: - # forward pass - loss, datapoint_count = self.model.forward_loss(batch_step) - average_over += datapoint_count - # Backward - if use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - train_loss += loss.item() + # identify dynamic embeddings (always deleted) on first sentence - # identify dynamic embeddings (always deleted) on first sentence + if dynamic_embeddings is None: + dynamic_embeddings = identify_dynamic_embeddings(batch) - if dynamic_embeddings is None: - dynamic_embeddings = identify_dynamic_embeddings(batch) + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(batch, embeddings_storage_mode, dynamic_embeddings) - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(batch, embeddings_storage_mode, dynamic_embeddings) + # do the optimizer step + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) + optimizer.step() - # do the optimizer step - torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) - optimizer.step() + # do the scheduler step if one-cycle or linear decay + if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)): + scheduler.step() + # get new learning rate + current_learning_rate = [group["lr"] for group in optimizer.param_groups] - # do the scheduler step if one-cycle or linear decay - if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)): - scheduler.step() - # get new learning rate - current_learning_rate = [group["lr"] for group in optimizer.param_groups] + momentum = [ + group["betas"][0] if "betas" in group else group.get("momentum", 0) + for group in optimizer.param_groups + ] - momentum = [ - group["betas"][0] if "betas" in group else group.get("momentum", 0) - for group in optimizer.param_groups - ] + seen_batches += 1 - seen_batches += 1 + if seen_batches % modulo == 0: + momentum_info = "" + if cycle_momentum: + momentum_info = " - momentum:" + ",".join([f"{m:.4f}" for m in momentum]) - if seen_batches % modulo == 0: - momentum_info = "" - if cycle_momentum: - momentum_info = " - momentum:" + ",".join([f"{m:.4f}" for m in momentum]) + lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) - lr_info = ",".join([f"{lr:.6f}" for lr in current_learning_rate]) + intermittent_loss = train_loss / average_over if average_over > 0 else train_loss / seen_batches + end_time = time.time() + log.info( + f"epoch {epoch}" + f" - iter {seen_batches}/{total_number_of_batches}" + f" - loss {intermittent_loss:.8f}" + f" - time (sec): {end_time - start_time:.2f}" + f" - samples/sec: {average_over / (end_time - start_time):.2f}" + f" - lr: {lr_info}{momentum_info}" + ) + iteration = epoch * total_number_of_batches + batch_no + if not param_selection_mode and write_weights: + weight_extractor.extract_weights(self.model.state_dict(), iteration) - intermittent_loss = ( - train_loss / average_over if average_over > 0 else train_loss / seen_batches - ) - end_time = time.time() - log.info( - f"epoch {epoch}" - f" - iter {seen_batches}/{total_number_of_batches}" - f" - loss {intermittent_loss:.8f}" - f" - time (sec): {end_time - start_time:.2f}" - f" - samples/sec: {average_over / (end_time - start_time):.2f}" - f" - lr: {lr_info}{momentum_info}" + if average_over != 0: + train_loss /= average_over + + self.model.eval() + + if save_model_each_k_epochs > 0 and epoch % save_model_each_k_epochs == 0: + log.info("saving model of current epoch") + model_name = "model_epoch_" + str(epoch) + ".pt" + self.model.save(base_path / model_name, checkpoint=save_optimizer_state) + + log_line(log) + log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {lr_info}") + + if use_tensorboard: + writer.add_scalar("train_loss", train_loss, epoch) + + # evaluate on train / dev / test split depending on training settings + result_line: str = "" + + if log_train: + train_eval_result = self.model.evaluate( + self.corpus.train, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{train_eval_result.loss}\t{train_eval_result.log_line}" + log.info( + f"TRAIN : loss {train_eval_result.loss} -" + f" {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]}) " + f" {round(train_eval_result.main_score, 4)}" + ) + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(self.corpus.train, embeddings_storage_mode, dynamic_embeddings) + + if log_train_part: + train_part_eval_result = self.model.evaluate( + train_part, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{train_part_eval_result.loss}" f"\t{train_part_eval_result.log_line}" + log.info( + f"TRAIN_SPLIT : loss {train_part_eval_result.loss}" + f" - {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]})" + f" {round(train_part_eval_result.main_score, 4)}" + ) + if use_tensorboard: + for metric_class_avg_type, metric_type in metrics_for_tensorboard: + writer.add_scalar( + f"train_{metric_class_avg_type}_{metric_type}", + train_part_eval_result.classification_report[metric_class_avg_type][metric_type], + epoch, ) - iteration = epoch * total_number_of_batches + batch_no - if not param_selection_mode and write_weights: - weight_extractor.extract_weights(self.model.state_dict(), iteration) - if average_over != 0: - train_loss /= average_over + if log_dev: + assert self.corpus.dev + dev_eval_result = self.model.evaluate( + self.corpus.dev, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + out_path=base_path / "dev.tsv", + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{dev_eval_result.loss}\t{dev_eval_result.log_line}" + log.info( + f"DEV : loss {dev_eval_result.loss}" + f" - {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]})" + f" {round(dev_eval_result.main_score, 4)}" + ) + # calculate scores using dev data if available + # append dev score to score history + dev_score_history.append(dev_eval_result.main_score) + dev_loss_history.append(dev_eval_result.loss) + + dev_score = dev_eval_result.main_score - self.model.eval() + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(self.corpus.dev, embeddings_storage_mode, dynamic_embeddings) - if save_model_each_k_epochs > 0 and epoch % save_model_each_k_epochs == 0: - log.info("saving model of current epoch") - model_name = "model_epoch_" + str(epoch) + ".pt" - self.model.save(base_path / model_name, checkpoint=save_optimizer_state) + if use_tensorboard: + writer.add_scalar("dev_loss", dev_eval_result.loss, epoch) + writer.add_scalar("dev_score", dev_eval_result.main_score, epoch) + for ( + metric_class_avg_type, + metric_type, + ) in metrics_for_tensorboard: + writer.add_scalar( + f"dev_{metric_class_avg_type}_{metric_type}", + dev_eval_result.classification_report[metric_class_avg_type][metric_type], + epoch, + ) - log_line(log) - log.info(f"EPOCH {epoch} done: loss {train_loss:.4f} - lr {lr_info}") + if log_test: + assert self.corpus.test + test_eval_result = self.model.evaluate( + self.corpus.test, + gold_label_type=self.model.label_type, + mini_batch_size=eval_batch_size, + num_workers=num_workers, + out_path=base_path / "test.tsv", + embedding_storage_mode=embeddings_storage_mode, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + result_line += f"\t{test_eval_result.loss}\t{test_eval_result.log_line}" + log.info( + f"TEST : loss {test_eval_result.loss} -" + f" {main_evaluation_metric[1]}" + f" ({main_evaluation_metric[0]}) " + f" {round(test_eval_result.main_score, 4)}" + ) + + # depending on memory mode, embeddings are moved to CPU, GPU or deleted + store_embeddings(self.corpus.test, embeddings_storage_mode, dynamic_embeddings) if use_tensorboard: - writer.add_scalar("train_loss", train_loss, epoch) - - # evaluate on train / dev / test split depending on training settings - result_line: str = "" - - if log_train: - train_eval_result = self.model.evaluate( - self.corpus.train, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{train_eval_result.loss}\t{train_eval_result.log_line}" - log.info( - f"TRAIN : loss {train_eval_result.loss} -" - f" {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]}) " - f" {round(train_eval_result.main_score, 4)}" - ) - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(self.corpus.train, embeddings_storage_mode, dynamic_embeddings) - - if log_train_part: - train_part_eval_result = self.model.evaluate( - train_part, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{train_part_eval_result.loss}" f"\t{train_part_eval_result.log_line}" - log.info( - f"TRAIN_SPLIT : loss {train_part_eval_result.loss}" - f" - {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]})" - f" {round(train_part_eval_result.main_score, 4)}" - ) - if use_tensorboard: - for metric_class_avg_type, metric_type in metrics_for_tensorboard: - writer.add_scalar( - f"train_{metric_class_avg_type}_{metric_type}", - train_part_eval_result.classification_report[metric_class_avg_type][metric_type], - epoch, + writer.add_scalar("test_loss", test_eval_result.loss, epoch) + writer.add_scalar("test_score", test_eval_result.main_score, epoch) + for ( + metric_class_avg_type, + metric_type, + ) in metrics_for_tensorboard: + writer.add_scalar( + f"test_{metric_class_avg_type}_{metric_type}", + test_eval_result.classification_report[metric_class_avg_type][metric_type], + epoch, + ) + + # determine if this is the best model or if we need to anneal + current_epoch_has_best_model_so_far = False + # default mode: anneal against dev score + if not train_with_dev and not anneal_against_dev_loss: + if dev_score > best_validation_score: + current_epoch_has_best_model_so_far = True + best_validation_score = dev_score + + if isinstance(scheduler, AnnealOnPlateau): + scheduler.step(dev_score, dev_eval_result.loss) + + # alternative: anneal against dev loss + if not train_with_dev and anneal_against_dev_loss: + if dev_eval_result.loss < best_validation_score: + current_epoch_has_best_model_so_far = True + best_validation_score = dev_eval_result.loss + + if isinstance(scheduler, AnnealOnPlateau): + scheduler.step(dev_eval_result.loss) + + # alternative: anneal against train loss + if train_with_dev: + if train_loss < best_validation_score: + current_epoch_has_best_model_so_far = True + best_validation_score = train_loss + + if isinstance(scheduler, AnnealOnPlateau): + scheduler.step(train_loss) + + train_loss_history.append(train_loss) + + # determine bad epoch number + try: + bad_epochs = scheduler.num_bad_epochs + except AttributeError: + bad_epochs = 0 + + new_learning_rate = [group["lr"] for group in optimizer.param_groups] + + if any([new_lr != prev_lr for new_lr, prev_lr in zip(new_learning_rate, previous_learning_rate)]): + bad_epochs = patience + 1 + + # lr unchanged + if all( + [ + prev_lr == initial_lr + for prev_lr, initial_lr in zip(previous_learning_rate, initial_learning_rate) + ] + ): + bad_epochs += initial_extra_patience + + # log bad epochs + if log_bad_epochs: + log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") + + if loss_txt is not None: + # output log file + with open(loss_txt, "a") as f: + # make headers on first epoch + if epoch == 1: + bad_epoch_header = "BAD_EPOCHS\t" if log_bad_epochs else "" + f.write(f"EPOCH\tTIMESTAMP\t{bad_epoch_header}LEARNING_RATE\tTRAIN_LOSS") + + if log_train: + f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t"))) + + if log_train_part: + f.write( + "\tTRAIN_PART_LOSS\tTRAIN_PART_" + + "\tTRAIN_PART_".join(train_part_eval_result.log_header.split("\t")) ) - if log_dev: - assert self.corpus.dev - dev_eval_result = self.model.evaluate( - self.corpus.dev, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - out_path=base_path / "dev.tsv", - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{dev_eval_result.loss}\t{dev_eval_result.log_line}" - log.info( - f"DEV : loss {dev_eval_result.loss}" - f" - {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]})" - f" {round(dev_eval_result.main_score, 4)}" - ) - # calculate scores using dev data if available - # append dev score to score history - dev_score_history.append(dev_eval_result.main_score) - dev_loss_history.append(dev_eval_result.loss) + if log_dev: + f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t"))) - dev_score = dev_eval_result.main_score + if log_test: + f.write("\tTEST_LOSS\tTEST_" + "\tTEST_".join(test_eval_result.log_header.split("\t"))) - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(self.corpus.dev, embeddings_storage_mode, dynamic_embeddings) - - if use_tensorboard: - writer.add_scalar("dev_loss", dev_eval_result.loss, epoch) - writer.add_scalar("dev_score", dev_eval_result.main_score, epoch) - for ( - metric_class_avg_type, - metric_type, - ) in metrics_for_tensorboard: - writer.add_scalar( - f"dev_{metric_class_avg_type}_{metric_type}", - dev_eval_result.classification_report[metric_class_avg_type][metric_type], - epoch, - ) + lr_info = ",".join([f"{lr:.4f}" for lr in current_learning_rate]) - if log_test: - assert self.corpus.test - test_eval_result = self.model.evaluate( - self.corpus.test, - gold_label_type=self.model.label_type, - mini_batch_size=eval_batch_size, - num_workers=num_workers, - out_path=base_path / "test.tsv", - embedding_storage_mode=embeddings_storage_mode, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - result_line += f"\t{test_eval_result.loss}\t{test_eval_result.log_line}" - log.info( - f"TEST : loss {test_eval_result.loss} -" - f" {main_evaluation_metric[1]}" - f" ({main_evaluation_metric[0]}) " - f" {round(test_eval_result.main_score, 4)}" + bad_epoch_info = "\t" + str(bad_epochs) if log_bad_epochs else "" + f.write( + f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}" + f"{bad_epoch_info}" + f"\t{lr_info}\t{train_loss}" ) + f.write(result_line) + + # if checkpoint is enabled, save model at each epoch + if checkpoint and not param_selection_mode: + self.model.save(base_path / "checkpoint.pt", checkpoint=True) + + # Check whether to save best model + if ( + (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) + and not param_selection_mode + and current_epoch_has_best_model_so_far + and not use_final_model_for_eval + ): + log.info("saving best model") + self.model.save(base_path / "best-model.pt", checkpoint=save_optimizer_state) - # depending on memory mode, embeddings are moved to CPU, GPU or deleted - store_embeddings(self.corpus.test, embeddings_storage_mode, dynamic_embeddings) - - if use_tensorboard: - writer.add_scalar("test_loss", test_eval_result.loss, epoch) - writer.add_scalar("test_score", test_eval_result.main_score, epoch) - for ( - metric_class_avg_type, - metric_type, - ) in metrics_for_tensorboard: - writer.add_scalar( - f"test_{metric_class_avg_type}_{metric_type}", - test_eval_result.classification_report[metric_class_avg_type][metric_type], - epoch, - ) - - # determine if this is the best model or if we need to anneal - current_epoch_has_best_model_so_far = False - # default mode: anneal against dev score - if not train_with_dev and not anneal_against_dev_loss: - if dev_score > best_validation_score: - current_epoch_has_best_model_so_far = True - best_validation_score = dev_score - - if isinstance(scheduler, AnnealOnPlateau): - scheduler.step(dev_score, dev_eval_result.loss) - - # alternative: anneal against dev loss - if not train_with_dev and anneal_against_dev_loss: - if dev_eval_result.loss < best_validation_score: - current_epoch_has_best_model_so_far = True - best_validation_score = dev_eval_result.loss - - if isinstance(scheduler, AnnealOnPlateau): - scheduler.step(dev_eval_result.loss) - - # alternative: anneal against train loss - if train_with_dev: - if train_loss < best_validation_score: - current_epoch_has_best_model_so_far = True - best_validation_score = train_loss - - if isinstance(scheduler, AnnealOnPlateau): - scheduler.step(train_loss) - - train_loss_history.append(train_loss) - - # determine bad epoch number - try: - bad_epochs = scheduler.num_bad_epochs - except AttributeError: - bad_epochs = 0 - - new_learning_rate = [group["lr"] for group in optimizer.param_groups] - - if any([new_lr != prev_lr for new_lr, prev_lr in zip(new_learning_rate, previous_learning_rate)]): - bad_epochs = patience + 1 - - # lr unchanged - if all( - [ - prev_lr == initial_lr - for prev_lr, initial_lr in zip(previous_learning_rate, initial_learning_rate) - ] - ): - bad_epochs += initial_extra_patience - - # log bad epochs - if log_bad_epochs: - log.info(f"BAD EPOCHS (no improvement): {bad_epochs}") - - if loss_txt is not None: - # output log file - with open(loss_txt, "a") as f: - # make headers on first epoch - if epoch == 1: - bad_epoch_header = "BAD_EPOCHS\t" if log_bad_epochs else "" - f.write(f"EPOCH\tTIMESTAMP\t{bad_epoch_header}LEARNING_RATE\tTRAIN_LOSS") - - if log_train: - f.write("\tTRAIN_" + "\tTRAIN_".join(train_eval_result.log_header.split("\t"))) - - if log_train_part: - f.write( - "\tTRAIN_PART_LOSS\tTRAIN_PART_" - + "\tTRAIN_PART_".join(train_part_eval_result.log_header.split("\t")) - ) - - if log_dev: - f.write("\tDEV_LOSS\tDEV_" + "\tDEV_".join(dev_eval_result.log_header.split("\t"))) - - if log_test: - f.write( - "\tTEST_LOSS\tTEST_" + "\tTEST_".join(test_eval_result.log_header.split("\t")) - ) - - lr_info = ",".join([f"{lr:.4f}" for lr in current_learning_rate]) - - bad_epoch_info = "\t" + str(bad_epochs) if log_bad_epochs else "" - f.write( - f"\n{epoch}\t{datetime.datetime.now():%H:%M:%S}" - f"{bad_epoch_info}" - f"\t{lr_info}\t{train_loss}" - ) - f.write(result_line) - - # if checkpoint is enabled, save model at each epoch - if checkpoint and not param_selection_mode: - self.model.save(base_path / "checkpoint.pt", checkpoint=True) - - # Check whether to save best model - if ( - (not train_with_dev or anneal_with_restarts or anneal_with_prestarts) - and not param_selection_mode - and current_epoch_has_best_model_so_far - and not use_final_model_for_eval - ): - log.info("saving best model") - self.model.save(base_path / "best-model.pt", checkpoint=save_optimizer_state) + if anneal_with_prestarts: + current_state_dict = self.model.state_dict() + self.model.load_state_dict(last_epoch_model_state_dict) + self.model.save(base_path / "pre-best-model.pt") + self.model.load_state_dict(current_state_dict) - if anneal_with_prestarts: - current_state_dict = self.model.state_dict() - self.model.load_state_dict(last_epoch_model_state_dict) - self.model.save(base_path / "pre-best-model.pt") - self.model.load_state_dict(current_state_dict) + if use_swa: + import torchcontrib - if use_swa: - import torchcontrib + cast(torchcontrib.optim.SWA, optimizer).swap_swa_sgd() - cast(torchcontrib.optim.SWA, optimizer).swap_swa_sgd() + # if we do not use dev data for model selection, save final model + if save_final_model and not param_selection_mode: + self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) - # if we do not use dev data for model selection, save final model - if save_final_model and not param_selection_mode: - self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) + except KeyboardInterrupt: + log_line(log) + log.info("Exiting from training early.") - except KeyboardInterrupt: - log_line(log) - log.info("Exiting from training early.") - - if not param_selection_mode: - log.info("Saving model ...") - self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) - log.info("Done.") - except Exception: - if create_file_logs: - log_handler.close() - log.removeHandler(log_handler) - raise - finally: - if use_tensorboard: - writer.close() - optimizer.zero_grad(set_to_none=True) - del optimizer - - # test best model if test data is present - if self.corpus.test and not train_with_test: - final_score = self.final_test( - base_path=base_path, - eval_mini_batch_size=eval_batch_size, - num_workers=num_workers, - main_evaluation_metric=main_evaluation_metric, - gold_label_dictionary_for_eval=gold_label_dictionary_for_eval, - exclude_labels=exclude_labels, - ) - else: - final_score = 0 - log.info("Test data not provided setting final score to 0") + if not param_selection_mode: + log.info("Saving model ...") + self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) + log.info("Done.") + except Exception: + if create_file_logs: + log_handler.close() + log.removeHandler(log_handler) + raise + finally: + if use_tensorboard: + writer.close() + optimizer.zero_grad(set_to_none=True) + del optimizer + + # test best model if test data is present + if self.corpus.test and not train_with_test: + final_score = self.final_test( + base_path=base_path, + eval_mini_batch_size=eval_batch_size, + num_workers=num_workers, + main_evaluation_metric=main_evaluation_metric, + gold_label_dictionary_for_eval=gold_label_dictionary_for_eval, + exclude_labels=exclude_labels, + ) + else: + final_score = 0 + log.info("Test data not provided setting final score to 0") if reduce_transformer_vocab: if save_final_model and not param_selection_mode: self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state) From 95af6f375f5b40fae2e6abfbc01c18642f906980 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Sun, 19 Feb 2023 06:49:09 +0100 Subject: [PATCH 12/14] fix mypy error --- flair/trainers/trainer.py | 51 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index 734b8609e..9be849744 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -250,10 +250,11 @@ def train( # if optimizer class is passed, instantiate: if inspect.isclass(optimizer): kwargs["lr"] = learning_rate - optimizer = optimizer(self.model.parameters(), **kwargs) - assert isinstance(optimizer, torch.optim.Optimizer) + optimizer_instance = optimizer(self.model.parameters(), **kwargs) + else: + optimizer_instance = optimizer - initial_learning_rate = [group["lr"] for group in optimizer.param_groups] + initial_learning_rate = [group["lr"] for group in optimizer_instance.param_groups] if not isinstance(min_learning_rate, list): min_learning_rate = [min_learning_rate] * len(initial_learning_rate) @@ -304,26 +305,28 @@ def train( for emb in transformer_embeddings: context_stack.enter_context( reduce_train_vocab( - model=emb.model, tokenizer=emb.tokenizer, texts=tokens, optimizer=optimizer + model=emb.model, tokenizer=emb.tokenizer, texts=tokens, optimizer=optimizer_instance ) ) if use_swa: import torchcontrib - optimizer = torchcontrib.optim.SWA(optimizer, swa_start=10, swa_freq=5, swa_lr=learning_rate) + optimizer_instance = torchcontrib.optim.SWA( + optimizer_instance, swa_start=10, swa_freq=5, swa_lr=learning_rate + ) # from here on, use list of learning rates - current_learning_rate: List = [group["lr"] for group in optimizer.param_groups] + current_learning_rate: List = [group["lr"] for group in optimizer_instance.param_groups] if use_amp: - self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=amp_opt_level) + self.model, optimizer_instance = amp.initialize(self.model, optimizer_instance, opt_level=amp_opt_level) - optimizer = cast(torch.optim.Optimizer, optimizer) + optimizer_instance = cast(torch.optim.Optimizer, optimizer_instance) # load existing optimizer state dictionary if it exists if optimizer_state_dict: - optimizer.load_state_dict(optimizer_state_dict) + optimizer_instance.load_state_dict(optimizer_state_dict) # minimize training loss if training with dev data, else maximize dev score anneal_mode = "min" if train_with_dev or anneal_against_dev_loss else "max" @@ -337,7 +340,7 @@ def train( if inspect.isclass(scheduler): if scheduler == OneCycleLR: scheduler = OneCycleLR( - optimizer, + optimizer_instance, max_lr=current_learning_rate, steps_per_epoch=dataset_size // mini_batch_size + 1, epochs=max_epochs - epoch, @@ -351,13 +354,13 @@ def train( num_warmup_steps = int(num_train_steps * warmup_fraction) scheduler = LinearSchedulerWithWarmup( - optimizer, + optimizer_instance, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, ) else: scheduler = scheduler( - optimizer, + optimizer_instance, factor=anneal_factor, patience=patience, initial_extra_patience=initial_extra_patience, @@ -373,7 +376,7 @@ def train( scheduler.load_state_dict(scheduler_state_dict) # update optimizer and scheduler in model card - model_card["training_parameters"]["optimizer"] = optimizer + model_card["training_parameters"]["optimizer"] = optimizer_instance model_card["training_parameters"]["scheduler"] = scheduler if isinstance(scheduler, OneCycleLR) and batch_growth_annealing: @@ -441,7 +444,7 @@ def train( previous_learning_rate = current_learning_rate - momentum = [group["momentum"] if "momentum" in group else 0 for group in optimizer.param_groups] + momentum = [group["momentum"] if "momentum" in group else 0 for group in optimizer_instance.param_groups] for epoch in range(epoch + 1, max_epochs + 1): log_line(log) @@ -459,7 +462,7 @@ def train( train_part = torch.utils.data.dataset.Subset(self.corpus.train, train_part_indices) # get new learning rate - current_learning_rate = [group["lr"] for group in optimizer.param_groups] + current_learning_rate = [group["lr"] for group in optimizer_instance.param_groups] lr_changed = any([lr != prev_lr for lr, prev_lr in zip(current_learning_rate, previous_learning_rate)]) @@ -525,7 +528,7 @@ def train( for batch_no, batch in enumerate(batch_loader): # zero the gradients on the model and optimizer self.model.zero_grad() - optimizer.zero_grad() + optimizer_instance.zero_grad() # if necessary, make batch_steps batch_steps = [batch] @@ -539,7 +542,7 @@ def train( average_over += datapoint_count # Backward if use_amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: + with amp.scale_loss(loss, optimizer_instance) as scaled_loss: scaled_loss.backward() else: loss.backward() @@ -555,17 +558,17 @@ def train( # do the optimizer step torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) - optimizer.step() + optimizer_instance.step() # do the scheduler step if one-cycle or linear decay if isinstance(scheduler, (OneCycleLR, LinearSchedulerWithWarmup)): scheduler.step() # get new learning rate - current_learning_rate = [group["lr"] for group in optimizer.param_groups] + current_learning_rate = [group["lr"] for group in optimizer_instance.param_groups] momentum = [ group["betas"][0] if "betas" in group else group.get("momentum", 0) - for group in optimizer.param_groups + for group in optimizer_instance.param_groups ] seen_batches += 1 @@ -774,7 +777,7 @@ def train( except AttributeError: bad_epochs = 0 - new_learning_rate = [group["lr"] for group in optimizer.param_groups] + new_learning_rate = [group["lr"] for group in optimizer_instance.param_groups] if any([new_lr != prev_lr for new_lr, prev_lr in zip(new_learning_rate, previous_learning_rate)]): bad_epochs = patience + 1 @@ -848,7 +851,7 @@ def train( if use_swa: import torchcontrib - cast(torchcontrib.optim.SWA, optimizer).swap_swa_sgd() + cast(torchcontrib.optim.SWA, optimizer_instance).swap_swa_sgd() # if we do not use dev data for model selection, save final model if save_final_model and not param_selection_mode: @@ -870,8 +873,8 @@ def train( finally: if use_tensorboard: writer.close() - optimizer.zero_grad(set_to_none=True) - del optimizer + optimizer_instance.zero_grad(set_to_none=True) + del optimizer_instance # test best model if test data is present if self.corpus.test and not train_with_test: From 86ed221b58af58ae1e46e6ae283e700b81ae4982 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 20 Feb 2023 15:23:08 +0100 Subject: [PATCH 13/14] fix pairwise classification model --- flair/models/pairwise_classification_model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py index 5e5162a7b..8d28d82ce 100644 --- a/flair/models/pairwise_classification_model.py +++ b/flair/models/pairwise_classification_model.py @@ -94,9 +94,6 @@ def _get_state_dict(self): "document_embeddings": self.embeddings.save_embeddings(use_state_dict=False), "label_dictionary": self.label_dictionary, "label_type": self.label_type, - "multi_label": self.multi_label, - "multi_label_threshold": self.multi_label_threshold, - "weight_dict": self.weight_dict, "embed_separately": self.embed_separately, } return model_state @@ -108,8 +105,6 @@ def _init_model_with_state_dict(cls, state, **kwargs): embeddings=state.get("document_embeddings"), label_dictionary=state.get("label_dictionary"), label_type=state.get("label_type"), - multi_label=state.get("multi_label_threshold", 0.5), - loss_weights=state.get("weight_dict"), embed_separately=state.get("embed_separately"), **kwargs, ) From fb55e61b75d8e9aae413f99e0fa24b0ecb7f9961 Mon Sep 17 00:00:00 2001 From: Benedikt Fuchs Date: Mon, 27 Feb 2023 11:41:13 +0100 Subject: [PATCH 14/14] extract functions to mixin and implement smaller transformer vocab for text regressor --- flair/models/text_regression_model.py | 11 ++++++++--- flair/nn/model.py | 13 +++---------- flair/trainers/trainer.py | 8 ++++---- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/flair/models/text_regression_model.py b/flair/models/text_regression_model.py index 30587270d..906f873f3 100644 --- a/flair/models/text_regression_model.py +++ b/flair/models/text_regression_model.py @@ -1,4 +1,5 @@ import logging +import typing from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union @@ -9,22 +10,22 @@ import flair import flair.embeddings -from flair.data import Dictionary, Sentence +from flair.data import Corpus, Dictionary, Sentence, _iter_dataset from flair.datasets import DataLoader, FlairDatapointDataset from flair.embeddings.base import load_embeddings +from flair.nn.model import ReduceTransformerVocabMixin from flair.training_utils import MetricRegression, Result, store_embeddings log = logging.getLogger("flair") -class TextRegressor(flair.nn.Model[Sentence]): +class TextRegressor(flair.nn.Model[Sentence], ReduceTransformerVocabMixin): def __init__( self, document_embeddings: flair.embeddings.DocumentEmbeddings, label_name: str = "label", ): super().__init__() - log.info("Using REGRESSION - experimental") self.document_embeddings: flair.embeddings.DocumentEmbeddings = document_embeddings self.label_name = label_name @@ -234,3 +235,7 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "TextRegressor": from typing import cast return cast("TextRegressor", super().load(model_path=model_path)) + + def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: + for sentence in _iter_dataset(corpus.get_all_sentences()): + yield [t.text for t in sentence] diff --git a/flair/nn/model.py b/flair/nn/model.py index 06c3bf8e6..2fc45f42b 100644 --- a/flair/nn/model.py +++ b/flair/nn/model.py @@ -233,16 +233,14 @@ def print_model_card(self): "trained or was trained with Flair version < 0.9.1)" ) - @property - def supports_smaller_training_vocab(self) -> bool: - # the smaller training vocab expects classification tasks, otherwise it won't work. - return False +class ReduceTransformerVocabMixin(ABC): + @abstractmethod def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: pass -class Classifier(Model[DT], typing.Generic[DT], ABC): +class Classifier(Model[DT], typing.Generic[DT], ReduceTransformerVocabMixin, ABC): """Abstract base class for all Flair models that do classification, both single- and multi-label. It inherits from flair.nn.Model and adds an unified evaluate() function so that all classification models use the same @@ -550,11 +548,6 @@ def _print_predictions(self, batch, gold_label_type): lines.append(eval_line) return lines - @property - def supports_smaller_training_vocab(self) -> bool: - # the smaller training vocab expects classification tasks, otherwise it won't work. - return True - def get_used_tokens(self, corpus: Corpus) -> typing.Iterable[List[str]]: for sentence in _iter_dataset(corpus.get_all_sentences()): yield [t.text for t in sentence] diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py index fdd0f9da9..bfbde8ab2 100644 --- a/flair/trainers/trainer.py +++ b/flair/trainers/trainer.py @@ -19,6 +19,7 @@ from flair.embeddings import Embeddings, StackedEmbeddings, TransformerEmbeddings from flair.models import FewshotClassifier from flair.nn import Model +from flair.nn.model import ReduceTransformerVocabMixin try: from apex import amp @@ -292,11 +293,8 @@ def train( weight_extractor = WeightExtractor(base_path) - if not self.model.supports_smaller_training_vocab: - reduce_transformer_vocab = False - with contextlib.ExitStack() as context_stack: - if reduce_transformer_vocab: + if isinstance(self.model, ReduceTransformerVocabMixin): transformer_embeddings = get_transformer_embeddings(self) if not transformer_embeddings: reduce_transformer_vocab = False @@ -308,6 +306,8 @@ def train( model=emb.model, tokenizer=emb.tokenizer, texts=tokens, optimizer=optimizer_instance ) ) + else: + reduce_transformer_vocab = False if use_swa: import torchcontrib