From cd179e6001dd0220b7a826241e27aaecbd771b51 Mon Sep 17 00:00:00 2001 From: rchan Date: Tue, 15 Aug 2023 11:48:51 +0100 Subject: [PATCH] add verbose option to prepare data and encoder classes --- src/nlpsig/data_preparation.py | 124 ++++++++++++++++----------- src/nlpsig/encode_text.py | 147 ++++++++++++++++++++++----------- 2 files changed, 175 insertions(+), 96 deletions(-) diff --git a/src/nlpsig/data_preparation.py b/src/nlpsig/data_preparation.py index 9024fd9..de47afd 100644 --- a/src/nlpsig/data_preparation.py +++ b/src/nlpsig/data_preparation.py @@ -49,7 +49,10 @@ def __init__( pooled_embeddings: np.array | None = None, id_column: str | None = None, label_column: str | None = None, + verbose: bool = True, ): + self.verbose = verbose + # perform checks that original_df have the right column names to work with if embeddings.ndim != 2: raise ValueError("`embeddings` should be a 2-dimensional array.") @@ -79,6 +82,7 @@ def __init__( # obtain modelling dataframe self.df: pd.DataFrame | None = None self.df = self._get_modeling_dataframe() + # set pooled embeddings if provided if pooled_embeddings is not None: if pooled_embeddings.ndim != 2: @@ -86,10 +90,11 @@ def __init__( "If provided, `pooled_embeddings` should be a 2-dimensional array." ) if len(self.df[self.id_column].unique()) != pooled_embeddings.shape[0]: - print( - f"[INFO] `len(self.df[self.id_column].unique())`={len(self.df[self.id_column].unique())}" - f" and `pooled_embeddings.shape[0]`={pooled_embeddings.shape[0]}." - ) + if self.verbose: + print( + f"[INFO] `len(self.df[self.id_column].unique())`={len(self.df[self.id_column].unique())}" + f" and `pooled_embeddings.shape[0]`={pooled_embeddings.shape[0]}." + ) raise ValueError( "If provided, `pooled_embeddings` should have the same number " "of rows as there are different ids in the id-column." @@ -123,17 +128,21 @@ def _get_modeling_dataframe(self) -> pd.DataFrame: if self.df is not None: return self.df - print("[INFO] Concatenating the embeddings to the dataframe...") - print("[INFO] - columns beginning with 'e' denote the full embddings.") + if self.verbose: + print("[INFO] Concatenating the embeddings to the dataframe...") + print("[INFO] - columns beginning with 'e' denote the full embddings.") + embedding_df = pd.DataFrame( self.embeddings, columns=[f"e{i+1}" for i in range(self.embeddings.shape[1])], ) if self.embeddings_reduced is not None: - print( - "[INFO] - columns beginning with 'd' denote the dimension reduced embeddings." - ) + if self.verbose: + print( + "[INFO] - columns beginning with 'd' denote the dimension reduced embeddings." + ) + embeddings_reduced_df = pd.DataFrame( self.embeddings_reduced, columns=[f"d{i+1}" for i in range(self.embeddings_reduced.shape[1])], @@ -151,17 +160,21 @@ def _get_modeling_dataframe(self) -> pd.DataFrame: [self.original_df.reset_index(drop=True), embedding_df], axis=1, ) + if self.id_column is None: self.id_column = "dummy_id" - print( - f"[INFO] No id_column was passed, so setting id_column to '{self.id_column}'." - ) + if self.verbose: + print( + f"[INFO] No id_column was passed, so setting id_column to '{self.id_column}'." + ) + if self.id_column not in self.original_df.columns: + if self.verbose: + print( + f"[INFO] There is no column in `.original_df` called '{self.id_column}'. " + f"Adding a new column named '{self.id_column}' of zeros." + ) # set default value to id_column - print( - f"[INFO] There is no column in `.original_df` called '{self.id_column}'. " - f"Adding a new column named '{self.id_column}' of zeros." - ) df[self.id_column] = 0 return df @@ -203,9 +216,13 @@ def _set_time_features(self) -> pd.DataFrame: Updated dataframe with time features. """ if self.time_features_added: - print("Time features have already been added.") + if self.verbose: + print("Time features have already been added.") return None - print("[INFO] Adding time feature columns into dataframe in `.df`.") + + if self.verbose: + print("[INFO] Adding time feature columns into dataframe in `.df`.") + if "datetime" in self.df.columns: self._feature_list += ["time_encoding", "time_diff"] @@ -213,7 +230,9 @@ def _set_time_features(self) -> pd.DataFrame: self.df["datetime"] = pd.to_datetime(self.df["datetime"]) # obtain time encoding by computing the fraction of year it is in - print("[INFO] Adding 'time_encoding' feature...") + if self.verbose: + print("[INFO] Adding 'time_encoding' feature...") + self.df["time_encoding"] = self.df["datetime"].map( lambda t: self._time_fraction(t) ) @@ -224,7 +243,9 @@ def _set_time_features(self) -> pd.DataFrame: self.df = self.df.sort_values(by=[self.id_column, "datetime"]) # calculate time difference between posts - print("[INFO] Adding 'time_diff' feature...") + if self.verbose: + print("[INFO] Adding 'time_diff' feature...") + self.df["time_diff"] = list( self.df.groupby(self.id_column) .apply( @@ -240,18 +261,22 @@ def _set_time_features(self) -> pd.DataFrame: .explode() ) else: - print( - "[INFO] Note 'datetime' is not a column in `.df`, " - "so only 'timeline_index' is added." - ) - print( - "[INFO] As 'datetime' is not a column in `.df`, " - "we assume that the data is ordered by time with respect to the id." - ) + if self.verbose: + print( + "[INFO] Note 'datetime' is not a column in `.df`, " + "so only 'timeline_index' is added." + ) + print( + "[INFO] As 'datetime' is not a column in `.df`, " + "we assume that the data is ordered by time with respect to the id." + ) + # assign index for each post in each timeline self._feature_list += ["timeline_index"] - print("[INFO] Adding 'timeline_index' feature...") + if self.verbose: + print("[INFO] Adding 'timeline_index' feature...") + self.df["timeline_index"] = list( self.df.groupby(self.id_column) .apply(lambda x: list(range(1, len(x) + 1))) @@ -756,9 +781,11 @@ def pad( dimension reduced embeddings, time features) """ - print( - "[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes." - ) + if self.verbose: + print( + "[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes." + ) + if pad_by not in ["id", "history"]: raise ValueError("`pad_by` must be either 'id' or 'history'.") @@ -1017,11 +1044,13 @@ def get_torch_path_for_SWNUNetwork( if include_embedding_in_input: # repeat the embeddings which will be concatenated to the path later if self.pad_method == "id": - print( - f"[INFO] The path was created for each {self.id_column} in the dataframe, " - "so to include embeddings in the FFN input, we concatenate the " - "pooled embeddings." - ) + if self.verbose: + print( + f"[INFO] The path was created for each {self.id_column} in the dataframe, " + "so to include embeddings in the FFN input, we concatenate the " + "pooled embeddings." + ) + if self.pooled_embeddings is None: raise ValueError( "There were no pooled embeddings passed into the class." @@ -1035,11 +1064,13 @@ def get_torch_path_for_SWNUNetwork( ) emb = torch.from_numpy(self.pooled_embeddings.astype("float")).float() elif self.pad_method == "history": - print( - "[INFO] The path was created for each item in the dataframe, " - "by looking at its history, so to include embeddings in the FFN input, " - "we concatenate the embeddings for each sentence / text." - ) + if self.verbose: + print( + "[INFO] The path was created for each item in the dataframe, " + "by looking at its history, so to include embeddings in the FFN input, " + "we concatenate the embeddings for each sentence / text." + ) + if reduced_embeddings: if self.embeddings_reduced is None: raise ValueError( @@ -1148,10 +1179,11 @@ def check_history_length_for_SeqSigNet( required_history_length = shift * n + (window_size - shift) if self.array_padded.shape[1] != required_history_length: # required history length not met - print( - f"A history length of size {required_history_length} is required, " - f"but we have history length size of {self.array_padded.shape[1]}" - ) + if self.verbose: + print( + f"A history length of size {required_history_length} is required, " + f"but we have history length size of {self.array_padded.shape[1]}" + ) return False # we have the required history length diff --git a/src/nlpsig/encode_text.py b/src/nlpsig/encode_text.py index f0bfeea..dc2ed0b 100644 --- a/src/nlpsig/encode_text.py +++ b/src/nlpsig/encode_text.py @@ -1,6 +1,7 @@ from __future__ import annotations import pickle +import warnings from typing import Callable, Iterable import numpy as np @@ -93,10 +94,13 @@ def __init__( model_modules: Iterable[nn.Module] | None = None, model_encoder_args: dict | None = None, model_fit_args: dict | None = None, + verbose: bool = True, ): + self.verbose = verbose + self.df = df if feature_name not in df.columns: - raise KeyError(f"{feature_name} is not a column in df") + raise KeyError(f"{feature_name} is not a column in df.") self.feature_name = feature_name self.sentence_embeddings = None self.model_name = model_name @@ -142,7 +146,7 @@ def load_pre_computed_embeddings(self, pre_computed_embeddings_file: str) -> Non raise ValueError( f"the loaded embeddings from {pre_computed_embeddings_file} " "must be a (n x d) array where n is the number of sentences " - "and d is the dimension of the embeddings" + "and d is the dimension of the embeddings." ) self.model_name = "pre-computed" self.model_modules = None @@ -170,14 +174,15 @@ def load_pretrained_model(self, force_reload: bool = False) -> None: See https://www.sbert.net/docs/pretrained_models.html for examples. """ if (not force_reload) and (self.model is not None): - print(f"[INFO] '{self.model_name}' model is already loaded") + warnings.warn(f"'{self.model_name}' model is already loaded.", stacklevel=3) return + if (force_reload) and (self.model == "pre-computed"): - print( - "[INFO] The current embeddings were computed before " - "and were loaded into this class" + warnings.warn( + "The current embeddings were pre-computed and loaded.", stacklevel=3 ) return + try: self.model = SentenceTransformer(model_name_or_path=self.model_name) except Exception as err: @@ -208,18 +213,20 @@ def load_custom_model(self, force_reload: bool = False) -> None: for examples. """ if (not force_reload) and (self.model is not None): - print(f"[INFO] '{self.model_name}' model is already loaded") + warnings.warn(f"'{self.model_name}' model is already loaded.", stacklevel=3) return + if (force_reload) and (self.model == "pre-computed"): - print( - "[INFO] The current embeddings were computed before " - "and were loaded into this class" + warnings.warn( + "The current embeddings were pre-computed and loaded.", stacklevel=3 ) return + if self.model_modules is None: raise ValueError( "`.model_modules` must be a list of modules which define the network architecture." ) + try: self.model = SentenceTransformer(modules=self.model_modules) except Exception as err: @@ -250,10 +257,14 @@ def obtain_embeddings(self) -> np.array: "or `.load_custom_model()` methods first" ) sentences = self.df[self.feature_name].to_list() - print(f"[INFO] number of sentences to encode: {len(sentences)}") + + if self.verbose: + print(f"[INFO] number of sentences to encode: {len(sentences)}") + self.sentence_embeddings = np.array( self.model.encode(sentences, **self.model_encoder_args) ) + return self.sentence_embeddings def fit_transformer( @@ -327,7 +338,10 @@ def __init__( config: PretrainedConfig | None = None, tokenizer: PreTrainedTokenizer | None = None, data_collator: DataCollator | None = None, + verbose: bool = True, ): + self.verbose = verbose + # check feature name is a string or list of length 1 or 2 of strings if isinstance(feature_name, str): # convert to list of one element @@ -405,19 +419,23 @@ def load_pretrained_model(self, force_reload: bool = False) -> None: Whether or not to overwrite current loaded model, by default False. """ if (not force_reload) and (self.model is not None): - print(f"[INFO] '{self.model_name}' model is already loaded.") + warnings.warn(f"'{self.model_name}' model is already loaded.", stacklevel=3) return + if self.model_name is None: raise TypeError("") + + if self.verbose: + print( + "[INFO] By default, `.load_pretrained_model()` uses " + "`AutoModel` to load in the model. " + "If you want to load the model for a specific task, " + "reset the `.model` attribute." + ) + self.config = AutoConfig.from_pretrained(self.model_name) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.data_collator = DataCollatorWithPadding(self.tokenizer) - Warning( - "[INFO] By default, `.load_pretrained_model()` uses " - "`AutoModel` to load in the model. " - "If you want to load the model for a specific task, " - "reset the `.model` attribute." - ) self.model = AutoModel.from_pretrained(self.model_name) self.model.eval() @@ -434,10 +452,12 @@ def initialise_transformer(self, force_reload: bool = False, **config_args) -> N Passed along to `AutoConfig.from_pretrained()` method. """ if (not force_reload) and (self.model is not None): - print(f"[INFO] '{self.model_name}' model is already loaded.") + warnings.warn(f"'{self.model_name}' model is already loaded.", stacklevel=3) return + if self.model_name is None: raise TypeError("") + self.config = AutoConfig.from_pretrained(self.model_name, **config_args) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.data_collator = DataCollatorWithPadding(self.tokenizer) @@ -533,8 +553,11 @@ def tokenize_text( # by default does not perform padding initially, # as will utilise dynamic padding later on tokenizer_args = {"padding": False, "truncation": True} + if not tokenizer_args.get("return_special_tokens_mask"): - print("[INFO] Setting return_special_tokens_mask=True") + if self.verbose: + print("[INFO] Setting return_special_tokens_mask=True") + tokenizer_args["return_special_tokens_mask"] = True # define tokenize_function for mapping to Dataset object @@ -556,7 +579,9 @@ def tokenize_function(dataset): ) # tokenize the dataset and save the tokens in .tokens attribute - print("[INFO] Tokenizing the dataset...") + if self.verbose: + print("[INFO] Tokenizing the dataset...") + self.dataset = self.dataset.map( tokenize_function, batched=batched, @@ -565,9 +590,10 @@ def tokenize_function(dataset): self.tokens = self.dataset.remove_columns(self._features) # save the tokenized text to `.df["tokens"] (does not include special tokens) - print( - "[INFO] Saving the tokenized text for each sentence into `.df['tokens']`..." - ) + if self.verbose: + print( + "[INFO] Saving the tokenized text for each sentence into `.df['tokens']`..." + ) cls_token_avail = self.tokenizer.cls_token is not None @@ -593,6 +619,7 @@ def tokenize_decoder(dataset): ) return {"tokens": tokens} + # token apply tokenize_decoder to dataset to obtain tokens self.dataset = self.dataset.map( tokenize_decoder, batched=batched, @@ -601,18 +628,23 @@ def tokenize_decoder(dataset): self.df["tokens"] = self.dataset["tokens"] # create new tokenized dataframe - print( - "[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute..." - ) + if self.verbose: + print( + "[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute..." + ) + self.tokenized_df = self.df.drop( columns=self.feature_name, errors="ignore", ).explode("tokens") self.tokenized_df = self.tokenized_df.reset_index() - print( - f"[INFO] Note: '{text_id_col_name}' is the " - "column name for denoting the corresponding text id" - ) + + if self.verbose: + print( + f"[INFO] Note: '{text_id_col_name}' is the " + "column name for denoting the corresponding text id" + ) + self.tokenized_df = self.tokenized_df.rename( columns={"index": text_id_col_name} ) @@ -1081,9 +1113,10 @@ def split_dataset( (if `valid_size` is not None), and test (`test`) Datasets. """ if self.dataset_split is not None: - print( - "[INFO] Dataset has already been split. " - "If required to split again, first set `.dataset_split` attribute to None" + warnings.warn( + "Dataset has already been split. If required to split again, first set " + "`.dataset_split` attribute to None", + stacklevel=3, ) return self.dataset_split @@ -1106,16 +1139,17 @@ def split_dataset( ) else: # indices are not provided, so split the dataset - if valid_size is None: - print( - "[INFO] Splitting up dataset into train / test sets, " - "and saving to `.dataset_split`." - ) - else: - print( - "[INFO] Splitting up dataset into train / validation / test sets, " - "and saving to `.dataset_split`." - ) + if self.verbose: + if valid_size is None: + print( + "[INFO] Splitting up dataset into train / test sets, " + "and saving to `.dataset_split`." + ) + else: + print( + "[INFO] Splitting up dataset into train / validation / test sets, " + "and saving to `.dataset_split`." + ) # first split data into train/valid set, test set train_test = self.dataset.train_test_split( @@ -1166,9 +1200,11 @@ def set_up_training_args(self, output_dir: str, **kwargs) -> TrainingArguments: TrainingArguments `TrainingArguments` object. """ - print( - "[INFO] Setting up TrainingArguments object and saving to `.training_args`." - ) + if self.verbose: + print( + "[INFO] Setting up TrainingArguments object and saving to `.training_args`." + ) + if kwargs is None: kwargs = {} if "evaluation_strategy" not in kwargs: @@ -1209,17 +1245,21 @@ def set_up_trainer( # check model, tokenizer and data_collator have been passed into the class self._check_model() - print("[INFO] Setting up Trainer object, and saving to `.trainer`.") + if self.verbose: + print("[INFO] Setting up Trainer object, and saving to `.trainer`.") + if self.training_args is None: raise NotImplementedError( "TrainingArgments have not been set in `.training_args`. " "Call `.set_up_training_args()` first." ) + if self.dataset_split is None: raise ValueError( "Dataset has not been split up into train / test (and validation) sets. " "Call `.split_dataset()` first." ) + if data_collator is None: # use the existing data collator data_collator = self.data_collator @@ -1286,6 +1326,13 @@ def fit_transformer_with_trainer_api( **trainer_args, ) - print(f"[INFO] Training model with {self.model.num_parameters()} parameters...") + if self.verbose: + print( + f"[INFO] Training model with {self.model.num_parameters()} parameters..." + ) + + # train model self.trainer.train() - print("[INFO] Training completed!") + + if self.verbose: + print("[INFO] Training completed!")