diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 96776c9..c437f6c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -85,7 +85,7 @@ jobs: - name: Check products run: pipx run twine check dist/* - - uses: pypa/gh-action-pypi-publish@v1.8.7 + - uses: pypa/gh-action-pypi-publish@v1.8.10 if: github.event_name == 'release' && github.event.action == 'published' with: # Remember to generate this and set it in "GitHub Secrets" diff --git a/src/nlpsig/classification_utils.py b/src/nlpsig/classification_utils.py index dc24203..93bc5df 100644 --- a/src/nlpsig/classification_utils.py +++ b/src/nlpsig/classification_utils.py @@ -110,7 +110,6 @@ def __init__( else: if self.groups is not None: # see https://github.com/scikit-learn/scikit-learn/issues/9193 - print("[INFO] Splitting data by provided groups") self.shuffle = False if x_data.shape[0] != len(self.groups): diff --git a/src/nlpsig/data_preparation.py b/src/nlpsig/data_preparation.py index 77bda80..9024fd9 100644 --- a/src/nlpsig/data_preparation.py +++ b/src/nlpsig/data_preparation.py @@ -97,7 +97,7 @@ def __init__( self.pooled_embeddings: np.array | None = pooled_embeddings # obtain time features - self._time_feature_choices: list[str] = [] + self._feature_list: list[str] = [] self.time_features_added: bool = False self.df = self._set_time_features() self.df_padded: pd.DataFrame | None = None @@ -207,13 +207,13 @@ def _set_time_features(self) -> pd.DataFrame: return None print("[INFO] Adding time feature columns into dataframe in `.df`.") if "datetime" in self.df.columns: - self._time_feature_choices += ["time_encoding", "time_diff"] + self._feature_list += ["time_encoding", "time_diff"] # checking 'datetime' column is datatime type self.df["datetime"] = pd.to_datetime(self.df["datetime"]) # obtain time encoding by computing the fraction of year it is in - print("[INFO] Adding 'time_encoding' and feature...") + print("[INFO] Adding 'time_encoding' feature...") self.df["time_encoding"] = self.df["datetime"].map( lambda t: self._time_fraction(t) ) @@ -224,7 +224,7 @@ def _set_time_features(self) -> pd.DataFrame: self.df = self.df.sort_values(by=[self.id_column, "datetime"]) # calculate time difference between posts - print("[INFO] Adding 'time_diff' and feature...") + print("[INFO] Adding 'time_diff' feature...") self.df["time_diff"] = list( self.df.groupby(self.id_column) .apply( @@ -249,7 +249,7 @@ def _set_time_features(self) -> pd.DataFrame: "we assume that the data is ordered by time with respect to the id." ) # assign index for each post in each timeline - self._time_feature_choices += ["timeline_index"] + self._feature_list += ["timeline_index"] print("[INFO] Adding 'timeline_index' feature...") self.df["timeline_index"] = list( @@ -266,7 +266,7 @@ def _set_time_features(self) -> pd.DataFrame: return self.df - def _obtain_colnames(self, embeddings: str) -> list[str]: + def _obtain_embedding_colnames(self, embeddings: str) -> list[str]: """ [Private] Obtains the column names storing the embeddings. @@ -308,62 +308,85 @@ def _obtain_colnames(self, embeddings: str) -> list[str]: return colnames - def _obtain_time_feature_columns( + def _check_feature_exists(self, feature: str) -> bool: + """ + [Private] Checks if `feature` is a column in `self._feature_list`. If not, + check if `self.df` dataframe and if it is, add this to `self._feature_list`. + + Parameters + ---------- + feature : str + Feature name. + + Returns + ------- + bool + True if `feature` is in `self._feature_list` or is a + column name in `self.df`. + """ + if (feature not in self._feature_list) and (feature in self.df.columns): + # not in ._feature_list, but is a valid column name in self.df, + # so add to feature list + self._feature_list += [feature] + + return feature in self._feature_list + + def _obtain_feature_columns( self, - time_feature: list[str] | str | None, + features: list[str] | str | None, ) -> list[str]: """ - [Private] Obtains the column names storing the time features requested. + [Private] Obtains the column names storing the feature(s) requested. If a string or list is passed, it essentially just checks if it is an - available time feature that is stored in `_time_feature_choices` and returns - the time features in a list. + available feature that is stored in `_feature_list` and returns + the feature(s) in a list. Parameters ---------- - time_feature : list[str] | str | None + features : list[str] | str | None If is a string, it must be in the list found in - `_time_feature_choices` attribute. If is a list, + `_feature_list` attribute. If is a list, each item must be a string and it must be in the - list found in `_time_feature_choices` attribute. + list found in `_feature_list` attribute. Returns ------- list[str] - List of column names which store the time features. + List of column names which store the feature(s). Raises ------ ValueError - if `time_feature` is a string, and it is not found in `_time_feature_choices`. - ValueError - if `time_feature` is a list of strings, and one of the items - is not found in `_time_feature_choices`. - TypeError - if `time_feature` is neither a string or a list. + if `features` is a string, and it is not found in + `_feature_list` attribute or if `features` is a + list of strings, and one of the items + is not found in `_feature_list` attribute. """ - if time_feature is None: - time_feature = [] + if features is None: + # no features are wanted, return an empty list + features = [] else: - if not self.time_features_added: - self.set_time_features() - if isinstance(time_feature, str): - if time_feature not in self._time_feature_choices: - raise ValueError( - "If `time_feature` is a string, it must " - f"be in {self._time_feature_choices}." - ) - time_feature = [time_feature] - elif isinstance(time_feature, list): - if not all(item in self._time_feature_choices for item in time_feature): - raise ValueError( - f"Each item in `time_feature` should be in {self._time_feature_choices}." - ) + # convert to list of strings + if isinstance(features, str): + features = [features] + + if isinstance(features, list): + # check each item in features is in self._feature_list + # if it isn't, but is a column in self.df, it will add + # it to self._feature_list + for item in features: + if not self._check_feature_exists(feature=item): + raise ValueError( + f"{item} must be in `self.feature_list`: {self._feature_list}, " + "or a column in `self.df`." + ) else: + # features is neither None, a string or a list raise TypeError( "`time_feature` must be either None, a string, or a list of strings." ) - return time_feature + return features def _pad_dataframe( self, @@ -371,7 +394,7 @@ def _pad_dataframe( k: int, zero_padding: bool, colnames: list[str], - time_feature: list[str], + features: list[str], id: int, pad_from_below: bool, ) -> pd.DataFrame: @@ -393,8 +416,8 @@ def _pad_dataframe( text associated to the id. colnames : list[str] List of column names that we wish to keep from the dataframe. - time_feature : list[str] - List of time feature column names that we wish to keep from the dataframe. + features : list[str] + List of feature column names that we wish to keep from the dataframe. id : int Which id are we padding. pad_from_below: bool @@ -412,7 +435,7 @@ def _pad_dataframe( """ if k <= 0: raise ValueError("`k` must be a positive integer.") - columns = time_feature + colnames + [self.id_column] + columns = features + colnames + [self.id_column] if self.label_column is not None: columns += [self.label_column] @@ -424,7 +447,7 @@ def _pad_dataframe( if self.label_column is not None: # set labels to be -1 to indicate that they're padded values data_dict = { - **dict.fromkeys(time_feature, [0]), + **dict.fromkeys(features, [0]), **{c: [0] for c in colnames}, self.id_column: [id], self.label_column: [-1], @@ -432,7 +455,7 @@ def _pad_dataframe( else: # no label column to add data_dict = { - **dict.fromkeys(time_feature, [0]), + **dict.fromkeys(features, [0]), **{c: [0] for c in colnames}, self.id_column: [id], } @@ -471,7 +494,7 @@ def _pad_id( k: int, zero_padding: bool, colnames: list[str], - time_feature: list[str], + features: list[str], id: int, pad_from_below: bool, ) -> pd.DataFrame: @@ -497,8 +520,8 @@ def _pad_id( text associated to the id. colnames : list[str] List of column names that we wish to keep from the dataframe. - time_feature : list[str] - List of time feature column names that we wish to keep from the dataframe. + features : list[str] + List of feature column names that we wish to keep from the dataframe. id : int Which id are we padding. pad_from_below: bool @@ -523,7 +546,7 @@ def _pad_id( k=k, zero_padding=zero_padding, colnames=colnames, - time_feature=time_feature, + features=features, id=id, pad_from_below=pad_from_below, ) @@ -533,7 +556,7 @@ def _pad_history( k: int, zero_padding: bool, colnames: list[str], - time_feature: list[str], + features: list[str], index: int, include_current_embedding: bool, pad_from_below: bool, @@ -559,8 +582,8 @@ def _pad_history( text associated to the id. colnames : list[str] List of column names that we wish to keep from the dataframe. - time_feature : list[str] - List of time feature column names that we wish to keep from the dataframe. + features : list[str] + List of features column names that we wish to keep from the dataframe. index : int Which index of the dataframe are we padding. pad_from_below: bool @@ -606,7 +629,7 @@ def _pad_history( k=k, zero_padding=zero_padding, colnames=colnames, - time_feature=time_feature, + features=features, id=id, pad_from_below=pad_from_below, ) @@ -616,18 +639,18 @@ def _standardise_pd( vec: pd.Series, method: str | None ) -> dict[str, pd.Series | Callable]: # standardised pandas series - implemented = ["standardise", "normalise", "minmax", None] + implemented = ["z_score", "sum_divide", "minmax", None] if method not in implemented: - raise ValueError(f"`method` must be in {implemented}.") + raise ValueError(f"`method`: {method} must be in {implemented}.") - if method == "standardise": + if method == "z_score": mean = vec.mean() std = vec.std() def transform(x): return (x - mean) / std - elif method == "normalise": + elif method == "sum_divide": sum = vec.sum() def transform(x): @@ -653,7 +676,7 @@ def pad( method: str = "k_last", zero_padding: bool = True, k: int = 5, - time_feature: list[str] | str | None = None, + features: list[str] | str | None = None, standardise_method: list[str] | str | None = None, embeddings: str = "full", include_current_embedding: bool = True, @@ -697,13 +720,16 @@ def pad( text associated to the id. k : int, optional The requested length of the path, default 5. This is ignored if `method="max"`. - time_feature : list[str] | str | None, optional - Which time feature(s) to keep. If None, then doesn't keep any. - standardise_method : str | None, optional - If not None, applies standardisation to the time features, default None. Options: + features : list[str] | str | None, optional + Which feature(s) to keep. If None, then doesn't keep any. + standardise_method : list[str] | str | None, optional + If not None, applies standardisation to the features, default None. + If a list is passed, must be the same length as `features`. Options: - - "standardise": transforms by subtracting the mean and dividing by standard deviation - - "normalise": transforms by dividing by the sum + - "z_score": transforms by subtracting the mean and dividing by standard deviation + - "sum_divide": transforms by dividing by the sum + - "minmax": transform by return (x-min(x)) / (max(x)-min(x)) where x + is the vector to standardise embeddings : str, optional Which embeddings to keep, by default "full". Options: @@ -750,35 +776,33 @@ def pad( else: raise ValueError("`method` must be either 'k_last' or 'max'.") - # obtain time feature colnames - time_feature_colnames = self._obtain_time_feature_columns( - time_feature=time_feature - ) - if len(time_feature_colnames) > 0: + # obtain feature colnames + feature_colnames = self._obtain_feature_columns(features=features) + if len(feature_colnames) > 0: if isinstance(standardise_method, str): - standardise_method = [standardise_method] * len(time_feature_colnames) + standardise_method = [standardise_method] * len(feature_colnames) elif isinstance(standardise_method, list) and ( - len(standardise_method) != len(time_feature_colnames) + len(standardise_method) != len(feature_colnames) ): raise ValueError( "if `standardise_method` is a list, it must have the same length " - f"as the number of time features requested: {len(time_feature_colnames)}." + f"as the number of features requested: {len(feature_colnames)}." ) if standardise_method is not None: - # standardises the time features in .df + # standardises the features in .df self.standardise_transform = {} - for i in range(len(time_feature_colnames)): + for i in range(len(feature_colnames)): standardise = self._standardise_pd( - vec=self.df[time_feature_colnames[i]], method=standardise_method[i] + vec=self.df[feature_colnames[i]], method=standardise_method[i] ) - self.standardise_transform[time_feature_colnames[i]] = standardise[ + self.standardise_transform[feature_colnames[i]] = standardise[ "transform" ] - self.df[time_feature_colnames[i]] = standardise["standardised_pd"] + self.df[feature_colnames[i]] = standardise["standardised_pd"] # obtain colnames of embeddings - colnames = self._obtain_colnames(embeddings=embeddings) + colnames = self._obtain_embedding_colnames(embeddings=embeddings) if pad_by == "id": # pad each of the ids in id_column and store them in a list @@ -787,7 +811,7 @@ def pad( k=k, zero_padding=zero_padding, colnames=colnames, - time_feature=time_feature_colnames, + features=feature_colnames, id=id, pad_from_below=pad_from_below, ) @@ -801,7 +825,7 @@ def pad( k=k, zero_padding=zero_padding, colnames=colnames, - time_feature=time_feature_colnames, + features=feature_colnames, index=index, include_current_embedding=include_current_embedding, pad_from_below=pad_from_below, @@ -824,11 +848,11 @@ def pad( def get_time_feature( self, time_feature: str = "timeline_index", - standardise_method: str = "standardise", + standardise_method: str = "z_score", ) -> dict[str, np.array | Callable | None]: """ Returns a `np.array` object of the time_feature that is requested - (the string passed has to be one of the strings in `._time_feature_choices`). + (the string passed has to be one of the strings in `._feature_list`). Parameters ---------- @@ -836,8 +860,10 @@ def get_time_feature( Which time feature to obtain `np.array` for, by default "timeline_index". standardise_method : str | None, optional If not None, applies standardisation to the time features, default None. Options: - - "standardise": transforms by subtracting the mean and dividing by standard deviation - - "normalise": transforms by dividing by the sum + - "z_score": transforms by subtracting the mean and dividing by standard deviation + - "sum_divide": transforms by dividing by the sum + - "minmax": transform by return (x-min(x)) / (max(x)-min(x)) where x + is the vector to standardise Returns ------- @@ -850,12 +876,10 @@ def get_time_feature( ------ ValueError if `time_feature` is not in the possible time_features - (can be found in `._time_feature_choices` attribute). + (can be found in `._feature_list` attribute). """ - if time_feature not in self._time_feature_choices: - raise ValueError( - f"`time_feature` should be in {self._time_feature_choices}." - ) + if time_feature not in self._feature_list: + raise ValueError(f"`time_feature` should be in {self._feature_list}.") if not self.time_features_added: self.set_time_features() @@ -873,15 +897,15 @@ def get_time_feature( return {"time_feature": np.array(self.df[time_feature]), "transform": None} - def get_path(self, include_time_features: bool = True) -> np.array: + def get_path(self, include_features: bool = True) -> np.array: """ Returns a `np.array` object of the path. - Includes the time features by default (if they are present after the padding). + Includes the features by default (if they are present after the padding). Parameters ---------- - include_time_features : bool, optional - Whether or not to keep the time features, by default True. + include_features : bool, optional + Whether or not to keep the features, by default True. Returns ------- @@ -906,14 +930,14 @@ def get_path(self, include_time_features: bool = True) -> np.array: # (which stores id_column) path = self.array_padded[:, :, :-1] - if not include_time_features: - # computes how many time features there are currently - # (which occur in the first n_time_features columns) - n_time_features = len( - [item for item in self._time_feature_choices if item in self.df_padded] + if not include_features: + # computes how many features there are currently + # (which occur in the first n_features columns) + n_features = len( + [item for item in self._feature_list if item in self.df_padded] ) - # removes any time features (if they're present) - path = path[:, :, n_time_features:] + # removes any features (if they're present) + path = path[:, :, n_features:] return path.astype("float") @@ -945,8 +969,8 @@ def get_embeddings(self, reduced_embeddings: bool = False) -> np.array: def get_torch_path_for_SWNUNetwork( self, - include_time_features_in_path: bool, - include_time_features_in_input: bool, + include_features_in_path: bool, + include_features_in_input: bool, include_embedding_in_input: bool, reduced_embeddings: bool = False, ) -> tuple[torch.tensor, int]: @@ -955,10 +979,10 @@ def get_torch_path_for_SWNUNetwork( Parameters ---------- - include_time_features_in_path : bool - Whether or not to keep time features within the path. - include_time_features_in_input : bool - Whether or not to concatenate the time feature into the feed-forward neural + include_features_in_path : bool + Whether or not to keep the additional features (e.g. time features) within the path. + include_features_in_input : bool + Whether or not to concatenate the additional features into the feed-forward neural network in the `nlpsig_networks.SWNUNetwork` model. include_embedding_in_input : bool Whether or not to concatenate the embeddings into the feed-forward neural @@ -984,10 +1008,10 @@ def get_torch_path_for_SWNUNetwork( raise ValueError("Need to first call to create the path `.pad()`.") # obtains a torch tensor which can be inputted into deepsignet - # computes how many time features there are currently - # (which occur in the first n_time_features columns) - n_time_features = len( - [item for item in self._time_feature_choices if item in self.df_padded] + # computes how many features there are currently + # (which occur in the first n_features columns) + n_features = len( + [item for item in self._feature_list if item in self.df_padded] ) if include_embedding_in_input: @@ -1047,39 +1071,39 @@ def get_torch_path_for_SWNUNetwork( .transpose(1, 2) ) - if include_time_features_in_path: - # make sure path includes the time features - path = torch.from_numpy(self.get_path(include_time_features=True)) + if include_features_in_path: + # make sure path includes the features + path = torch.from_numpy(self.get_path(include_features=True)) input_channels = path.shape[2] - if include_time_features_in_input: - # need to repeat the time feature columns - # if there are no time features, then we don't need to repeat anything - if n_time_features == 1: + if include_features_in_input: + # need to repeat the feature columns + # if there are no features, then we don't need to repeat anything + if n_features == 1: path = torch.cat([path, path[:, :, 0].unsqueeze(2)], dim=2) - elif n_time_features > 1: - path = torch.cat([path, path[:, :, 0:n_time_features]], dim=2) + elif n_features > 1: + path = torch.cat([path, path[:, :, 0:n_features]], dim=2) else: - if include_time_features_in_input: - # path doesn't need to include the time features + if include_features_in_input: + # path doesn't need to include the features # but we still want to include them in the input to the FFN for classification - path = torch.from_numpy(self.get_path(include_time_features=True)) - input_channels = path.shape[2] - n_time_features - # need to move time features to the end of the path - # if there are no time features, then we don't need to move anything - if n_time_features == 1: + path = torch.from_numpy(self.get_path(include_features=True)) + input_channels = path.shape[2] - n_features + # need to move features to the end of the path + # if there are no features, then we don't need to move anything + if n_features == 1: path = torch.cat( - [path[:, :, n_time_features:], path[:, :, 0].unsqueeze(2)], + [path[:, :, n_features:], path[:, :, 0].unsqueeze(2)], dim=2, ) - elif n_time_features > 1: + elif n_features > 1: path = torch.cat( - [path[:, :, n_time_features:], path[:, :, 0:n_time_features]], + [path[:, :, n_features:], path[:, :, 0:n_features]], dim=2, ) else: - # path doesn't need to include the time features + # path doesn't need to include the features # and don't need to include them in the input to the FFN for classification - path = torch.from_numpy(self.get_path(include_time_features=False)) + path = torch.from_numpy(self.get_path(include_features=False)) input_channels = path.shape[2] if include_embedding_in_input: @@ -1138,8 +1162,8 @@ def get_torch_path_for_SeqSigNet( shift: int, window_size: int, n: int, - include_time_features_in_path: bool, - include_time_features_in_input: bool, + include_features_in_path: bool, + include_features_in_input: bool, include_embedding_in_input: bool, reduced_embeddings: bool = False, ) -> tuple[torch.tensor, int]: @@ -1154,10 +1178,10 @@ def get_torch_path_for_SeqSigNet( Size of the window we use over the texts. n : int Number of units we wish to use in SeqSigNet. - include_time_features_in_path : bool - Whether or not to keep time features within the path. - include_time_features_in_input : bool - Whether or not to concatenate the time feature into the feed-forward neural + include_features_in_path : bool + Whether or not to keep the additional features (e.g. time features) within the path. + include_features_in_input : bool + Whether or not to concatenate the additional features into the feed-forward neural network in the `nlpsig_networks.SeqSigNet` model. include_embedding_in_input : bool Whether or not to concatenate the embeddings into the feed-forward neural @@ -1194,8 +1218,8 @@ def get_torch_path_for_SeqSigNet( # obtain 3 dimensional tensor with dimensions [batch, history, channels] swnu_path, input_channels = self.get_torch_path_for_SWNUNetwork( - include_time_features_in_path=include_time_features_in_path, - include_time_features_in_input=include_time_features_in_input, + include_features_in_path=include_features_in_path, + include_features_in_input=include_features_in_input, include_embedding_in_input=include_embedding_in_input, reduced_embeddings=reduced_embeddings, ) diff --git a/tests/conftest.py b/tests/conftest.py index b45f3e4..cda224c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,6 +29,8 @@ def test_df_with_datetime(): return pd.DataFrame( { "text": [f"text_{i}" for i in range(n_entries)], + "binary_var": [rng.choice([0, 1]) for i in range(n_entries)], + "continuous_var": rng.random(n_entries), "id_col": [0 for i in range(100)] + [rng.integers(1, 5) for i in range(n_entries - 100)], "label_col": [rng.integers(0, 4) for i in range(n_entries)], @@ -43,6 +45,8 @@ def test_df_no_time(): return pd.DataFrame( { "text": [f"text_{i}" for i in range(n_entries)], + "binary_var": [rng.choice([0, 1]) for i in range(n_entries)], + "continuous_var": rng.random(n_entries), "id_col": [0 for i in range(100)] + [rng.integers(1, 5) for i in range(n_entries - 100)], "label_col": [rng.integers(0, 4) for i in range(n_entries)], @@ -56,6 +60,8 @@ def test_df_to_pad(): return pd.DataFrame( { "text": [f"text_{i}" for i in range(n_entries)], + "binary_var": [rng.choice([0, 1]) for i in range(n_entries)], + "continuous_var": rng.random(n_entries), "id_col": 0, "label_col": [rng.integers(0, 4) for i in range(n_entries)], } diff --git a/tests/test_data_preparation.py b/tests/test_data_preparation.py index a1d99f5..35d7746 100644 --- a/tests/test_data_preparation.py +++ b/tests/test_data_preparation.py @@ -28,14 +28,10 @@ def test_default_initialisation_datetime( # 1 dummy id column assert obj.df.shape == ( len(obj.original_df.index), - 1 - + len(obj.original_df.columns) - + emb.shape[1] - + len(obj._time_feature_choices) - + 1, + 1 + len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list) + 1, ) assert obj.pooled_embeddings is None - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", @@ -67,13 +63,10 @@ def test_default_initialisation_no_time( # 1 dummy id column assert obj.df.shape == ( len(obj.original_df.index), - len(obj.original_df.columns) - + emb.shape[1] - + len(obj._time_feature_choices) - + 1, + len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list) + 1, ) assert obj.pooled_embeddings is None - assert obj._time_feature_choices == ["timeline_index"] + assert obj._feature_list == ["timeline_index"] assert obj.time_features_added assert obj.df_padded is None assert obj.array_padded is None @@ -105,13 +98,10 @@ def test_initialisation_with_id_and_label_datetime( # 3 time features assert obj.df.shape == ( len(obj.original_df.index), - 1 - + len(obj.original_df.columns) - + emb.shape[1] - + len(obj._time_feature_choices), + 1 + len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list), ) assert obj.pooled_embeddings is None - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", @@ -148,10 +138,10 @@ def test_initialisation_with_id_and_label_no_time( # 1 label column assert obj.df.shape == ( len(obj.original_df.index), - len(obj.original_df.columns) + emb.shape[1] + len(obj._time_feature_choices), + len(obj.original_df.columns) + emb.shape[1] + len(obj._feature_list), ) assert obj.pooled_embeddings is None - assert obj._time_feature_choices == ["timeline_index"] + assert obj._feature_list == ["timeline_index"] assert obj.time_features_added assert obj.df_padded is None assert obj.array_padded is None @@ -205,11 +195,11 @@ def test_initialisation_with_reduced_emb_datetime( + len(obj.original_df.columns) + emb.shape[1] + emb_reduced.shape[1] - + len(obj._time_feature_choices) + + len(obj._feature_list) + 1, ) assert obj.pooled_embeddings is None - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", @@ -248,11 +238,11 @@ def test_initialisation_with_reduced_emb_no_time( len(obj.original_df.columns) + emb.shape[1] + emb_reduced.shape[1] - + len(obj._time_feature_choices) + + len(obj._feature_list) + 1, ) assert obj.pooled_embeddings is None - assert obj._time_feature_choices == ["timeline_index"] + assert obj._feature_list == ["timeline_index"] assert obj.time_features_added assert obj.df_padded is None assert obj.array_padded is None @@ -275,9 +265,6 @@ def test_initialisation_with_pooled_emb_datetime( pooled_embeddings=emb_pooled, id_column="id_col", ) - # should have an error as we haven't passed in the id column, - # and so it expects the number of rows in emb_pooled to - # equal the number of rows in the dataframe pd.testing.assert_frame_equal(obj.original_df, test_df_with_datetime) assert obj.id_column == "id_col" assert obj.label_column is None @@ -297,10 +284,10 @@ def test_initialisation_with_pooled_emb_datetime( + len(obj.original_df.columns) + emb.shape[1] + emb_reduced.shape[1] - + len(obj._time_feature_choices), + + len(obj._feature_list), ) assert (obj.pooled_embeddings == emb_pooled).all() - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", @@ -327,9 +314,6 @@ def test_initialisation_with_pooled_emb_no_time( pooled_embeddings=emb_pooled, id_column="id_col", ) - # should have an error as we haven't passed in the id column, - # and so it expects the number of rows in emb_pooled to - # equal the number of rows in the dataframe pd.testing.assert_frame_equal(obj.original_df, test_df_no_time) assert obj.id_column == "id_col" assert obj.label_column is None @@ -347,10 +331,10 @@ def test_initialisation_with_pooled_emb_no_time( len(obj.original_df.columns) + emb.shape[1] + emb_reduced.shape[1] - + len(obj._time_feature_choices), + + len(obj._feature_list), ) assert (obj.pooled_embeddings == emb_pooled).all() - assert obj._time_feature_choices == ["timeline_index"] + assert obj._feature_list == ["timeline_index"] assert obj.time_features_added assert obj.df_padded is None assert obj.array_padded is None @@ -500,14 +484,14 @@ def test_PrepareData_obtain_colnames_emb(test_df_with_datetime, emb): # test cases where only embeddings are passed obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert obj._obtain_colnames(embeddings="full") == emb_names - assert obj._obtain_colnames(embeddings="dim_reduced") == [] - assert obj._obtain_colnames(embeddings="both") == emb_names + assert obj._obtain_embedding_colnames(embeddings="full") == emb_names + assert obj._obtain_embedding_colnames(embeddings="dim_reduced") == [] + assert obj._obtain_embedding_colnames(embeddings="both") == emb_names with pytest.raises( ValueError, match="Embeddings must be either 'dim_reduced', 'full', or 'both'" ): - obj._obtain_colnames(embeddings="") + obj._obtain_embedding_colnames(embeddings="") def test_obtain_colnames_both(test_df_with_datetime, emb, emb_reduced): @@ -520,85 +504,160 @@ def test_obtain_colnames_both(test_df_with_datetime, emb, emb_reduced): embeddings=emb, embeddings_reduced=emb_reduced, ) - assert obj._obtain_colnames(embeddings="full") == emb_names - assert obj._obtain_colnames(embeddings="dim_reduced") == emb_reduced_names - assert obj._obtain_colnames(embeddings="both") == emb_reduced_names + emb_names + assert obj._obtain_embedding_colnames(embeddings="full") == emb_names + assert obj._obtain_embedding_colnames(embeddings="dim_reduced") == emb_reduced_names + assert ( + obj._obtain_embedding_colnames(embeddings="both") + == emb_reduced_names + emb_names + ) + + +def test_obtain_feature_columns_string(test_df_with_datetime, emb): + # default initialisation + obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) + assert set(obj._feature_list) == { + "time_encoding", + "time_diff", + "timeline_index", + } + assert obj._obtain_feature_columns("timeline_index") == ["timeline_index"] + + +def test_obtain_feature_columns_string_additional_binary(test_df_with_datetime, emb): + # default initialisation + obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) + # originally only have the time features + assert set(obj._feature_list) == { + "time_encoding", + "time_diff", + "timeline_index", + } + # pass in string of column name that isn't in _feature_list but + # is a column in self.df + assert obj._obtain_feature_columns("binary_var") == ["binary_var"] + assert set(obj._feature_list) == { + "time_encoding", + "time_diff", + "timeline_index", + "binary_var", + } + + +def test_obtain_feature_columns_string_additional_continuous( + test_df_with_datetime, emb +): + # default initialisation + obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) + # originally only have the time features + assert set(obj._feature_list) == { + "time_encoding", + "time_diff", + "timeline_index", + } + # pass in string of column name that isn't in _feature_list but + # is a column in self.df + assert obj._obtain_feature_columns("continuous_var") == ["continuous_var"] + assert set(obj._feature_list) == { + "time_encoding", + "time_diff", + "timeline_index", + "continuous_var", + } -def test_obtain_time_feature_columns_string(test_df_with_datetime, emb): +def test_obtain_feature_columns_list(test_df_with_datetime, emb): # default initialisation obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", } - assert obj._obtain_time_feature_columns("timeline_index") == ["timeline_index"] + assert obj._obtain_feature_columns(["time_encoding", "timeline_index"]) == [ + "time_encoding", + "timeline_index", + ] -def test_obtain_time_feature_columns_list(test_df_with_datetime, emb): +def test_obtain_feature_columns_list_additional(test_df_with_datetime, emb): # default initialisation obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", } - assert obj._obtain_time_feature_columns(["time_encoding", "timeline_index"]) == [ + assert obj._obtain_feature_columns( + ["time_encoding", "timeline_index", "binary_var", "continuous_var"] + ) == [ "time_encoding", "timeline_index", + "binary_var", + "continuous_var", ] + # check that it has added binary_var and continuous_var to ._feature_list + assert set(obj._feature_list) == { + "time_encoding", + "time_diff", + "timeline_index", + "binary_var", + "continuous_var", + } -def test_obtain_time_feature_columns_none(test_df_with_datetime, emb): +def test_obtain_feature_columns_none(test_df_with_datetime, emb): # default initialisation obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", } - assert obj._obtain_time_feature_columns(None) == [] + assert obj._obtain_feature_columns(None) == [] -def test_obtain_time_feature_columns_string_not_in(test_df_with_datetime, emb): +def test_obtain_feature_columns_string_not_in(test_df_with_datetime, emb): # default initialisation obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", } + incorrect_name = "TEST_COLUMN" with pytest.raises( ValueError, match=re.escape( - f"If `time_feature` is a string, it must be in {obj._time_feature_choices}." + f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list}, " + "or a column in `self.df`." ), ): - obj._obtain_time_feature_columns("TEST_COLUMN") + obj._obtain_feature_columns(incorrect_name) -def test_obtain_time_feature_columns_list_not_in(test_df_with_datetime, emb): +def test_obtain_feature_columns_list_not_in(test_df_with_datetime, emb): # default initialisation obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", } + incorrect_name = "TEST_COLUMN" with pytest.raises( ValueError, match=re.escape( - f"Each item in `time_feature` should be in {obj._time_feature_choices}." + f"{incorrect_name} must be in `self.feature_list`: {obj._feature_list}, " + "or a column in `self.df`." ), ): - obj._obtain_time_feature_columns(["timeline_index", "TEST_COLUMN"]) + obj._obtain_feature_columns(["timeline_index", incorrect_name]) -def test_obtain_time_feature_columns_type(test_df_with_datetime, emb): +def test_obtain_feature_columns_type(test_df_with_datetime, emb): # default initialisation obj = PrepareData(original_df=test_df_with_datetime, embeddings=emb) - assert set(obj._time_feature_choices) == { + assert set(obj._feature_list) == { "time_encoding", "time_diff", "timeline_index", @@ -607,13 +666,13 @@ def test_obtain_time_feature_columns_type(test_df_with_datetime, emb): TypeError, match="`time_feature` must be either None, a string, or a list of strings.", ): - obj._obtain_time_feature_columns(0) + obj._obtain_feature_columns(0) -def test_standardise_pd_standardise(vec_to_standardise, test_df_no_time, emb): - # testing _standardise_pd with method=="standardise" +def test_standardise_pd_z_score(vec_to_standardise, test_df_no_time, emb): + # testing _standardise_pd with method=="z_score" obj = PrepareData(original_df=test_df_no_time, embeddings=emb) - standardise = obj._standardise_pd(vec=vec_to_standardise, method="standardise") + standardise = obj._standardise_pd(vec=vec_to_standardise, method="z_score") assert type(standardise) == dict assert type(standardise["standardised_pd"]) == pd.Series pd.testing.assert_series_equal( @@ -624,10 +683,10 @@ def test_standardise_pd_standardise(vec_to_standardise, test_df_no_time, emb): ) -def test_standardise_pd_normalise(vec_to_standardise, test_df_no_time, emb): - # testing _standardise_pd with method=="normalise" +def test_standardise_pd_sum_divide(vec_to_standardise, test_df_no_time, emb): + # testing _standardise_pd with method=="sum_divide" obj = PrepareData(original_df=test_df_no_time, embeddings=emb) - standardise = obj._standardise_pd(vec=vec_to_standardise, method="normalise") + standardise = obj._standardise_pd(vec=vec_to_standardise, method="sum_divide") assert type(standardise) == dict assert type(standardise["standardised_pd"]) == pd.Series pd.testing.assert_series_equal( @@ -639,7 +698,7 @@ def test_standardise_pd_normalise(vec_to_standardise, test_df_no_time, emb): def test_standardise_pd_minmax(vec_to_standardise, test_df_no_time, emb): - # testing _standardise_pd with method=="normalise" + # testing _standardise_pd with method=="sum_divide" obj = PrepareData(original_df=test_df_no_time, embeddings=emb) standardise = obj._standardise_pd(vec=vec_to_standardise, method="minmax") assert type(standardise) == dict @@ -654,7 +713,7 @@ def test_standardise_pd_minmax(vec_to_standardise, test_df_no_time, emb): def test_standardise_pd_None(vec_to_standardise, test_df_no_time, emb): - # testing _standardise_pd with method=="normalise" + # testing _standardise_pd with method=="sum_divide" obj = PrepareData(original_df=test_df_no_time, embeddings=emb) standardise = obj._standardise_pd(vec=vec_to_standardise, method=None) assert type(standardise) == dict @@ -668,9 +727,11 @@ def test_standardise_pd_None(vec_to_standardise, test_df_no_time, emb): def test_standardise_pd_wrong_method(vec_to_standardise, test_df_no_time, emb): # testing _standardise_pd with method that isn't implemented - implemented = ["standardise", "normalise", "minmax", None] + implemented = ["z_score", "sum_divide", "minmax", None] obj = PrepareData(original_df=test_df_no_time, embeddings=emb) + incorrect_method = "fake_method" with pytest.raises( - ValueError, match=re.escape(f"`method` must be in {implemented}.") + ValueError, + match=re.escape(f"`method`: {incorrect_method} must be in {implemented}."), ): - obj._standardise_pd(vec=vec_to_standardise, method="fake_method") + obj._standardise_pd(vec=vec_to_standardise, method=incorrect_method) diff --git a/tests/test_padding.py b/tests/test_padding.py index 177ebee..e2612a3 100644 --- a/tests/test_padding.py +++ b/tests/test_padding.py @@ -22,7 +22,7 @@ def test_pad_dataframe_zero_padding_from_below_without_label(test_df_no_time, em k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=True, ) @@ -61,7 +61,7 @@ def test_pad_dataframe_zero_padding_from_below_with_label(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=True, ) @@ -99,7 +99,7 @@ def test_pad_dataframe_zero_padding_from_above_without_label(test_df_no_time, em k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -137,7 +137,7 @@ def test_pad_dataframe_zero_padding_from_above_with_label(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -176,7 +176,7 @@ def test_pad_dataframe_non_zero_padding_from_below(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=True, ) @@ -208,7 +208,7 @@ def test_pad_dataframe_non_zero_padding_from_above(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -236,8 +236,8 @@ def test_pad_dataframe_k_equal_zero(test_df_no_time, test_df_to_pad, emb): df=test_df_to_pad, k=0, zero_padding=False, - colnames=obj._obtain_colnames("full"), - time_feature=["timeline_index"], + colnames=obj._obtain_embedding_colnames("full"), + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -257,8 +257,8 @@ def test_pad_dataframe_k_negative(test_df_no_time, test_df_to_pad, emb): df=test_df_to_pad, k=-1, zero_padding=False, - colnames=obj._obtain_colnames("full"), - time_feature=["timeline_index"], + colnames=obj._obtain_embedding_colnames("full"), + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -279,7 +279,7 @@ def test_pad_dataframe_no_pad(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -304,7 +304,7 @@ def test_pad_dataframe_cutoff(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -326,8 +326,8 @@ def test_pad_id_k_equal_zero(test_df_no_time, emb): obj._pad_id( k=-1, zero_padding=False, - colnames=obj._obtain_colnames("full"), - time_feature=["timeline_index"], + colnames=obj._obtain_embedding_colnames("full"), + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -345,8 +345,8 @@ def test_pad_id_k_negative(test_df_no_time, emb): obj._pad_id( k=-1, zero_padding=False, - colnames=obj._obtain_colnames("full"), - time_feature=["timeline_index"], + colnames=obj._obtain_embedding_colnames("full"), + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -366,7 +366,7 @@ def test_pad_id_zero_padding_from_below(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=True, ) @@ -402,7 +402,7 @@ def test_pad_id_zero_padding_from_above(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -443,7 +443,7 @@ def test_pad_id_non_zero_padding_from_below(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=True, ) @@ -477,7 +477,7 @@ def test_pad_id_non_zero_padding_from_above(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -508,7 +508,7 @@ def test_pad_id_no_pad(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -536,7 +536,7 @@ def test_pad_id_cutoff(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], id=0, pad_from_below=False, ) @@ -557,7 +557,7 @@ def test_pad_history_zero_padding_no_history_from_below(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=True, @@ -587,7 +587,7 @@ def test_pad_history_zero_padding_no_history_from_above(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=False, @@ -619,7 +619,7 @@ def test_pad_history_zero_padding_some_history_from_below(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=True, @@ -655,7 +655,7 @@ def test_pad_history_zero_padding_some_history_from_above(test_df_no_time, emb): k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=False, @@ -688,7 +688,7 @@ def test_pad_history_non_zero_padding_no_history_from_below(test_df_no_time, emb k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=True, @@ -718,7 +718,7 @@ def test_pad_history_non_zero_padding_no_history_from_above(test_df_no_time, emb k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=False, @@ -750,7 +750,7 @@ def test_pad_history_non_zero_padding_some_history_from_below(test_df_no_time, e k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=True, @@ -780,7 +780,7 @@ def test_pad_history_non_zero_padding_some_history_from_above(test_df_no_time, e k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=False, @@ -814,7 +814,7 @@ def test_pad_history_just_enough_history(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=False, @@ -845,7 +845,7 @@ def test_pad_history_many_history_cutoff(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=False, pad_from_below=False, @@ -872,7 +872,7 @@ def test_pad_history_no_history_zero_padding_include_current_from_below( k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=True, @@ -911,7 +911,7 @@ def test_pad_history_no_history_zero_padding_include_current_from_above( k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=False, @@ -950,7 +950,7 @@ def test_pad_history_some_history_zero_padding_include_current_from_below( k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=True, @@ -990,7 +990,7 @@ def test_pad_history_some_history_zero_padding_include_current_from_above( k=k, zero_padding=True, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=False, @@ -1028,7 +1028,7 @@ def test_pad_history_no_history_non_zero_padding_include_current_from_below( k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=True, @@ -1056,7 +1056,7 @@ def test_pad_history_no_history_non_zero_padding_include_current_from_above( k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=False, @@ -1086,7 +1086,7 @@ def test_pad_history_some_history_non_zero_padding_include_current_from_below( k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=True, @@ -1119,7 +1119,7 @@ def test_pad_history_some_history_non_zero_padding_include_current_from_above( k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=False, @@ -1152,7 +1152,7 @@ def test_pad_history_just_enough_history_include_current(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=False, @@ -1182,7 +1182,7 @@ def test_pad_history_many_history_include_current(test_df_no_time, emb): k=k, zero_padding=False, colnames=colnames, - time_feature=["timeline_index"], + features=["timeline_index"], index=index, include_current_embedding=True, pad_from_below=False, @@ -1193,9 +1193,9 @@ def test_pad_history_many_history_include_current(test_df_no_time, emb): ) -def test_pad_by_id_k_last(test_df_no_time, emb): +def test_pad_by_id_k_last(test_df_with_datetime, emb): obj = PrepareData( - original_df=test_df_no_time, + original_df=test_df_with_datetime, embeddings=emb, id_column="id_col", label_column="label_col", @@ -1206,15 +1206,15 @@ def test_pad_by_id_k_last(test_df_no_time, emb): method="k_last", zero_padding=True, k=k, - time_feature="timeline_index", + features="timeline_index", standardise_method=None, embeddings="full", include_current_embedding=True, pad_from_below=True, ) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # timeline_index column + number of columns in emb + id col + label col + ncol = 1 + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) assert type(obj.array_padded) == np.ndarray @@ -1222,9 +1222,39 @@ def test_pad_by_id_k_last(test_df_no_time, emb): assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol) -def test_pad_by_id_max(test_df_no_time, emb): +def test_pad_by_id_k_last_additional(test_df_with_datetime, emb): obj = PrepareData( - original_df=test_df_no_time, + original_df=test_df_with_datetime, + embeddings=emb, + id_column="id_col", + label_column="label_col", + ) + k = 10 + features = ["timeline_index", "binary_var", "continuous_var"] + padded_array = obj.pad( + pad_by="id", + method="k_last", + zero_padding=True, + k=k, + features=features, + standardise_method=None, + embeddings="full", + include_current_embedding=True, + pad_from_below=True, + ) + # number of columns is: + # number of features requested + number of columns in emb + id col + label col + ncol = len(features) + emb.shape[1] + 1 + 1 + assert type(obj.df_padded) == pd.DataFrame + assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) + assert type(obj.array_padded) == np.ndarray + assert np.array_equal(padded_array, obj.array_padded) + assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol) + + +def test_pad_by_id_max(test_df_with_datetime, emb): + obj = PrepareData( + original_df=test_df_with_datetime, embeddings=emb, id_column="id_col", label_column="label_col", @@ -1233,15 +1263,15 @@ def test_pad_by_id_max(test_df_no_time, emb): pad_by="id", method="max", zero_padding=True, - time_feature="timeline_index", + features="timeline_index", standardise_method=None, embeddings="full", include_current_embedding=True, pad_from_below=True, ) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # timeline_index column + number of columns in emb + id col + label col + ncol = 1 + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame k = obj.original_df["id_col"].value_counts().max() assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) @@ -1250,9 +1280,38 @@ def test_pad_by_id_max(test_df_no_time, emb): assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol) -def test_pad_by_history_k_last(test_df_no_time, emb): +def test_pad_by_id_max_additional(test_df_with_datetime, emb): obj = PrepareData( - original_df=test_df_no_time, + original_df=test_df_with_datetime, + embeddings=emb, + id_column="id_col", + label_column="label_col", + ) + features = ["timeline_index", "binary_var", "continuous_var"] + padded_array = obj.pad( + pad_by="id", + method="max", + zero_padding=True, + features=features, + standardise_method=None, + embeddings="full", + include_current_embedding=True, + pad_from_below=True, + ) + # number of columns is: + # number of features requested + number of columns in emb + id col + label col + ncol = len(features) + emb.shape[1] + 1 + 1 + assert type(obj.df_padded) == pd.DataFrame + k = obj.original_df["id_col"].value_counts().max() + assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) + assert type(obj.array_padded) == np.ndarray + assert np.array_equal(padded_array, obj.array_padded) + assert obj.array_padded.shape == (len(obj.original_df["id_col"].unique()), k, ncol) + + +def test_pad_by_history_k_last(test_df_with_datetime, emb): + obj = PrepareData( + original_df=test_df_with_datetime, embeddings=emb, id_column="id_col", label_column="label_col", @@ -1263,15 +1322,15 @@ def test_pad_by_history_k_last(test_df_no_time, emb): method="k_last", zero_padding=True, k=k, - time_feature="timeline_index", + features="timeline_index", standardise_method=None, embeddings="full", include_current_embedding=True, pad_from_below=True, ) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # timeline_index column + number of columns in emb + id col + label col + ncol = 1 + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol) assert type(obj.array_padded) == np.ndarray @@ -1279,26 +1338,85 @@ def test_pad_by_history_k_last(test_df_no_time, emb): assert obj.array_padded.shape == (len(obj.original_df.index), k, ncol) -def test_pad_by_history_max(test_df_no_time, emb): +def test_pad_by_history_k_last_additional(test_df_with_datetime, emb): obj = PrepareData( - original_df=test_df_no_time, + original_df=test_df_with_datetime, + embeddings=emb, + id_column="id_col", + label_column="label_col", + ) + k = 10 + features = ["timeline_index", "binary_var", "continuous_var"] + padded_array = obj.pad( + pad_by="history", + method="k_last", + zero_padding=True, + k=k, + features=features, + standardise_method=None, + embeddings="full", + include_current_embedding=True, + pad_from_below=True, + ) + # number of columns is: + # number of features requested + number of columns in emb + id col + label col + ncol = len(features) + emb.shape[1] + 1 + 1 + assert type(obj.df_padded) == pd.DataFrame + assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol) + assert type(obj.array_padded) == np.ndarray + assert np.array_equal(padded_array, obj.array_padded) + assert obj.array_padded.shape == (len(obj.original_df.index), k, ncol) + + +def test_pad_by_history_max(test_df_with_datetime, emb): + obj = PrepareData( + original_df=test_df_with_datetime, + embeddings=emb, + id_column="id_col", + label_column="label_col", + ) + padded_array = obj.pad( + pad_by="history", + method="max", + zero_padding=True, + features="timeline_index", + standardise_method=None, + embeddings="full", + include_current_embedding=True, + pad_from_below=True, + ) + # number of columns is: + # timeline_index column + number of columns in emb + id col + label col + ncol = 1 + emb.shape[1] + 1 + 1 + assert type(obj.df_padded) == pd.DataFrame + k = obj.original_df["id_col"].value_counts().max() + assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol) + assert type(obj.array_padded) == np.ndarray + assert np.array_equal(padded_array, obj.array_padded) + assert obj.array_padded.shape == (len(obj.original_df.index), k, ncol) + + +def test_pad_by_history_max_additional(test_df_with_datetime, emb): + obj = PrepareData( + original_df=test_df_with_datetime, embeddings=emb, id_column="id_col", label_column="label_col", ) + features = ["timeline_index", "binary_var", "continuous_var"] padded_array = obj.pad( pad_by="history", method="max", zero_padding=True, - time_feature="timeline_index", + features=features, standardise_method=None, embeddings="full", include_current_embedding=True, pad_from_below=True, ) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # number of features requested + number of columns in emb + id col + label col + ncol = len(features) + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame k = obj.original_df["id_col"].value_counts().max() assert obj.df_padded.shape == (k * len(obj.original_df.index), ncol) @@ -1319,7 +1437,7 @@ def test_pad_wrong_pad_by(test_df_no_time, emb): pad_by="fake_pad_by", method="max", zero_padding=True, - time_feature="timeline_index", + features="timeline_index", standardise_method=None, embeddings="full", include_current_embedding=True, @@ -1339,7 +1457,7 @@ def test_pad_wrong_method(test_df_no_time, emb): pad_by="id", method="fake_method", zero_padding=True, - time_feature="timeline_index", + features="timeline_index", standardise_method=None, embeddings="full", include_current_embedding=True, @@ -1360,19 +1478,19 @@ def test_pad_by_id_k_last_standardise_standardise(test_df_no_time, emb): method="k_last", zero_padding=True, k=k, - time_feature="timeline_index", - standardise_method="standardise", + features="timeline_index", + standardise_method="z_score", embeddings="full", include_current_embedding=True, pad_from_below=True, ) standardise_vec = obj._standardise_pd( - vec=obj.df["timeline_index"], method="standardise" + vec=obj.df["timeline_index"], method="z_score" )["standardised_pd"] pd.testing.assert_series_equal(obj.df["timeline_index"], standardise_vec) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # number of features + number of columns in emb + id col + label col + ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) assert type(obj.array_padded) == np.ndarray @@ -1393,19 +1511,19 @@ def test_pad_by_id_k_last_standardise_normalise(test_df_no_time, emb): method="k_last", zero_padding=True, k=k, - time_feature="timeline_index", - standardise_method="normalise", + features="timeline_index", + standardise_method="sum_divide", embeddings="full", include_current_embedding=True, pad_from_below=True, ) normalise_vec = obj._standardise_pd( - vec=obj.df["timeline_index"], method="normalise" + vec=obj.df["timeline_index"], method="sum_divide" )["standardised_pd"] pd.testing.assert_series_equal(obj.df["timeline_index"], normalise_vec) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # number of features + number of columns in emb + id col + label col + ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) assert type(obj.array_padded) == np.ndarray @@ -1426,7 +1544,7 @@ def test_pad_by_id_k_last_standardise_minmax(test_df_no_time, emb): method="k_last", zero_padding=True, k=k, - time_feature="timeline_index", + features="timeline_index", standardise_method="minmax", embeddings="full", include_current_embedding=True, @@ -1437,8 +1555,8 @@ def test_pad_by_id_k_last_standardise_minmax(test_df_no_time, emb): ] pd.testing.assert_series_equal(obj.df["timeline_index"], minmax_vec) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(obj._time_feature_choices) + emb.shape[1] + 1 + 1 + # number of features + number of columns in emb + id col + label col + ncol = len(obj._feature_list) + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) assert type(obj.array_padded) == np.ndarray @@ -1454,13 +1572,13 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb): label_column="label_col", ) k = 10 - time_features = ["timeline_index", "time_encoding", "time_diff"] + features = ["timeline_index", "time_encoding", "time_diff"] # expected standardised vectors standardised_vec = obj._standardise_pd( - vec=obj.df["timeline_index"], method="standardise" + vec=obj.df["timeline_index"], method="z_score" )["standardised_pd"] normalised_vec = obj._standardise_pd( - vec=obj.df["time_encoding"], method="normalise" + vec=obj.df["time_encoding"], method="sum_divide" )["standardised_pd"] none_standardisation_vec = obj.df["time_diff"] # pad and perform standardisation @@ -1469,8 +1587,8 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb): method="k_last", zero_padding=True, k=k, - time_feature=time_features, - standardise_method=["standardise", "normalise", None], + features=features, + standardise_method=["z_score", "sum_divide", None], embeddings="full", include_current_embedding=True, pad_from_below=True, @@ -1479,8 +1597,8 @@ def test_pad_by_id_k_last_standardise_multiple(test_df_with_datetime, emb): pd.testing.assert_series_equal(obj.df["time_encoding"], normalised_vec) pd.testing.assert_series_equal(obj.df["time_diff"], none_standardisation_vec) # number of columns is: - # number of time features + number of columns in emb + id col + label col - ncol = len(time_features) + emb.shape[1] + 1 + 1 + # number of features + number of columns in emb + id col + label col + ncol = len(features) + emb.shape[1] + 1 + 1 assert type(obj.df_padded) == pd.DataFrame assert obj.df_padded.shape == (k * len(obj.original_df["id_col"].unique()), ncol) assert type(obj.array_padded) == np.ndarray