diff --git a/CHANGELOG.md b/CHANGELOG.md index 7114d157..9b331450 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,13 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `Debias` mechanism for classification, ranking and auc metrics. New parameter `is_debiased` to `calc_from_confusion_df`, `calc_per_user_from_confusion_df` methods of classification metrics, `calc_from_fitted`, `calc_per_user_from_fitted` methods of auc and rankning (`MAP`) metrics, `calc_from_merged`, `calc_per_user_from_merged` methods of ranking (`NDCG`, `MRR`) metrics. ([#152](https://github.com/MobileTeleSystems/RecTools/pull/152)) - `nbformat >= 4.2.0` dependency to `[visuals]` extra ([#169](https://github.com/MobileTeleSystems/RecTools/pull/169)) +- `filter_interactions` method of `Dataset` ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177)) +- `on_unsupported_targets` parameter to `recommend` and `recommend_to_items` model methods ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177)) ### Fixed - `display()` method in `MetricsApp` ([#169](https://github.com/MobileTeleSystems/RecTools/pull/169)) - -### Fixed +- `IntraListDiversity` metric computation in `cross_validate` ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177)) - Allow warp-kos loss for LightFMWrapperModel ([#175](https://github.com/MobileTeleSystems/RecTools/pull/175)) +### Removed +- [Breaking] `assume_external_ids` parameter in `recommend` and `recommend_to_items` model methods ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177)) + ## [0.7.0] - 29.07.2024 ### Added diff --git a/README.md b/README.md index dbd626a4..980d0d36 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ RecTools is on PyPI, so you can use `pip` to install it. ``` pip install rectools ``` -The default version doesn't contain all the dependencies, because some of them are needed only for specific models. Available user extensions are the following: +The default version doesn't contain all the dependencies, because some of them are needed only for specific functionality. Available user extensions are the following: - `lightfm`: adds wrapper for LightFM model, - `torch`: adds models based on neural nets, diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index 8906704d..cd68433b 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -17,6 +17,7 @@ import typing as tp import attr +import numpy as np import pandas as pd from scipy import sparse @@ -245,3 +246,66 @@ def get_raw_interactions(self, include_weight: bool = True, include_datetime: bo pd.DataFrame """ return self.interactions.to_external(self.user_id_map, self.item_id_map, include_weight, include_datetime) + + def filter_interactions( + self, + row_indexes_to_keep: np.ndarray, + keep_external_ids: bool = True, + keep_features_for_removed_entities: bool = True, + ) -> "Dataset": + """ + Generate filtered dataset that contains only provided `row_indexes_to_keep` from original + dataset interactions dataframe. + Resulting dataset will get new id mapping for both users and items. + + Parameters + ---------- + row_indexes_to_keep : np.ndarray + Original dataset interactions df row indexes that are to be kept + keep_external_ids : bool, default `True` + Whether to keep external ids -> 2x internal ids mapping (default). + Otherwise internal -> 2x internal ids mapping will be created. + keep_features_for_removed_entities : bool, default `True` + Whether to keep all features for users and items that are not hot any more. + + Returns + ------- + Dataset + Filtered dataset that has only selected interactions, new ids mapping and processed features. + """ + interactions_df = self.interactions.df.iloc[row_indexes_to_keep] + + # 1x internal -> 2x internal + user_id_map = IdMap.from_values(interactions_df[Columns.User].values) + item_id_map = IdMap.from_values(interactions_df[Columns.Item].values) + interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map) + + def _handle_features( + features: tp.Optional[Features], target_id_map: IdMap, dataset_id_map: IdMap + ) -> tp.Tuple[tp.Optional[Features], IdMap]: + if features is None: + return None, target_id_map + + if keep_features_for_removed_entities: + all_features_ids = np.arange(len(features)) + target_id_map = target_id_map.add_ids(all_features_ids, raise_if_already_present=False) + + needed_ids = target_id_map.get_external_sorted_by_internal() + features = features.take(needed_ids) + return features, target_id_map + + user_features_new, user_id_map = _handle_features(self.user_features, user_id_map, self.user_id_map) + item_features_new, item_id_map = _handle_features(self.item_features, item_id_map, self.item_id_map) + + if keep_external_ids: # external -> 2x internal + user_id_map = IdMap(self.user_id_map.convert_to_external(user_id_map.external_ids)) + item_id_map = IdMap(self.item_id_map.convert_to_external(item_id_map.external_ids)) + + filtered_dataset = Dataset( + user_id_map=user_id_map, + item_id_map=item_id_map, + interactions=interactions, + user_features=user_features_new, + item_features=item_features_new, + ) + return filtered_dataset diff --git a/rectools/model_selection/cross_validate.py b/rectools/model_selection/cross_validate.py index ca0c69d4..510c48b1 100644 --- a/rectools/model_selection/cross_validate.py +++ b/rectools/model_selection/cross_validate.py @@ -14,58 +14,16 @@ import typing as tp -import numpy as np -import pandas as pd - from rectools.columns import Columns -from rectools.dataset import Dataset, Features, IdMap, Interactions +from rectools.dataset import Dataset from rectools.metrics import calc_metrics from rectools.metrics.base import MetricAtK -from rectools.models.base import ModelBase +from rectools.models.base import ErrorBehaviour, ModelBase from rectools.types import ExternalIds from .splitter import Splitter -def _gen_2x_internal_ids_dataset( - interactions_internal_df: pd.DataFrame, - user_features: tp.Optional[Features], - item_features: tp.Optional[Features], - prefer_warm_inference_over_cold: bool, -) -> Dataset: - """ - Make new dataset based on given interactions and features from base dataset. - Assume that interactions dataframe contains internal ids. - Returned dataset contains 2nd level of internal ids. - """ - user_id_map = IdMap.from_values(interactions_internal_df[Columns.User].values) # 1x internal -> 2x internal - item_id_map = IdMap.from_values(interactions_internal_df[Columns.Item].values) # 1x internal -> 2x internal - interactions_train = Interactions.from_raw(interactions_internal_df, user_id_map, item_id_map) # 2x internal - - def _handle_features(features: tp.Optional[Features], id_map: IdMap) -> tp.Tuple[tp.Optional[Features], IdMap]: - if features is None: - return None, id_map - - if prefer_warm_inference_over_cold: - all_features_ids = np.arange(len(features)) # 1x internal - id_map = id_map.add_ids(all_features_ids, raise_if_already_present=False) - - features = features.take(id_map.get_external_sorted_by_internal()) # 2x internal - return features, id_map - - user_features_new, user_id_map = _handle_features(user_features, user_id_map) - item_features_new, item_id_map = _handle_features(item_features, item_id_map) - - dataset = Dataset( - user_id_map=user_id_map, - item_id_map=item_id_map, - interactions=interactions_train, - user_features=user_features_new, - item_features=item_features_new, - ) - return dataset - - def cross_validate( # pylint: disable=too-many-locals dataset: Dataset, splitter: Splitter, @@ -77,6 +35,7 @@ def cross_validate( # pylint: disable=too-many-locals prefer_warm_inference_over_cold: bool = True, ref_models: tp.Optional[tp.List[str]] = None, validate_ref_models: bool = False, + on_unsupported_targets: ErrorBehaviour = "warn", ) -> tp.Dict[str, tp.Any]: """ Run cross validation on multiple models with multiple metrics. @@ -113,6 +72,15 @@ def cross_validate( # pylint: disable=too-many-locals validate_ref_models : bool, default False If True include models specified in `ref_models` to all metrics calculations and receive their metrics from cross-validation. + on_unsupported_targets : Literal["raise", "warn", "ignore"], default "warn" + How to handle warm/cold target users when model doesn't support warm/cold inference. + Specify "warn" to filter with warning (default in `cross_validate`). + Specify "ignore" to filter unsupported targets without a warning. + It is highly recommended to pass `CoveredUsers` DQ metric to catch all models with + insufficient recommendations for each fold. + Specify "raise" to raise ValueError in case unsupported targets are passed. In cross-validation + this may cause unexpected errors for some of the complicated models. + Returns ------- @@ -132,9 +100,7 @@ def cross_validate( # pylint: disable=too-many-locals ] } """ - interactions = dataset.interactions - - split_iterator = splitter.split(interactions, collect_fold_stats=True) + split_iterator = splitter.split(dataset.interactions, collect_fold_stats=True) split_infos = [] metrics_all = [] @@ -142,24 +108,18 @@ def cross_validate( # pylint: disable=too-many-locals for train_ids, test_ids, split_info in split_iterator: split_infos.append(split_info) - # ### Prepare split data - interactions_df_train = interactions.df.iloc[train_ids] # 1x internal - # We need to avoid fitting models on sparse matrices with all zero rows/columns => - # => we need to create a fold dataset which contains only hot users and items for current training - fold_dataset = _gen_2x_internal_ids_dataset( - interactions_df_train, dataset.user_features, dataset.item_features, prefer_warm_inference_over_cold + fold_dataset = dataset.filter_interactions( + row_indexes_to_keep=train_ids, + keep_external_ids=True, + keep_features_for_removed_entities=prefer_warm_inference_over_cold, ) + interactions_df_test = dataset.interactions.df.loc[test_ids] + interactions_df_test[Columns.User] = dataset.user_id_map.convert_to_external(interactions_df_test[Columns.User]) + interactions_df_test[Columns.Item] = dataset.item_id_map.convert_to_external(interactions_df_test[Columns.Item]) - interactions_df_test = interactions.df.iloc[test_ids] # 1x internal - test_users = interactions_df_test[Columns.User].unique() # 1x internal - catalog = interactions_df_train[Columns.Item].unique() # 1x internal - - if items_to_recommend is not None: - item_ids_to_recommend = dataset.item_id_map.convert_to_internal( - items_to_recommend, strict=False - ) # 1x internal - else: - item_ids_to_recommend = None + test_users = interactions_df_test[Columns.User].unique() + prev_interactions = fold_dataset.get_raw_interactions() + catalog = prev_interactions[Columns.Item].unique() # ### Train ref models if any ref_reco = {} @@ -171,7 +131,8 @@ def cross_validate( # pylint: disable=too-many-locals dataset=fold_dataset, k=k, filter_viewed=filter_viewed, - items_to_recommend=item_ids_to_recommend, + items_to_recommend=items_to_recommend, + on_unsupported_targets=on_unsupported_targets, ) # ### Generate recommendations and calc metrics @@ -183,19 +144,20 @@ def cross_validate( # pylint: disable=too-many-locals reco = ref_reco[model_name] else: model.fit(fold_dataset) - reco = model.recommend( # 1x internal + reco = model.recommend( users=test_users, dataset=fold_dataset, k=k, filter_viewed=filter_viewed, - items_to_recommend=item_ids_to_recommend, + items_to_recommend=items_to_recommend, + on_unsupported_targets=on_unsupported_targets, ) metric_values = calc_metrics( metrics, reco=reco, interactions=interactions_df_test, - prev_interactions=interactions_df_train, + prev_interactions=prev_interactions, catalog=catalog, ref_reco=ref_reco, ) diff --git a/rectools/models/base.py b/rectools/models/base.py index 44ab7014..79e528c8 100644 --- a/rectools/models/base.py +++ b/rectools/models/base.py @@ -15,25 +15,27 @@ """Base model.""" import typing as tp +import warnings import numpy as np import pandas as pd -from rectools import AnyIds, Columns, InternalIds +from rectools import Columns, ExternalIds, InternalIds from rectools.dataset import Dataset from rectools.dataset.identifiers import IdMap from rectools.exceptions import NotFittedError -from rectools.types import AnyIdsArray, InternalIdsArray +from rectools.types import ExternalIdsArray, InternalIdsArray T = tp.TypeVar("T", bound="ModelBase") ScoresArray = np.ndarray Scores = tp.Union[tp.Sequence[float], ScoresArray] +ErrorBehaviour = tp.Literal["ignore", "warn", "raise"] InternalRecoTriplet = tp.Tuple[InternalIds, InternalIds, Scores] -SemiInternalRecoTriplet = tp.Tuple[AnyIds, InternalIds, Scores] -RecoTriplet = tp.Tuple[AnyIds, AnyIds, Scores] +SemiInternalRecoTriplet = tp.Tuple[ExternalIds, InternalIds, Scores] +ExternalRecoTriplet = tp.Tuple[ExternalIds, ExternalIds, Scores] -RecoTriplet_T = tp.TypeVar("RecoTriplet_T", InternalRecoTriplet, SemiInternalRecoTriplet, RecoTriplet) +RecoTriplet_T = tp.TypeVar("RecoTriplet_T", InternalRecoTriplet, SemiInternalRecoTriplet, ExternalRecoTriplet) class ModelBase: @@ -71,15 +73,29 @@ def fit(self: T, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> T: def _fit(self, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> None: raise NotImplementedError() + def _custom_transform_dataset_u2i( + self, dataset: Dataset, users: ExternalIds, on_unsupported_targets: ErrorBehaviour + ) -> Dataset: + # This method should be overwritten for models that require dataset processing for u2i recommendations + # E.g.: interactions filtering or changing mapping of internal ids based on model specific logic + return dataset + + def _custom_transform_dataset_i2i( + self, dataset: Dataset, target_items: ExternalIds, on_unsupported_targets: ErrorBehaviour + ) -> Dataset: + # This method should be overwritten for models that require dataset processing for i2i recommendations + # E.g.: interactions filtering or changing mapping of internal ids based on model specific logic + return dataset + def recommend( self, - users: AnyIds, + users: ExternalIds, dataset: Dataset, k: int, filter_viewed: bool, - items_to_recommend: tp.Optional[AnyIds] = None, + items_to_recommend: tp.Optional[ExternalIds] = None, add_rank_col: bool = True, - assume_external_ids: bool = True, + on_unsupported_targets: ErrorBehaviour = "raise", ) -> pd.DataFrame: r""" Recommend items for users. @@ -89,9 +105,7 @@ def recommend( Parameters ---------- users : array-like - Array of user ids to recommend for. - User ids are supposed to be external if `assume_external_ids` is ``True`` (default). - Internal otherwise. + Array of user ids to recommend for. User ids are supposed to be external dataset : Dataset Dataset with input data. Usually it's the same dataset that was used to fit model. @@ -105,18 +119,18 @@ def recommend( Whitelist of item ids. If given, only these items will be used for recommendations. Otherwise all items from dataset will be used. - Item ids are supposed to be external if `assume_external_ids` is `True`` (default). - Internal otherwise. + Item ids are supposed to be external. add_rank_col : bool, default True Whether to add rank column to recommendations. If True column `Columns.Rank` will be added. This column contain integers from 1 to ``number of user recommendations``. In any case recommendations are sorted per rank for every user. The lesser the rank the more recommendation is relevant. - assume_external_ids : bool, default True - When ``True`` all input user and item ids are supposed to be external. - Ids in returning recommendations table will be external as well. - Internal otherwise. Works faster with ``False``. + on_unsupported_targets : Literal["raise", "warn", "ignore"], default "raise" + How to handle warm/cold target users when model doesn't support warm/cold inference. + Specify "raise" to raise ValueError in case unsupported targets are passed (default). + Specify "ignore" to filter unsupported targets. + Specify "warn" to filter with warning. Returns ------- @@ -135,24 +149,25 @@ def recommend( TypeError, ValueError If arguments have inappropriate type or value ValueError - If some of given users are warm/cold and model doesn't support such type of users. + If some of given users are warm/cold and model doesn't support such type of users and + `on_unsupported_targets` is set to "raise". """ self._check_is_fitted() self._check_k(k) - sorted_item_ids_to_recommend = self._get_sorted_item_ids_to_recommend( - items_to_recommend, dataset, assume_external_ids - ) + dataset = self._custom_transform_dataset_u2i(dataset, users, on_unsupported_targets) - # Here for hot and warm we get internal ids, for cold we keep given ids + sorted_item_ids_to_recommend = self._get_sorted_item_ids_to_recommend(items_to_recommend, dataset) + + # Here for hot and warm we get internal ids, for cold we keep external ids hot_user_ids, warm_user_ids, cold_user_ids = self._split_targets_by_hot_warm_cold( users, - dataset.user_id_map, - dataset.n_hot_users, - assume_external_ids, + dataset, "user", ) - self._check_targets_are_valid(hot_user_ids, warm_user_ids, cold_user_ids, "user") + hot_user_ids, warm_user_ids, cold_user_ids = self._check_targets_are_valid( + hot_user_ids, warm_user_ids, cold_user_ids, "user", on_unsupported_targets + ) reco_hot = self._init_internal_reco_triplet() reco_warm = self._init_internal_reco_triplet() @@ -175,12 +190,10 @@ def recommend( reco_warm = self._adjust_reco_types(reco_warm) reco_cold = self._adjust_reco_types(reco_cold, target_type=dataset.user_id_map.external_dtype) - if assume_external_ids: - reco_hot_final = self._reco_to_external(reco_hot, dataset.user_id_map, dataset.item_id_map) - reco_warm_final = self._reco_to_external(reco_warm, dataset.user_id_map, dataset.item_id_map) - reco_cold_final = self._reco_items_to_external(reco_cold, dataset.item_id_map) - else: - reco_hot_final, reco_warm_final, reco_cold_final = reco_hot, reco_warm, reco_cold + reco_hot_final = self._reco_to_external(reco_hot, dataset.user_id_map, dataset.item_id_map) + reco_warm_final = self._reco_to_external(reco_warm, dataset.user_id_map, dataset.item_id_map) + reco_cold_final = self._reco_items_to_external(reco_cold, dataset.item_id_map) + del reco_hot, reco_warm, reco_cold reco_all = self._concat_reco((reco_hot_final, reco_warm_final, reco_cold_final)) @@ -190,13 +203,13 @@ def recommend( def recommend_to_items( # pylint: disable=too-many-branches self, - target_items: AnyIds, + target_items: ExternalIds, dataset: Dataset, k: int, filter_itself: bool = True, - items_to_recommend: tp.Optional[AnyIds] = None, + items_to_recommend: tp.Optional[ExternalIds] = None, add_rank_col: bool = True, - assume_external_ids: bool = True, + on_unsupported_targets: ErrorBehaviour = "raise", ) -> pd.DataFrame: """ Recommend items for target items. @@ -206,9 +219,7 @@ def recommend_to_items( # pylint: disable=too-many-branches Parameters ---------- target_items : array-like - Array of item ids to recommend for. - Item ids are supposed to be external if `assume_external_ids` is `True`` (default). - Internal otherwise. + Array of item ids to recommend for. Item ids are supposed to be external. dataset : Dataset Dataset with input data. Usually it's the same dataset that was used to fit model. @@ -221,18 +232,18 @@ def recommend_to_items( # pylint: disable=too-many-branches Whitelist of item ids. If given, only these items will be used for recommendations. Otherwise all items from dataset will be used. - Item ids are supposed to be external if `assume_external_ids` is `True`` (default). - Internal otherwise. + Item ids are supposed to be external add_rank_col : bool, default True Whether to add rank column to recommendations. If True column `Columns.Rank` will be added. This column contain integers from 1 to ``number of item recommendations``. In any case recommendations are sorted per rank for every target item. Less rank means more relevant recommendation. - assume_external_ids : bool, default True - When ``True`` all input item ids are supposed to be external. - Ids in returning recommendations table will be external as well. - Internal otherwise. Works faster with ``False``. + on_unsupported_targets : Literal["raise", "warn", "ignore"], default "raise" + How to handle warm/cold target users when model doesn't support warm/cold inference. + Specify "raise" to raise ValueError in case unsupported targets are passed (default). + Specify "ignore" to filter unsupported targets. + Specify "warn" to filter with warning. Returns ------- @@ -250,25 +261,26 @@ def recommend_to_items( # pylint: disable=too-many-branches If called for not fitted model. TypeError, ValueError If arguments have inappropriate type or value - KeyError - If some of given target items are not in `dataset.item_id_map` + ValueError + If some of given users are warm/cold and model doesn't support such type of users and + `on_unsupported_targets` is set to "raise". """ self._check_is_fitted() self._check_k(k) - sorted_item_ids_to_recommend = self._get_sorted_item_ids_to_recommend( - items_to_recommend, dataset, assume_external_ids - ) + dataset = self._custom_transform_dataset_i2i(dataset, target_items, on_unsupported_targets) + + sorted_item_ids_to_recommend = self._get_sorted_item_ids_to_recommend(items_to_recommend, dataset) - # Here for hot and warm we get internal ids, for cold we keep given ids + # Here for hot and warm we get internal ids, for cold we keep external ids hot_target_ids, warm_target_ids, cold_target_ids = self._split_targets_by_hot_warm_cold( target_items, - dataset.item_id_map, - dataset.n_hot_items, - assume_external_ids, + dataset, "item", ) - self._check_targets_are_valid(hot_target_ids, warm_target_ids, cold_target_ids, "item") + hot_target_ids, warm_target_ids, cold_target_ids = self._check_targets_are_valid( + hot_target_ids, warm_target_ids, cold_target_ids, "item", on_unsupported_targets + ) requested_k = k + 1 if filter_itself else k @@ -301,12 +313,9 @@ def recommend_to_items( # pylint: disable=too-many-branches reco_warm = self._filter_item_itself_from_i2i_reco(reco_warm, k) # We don't filter cold reco since we never recommend cold items - if assume_external_ids: - reco_hot_final = self._reco_to_external(reco_hot, dataset.item_id_map, dataset.item_id_map) - reco_warm_final = self._reco_to_external(reco_warm, dataset.item_id_map, dataset.item_id_map) - reco_cold_final = self._reco_items_to_external(reco_cold, dataset.item_id_map) - else: - reco_hot_final, reco_warm_final, reco_cold_final = reco_hot, reco_warm, reco_cold + reco_hot_final = self._reco_to_external(reco_hot, dataset.item_id_map, dataset.item_id_map) + reco_warm_final = self._reco_to_external(reco_warm, dataset.item_id_map, dataset.item_id_map) + reco_cold_final = self._reco_items_to_external(reco_cold, dataset.item_id_map) del reco_hot, reco_warm, reco_cold reco_all = self._concat_reco((reco_hot_final, reco_warm_final, reco_cold_final)) @@ -333,42 +342,35 @@ def _init_internal_reco_triplet(cls) -> InternalRecoTriplet: @classmethod def _get_sorted_item_ids_to_recommend( - cls, items_to_recommend: tp.Optional[AnyIds], dataset: Dataset, assume_external_ids: bool + cls, items_to_recommend: tp.Optional[ExternalIds], dataset: Dataset ) -> tp.Optional[InternalIdsArray]: if items_to_recommend is None: return None - if assume_external_ids: - item_ids_to_recommend = dataset.item_id_map.convert_to_internal(items_to_recommend, strict=False) - else: - item_ids_to_recommend = cls._ensure_internal_ids_valid(items_to_recommend) + internal_ids_to_recommend = dataset.item_id_map.convert_to_internal(items_to_recommend, strict=False) - sorted_item_ids_to_recommend = np.unique(item_ids_to_recommend) - return sorted_item_ids_to_recommend + return np.unique(internal_ids_to_recommend) @classmethod def _split_targets_by_hot_warm_cold( cls, - targets: AnyIds, # users for U2I or target items for I2I - id_map: IdMap, - n_hot: int, - assume_external_ids: bool, + targets: ExternalIds, # users for U2I or target items for I2I + dataset: Dataset, entity: tp.Literal["user", "item"], - ) -> tp.Tuple[InternalIdsArray, InternalIdsArray, AnyIdsArray]: - if assume_external_ids: - known_ids, cold_ids = id_map.convert_to_internal(targets, strict=False, return_missing=True) - try: - cold_ids = cold_ids.astype(id_map.external_dtype) - except ValueError: - raise TypeError( - f"Given {entity} ids must be convertible to the " - f"{entity}_id` type in dataset ({id_map.external_dtype})" - ) + ) -> tp.Tuple[InternalIdsArray, InternalIdsArray, ExternalIdsArray]: + if entity == "user": + id_map, n_hot = dataset.user_id_map, dataset.n_hot_users else: - target_ids = cls._ensure_internal_ids_valid(targets) - known_mask = target_ids < id_map.size - known_ids = target_ids[known_mask] - cold_ids = target_ids[~known_mask] + id_map, n_hot = dataset.item_id_map, dataset.n_hot_items + + known_ids, cold_ids = id_map.convert_to_internal(targets, strict=False, return_missing=True) + try: + cold_ids = cold_ids.astype(id_map.external_dtype) + except ValueError: + raise TypeError( + f"Given {entity} ids must be convertible to the " + f"{entity}_id` type in dataset ({id_map.external_dtype})" + ) hot_mask = known_ids < n_hot hot_ids = known_ids[hot_mask] @@ -380,29 +382,32 @@ def _check_targets_are_valid( cls, hot_targets: InternalIdsArray, warm_targets: InternalIdsArray, - cold_targets: AnyIdsArray, + cold_targets: ExternalIdsArray, entity: tp.Literal["user", "item"], - ) -> None: + on_unsupported_targets: ErrorBehaviour, + ) -> tp.Tuple[InternalIdsArray, InternalIdsArray, ExternalIdsArray]: if warm_targets.size > 0 and not cls.recommends_for_warm and not cls.recommends_for_cold: - raise ValueError( - f"Model `{cls}` doesn't support recommendations for warm and cold {entity}s, " - f"but some of given {entity}s are warm: they are not in the interactions" - ) + explanation = f""" + Model `{cls}` doesn't support recommendations for warm and cold {entity}s, + but some of given {entity}s are warm: they are not in the interactions + """ + if on_unsupported_targets == "warn": + warnings.warn(explanation) + elif on_unsupported_targets == "raise": + raise ValueError(explanation) + warm_targets = np.asarray([]) if cold_targets.size > 0 and not cls.recommends_for_cold: - raise ValueError( - f"Model `{cls}` doesn't support recommendations for cold {entity}s, " - f"but some of given {entity}s are cold: they are not in the `dataset.{entity}_id_map`" - ) - - @classmethod - def _ensure_internal_ids_valid(cls, internal_ids: AnyIds) -> InternalIdsArray: - ids = np.asarray(internal_ids) - if not np.issubdtype(ids.dtype, np.integer): - raise TypeError("Internal ids are always integer") - if ids.min() < 0: - raise ValueError("Internal ids should be non-negative integers") - return ids + explanation = f""" + Model `{cls}` doesn't support recommendations for cold {entity}s, + but some of given {entity}s are cold: they are not in the `dataset.{entity}_id_map` + """ + if on_unsupported_targets == "warn": + warnings.warn(explanation) + elif on_unsupported_targets == "raise": + raise ValueError(explanation) + cold_targets = np.asarray([]) + return hot_targets, warm_targets, cold_targets @classmethod def _adjust_reco_types(cls, reco: RecoTriplet_T, target_type: tp.Type = np.int64) -> RecoTriplet_T: @@ -424,27 +429,29 @@ def _filter_item_itself_from_i2i_reco(cls, reco: RecoTriplet_T, k: int) -> RecoT return df_reco["tid"].values, df_reco["iid"].values, df_reco["score"].values @classmethod - def _reco_to_external(cls, reco: InternalRecoTriplet, target_id_map: IdMap, item_id_map: IdMap) -> RecoTriplet: + def _reco_to_external( + cls, reco: InternalRecoTriplet, target_id_map: IdMap, item_id_map: IdMap + ) -> ExternalRecoTriplet: target_ids, item_ids, scores = reco target_ids = target_id_map.convert_to_external(target_ids) item_ids = item_id_map.convert_to_external(item_ids) return target_ids, item_ids, scores @classmethod - def _reco_items_to_external(cls, reco: SemiInternalRecoTriplet, item_id_map: IdMap) -> RecoTriplet: + def _reco_items_to_external(cls, reco: SemiInternalRecoTriplet, item_id_map: IdMap) -> ExternalRecoTriplet: target_ids, item_ids, scores = reco item_ids = item_id_map.convert_to_external(item_ids) return target_ids, item_ids, scores @classmethod - def _concat_reco(cls, parts: tp.Sequence[RecoTriplet]) -> RecoTriplet: + def _concat_reco(cls, parts: tp.Sequence[RecoTriplet_T]) -> RecoTriplet_T: targets = np.concatenate([part[0] for part in parts]) items = np.concatenate([part[1] for part in parts]) scores = np.concatenate([part[2] for part in parts]) return targets, items, scores @classmethod - def _make_reco_table(cls, reco: RecoTriplet, target_col: str, add_rank_col: bool) -> pd.DataFrame: + def _make_reco_table(cls, reco: ExternalRecoTriplet, target_col: str, add_rank_col: bool) -> pd.DataFrame: target_ids, item_ids, scores = reco df = pd.DataFrame( { @@ -461,7 +468,7 @@ def _make_reco_table(cls, reco: RecoTriplet, target_col: str, add_rank_col: bool def _recommend_cold( self, - target_ids: AnyIdsArray, + target_ids: ExternalIdsArray, dataset: Dataset, k: int, sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray], @@ -515,7 +522,7 @@ class FixedColdRecoModelMixin: def _recommend_cold( self, - target_ids: AnyIdsArray, + target_ids: ExternalIdsArray, dataset: Dataset, k: int, sorted_item_ids_to_recommend: tp.Optional[InternalIdsArray], diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index fbf9e62a..e9c9dc48 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -17,6 +17,7 @@ import typing as tp from datetime import datetime +import numpy as np import pandas as pd import pytest from scipy import sparse @@ -284,3 +285,145 @@ def test_get_raw_interactions(self, include_weight: bool, include_datetime: bool if not include_datetime: expected.drop(columns=Columns.Datetime, inplace=True) pd.testing.assert_frame_equal(actual, expected) + + @pytest.fixture + def dataset_to_filter(self) -> Dataset: + item_id_map = IdMap.from_values([10, 20, 30, 40, 50]) + user_id_map = IdMap.from_values([10, 11, 12, 13, 14]) + df = pd.DataFrame( + [ + [0, 0, 1, "2021-09-01"], + [4, 2, 1, "2021-09-02"], + [2, 1, 1, "2021-09-02"], + [2, 2, 1, "2021-09-03"], + [3, 2, 1, "2021-09-03"], + [3, 3, 1, "2021-09-03"], + [3, 4, 1, "2021-09-04"], + [1, 2, 1, "2021-09-04"], + [3, 1, 1, "2021-09-05"], + [4, 2, 1, "2021-09-05"], + [3, 3, 1, "2021-09-06"], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ).astype({Columns.Datetime: "datetime64[ns]"}) + interactions = Interactions(df) + return Dataset(user_id_map, item_id_map, interactions) + + @pytest.fixture + def dataset_with_features_to_filter(self, dataset_to_filter: Dataset) -> Dataset: + user_features = DenseFeatures( + values=np.array([[1, 10], [2, 20], [3, 30], [4, 40], [5, 50]]), + names=("f1", "f2"), + ) + item_features = SparseFeatures( + values=sparse.csr_matrix( + [ + [3.2, 0, 1], + [2.4, 2, 0], + [0.0, 0, 1], + [1.0, 5, 1], + [2.0, 1, 1], + ], + ), + names=(("f1", None), ("f2", 100), ("f2", 200)), + ) + return Dataset( + dataset_to_filter.user_id_map, + dataset_to_filter.item_id_map, + dataset_to_filter.interactions, + user_features, + item_features, + ) + + @pytest.mark.parametrize("keep_features_for_removed_entities", (True, False)) + @pytest.mark.parametrize( + "keep_external_ids, expected_external_item_ids, expected_external_user_ids", + ((True, np.array([10, 30, 20]), np.array([10, 14, 12])), (False, np.array([0, 2, 1]), np.array([0, 4, 2]))), + ) + def test_filter_dataset_interactions_df_rows_without_features( + self, + dataset_to_filter: Dataset, + keep_features_for_removed_entities: bool, + keep_external_ids: bool, + expected_external_item_ids: np.ndarray, + expected_external_user_ids: np.ndarray, + ) -> None: + rows_to_keep = np.arange(4) + filtered_dataset = dataset_to_filter.filter_interactions( + rows_to_keep, + keep_external_ids=keep_external_ids, + keep_features_for_removed_entities=keep_features_for_removed_entities, + ) + expected_interactions_2x_internal_df = pd.DataFrame( + [ + [0, 0, 1, "2021-09-01"], + [1, 1, 1, "2021-09-02"], + [2, 2, 1, "2021-09-02"], + [2, 1, 1, "2021-09-03"], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float}) + np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids) + np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids) + pd.testing.assert_frame_equal(filtered_dataset.interactions.df, expected_interactions_2x_internal_df) + assert filtered_dataset.user_features is None + assert filtered_dataset.item_features is None + + @pytest.mark.parametrize( + "keep_external_ids, keep_features_for_removed_entities, expected_external_item_ids, expected_external_user_ids", + ( + (True, False, np.array([10, 30, 20]), np.array([10, 14, 12])), + (False, False, np.array([0, 2, 1]), np.array([0, 4, 2])), + (True, True, np.array([10, 30, 20, 40, 50]), np.array([10, 14, 12, 11, 13])), + (False, True, np.array([0, 2, 1, 3, 4]), np.array([0, 4, 2, 1, 3])), + ), + ) + def test_filter_dataset_interactions_df_rows_with_features( + self, + dataset_with_features_to_filter: Dataset, + keep_features_for_removed_entities: bool, + keep_external_ids: bool, + expected_external_item_ids: np.ndarray, + expected_external_user_ids: np.ndarray, + ) -> None: + rows_to_keep = np.arange(4) + filtered_dataset = dataset_with_features_to_filter.filter_interactions( + rows_to_keep, + keep_external_ids=keep_external_ids, + keep_features_for_removed_entities=keep_features_for_removed_entities, + ) + expected_interactions_2x_internal_df = pd.DataFrame( + [ + [0, 0, 1, "2021-09-01"], + [1, 1, 1, "2021-09-02"], + [2, 2, 1, "2021-09-02"], + [2, 1, 1, "2021-09-03"], + ], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float}) + np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids) + np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids) + pd.testing.assert_frame_equal(filtered_dataset.interactions.df, expected_interactions_2x_internal_df) + + # Check features + old_user_features = dataset_with_features_to_filter.user_features + old_item_features = dataset_with_features_to_filter.item_features + new_user_features = filtered_dataset.user_features + new_item_features = filtered_dataset.item_features + assert new_user_features is not None and new_item_features is not None # for mypy + assert old_user_features is not None and old_item_features is not None # for mypy + + kept_internal_user_ids = ( + dataset_with_features_to_filter.user_id_map.convert_to_internal(expected_external_user_ids) + if keep_external_ids + else expected_external_user_ids + ) + kept_internal_item_ids = ( + dataset_with_features_to_filter.item_id_map.convert_to_internal(expected_external_item_ids) + if keep_external_ids + else expected_external_item_ids + ) + np.testing.assert_equal(new_user_features.values, old_user_features.values[kept_internal_user_ids]) + assert new_user_features.names == old_user_features.names + assert_sparse_matrix_equal(new_item_features.values, old_item_features.values[kept_internal_item_ids]) + assert new_item_features.names == old_item_features.names diff --git a/tests/model_selection/test_cross_validate.py b/tests/model_selection/test_cross_validate.py index d5d9dd87..f00eb084 100644 --- a/tests/model_selection/test_cross_validate.py +++ b/tests/model_selection/test_cross_validate.py @@ -16,102 +16,21 @@ import typing as tp -import numpy as np import pandas as pd import pytest from implicit.als import AlternatingLeastSquares -from scipy import sparse from rectools import Columns, ExternalIds -from rectools.dataset import Dataset, DenseFeatures, SparseFeatures +from rectools.dataset import Dataset from rectools.metrics import Intersection, Precision, Recall from rectools.metrics.base import MetricAtK from rectools.model_selection import LastNSplitter, cross_validate -from rectools.model_selection.cross_validate import _gen_2x_internal_ids_dataset -from rectools.models import ImplicitALSWrapperModel, PopularInCategoryModel, PopularModel, RandomModel +from rectools.models import ImplicitALSWrapperModel, PopularModel, RandomModel from rectools.models.base import ModelBase -from tests.testing_utils import assert_sparse_matrix_equal a = pytest.approx -class TestGen2xInternalIdsDataset: - def setup_method(self) -> None: - self.interactions_internal_df = pd.DataFrame( - [ - [0, 0, 1, 101], - [0, 1, 1, 102], - [0, 0, 1, 103], - [3, 0, 1, 101], - [3, 2, 1, 102], - ], - columns=Columns.Interactions, - ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float}) - - self.expected_interactions_2x_internal_df = pd.DataFrame( - [ - [0, 0, 1, 101], - [0, 1, 1, 102], - [0, 0, 1, 103], - [1, 0, 1, 101], - [1, 2, 1, 102], - ], - columns=Columns.Interactions, - ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float}) - - @pytest.mark.parametrize("prefer_warm_inference_over_cold", (True, False)) - def test_without_features(self, prefer_warm_inference_over_cold: bool) -> None: - dataset = _gen_2x_internal_ids_dataset( - self.interactions_internal_df, None, None, prefer_warm_inference_over_cold - ) - - np.testing.assert_equal(dataset.user_id_map.external_ids, np.array([0, 3])) - np.testing.assert_equal(dataset.item_id_map.external_ids, np.array([0, 1, 2])) - pd.testing.assert_frame_equal(dataset.interactions.df, self.expected_interactions_2x_internal_df) - assert dataset.user_features is None - assert dataset.item_features is None - - @pytest.mark.parametrize( - "prefer_warm_inference_over_cold, expected_user_ids, expected_item_ids", - ( - (False, [0, 3], [0, 1, 2]), - (True, [0, 3, 1, 2], [0, 1, 2, 3]), - ), - ) - def test_with_features( - self, prefer_warm_inference_over_cold: bool, expected_user_ids: tp.List[int], expected_item_ids: tp.List[int] - ) -> None: - user_features = DenseFeatures( - values=np.array([[1, 10], [2, 20], [3, 30], [4, 40]]), - names=("f1", "f2"), - ) - item_features = SparseFeatures( - values=sparse.csr_matrix( - [ - [3.2, 0, 1], - [2.4, 2, 0], - [0.0, 0, 1], - [1.0, 5, 1], - ], - ), - names=(("f1", None), ("f2", 100), ("f2", 200)), - ) - - dataset = _gen_2x_internal_ids_dataset( - self.interactions_internal_df, user_features, item_features, prefer_warm_inference_over_cold - ) - - np.testing.assert_equal(dataset.user_id_map.external_ids, np.array(expected_user_ids)) - np.testing.assert_equal(dataset.item_id_map.external_ids, np.array(expected_item_ids)) - pd.testing.assert_frame_equal(dataset.interactions.df, self.expected_interactions_2x_internal_df) - - assert dataset.user_features is not None and dataset.item_features is not None # for mypy - np.testing.assert_equal(dataset.user_features.values, user_features.values[expected_user_ids]) - assert dataset.user_features.names == user_features.names - assert_sparse_matrix_equal(dataset.item_features.values, item_features.values[expected_item_ids]) - assert dataset.item_features.names == item_features.names - - class TestCrossValidate: def setup_method(self) -> None: interactions_df = pd.DataFrame( @@ -146,7 +65,6 @@ def setup_method(self) -> None: [14, "f2", 1], [11, "f1", "y"], [11, "f2", 2], - [12, "f1", "y"], ], columns=["id", "feature", "value"], ) @@ -248,7 +166,6 @@ def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) - models: tp.Dict[str, ModelBase] = { "als": ImplicitALSWrapperModel(AlternatingLeastSquares(factors=2, iterations=2, random_state=42)), - "pop_in_cat": PopularInCategoryModel(category_feature="f1", n_categories=2), } actual = cross_validate( @@ -284,9 +201,7 @@ def test_happy_path_with_features(self, prefer_warm_inference_over_cold: bool) - ], "metrics": [ {"model": "als", "i_split": 0, "precision@2": 0.5, "recall@1": 0.0}, - {"model": "pop_in_cat", "i_split": 0, "precision@2": 0.5, "recall@1": 0.5}, - {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.0}, - {"model": "pop_in_cat", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25}, + {"model": "als", "i_split": 1, "precision@2": 0.375, "recall@1": 0.25}, ], } diff --git a/tests/models/test_base.py b/tests/models/test_base.py index 06e5071b..9dedaf3d 100644 --- a/tests/models/test_base.py +++ b/tests/models/test_base.py @@ -15,23 +15,24 @@ # pylint: disable=attribute-defined-outside-init import typing as tp +import warnings import numpy as np import pandas as pd import pytest -from pytest_mock import MockerFixture from rectools import Columns from rectools.dataset import Dataset from rectools.exceptions import NotFittedError from rectools.models.base import ( + ErrorBehaviour, FixedColdRecoModelMixin, InternalRecoTriplet, ModelBase, Scores, SemiInternalRecoTriplet, ) -from rectools.types import AnyIds, ExternalIds, InternalIds +from rectools.types import ExternalIds, InternalIds from .data import DATASET, INTERACTIONS @@ -82,135 +83,6 @@ def test_raise_when_k_is_not_positive_i2i(k: int) -> None: ) -class TestRecommendWithInternalIds: - def setup_method(self) -> None: - class SomeModel(ModelBase): - def _fit(self, dataset: Dataset, *args: tp.Any, **kwargs: tp.Any) -> None: - pass - - def _recommend_u2i( - self, - user_ids: np.ndarray, - dataset: Dataset, - k: int, - filter_viewed: bool, - sorted_item_ids_to_recommend: tp.Optional[np.ndarray], - ) -> tp.Tuple[InternalIds, InternalIds, Scores]: - return [0, 0, 1], [0, 1, 2], [0.1, 0.2, 0.3] - - def _recommend_i2i( - self, - target_ids: np.ndarray, - dataset: Dataset, - k: int, - sorted_item_ids_to_recommend: tp.Optional[np.ndarray], - ) -> tp.Tuple[InternalIds, InternalIds, Scores]: - return [0, 0, 1], [0, 1, 2], [0.1, 0.2, 0.3] - - self.model = SomeModel().fit(DATASET) - - def test_u2i_success(self, mocker: MockerFixture) -> None: - model = self.model - users = [0, 1] - items_to_recommend = [0, 1, 2] - - spy = mocker.spy(model, "_recommend_u2i") - reco = model.recommend( - users=users, - dataset=DATASET, - k=2, - filter_viewed=False, - items_to_recommend=items_to_recommend, - assume_external_ids=False, - add_rank_col=False, - ) - - args, _ = spy.call_args # args and kwargs properties are unavailable in Python < 3.8 - assert list(args[0]) == users - assert list(args[4]) == items_to_recommend - - excepted = pd.DataFrame( - { - Columns.User: [0, 0, 1], - Columns.Item: [0, 1, 2], - Columns.Score: [0.1, 0.2, 0.3], - } - ) - pd.testing.assert_frame_equal(reco, excepted.astype({Columns.Score: np.float32})) - - @pytest.mark.parametrize( - "users, items_to_recommend, error_type", - ( - (["u1", "u2"], [0, 1], TypeError), - ([0, 1], ["i1", "i2"], TypeError), - (["u1", "u2"], ["i1", "i2"], TypeError), - ([0, 1], [-1, 1], ValueError), - ([-1, 1], [0, 1], ValueError), - ), - ) - def test_u2i_with_incorrect_ids(self, users: AnyIds, items_to_recommend: AnyIds, error_type: tp.Type) -> None: - with pytest.raises(error_type): - self.model.recommend( - users=users, - dataset=DATASET, - k=2, - filter_viewed=False, - items_to_recommend=items_to_recommend, - assume_external_ids=False, - ) - - def test_i2i_success(self, mocker: MockerFixture) -> None: - model = self.model - target_items = [0, 1, 2] - items_to_recommend = [0, 1, 2] - - spy = mocker.spy(model, "_recommend_i2i") - reco = model.recommend_to_items( - target_items=target_items, - dataset=DATASET, - k=2, - items_to_recommend=items_to_recommend, - assume_external_ids=False, - add_rank_col=False, - filter_itself=False, - ) - - args, _ = spy.call_args # args and kwargs properties are unavailable in Python < 3.8 - assert list(args[0]) == target_items - assert list(args[3]) == items_to_recommend - - excepted = pd.DataFrame( - { - Columns.TargetItem: [0, 0, 1], - Columns.Item: [0, 1, 2], - Columns.Score: [0.1, 0.2, 0.3], - } - ) - pd.testing.assert_frame_equal(reco, excepted.astype({Columns.Score: np.float32})) - - @pytest.mark.parametrize( - "target_items, items_to_recommend, error_type", - ( - (["i1", "i2"], [0, 1], TypeError), - ([0, 1], ["i1", "i2"], TypeError), - (["i1", "i2"], ["i1", "i2"], TypeError), - ([0, 1], [-1, 1], ValueError), - ([-1, 1], [0, 1], ValueError), - ), - ) - def test_i2i_with_incorrect_ids( - self, target_items: AnyIds, items_to_recommend: AnyIds, error_type: tp.Type - ) -> None: - with pytest.raises(error_type): - self.model.recommend_to_items( - target_items=target_items, - dataset=DATASET, - k=2, - items_to_recommend=items_to_recommend, - assume_external_ids=False, - ) - - class TestHotWarmCold: def setup_method(self) -> None: class HotModel(ModelBase): @@ -331,7 +203,14 @@ class HotWarmColdModel(HotWarmModel, HotColdModel): self.warms = {"u2i": [50], "i2i": [16]} self.colds = {"u2i": [60], "i2i": [18]} - def _get_reco(self, targets: ExternalIds, model_key: str, dataset_key: str, kind: str) -> pd.DataFrame: + def _get_reco( + self, + targets: ExternalIds, + model_key: str, + dataset_key: str, + kind: str, + on_unsupported_targets: ErrorBehaviour = "raise", + ) -> pd.DataFrame: model = self.models[model_key] if kind == "u2i": reco = model.recommend( @@ -340,6 +219,7 @@ def _get_reco(self, targets: ExternalIds, model_key: str, dataset_key: str, kind k=2, filter_viewed=False, add_rank_col=False, + on_unsupported_targets=on_unsupported_targets, ) reco.rename(columns={Columns.User: "target"}, inplace=True) elif kind == "i2i": @@ -349,6 +229,7 @@ def _get_reco(self, targets: ExternalIds, model_key: str, dataset_key: str, kind k=2, add_rank_col=False, filter_itself=False, + on_unsupported_targets=on_unsupported_targets, ) reco.rename(columns={Columns.TargetItem: "target"}, inplace=True) else: @@ -461,16 +342,81 @@ def test_full_model_works_for_all_without_features(self, kind: str) -> None: @pytest.mark.parametrize("dataset_key", ("no_features", "with_features")) @pytest.mark.parametrize("kind", ("u2i", "i2i")) @pytest.mark.parametrize("model_key", ("hot", "hot_warm")) - def test_not_cold_models_raise_on_cold(self, dataset_key: str, kind: str, model_key: str) -> None: - targets = self.colds[kind] + def test_not_cold_models_with_cold_targets_raise(self, dataset_key: str, kind: str, model_key: str) -> None: + targets = self.colds[kind] + self.hots[kind] with pytest.raises(ValueError, match="doesn't support recommendations for cold"): - self._get_reco(targets, model_key, dataset_key, kind) + self._get_reco(targets, model_key, dataset_key, kind, on_unsupported_targets="raise") + @pytest.mark.parametrize("dataset_key", ("no_features", "with_features")) @pytest.mark.parametrize("kind", ("u2i", "i2i")) - def test_warm_only_model_raises_on_warm_without_features(self, kind: str) -> None: - targets = self.warms[kind] + @pytest.mark.parametrize("model_key", ("hot", "hot_warm")) + def test_not_cold_models_with_cold_targets_ignore(self, dataset_key: str, kind: str, model_key: str) -> None: + targets = self.colds[kind] + self.hots[kind] + actual = self._get_reco(targets, model_key, dataset_key, kind, on_unsupported_targets="ignore") + expected_targets = self.hots[kind] + expected = self._get_reco(expected_targets, model_key, dataset_key, kind) + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("dataset_key", ("no_features", "with_features")) + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + @pytest.mark.parametrize("model_key", ("hot", "hot_warm")) + def test_not_cold_models_with_cold_targets_warn(self, dataset_key: str, kind: str, model_key: str) -> None: + targets = self.colds[kind] + self.hots[kind] + with warnings.catch_warnings(record=True) as w: + self._get_reco(targets, model_key, dataset_key, kind, on_unsupported_targets="warn") + assert len(w) == 1 + for phrase in ("support", "cold"): + assert phrase in str(w[-1].message) + assert "warm" not in str(w[-1].message) + + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + def test_warm_only_model_with_warm_targets_without_features_raise(self, kind: str) -> None: + targets = self.warms[kind] + self.hots[kind] with pytest.raises(ValueError, match="doesn't support recommendations for cold"): - self._get_reco(targets, "hot_warm", "no_features", kind) + self._get_reco(targets, "hot_warm", "no_features", kind, on_unsupported_targets="raise") + + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + def test_warm_only_model_with_warm_targets_without_features_ignore(self, kind: str) -> None: + targets = self.warms[kind] + self.hots[kind] + + # ignore + actual = self._get_reco(targets, "hot_warm", "no_features", kind, on_unsupported_targets="ignore") + expected_targets = self.hots[kind] + expected = self._get_reco(expected_targets, "hot_warm", "no_features", kind) + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + def test_warm_only_model_with_warm_targets_without_features_warn(self, kind: str) -> None: + targets = self.warms[kind] + self.hots[kind] + with warnings.catch_warnings(record=True) as w: + self._get_reco(targets, "hot_warm", "no_features", kind, on_unsupported_targets="warn") + assert len(w) == 1 + for phrase in ("support", "cold"): + assert phrase in str(w[-1].message) + assert "warm" not in str(w[-1].message) + + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + def test_hot_only_model_with_warm_targets_raise(self, kind: str) -> None: + targets = self.warms[kind] + self.hots[kind] + with pytest.raises(ValueError, match="doesn't support recommendations for warm"): + self._get_reco(targets, "hot", "with_features", kind, on_unsupported_targets="raise") + + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + def test_hot_only_model_with_warm_targets_ignore(self, kind: str) -> None: + targets = self.warms[kind] + self.hots[kind] + actual = self._get_reco(targets, "hot", "with_features", kind, on_unsupported_targets="ignore") + expected_targets = self.hots[kind] + expected = self._get_reco(expected_targets, "hot", "with_features", kind) + pd.testing.assert_frame_equal(actual, expected) + + @pytest.mark.parametrize("kind", ("u2i", "i2i")) + def test_hot_only_model_with_warm_targets_warn(self, kind: str) -> None: + targets = self.warms[kind] + self.hots[kind] + with warnings.catch_warnings(record=True) as w: + self._get_reco(targets, "hot", "with_features", kind, on_unsupported_targets="warn") + assert len(w) == 1 + for phrase in ("support", "cold", "warm"): + assert phrase in str(w[-1].message) @pytest.mark.parametrize("dataset_key", ("no_features", "with_features")) @pytest.mark.parametrize("kind", ("u2i", "i2i"))