Skip to content

Commit

Permalink
Removed internal ids usage outside models (#177)
Browse files Browse the repository at this point in the history
- Removed `assume_external_ids` parameter in `recommend` and
`recommend_to_items` model methods
- Added `on_unsupported_targets` in `recommend` and `recommend_to_items`
model methods
- Added `filter_on_interactions_df_row_indexes` method of `Dataset`
- Fixed `IntraListDiversity` metric computation in `cross_validate`
  • Loading branch information
blondered authored Aug 15, 2024
1 parent f73d054 commit ae77f27
Show file tree
Hide file tree
Showing 8 changed files with 447 additions and 406 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- `Debias` mechanism for classification, ranking and auc metrics. New parameter `is_debiased` to `calc_from_confusion_df`, `calc_per_user_from_confusion_df` methods of classification metrics, `calc_from_fitted`, `calc_per_user_from_fitted` methods of auc and rankning (`MAP`) metrics, `calc_from_merged`, `calc_per_user_from_merged` methods of ranking (`NDCG`, `MRR`) metrics. ([#152](https://github.com/MobileTeleSystems/RecTools/pull/152))
- `nbformat >= 4.2.0` dependency to `[visuals]` extra ([#169](https://github.com/MobileTeleSystems/RecTools/pull/169))
- `filter_interactions` method of `Dataset` ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177))
- `on_unsupported_targets` parameter to `recommend` and `recommend_to_items` model methods ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177))

### Fixed
- `display()` method in `MetricsApp` ([#169](https://github.com/MobileTeleSystems/RecTools/pull/169))

### Fixed
- `IntraListDiversity` metric computation in `cross_validate` ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177))
- Allow warp-kos loss for LightFMWrapperModel ([#175](https://github.com/MobileTeleSystems/RecTools/pull/175))

### Removed
- [Breaking] `assume_external_ids` parameter in `recommend` and `recommend_to_items` model methods ([#177](https://github.com/MobileTeleSystems/RecTools/pull/177))

## [0.7.0] - 29.07.2024

### Added
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ RecTools is on PyPI, so you can use `pip` to install it.
```
pip install rectools
```
The default version doesn't contain all the dependencies, because some of them are needed only for specific models. Available user extensions are the following:
The default version doesn't contain all the dependencies, because some of them are needed only for specific functionality. Available user extensions are the following:

- `lightfm`: adds wrapper for LightFM model,
- `torch`: adds models based on neural nets,
Expand Down
64 changes: 64 additions & 0 deletions rectools/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import typing as tp

import attr
import numpy as np
import pandas as pd
from scipy import sparse

Expand Down Expand Up @@ -245,3 +246,66 @@ def get_raw_interactions(self, include_weight: bool = True, include_datetime: bo
pd.DataFrame
"""
return self.interactions.to_external(self.user_id_map, self.item_id_map, include_weight, include_datetime)

def filter_interactions(
self,
row_indexes_to_keep: np.ndarray,
keep_external_ids: bool = True,
keep_features_for_removed_entities: bool = True,
) -> "Dataset":
"""
Generate filtered dataset that contains only provided `row_indexes_to_keep` from original
dataset interactions dataframe.
Resulting dataset will get new id mapping for both users and items.
Parameters
----------
row_indexes_to_keep : np.ndarray
Original dataset interactions df row indexes that are to be kept
keep_external_ids : bool, default `True`
Whether to keep external ids -> 2x internal ids mapping (default).
Otherwise internal -> 2x internal ids mapping will be created.
keep_features_for_removed_entities : bool, default `True`
Whether to keep all features for users and items that are not hot any more.
Returns
-------
Dataset
Filtered dataset that has only selected interactions, new ids mapping and processed features.
"""
interactions_df = self.interactions.df.iloc[row_indexes_to_keep]

# 1x internal -> 2x internal
user_id_map = IdMap.from_values(interactions_df[Columns.User].values)
item_id_map = IdMap.from_values(interactions_df[Columns.Item].values)
interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)

def _handle_features(
features: tp.Optional[Features], target_id_map: IdMap, dataset_id_map: IdMap
) -> tp.Tuple[tp.Optional[Features], IdMap]:
if features is None:
return None, target_id_map

if keep_features_for_removed_entities:
all_features_ids = np.arange(len(features))
target_id_map = target_id_map.add_ids(all_features_ids, raise_if_already_present=False)

needed_ids = target_id_map.get_external_sorted_by_internal()
features = features.take(needed_ids)
return features, target_id_map

user_features_new, user_id_map = _handle_features(self.user_features, user_id_map, self.user_id_map)
item_features_new, item_id_map = _handle_features(self.item_features, item_id_map, self.item_id_map)

if keep_external_ids: # external -> 2x internal
user_id_map = IdMap(self.user_id_map.convert_to_external(user_id_map.external_ids))
item_id_map = IdMap(self.item_id_map.convert_to_external(item_id_map.external_ids))

filtered_dataset = Dataset(
user_id_map=user_id_map,
item_id_map=item_id_map,
interactions=interactions,
user_features=user_features_new,
item_features=item_features_new,
)
return filtered_dataset
96 changes: 29 additions & 67 deletions rectools/model_selection/cross_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,58 +14,16 @@

import typing as tp

import numpy as np
import pandas as pd

from rectools.columns import Columns
from rectools.dataset import Dataset, Features, IdMap, Interactions
from rectools.dataset import Dataset
from rectools.metrics import calc_metrics
from rectools.metrics.base import MetricAtK
from rectools.models.base import ModelBase
from rectools.models.base import ErrorBehaviour, ModelBase
from rectools.types import ExternalIds

from .splitter import Splitter


def _gen_2x_internal_ids_dataset(
interactions_internal_df: pd.DataFrame,
user_features: tp.Optional[Features],
item_features: tp.Optional[Features],
prefer_warm_inference_over_cold: bool,
) -> Dataset:
"""
Make new dataset based on given interactions and features from base dataset.
Assume that interactions dataframe contains internal ids.
Returned dataset contains 2nd level of internal ids.
"""
user_id_map = IdMap.from_values(interactions_internal_df[Columns.User].values) # 1x internal -> 2x internal
item_id_map = IdMap.from_values(interactions_internal_df[Columns.Item].values) # 1x internal -> 2x internal
interactions_train = Interactions.from_raw(interactions_internal_df, user_id_map, item_id_map) # 2x internal

def _handle_features(features: tp.Optional[Features], id_map: IdMap) -> tp.Tuple[tp.Optional[Features], IdMap]:
if features is None:
return None, id_map

if prefer_warm_inference_over_cold:
all_features_ids = np.arange(len(features)) # 1x internal
id_map = id_map.add_ids(all_features_ids, raise_if_already_present=False)

features = features.take(id_map.get_external_sorted_by_internal()) # 2x internal
return features, id_map

user_features_new, user_id_map = _handle_features(user_features, user_id_map)
item_features_new, item_id_map = _handle_features(item_features, item_id_map)

dataset = Dataset(
user_id_map=user_id_map,
item_id_map=item_id_map,
interactions=interactions_train,
user_features=user_features_new,
item_features=item_features_new,
)
return dataset


def cross_validate( # pylint: disable=too-many-locals
dataset: Dataset,
splitter: Splitter,
Expand All @@ -77,6 +35,7 @@ def cross_validate( # pylint: disable=too-many-locals
prefer_warm_inference_over_cold: bool = True,
ref_models: tp.Optional[tp.List[str]] = None,
validate_ref_models: bool = False,
on_unsupported_targets: ErrorBehaviour = "warn",
) -> tp.Dict[str, tp.Any]:
"""
Run cross validation on multiple models with multiple metrics.
Expand Down Expand Up @@ -113,6 +72,15 @@ def cross_validate( # pylint: disable=too-many-locals
validate_ref_models : bool, default False
If True include models specified in `ref_models` to all metrics calculations
and receive their metrics from cross-validation.
on_unsupported_targets : Literal["raise", "warn", "ignore"], default "warn"
How to handle warm/cold target users when model doesn't support warm/cold inference.
Specify "warn" to filter with warning (default in `cross_validate`).
Specify "ignore" to filter unsupported targets without a warning.
It is highly recommended to pass `CoveredUsers` DQ metric to catch all models with
insufficient recommendations for each fold.
Specify "raise" to raise ValueError in case unsupported targets are passed. In cross-validation
this may cause unexpected errors for some of the complicated models.
Returns
-------
Expand All @@ -132,34 +100,26 @@ def cross_validate( # pylint: disable=too-many-locals
]
}
"""
interactions = dataset.interactions

split_iterator = splitter.split(interactions, collect_fold_stats=True)
split_iterator = splitter.split(dataset.interactions, collect_fold_stats=True)

split_infos = []
metrics_all = []

for train_ids, test_ids, split_info in split_iterator:
split_infos.append(split_info)

# ### Prepare split data
interactions_df_train = interactions.df.iloc[train_ids] # 1x internal
# We need to avoid fitting models on sparse matrices with all zero rows/columns =>
# => we need to create a fold dataset which contains only hot users and items for current training
fold_dataset = _gen_2x_internal_ids_dataset(
interactions_df_train, dataset.user_features, dataset.item_features, prefer_warm_inference_over_cold
fold_dataset = dataset.filter_interactions(
row_indexes_to_keep=train_ids,
keep_external_ids=True,
keep_features_for_removed_entities=prefer_warm_inference_over_cold,
)
interactions_df_test = dataset.interactions.df.loc[test_ids]
interactions_df_test[Columns.User] = dataset.user_id_map.convert_to_external(interactions_df_test[Columns.User])
interactions_df_test[Columns.Item] = dataset.item_id_map.convert_to_external(interactions_df_test[Columns.Item])

interactions_df_test = interactions.df.iloc[test_ids] # 1x internal
test_users = interactions_df_test[Columns.User].unique() # 1x internal
catalog = interactions_df_train[Columns.Item].unique() # 1x internal

if items_to_recommend is not None:
item_ids_to_recommend = dataset.item_id_map.convert_to_internal(
items_to_recommend, strict=False
) # 1x internal
else:
item_ids_to_recommend = None
test_users = interactions_df_test[Columns.User].unique()
prev_interactions = fold_dataset.get_raw_interactions()
catalog = prev_interactions[Columns.Item].unique()

# ### Train ref models if any
ref_reco = {}
Expand All @@ -171,7 +131,8 @@ def cross_validate( # pylint: disable=too-many-locals
dataset=fold_dataset,
k=k,
filter_viewed=filter_viewed,
items_to_recommend=item_ids_to_recommend,
items_to_recommend=items_to_recommend,
on_unsupported_targets=on_unsupported_targets,
)

# ### Generate recommendations and calc metrics
Expand All @@ -183,19 +144,20 @@ def cross_validate( # pylint: disable=too-many-locals
reco = ref_reco[model_name]
else:
model.fit(fold_dataset)
reco = model.recommend( # 1x internal
reco = model.recommend(
users=test_users,
dataset=fold_dataset,
k=k,
filter_viewed=filter_viewed,
items_to_recommend=item_ids_to_recommend,
items_to_recommend=items_to_recommend,
on_unsupported_targets=on_unsupported_targets,
)

metric_values = calc_metrics(
metrics,
reco=reco,
interactions=interactions_df_test,
prev_interactions=interactions_df_train,
prev_interactions=prev_interactions,
catalog=catalog,
ref_reco=ref_reco,
)
Expand Down
Loading

0 comments on commit ae77f27

Please sign in to comment.