Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added user info to movielens 100k and 1m #2178

Open
wants to merge 1 commit into
base: staging
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 183 additions & 10 deletions recommenders/datasets/movielens.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
has_header=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got these errors:

(recommenders311) miguel@miguel:~/MS/recommenders$ pytest tests/data_validation/recommenders/datasets/test_movielens.py --disable-warnings --durations 0
========================================================================= test session starts =========================================================================
platform linux -- Python 3.11.9, pytest-8.2.2, pluggy-1.5.0
rootdir: /home/miguel/MS/recommenders
configfile: pyproject.toml
plugins: anyio-4.4.0, cov-5.0.0, typeguard-4.3.0, hypothesis-6.104.2, mock-3.14.0
collected 71 items

tests/data_validation/recommenders/datasets/test_movielens.py ...................................FFFFFF..FFFF....................FF.. [100%]

=================================== FAILURES ====================================
___________________ test_download_and_extract_movielens[100k] ___________________
size = '100k', tmp = '/tmp/pytest-of-miguel/pytest-77/tmphd6aiueo'

    @pytest.mark.parametrize("size", ["100k", "1m", "10m", "20m"])
    def test_download_and_extract_movielens(size, tmp):
        """Test movielens data download and extract"""
        zip_path = os.path.join(tmp, "ml.zip")
        download_movielens(size, dest_path=zip_path)
        assert len(os.listdir(tmp)) == 1
        assert os.path.exists(zip_path) is True

        rating_path = os.path.join(tmp, "rating.dat")
        item_path = os.path.join(tmp, "item.dat")
>       extract_movielens(
            size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
        )
E       TypeError: extract_movielens() missing 1 required positional argument: 'user_path'

tests/data_validation/recommenders/datasets/test_movielens.py:125: TypeError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 4.81k/4.81k [00:01<00:00, 3.97kKB/s]
____________________ test_download_and_extract_movielens[1m] ____________________

size = '1m', tmp = '/tmp/pytest-of-miguel/pytest-77/tmpt47xilf3'

    @pytest.mark.parametrize("size", ["100k", "1m", "10m", "20m"])
    def test_download_and_extract_movielens(size, tmp):
        """Test movielens data download and extract"""
        zip_path = os.path.join(tmp, "ml.zip")
        download_movielens(size, dest_path=zip_path)
        assert len(os.listdir(tmp)) == 1
        assert os.path.exists(zip_path) is True

        rating_path = os.path.join(tmp, "rating.dat")
        item_path = os.path.join(tmp, "item.dat")
>       extract_movielens(
            size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
        )
E       TypeError: extract_movielens() missing 1 required positional argument: 'user_path'

tests/data_validation/recommenders/datasets/test_movielens.py:125: TypeError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 5.78k/5.78k [00:01<00:00, 4.59kKB/s]
___________________ test_download_and_extract_movielens[10m] ____________________

size = '10m', tmp = '/tmp/pytest-of-miguel/pytest-77/tmpjrveosih'

    @pytest.mark.parametrize("size", ["100k", "1m", "10m", "20m"])
    def test_download_and_extract_movielens(size, tmp):
        """Test movielens data download and extract"""
        zip_path = os.path.join(tmp, "ml.zip")
        download_movielens(size, dest_path=zip_path)
        assert len(os.listdir(tmp)) == 1
        assert os.path.exists(zip_path) is True

        rating_path = os.path.join(tmp, "rating.dat")
        item_path = os.path.join(tmp, "item.dat")
>       extract_movielens(
            size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
        )
E       TypeError: extract_movielens() missing 1 required positional argument: 'user_path'

tests/data_validation/recommenders/datasets/test_movielens.py:125: TypeError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 64.0k/64.0k [00:03<00:00, 16.4kKB/s]
___________________ test_download_and_extract_movielens[20m] ____________________

size = '20m', tmp = '/tmp/pytest-of-miguel/pytest-77/tmpw2vclyex'

    @pytest.mark.parametrize("size", ["100k", "1m", "10m", "20m"])
    def test_download_and_extract_movielens(size, tmp):
        """Test movielens data download and extract"""
        zip_path = os.path.join(tmp, "ml.zip")
        download_movielens(size, dest_path=zip_path)
        assert len(os.listdir(tmp)) == 1
        assert os.path.exists(zip_path) is True

        rating_path = os.path.join(tmp, "rating.dat")
        item_path = os.path.join(tmp, "item.dat")
>       extract_movielens(
            size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
        )
E       TypeError: extract_movielens() missing 1 required positional argument: 'user_path'

tests/data_validation/recommenders/datasets/test_movielens.py:125: TypeError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 194k/194k [00:10<00:00, 18.7kKB/s]
_ test_load_pandas_df[100k-100000-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995] _

size = '100k', num_samples = 100000, num_movies = 1682, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = "Animation|Children's|Comedy", year_example = '1995'
tmp = '/tmp/pytest-of-miguel/pytest-77/tmp0woedy4a'

    @pytest.mark.parametrize(
        "size, num_samples, num_movies, movie_example, title_example, genres_example, year_example",
        [
            (
                "100k",
                100000,
                1682,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "1m",
                1000209,
                3883,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "10m",
                10000054,
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                20000263,
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_pandas_df(
        size,
        num_samples,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
    ):
        """Test MovieLens dataset load as pd.DataFrame"""
        # Test if correct data are loaded
        header = ["a", "b", "c"]
        df = load_pandas_df(size=size, local_cache_path=tmp, header=header)
        assert len(df) == num_samples
        assert len(df.columns) == len(header)
        # Test if raw-zip file, rating file, and item file are cached
>       assert len(os.listdir(tmp)) == 3
E       AssertionError: assert 4 == 3
E        +  where 4 = len(['u.data', 'u.user', 'u.item', 'ml-100k.zip'])
E        +    where ['u.data', 'u.user', 'u.item', 'ml-100k.zip'] = <built-in function listdir>('/tmp/pytest-of-miguel/pytest-77/tmp0woedy4a')
E        +      where <built-in function listdir> = os.listdir

tests/data_validation/recommenders/datasets/test_movielens.py:192: AssertionError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 4.81k/4.81k [00:01<00:00, 4.20kKB/s]
_ test_load_pandas_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995] _

size = '1m', num_samples = 1000209, num_movies = 3883, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = "Animation|Children's|Comedy", year_example = '1995'
tmp = '/tmp/pytest-of-miguel/pytest-77/tmpnjmy_mcj'

    @pytest.mark.parametrize(
        "size, num_samples, num_movies, movie_example, title_example, genres_example, year_example",
        [
            (
                "100k",
                100000,
                1682,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "1m",
                1000209,
                3883,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "10m",
                10000054,
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                20000263,
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_pandas_df(
        size,
        num_samples,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
    ):
        """Test MovieLens dataset load as pd.DataFrame"""
        # Test if correct data are loaded
        header = ["a", "b", "c"]
        df = load_pandas_df(size=size, local_cache_path=tmp, header=header)
        assert len(df) == num_samples
        assert len(df.columns) == len(header)
        # Test if raw-zip file, rating file, and item file are cached
>       assert len(os.listdir(tmp)) == 3
E       AssertionError: assert 4 == 3
E        +  where 4 = len(['users.dat', 'ml-1m.zip', 'ratings.dat', 'movies.dat'])
E        +    where ['users.dat', 'ml-1m.zip', 'ratings.dat', 'movies.dat'] = <built-in function listdir>('/tmp/pytest-of-miguel/pytest-77/tmpnjmy_mcj')
E        +      where <built-in function listdir> = os.listdir

tests/data_validation/recommenders/datasets/test_movielens.py:192: AssertionError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 5.78k/5.78k [00:01<00:00, 4.63kKB/s]
_ test_load_item_df[100k-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995] _

size = '100k', num_movies = 1682, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = "Animation|Children's|Comedy", year_example = '1995'
tmp = '/tmp/pytest-of-miguel/pytest-77/tmpq4bt6q3n'

    @pytest.mark.parametrize(
        "size, num_movies, movie_example, title_example, genres_example, year_example",
        [
            ("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            ("1m", 3883, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            (
                "10m",
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_item_df(
        size,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
    ):
        """Test movielens item data load (not rating data)"""
>       df = load_item_df(size, local_cache_path=tmp, title_col="title")

tests/data_validation/recommenders/datasets/test_movielens.py:264:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

size = '100k', local_cache_path = '/tmp/pytest-of-miguel/pytest-77/tmpq4bt6q3n'
movie_col = 'itemID', title_col = 'title', genres_col = None, year_col = None

    def load_item_df(
        size="100k",
        local_cache_path=None,
        movie_col=DEFAULT_ITEM_COL,
        title_col=None,
        genres_col=None,
        year_col=None,
    ):
        """Loads Movie info.

        Args:
            size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
            local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
                If None, all the intermediate files will be stored in a temporary directory and removed after use.
            movie_col (str): Movie id column name.
            title_col (str): Movie title column name. If None, the column will not be loaded.
            genres_col (str): Genres column name. Genres are '|' separated string.
                If None, the column will not be loaded.
            year_col (str): Movie release year column name. If None, the column will not be loaded.

        Returns:
            pandas.DataFrame: Movie information data, such as title, genres, and release year.
        """
        size = size.lower()
        if size not in DATA_FORMAT:
            raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)

        with download_path(local_cache_path) as path:
            filepath = os.path.join(path, "ml-{}.zip".format(size))
>           _, item_datapath = _maybe_download_and_extract(size, filepath)
E           ValueError: too many values to unpack (expected 2)

recommenders/datasets/movielens.py:335: ValueError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 4.81k/4.81k [00:01<00:00, 3.37kKB/s]
_ test_load_item_df[1m-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995] _

size = '1m', num_movies = 3883, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = "Animation|Children's|Comedy", year_example = '1995'
tmp = '/tmp/pytest-of-miguel/pytest-77/tmpirqfi1bs'

    @pytest.mark.parametrize(
        "size, num_movies, movie_example, title_example, genres_example, year_example",
        [
            ("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            ("1m", 3883, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            (
                "10m",
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_item_df(
        size,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
    ):
        """Test movielens item data load (not rating data)"""
>       df = load_item_df(size, local_cache_path=tmp, title_col="title")

tests/data_validation/recommenders/datasets/test_movielens.py:264:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

size = '1m', local_cache_path = '/tmp/pytest-of-miguel/pytest-77/tmpirqfi1bs'
movie_col = 'itemID', title_col = 'title', genres_col = None, year_col = None

    def load_item_df(
        size="100k",
        local_cache_path=None,
        movie_col=DEFAULT_ITEM_COL,
        title_col=None,
        genres_col=None,
        year_col=None,
    ):
        """Loads Movie info.

        Args:
            size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
            local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
                If None, all the intermediate files will be stored in a temporary directory and removed after use.
            movie_col (str): Movie id column name.
            title_col (str): Movie title column name. If None, the column will not be loaded.
            genres_col (str): Genres column name. Genres are '|' separated string.
                If None, the column will not be loaded.
            year_col (str): Movie release year column name. If None, the column will not be loaded.

        Returns:
            pandas.DataFrame: Movie information data, such as title, genres, and release year.
        """
        size = size.lower()
        if size not in DATA_FORMAT:
            raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)

        with download_path(local_cache_path) as path:
            filepath = os.path.join(path, "ml-{}.zip".format(size))
>           _, item_datapath = _maybe_download_and_extract(size, filepath)
E           ValueError: too many values to unpack (expected 2)

recommenders/datasets/movielens.py:335: ValueError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 5.78k/5.78k [00:01<00:00, 4.72kKB/s]
_ test_load_item_df[10m-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995] _

size = '10m', num_movies = 10681, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = 'Adventure|Animation|Children|Comedy|Fantasy'
year_example = '1995', tmp = '/tmp/pytest-of-miguel/pytest-77/tmp068p6mvt'

    @pytest.mark.parametrize(
        "size, num_movies, movie_example, title_example, genres_example, year_example",
        [
            ("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            ("1m", 3883, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            (
                "10m",
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_item_df(
        size,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
    ):
        """Test movielens item data load (not rating data)"""
>       df = load_item_df(size, local_cache_path=tmp, title_col="title")

tests/data_validation/recommenders/datasets/test_movielens.py:264:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

size = '10m', local_cache_path = '/tmp/pytest-of-miguel/pytest-77/tmp068p6mvt'
movie_col = 'itemID', title_col = 'title', genres_col = None, year_col = None

    def load_item_df(
        size="100k",
        local_cache_path=None,
        movie_col=DEFAULT_ITEM_COL,
        title_col=None,
        genres_col=None,
        year_col=None,
    ):
        """Loads Movie info.

        Args:
            size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
            local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
                If None, all the intermediate files will be stored in a temporary directory and removed after use.
            movie_col (str): Movie id column name.
            title_col (str): Movie title column name. If None, the column will not be loaded.
            genres_col (str): Genres column name. Genres are '|' separated string.
                If None, the column will not be loaded.
            year_col (str): Movie release year column name. If None, the column will not be loaded.

        Returns:
            pandas.DataFrame: Movie information data, such as title, genres, and release year.
        """
        size = size.lower()
        if size not in DATA_FORMAT:
            raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)

        with download_path(local_cache_path) as path:
            filepath = os.path.join(path, "ml-{}.zip".format(size))
>           _, item_datapath = _maybe_download_and_extract(size, filepath)
E           ValueError: too many values to unpack (expected 2)

recommenders/datasets/movielens.py:335: ValueError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 64.0k/64.0k [00:03<00:00, 16.5kKB/s]
_ test_load_item_df[20m-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995] _

size = '20m', num_movies = 27278, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = 'Adventure|Animation|Children|Comedy|Fantasy'
year_example = '1995', tmp = '/tmp/pytest-of-miguel/pytest-77/tmp81nnj7fw'

    @pytest.mark.parametrize(
        "size, num_movies, movie_example, title_example, genres_example, year_example",
        [
            ("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            ("1m", 3883, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995"),
            (
                "10m",
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_item_df(
        size,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
    ):
        """Test movielens item data load (not rating data)"""
>       df = load_item_df(size, local_cache_path=tmp, title_col="title")

tests/data_validation/recommenders/datasets/test_movielens.py:264:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

size = '20m', local_cache_path = '/tmp/pytest-of-miguel/pytest-77/tmp81nnj7fw'
movie_col = 'itemID', title_col = 'title', genres_col = None, year_col = None

    def load_item_df(
        size="100k",
        local_cache_path=None,
        movie_col=DEFAULT_ITEM_COL,
        title_col=None,
        genres_col=None,
        year_col=None,
    ):
        """Loads Movie info.

        Args:
            size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
            local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
                If None, all the intermediate files will be stored in a temporary directory and removed after use.
            movie_col (str): Movie id column name.
            title_col (str): Movie title column name. If None, the column will not be loaded.
            genres_col (str): Genres column name. Genres are '|' separated string.
                If None, the column will not be loaded.
            year_col (str): Movie release year column name. If None, the column will not be loaded.

        Returns:
            pandas.DataFrame: Movie information data, such as title, genres, and release year.
        """
        size = size.lower()
        if size not in DATA_FORMAT:
            raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)

        with download_path(local_cache_path) as path:
            filepath = os.path.join(path, "ml-{}.zip".format(size))
>           _, item_datapath = _maybe_download_and_extract(size, filepath)
E           ValueError: too many values to unpack (expected 2)

recommenders/datasets/movielens.py:335: ValueError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 194k/194k [00:09<00:00, 19.6kKB/s]
_ test_load_spark_df[100k-100000-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995] _

size = '100k', num_samples = 100000, num_movies = 1682, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = "Animation|Children's|Comedy", year_example = '1995'
tmp = '/tmp/pytest-of-miguel/pytest-77/tmpi7dpzm72'
spark = <pyspark.sql.session.SparkSession object at 0x7fc65a36bbd0>

    @pytest.mark.spark
    @pytest.mark.parametrize(
        "size, num_samples, num_movies, movie_example, title_example, genres_example, year_example",
        [
            (
                "100k",
                100000,
                1682,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "1m",
                1000209,
                3883,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "10m",
                10000054,
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                20000263,
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_spark_df(
        size,
        num_samples,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
        spark,
    ):
        """Test MovieLens dataset load into pySpark.DataFrame"""

        # Test if correct data are loaded
        header = ["1", "2", "3"]
        schema = StructType(
            [
                StructField("u", IntegerType()),
                StructField("m", IntegerType()),
            ]
        )
        with pytest.warns(Warning):
            df = load_spark_df(
                spark, size=size, local_cache_path=tmp, header=header, schema=schema
            )
            assert df.count() == num_samples
            # Test if schema is used when both schema and header are provided
            assert len(df.columns) == len(schema)
            # Test if raw-zip file, rating file, and item file are cached
>           assert len(os.listdir(tmp)) == 3
E           AssertionError: assert 4 == 3
E            +  where 4 = len(['u.data', 'u.user', 'u.item', 'ml-100k.zip'])
E            +    where ['u.data', 'u.user', 'u.item', 'ml-100k.zip'] = <built-in function listdir>('/tmp/pytest-of-miguel/pytest-77/tmpi7dpzm72')
E            +      where <built-in function listdir> = os.listdir

tests/data_validation/recommenders/datasets/test_movielens.py:488: AssertionError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 4.81k/4.81k [00:01<00:00, 4.35kKB/s]

_ test_load_spark_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995] _

size = '1m', num_samples = 1000209, num_movies = 3883, movie_example = 1
title_example = 'Toy Story (1995)'
genres_example = "Animation|Children's|Comedy", year_example = '1995'
tmp = '/tmp/pytest-of-miguel/pytest-77/tmpipedkdgk'
spark = <pyspark.sql.session.SparkSession object at 0x7fc65a36bbd0>

    @pytest.mark.spark
    @pytest.mark.parametrize(
        "size, num_samples, num_movies, movie_example, title_example, genres_example, year_example",
        [
            (
                "100k",
                100000,
                1682,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "1m",
                1000209,
                3883,
                1,
                "Toy Story (1995)",
                "Animation|Children's|Comedy",
                "1995",
            ),
            (
                "10m",
                10000054,
                10681,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
            (
                "20m",
                20000263,
                27278,
                1,
                "Toy Story (1995)",
                "Adventure|Animation|Children|Comedy|Fantasy",
                "1995",
            ),
        ],
    )
    def test_load_spark_df(
        size,
        num_samples,
        num_movies,
        movie_example,
        title_example,
        genres_example,
        year_example,
        tmp,
        spark,
    ):
        """Test MovieLens dataset load into pySpark.DataFrame"""

        # Test if correct data are loaded
        header = ["1", "2", "3"]
        schema = StructType(
            [
                StructField("u", IntegerType()),
                StructField("m", IntegerType()),
            ]
        )
        with pytest.warns(Warning):
            df = load_spark_df(
                spark, size=size, local_cache_path=tmp, header=header, schema=schema
            )
            assert df.count() == num_samples
            # Test if schema is used when both schema and header are provided
            assert len(df.columns) == len(schema)
            # Test if raw-zip file, rating file, and item file are cached
>           assert len(os.listdir(tmp)) == 3
E           AssertionError: assert 4 == 3
E            +  where 4 = len(['users.dat', 'ml-1m.zip', 'ratings.dat', 'movies.dat'])
E            +    where ['users.dat', 'ml-1m.zip', 'ratings.dat', 'movies.dat'] = <built-in function listdir>('/tmp/pytest-of-miguel/pytest-77/tmpipedkdgk')
E            +      where <built-in function listdir> = os.listdir

tests/data_validation/recommenders/datasets/test_movielens.py:488: AssertionError
----------------------------- Captured stderr call ------------------------------
100%|██████████| 5.78k/5.78k [00:01<00:00, 4.72kKB/s]

=============================== slowest durations ===============================
229.46s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[20m-20000263-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
187.80s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[20m-20000263-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
108.28s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[10m-10000054-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
105.43s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[10m-10000054-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
79.74s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df_mock_100__with_custom_param__succeed
71.77s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df_mock_100__with_default_param__succeed
55.13s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df_mock_100__with_default_param__succeed
54.47s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df_mock_100__with_custom_param__succeed
14.17s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__has_default_col_names[100]
13.83s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[20m-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
12.26s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df_error[20m]
10.89s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[20m]
6.74s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[10m-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
6.13s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df_error[10m]
5.83s setup    tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[0-101-True-True]
5.76s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[0-101-True-True]
5.19s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
4.80s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
4.46s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[10m]
3.49s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[3-101-True-True]
3.30s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__has_default_col_names[10]
3.24s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[10-101-True-True]
3.18s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[10-101-False-False]
3.13s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-None-False-True]
3.09s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[100k-100000-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
2.96s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[10-101-False-True]
2.86s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[10-101-True-False]
2.85s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-None-True-False]
2.78s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-None-True-True]
2.78s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[3-101-False-True]
2.65s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[3-101-False-False]
2.64s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[3-101-True-False]
2.53s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-None-False-False]
2.38s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df_remove_default_col__return_success[4]
2.26s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__data_serialization_default_param
2.26s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[100k-100000-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
2.25s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-None-False-False]
2.25s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-None-True-False]
2.15s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-None-False-True]
2.07s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__store_tmp_file
2.05s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df[1m-6040-1-1-F-K-12 student-48067]
2.00s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-None-True-True]
1.99s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[100k-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
1.98s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[1m-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
1.96s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[1m]
1.95s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[100k]
1.73s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df[100k-943-1-24-M-technician-85711]
1.45s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df_remove_default_col__return_success[2]
1.40s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df_remove_default_col__return_success[3]
1.36s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-2-True-True]
1.35s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-2-True-False]
1.28s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-2-False-True]
1.25s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-2-False-True]
1.23s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-2-True-True]
1.21s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-2-True-False]
1.19s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[20m-20000263-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
1.09s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[10--1-2-False-False]
0.98s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[3--1-2-False-False]
0.47s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[0-101-True-False]
0.41s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[0-101-False-True]
0.39s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_spark_df__return_success[0-101-False-False]
0.17s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[20m-20000263-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
0.09s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[20m-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
0.06s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df_error[20m]
0.05s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[10m-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
0.03s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[20m]
0.03s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df_error[10m]
0.03s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[10m-10000054-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
0.02s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[10m-10000054-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995]
0.02s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-None-False-True]
0.02s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-2-True-True]
0.02s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-2-True-False]
0.02s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-2-False-False]
0.02s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-None-False-False]
0.02s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-2-False-True]
0.01s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-None-True-False]
0.01s call     tests/data_validation/recommenders/datasets/test_movielens.py::test_mock_movielens_schema__get_df__return_success[0--1-None-True-True]
0.01s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[10m]
0.01s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[100k-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
0.01s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[1m-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995]
0.01s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_user_df[1m-6040-1-1-F-K-12 student-48067]
0.01s teardown tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995]

(131 durations < 0.005s hidden.  Use -vv to show these durations.)
============================ short test summary info ============================
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[100k] - TypeError: extract_movielens() missing 1 required positional argument: 'user...
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[1m] - TypeError: extract_movielens() missing 1 required positional argument: 'user...
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[10m] - TypeError: extract_movielens() missing 1 required positional argument: 'user...
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_download_and_extract_movielens[20m] - TypeError: extract_movielens() missing 1 required positional argument: 'user...
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[100k-100000-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995] - AssertionError: assert 4 == 3
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_pandas_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995] - AssertionError: assert 4 == 3
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[100k-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995] - ValueError: too many values to unpack (expected 2)
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[1m-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995] - ValueError: too many values to unpack (expected 2)
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[10m-10681-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995] - ValueError: too many values to unpack (expected 2)
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_item_df[20m-27278-1-Toy Story (1995)-Adventure|Animation|Children|Comedy|Fantasy-1995] - ValueError: too many values to unpack (expected 2)
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[100k-100000-1682-1-Toy Story (1995)-Animation|Children's|Comedy-1995] - AssertionError: assert 4 == 3
FAILED tests/data_validation/recommenders/datasets/test_movielens.py::test_load_spark_df[1m-1000209-3883-1-Toy Story (1995)-Animation|Children's|Comedy-1995] - AssertionError: assert 4 == 3
=========== 12 failed, 59 passed, 6057 warnings in 2526.02s (0:42:06) ===========

item_sep=None,
item_path=None,
user_path=None,
item_has_header=False,
):
"""MovieLens data format container as a different size of MovieLens data file
Expand All @@ -58,6 +59,7 @@ def __init__(
has_header (bool): Whether the rating data contains a header line or not
item_sep (str): Item data delimiter
item_path (str): Item data path within the original zip file
user_path (str): User data path within the original zip file
item_has_header (bool): Whether the item data contains a header line or not
"""

Expand All @@ -69,6 +71,7 @@ def __init__(
# Item file
self._item_sep = item_sep
self._item_path = item_path
self._user_path = user_path
self._item_has_header = item_has_header

@property
Expand All @@ -91,21 +94,25 @@ def item_separator(self):
def item_path(self):
return self._item_path

@property
def user_path(self):
return self._user_path

@property
def item_has_header(self):
return self._item_has_header


# 10m and 20m data do not have user data
DATA_FORMAT = {
"100k": _DataFormat("\t", "ml-100k/u.data", False, "|", "ml-100k/u.item", False),
"100k": _DataFormat("\t", "ml-100k/u.data", False, "|", "ml-100k/u.item", "ml-100k/u.user", False),
"1m": _DataFormat(
"::", "ml-1m/ratings.dat", False, "::", "ml-1m/movies.dat", False
"::", "ml-1m/ratings.dat", False, "::", "ml-1m/movies.dat", "ml-1m/users.dat", False
),
"10m": _DataFormat(
"::", "ml-10M100K/ratings.dat", False, "::", "ml-10M100K/movies.dat", False
"::", "ml-10M100K/ratings.dat", False, "::", "ml-10M100K/movies.dat", None, False
),
"20m": _DataFormat(",", "ml-20m/ratings.csv", True, ",", "ml-20m/movies.csv", True),
"20m": _DataFormat(",", "ml-20m/ratings.csv", True, ",", "ml-20m/movies.csv", None, True),
}

# Fake data for testing only
Expand Down Expand Up @@ -136,6 +143,31 @@ def item_has_header(self):
"Western",
)

# 1m data occupation index to string mapper. For 100k, the occupation labels are already in the dataset.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@daviddavo the extra info is only for 100k and 1M?

OCCUPATIONS = (
"Other",
"Academic/Educator",
"Artist",
"Clerical/Admin",
"College/Grad student",
"Customer service",
"Doctor/Health care",
"Executive/Managerial",
"Farmer",
"Homemaker",
"K-12 student",
"Lawyer",
"Programmer",
"Retired",
"Sales/Marketing",
"Scientist",
"Self-employed",
"Technician/Engineer",
"Tradesman/Craftsman",
"Unemployed",
"Writer",
)


# Warning and error messages
WARNING_MOVIE_LENS_HEADER = """MovieLens rating dataset has four columns
Expand All @@ -153,9 +185,15 @@ def load_pandas_df(
size="100k",
header=None,
local_cache_path=None,
# Movie properties
title_col=None,
genres_col=None,
year_col=None,
# User properties
age_col=None,
gender_col=None,
occupation_col=None,
zip_code_col=None,
):
"""Loads the MovieLens dataset as pd.DataFrame.

Expand All @@ -174,6 +212,10 @@ def load_pandas_df(
If None, the column will not be loaded.
year_col (str): Movie release year column name. If None, the column will not be loaded.
If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored.
age_col (str): User age column name. If None, the column will not be loaded.
gender_col (str): User gender column name. If None, the column will not be loaded.
occupation_col (str): User occupation column name. If None, the column will not be loaded.
zip_code_col (str): User zip code column name. If None, the column will not be loaded.

Returns:
pandas.DataFrame: Movie rating dataset.
Expand Down Expand Up @@ -219,11 +261,17 @@ def load_pandas_df(
], # supply the rest of the kwarg with the dictionary
)

user_col = header[0]
movie_col = header[1]

with download_path(local_cache_path) as path:
filepath = os.path.join(path, "ml-{}.zip".format(size))
datapath, item_datapath = _maybe_download_and_extract(size, filepath)
datapath, item_datapath, user_datapath = _maybe_download_and_extract(size, filepath)

# Load user features such as age, gender, occupation, or zip code
user_df = _load_user_df(
size, user_datapath, user_col, age_col, gender_col, occupation_col, zip_code_col
)

# Load movie features such as title, genres, and release year
item_df = _load_item_df(
Expand All @@ -244,6 +292,10 @@ def load_pandas_df(
if len(header) > 2:
df[header[2]] = df[header[2]].astype(float)

# Merge rating df w/ user_df
if user_df is not None:
df = df.merge(user_df, on=header[0])

# Merge rating df w/ item_df
if item_df is not None:
df = df.merge(item_df, on=header[1])
Expand Down Expand Up @@ -353,6 +405,96 @@ def parse_year(t):
return item_df


def load_user_df(
size="100k",
local_cache_path=None,
user_col=DEFAULT_USER_COL,
age_col=None,
gender_col=None,
occupation_col=None,
zip_code_col=None,
) -> pd.DataFrame:
"""Loads user info

Args:
size (str, optional): Size of the data to load. One of ("100k", "1m", "10m", "20m"). Defaults to "100k".
local_cache_path (str, optional): Path (directory or a zip file) to cache the downloaded zip file.
If None, all the intermediate files will be sotred in a temporary directory and removed after use.
user_col (str): User id column name. Defaults to DEFAULT_USER_COL.
age_col (str): User age column name. If None, the column will not be loaded.
gender_col (str): User gender column name (M/F only). If None, the column will not be loaded.
occupation_col (str): User occupation column name. If None, the column will not be loaded.
zip_code_col (str): User zip code column name. If None, the column will not be loaded.

Returns:
pandas.DatFrame: User information data.
"""
size = size.lower()

if size not in DATA_FORMAT:
raise ValueError(f"Size: {size}. " + ERROR_MOVIE_LENS_SIZE)

with download_path(local_cache_path) as path:
filepath = os.path.join(path, "ml-{}.zip".format(size))
_, _, user_datapath = _maybe_download_and_extract(size, filepath)
user_df = _load_user_df(
size, user_datapath, user_col, age_col, gender_col, occupation_col, zip_code_col
)

return user_df


def _load_user_df(size, user_datapath, user_col, age_col, gender_col, occupation_col, zip_code_col):
"""Loads user info"""
if all(c is None for c in [age_col, gender_col, occupation_col, zip_code_col]):
return None

if DATA_FORMAT[size].user_path is None:
raise ValueError(f"Movielens {size} does not support user info. Do not request user info columns.")

header = {
0: user_col,
}

# 100k has the gender and age columns order swapped
if age_col is not None:
if size == '100k':
header[1] = age_col
else:
header[2] = age_col

if gender_col is not None:
if size == '100k':
header[2] = gender_col
else:
header[1] = gender_col

if occupation_col is not None:
header[3] = occupation_col

if zip_code_col is not None:
header[4] = zip_code_col

usecols = sorted(header.keys())
user_header = [header[k] for k in usecols]

user_df = pd.read_csv(
user_datapath,
sep=DATA_FORMAT[size].item_separator,
engine="python",
names=user_header,
usecols=usecols,
header=0 if DATA_FORMAT[size].item_has_header else None,
encoding="ISO-8859-1",
)

# 100k has the labels, but the rest do not
if size != '100k':
user_df[occupation_col] = user_df[occupation_col].map(lambda x: OCCUPATIONS[x])

return user_df


def load_spark_df(
spark,
size="100k",
Expand All @@ -363,6 +505,10 @@ def load_spark_df(
title_col=None,
genres_col=None,
year_col=None,
age_col=None,
gender_col=None,
occupation_col=None,
zip_code_col=None,
):
"""Loads the MovieLens dataset as `pyspark.sql.DataFrame`.

Expand All @@ -386,6 +532,10 @@ def load_spark_df(
If None, the column will not be loaded.
year_col (str): Movie release year column name. If None, the column will not be loaded.
If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored.
age_col (str): User age column name. If None, the column will not be loaded.
gender_col (str): User gender column name. If None, the column will not be loaded.
occupation_col (str): User occupation column name. If None, the column will not be loaded.
zip_code_col (str): User zip code column name. If None, the column will not be loaded.

Returns:
pyspark.sql.DataFrame: Movie rating dataset.
Expand Down Expand Up @@ -438,11 +588,12 @@ def load_spark_df(
if len(schema) < 2:
raise ValueError(ERROR_HEADER)

user_col = schema[0].name
movie_col = schema[1].name

with download_path(local_cache_path) as path:
filepath = os.path.join(path, "ml-{}.zip".format(size))
datapath, item_datapath = _maybe_download_and_extract(size, filepath)
datapath, item_datapath, user_datapath = _maybe_download_and_extract(size, filepath)
spark_datapath = "file:///" + datapath # shorten form of file://localhost/

# Load movie features such as title, genres, and release year.
Expand All @@ -453,6 +604,14 @@ def load_spark_df(
)
item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None

# Load user features such as age, gender, occupation and zip code
# Since the file size is small, we directly load as pd.DataFrame from the driver node
# and then convert into pyspark.sql.DataFrame
user_pd_df = _load_user_df(
size, user_datapath, user_col, age_col, gender_col, occupation_col, zip_code_col,
)
user_df = spark.createDataFrame(user_pd_df) if user_pd_df is not None else None

if is_databricks():
if dbutils is None:
raise ValueError(
Expand Down Expand Up @@ -487,6 +646,10 @@ def load_spark_df(
if item_df is not None:
df = df.join(item_df, movie_col, "left")

# Merge rating w/ user_df
if user_df is not None:
df = df.join(user_df, user_col, "left")

# Cache and force trigger action since data-file might be removed.
df.cache()
df.count()
Expand Down Expand Up @@ -535,11 +698,17 @@ def _maybe_download_and_extract(size, dest_path):
_, item_filename = os.path.split(DATA_FORMAT[size].item_path)
item_path = os.path.join(dirs, item_filename)

if not os.path.exists(rating_path) or not os.path.exists(item_path):
if DATA_FORMAT[size].user_path is None:
user_path = None
else:
_, user_filename = os.path.split(DATA_FORMAT[size].user_path)
user_path = os.path.join(dirs, user_filename)

if not all(p is None or os.path.exists(p) for p in [rating_path, item_path, user_path]):
download_movielens(size, dest_path)
extract_movielens(size, rating_path, item_path, dest_path)
extract_movielens(size, rating_path, item_path, user_path, dest_path)

return rating_path, item_path
return rating_path, item_path, user_path


def download_movielens(size, dest_path):
Expand All @@ -557,7 +726,7 @@ def download_movielens(size, dest_path):
maybe_download(url, file, work_directory=dirs)


def extract_movielens(size, rating_path, item_path, zip_path):
def extract_movielens(size, rating_path, item_path, user_path, zip_path):
"""Extract MovieLens rating and item datafiles from the MovieLens raw zip file.

To extract all files instead of just rating and item datafiles,
Expand All @@ -567,13 +736,17 @@ def extract_movielens(size, rating_path, item_path, zip_path):
size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
rating_path (str): Destination path for rating datafile
item_path (str): Destination path for item datafile
user_path (str): Destination path for user datafile
zip_path (str): zipfile path
"""
with ZipFile(zip_path, "r") as z:
with z.open(DATA_FORMAT[size].path) as zf, open(rating_path, "wb") as f:
shutil.copyfileobj(zf, f)
with z.open(DATA_FORMAT[size].item_path) as zf, open(item_path, "wb") as f:
shutil.copyfileobj(zf, f)
if DATA_FORMAT[size].user_path is not None:
with z.open(DATA_FORMAT[size].user_path) as zf, open(user_path, "wb") as f:
shutil.copyfileobj(zf, f)


# For more information on data synthesis, see https://pandera.readthedocs.io/en/latest/data_synthesis_strategies.html
Expand Down
Loading
Loading