Add Airbnb Reviews Data (#125)

* Add Airbnb Reviews Data * Update changelog.md * Lint * Subsample listing_id from data * Update Airbnb Reviews with Multiple Cities * Downsample further and update readme * Linting * Rounded up ratings
trane-dev · Aug 2, 2023 · 03b2d9c · 03b2d9c
1 parent c8a8893
commit 03b2d9c
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 18 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -7,6 +7,7 @@ v0.6.0 (, 2023)
 ===============
 * Enhancements
     * Add pyarrow dependency and use pyarrow backed dtypes [#120][#120]
+    * Add Airbnb Reviews dataset [#125][#125]
 * Fixes
     * Rename `_execute_operations_on_df` to `target` in executed prediction problem dataframe [#124][#124]
     * Clean up operation description generation [#118][#118]
@@ -16,6 +17,7 @@ v0.6.0 (, 2023)
     [#124]: <https://github.com/trane-dev/Trane/pull/124>
     [#118]: <https://github.com/trane-dev/Trane/pull/118>
     [#120]: <https://github.com/trane-dev/Trane/pull/120>
+    [#125]: <https://github.com/trane-dev/Trane/pull/125>
 
 
 v0.5.0 (July 27, 2023)

diff --git a/tests/test_load_functions.py b/tests/test_load_functions.py
@@ -1,4 +1,5 @@
 from trane.datasets.load_functions import (
+    load_airbnb_reviews,
     load_covid,
     load_covid_metadata,
     load_youtube,
@@ -58,6 +59,15 @@ def test_load_youtube():
     assert df["category_id"].dtype == "category"
 
 
+def test_load_airbnb_reviews():
+    df = load_airbnb_reviews()
+
+    assert df["date"].dtype == "datetime64[ns]"
+    assert df["listing_id"].dtype == "int64[pyarrow]"
+    assert df["id"].dtype == "int64[pyarrow]"
+    assert df["rating"].dtype == "int64[pyarrow]"
+
+
 def check_column_schema(columns, df, metadata):
     for col in columns:
         assert col in df.columns

diff --git a/trane/datasets/__init__.py b/trane/datasets/__init__.py
@@ -3,4 +3,5 @@
     load_covid_metadata,
     load_youtube,
     load_youtube_metadata,
+    load_airbnb_reviews,
 )
diff --git a/trane/datasets/data/airbnb_reviews/README.md b/trane/datasets/data/airbnb_reviews/README.md
@@ -0,0 +1,26 @@
+## Modified Airbnb Reviews Demo Dataset Attribution
+
+The demo dataset used in this package has been modified from its original version, which is licensed under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/).
+
+**Original Dataset Information:**
+
+- Titles:
+    New York City, New York, United States - Detailed Review Data (05 June, 2023)
+    London, England, United Kingdom - Detailed Review Data (05 June, 2023)
+    Paris, Île-de-France, France - Detailed Review Data (05 June, 2023)
+    San Francisco, California, United States - Detailed Review Data (05 June, 2023)
+- Author(s): Inside Airbnb
+- License: [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
+- Source: [Inside Airbnb Data](http://insideairbnb.com/get-the-data)
+
+**Modifications:**
+
+In this package, we have made the following modification to the original dataset:
+
+- Combined data from New York City, London, Paris, and San Francisco
+- Applied sentiment analysis to the comments column to derive rating scores for each entry.
+- Subsampled the listing_id to make for quicker analysis.
+
+Please note that while we have made modifications to the dataset, the original data is still covered under the CC BY 4.0 license. The modifications performed in this package do not change the license of the original dataset.
+
+The dataset is provided as-is, and the authors do not bear any responsibility for the usage or accuracy of the data or the sentiment analysis results.
diff --git a/trane/datasets/data/airbnb_reviews/airbnb_reviews.csv.bz2 b/trane/datasets/data/airbnb_reviews/airbnb_reviews.csv.bz2
diff --git a/trane/datasets/load_functions.py b/trane/datasets/load_functions.py
@@ -41,24 +41,6 @@ def load_covid():
     return df
 
 
-def load_youtube():
-    time_col = "trending_date"
-    filepath = generate_local_filepath("USvideos.csv")
-    df = pd.read_csv(
-        filepath,
-        dtype_backend="pyarrow",
-    )
-    df[time_col] = pd.to_datetime(df[time_col], format="%y.%d.%m")
-    df = df.astype(
-        {
-            "channel_title": "category",
-            "category_id": "category",
-        },
-    )
-    df = df.sort_values(by=[time_col])
-    return df
-
-
 def load_covid_metadata():
     table_meta = {
         "Province/State": ColumnSchema(
@@ -79,6 +61,24 @@ def load_covid_metadata():
     return table_meta
 
 
+def load_youtube():
+    time_col = "trending_date"
+    filepath = generate_local_filepath("USvideos.csv")
+    df = pd.read_csv(
+        filepath,
+        dtype_backend="pyarrow",
+    )
+    df[time_col] = pd.to_datetime(df[time_col], format="%y.%d.%m")
+    df = df.astype(
+        {
+            "channel_title": "category",
+            "category_id": "category",
+        },
+    )
+    df = df.sort_values(by=[time_col])
+    return df
+
+
 def load_youtube_metadata():
     table_meta = {
         "trending_date": ColumnSchema(logical_type=Datetime),
@@ -98,6 +98,17 @@ def load_youtube_metadata():
     return table_meta
 
 
+def load_airbnb_reviews():
+    time_col = "date"
+    filepath = generate_local_filepath("data/airbnb_reviews/airbnb_reviews.csv.bz2")
+    df = pd.read_csv(filepath, dtype_backend="pyarrow")
+    df = df.dropna()
+    df[time_col] = pd.to_datetime(df[time_col], format="%Y-%m-%d")
+    df = df.sort_values(by=["date"])
+
+    return df
+
+
 def generate_local_filepath(key):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     return os.path.join(dir_path, key)