Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
gsheni committed Jul 14, 2023
1 parent 137b872 commit 8c23992
Show file tree
Hide file tree
Showing 6 changed files with 790 additions and 31 deletions.
85 changes: 65 additions & 20 deletions tests/test_data_parser.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,84 @@
import pandas as pd
import pytest

from trane.utils.data_parser import denormalize


def test_denormalize_simple():
users_df = pd.DataFrame(
@pytest.fixture()
def products_df():
return pd.DataFrame(
{
"user_id": [1, 2, 3],
"name": ["Charlie", "Dennis", "Mac"],
"id": [1, 2, 3],
"price": [10, 20, 30],
},
)
orders_df = pd.DataFrame(


@pytest.fixture()
def logs_df():
return pd.DataFrame(
{
"id": [1, 2, 3, 4, 5],
"product_id": [1, 2, 3, 1, 2],
"session_id": [1, 1, 2, 2, 2],
},
)


@pytest.fixture()
def sessions_df():
return pd.DataFrame(
{
"order_id": [1, 2, 3, 4, 5, 6],
"user_id": [1, 1, 1, 2, 2, 3],
"id": [1, 2],
},
)
denormalized = denormalize(


def test_denormalize_simple(products_df, logs_df):
logs_df = logs_df.drop(columns=["session_id"])
assert products_df["id"].is_unique
assert logs_df["id"].is_unique
relationships = [
# one to many relationship
("products", "id", "log", "product_id"),
]
flattend = denormalize(
dataframes={
"users": users_df,
"orders": orders_df,
"products": products_df,
"log": logs_df,
},
relationships=[
# one to many relationship
("users", "user_id", "orders", "user_id"),
],
relationships=relationships,
)
expected_df = pd.DataFrame(
{
"user_id": [1, 1, 1, 2, 2, 3],
"name": ["Charlie", "Charlie", "Charlie", "Dennis", "Dennis", "Mac"],
"order_id": [1, 2, 3, 4, 5, 6],
assert flattend.shape == (5, 3)
assert flattend["id"].is_unique
assert flattend.columns.tolist().sort() == ["id", "price", "product_id"].sort()
for price in flattend["price"].tolist():
assert price in [10, 20, 30]


def test_denormalize_three_tables(products_df, logs_df, sessions_df):
assert sessions_df["id"].is_unique
relationships = [
# one to many relationship
("products", "id", "log", "product_id"),
("sessions", "id", "log", "session_id"),
]
flattend = denormalize(
dataframes={
"products": products_df,
"log": logs_df,
"sessions": sessions_df,
},
relationships=relationships,
)
assert flattend.shape == (5, 4)
assert flattend["id"].is_unique
assert (
flattend.columns.tolist().sort()
== ["id", "price", "product_id", "session_id"].sort()
)
assert denormalized.equals(expected_df)
for price in flattend["price"].tolist():
assert price in [10, 20, 30]


# def test_denormalize_complex():
Expand Down
Empty file added tests/test_mock_dataset.py
Empty file.
6 changes: 4 additions & 2 deletions trane/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from trane.ops.filter_ops import FilterOpBase
from trane.typing.column_schema import ColumnSchema
from trane.typing.logical_types import (
ALL_LOGICAL_TYPES,
Boolean,
Categorical,
Datetime,
Double,
Integer,
LogicalType,
)

TYPE_MAPPING = {
Expand All @@ -34,7 +34,9 @@ def clean_date(date):


def _parse_table_meta(table_meta):
str_to_logical_type = {ltype.__name__.lower(): ltype for ltype in ALL_LOGICAL_TYPES}
str_to_logical_type = {
ltype.__name__.lower(): ltype for ltype in LogicalType.__subclasses__()
}
parsed_schema = {}
for col, schema in table_meta.items():
if isinstance(schema, str):
Expand Down
Loading

0 comments on commit 8c23992

Please sign in to comment.