Skip to content

Commit

Permalink
feat: add tests to produce isthmus relation plans (#98)
Browse files Browse the repository at this point in the history
* feat: add tests to produce isthmus relation plans

* fix: workflow

* fix: handle schema file mismatch

* fix: bypass uri validation

* fix: override warnings for isthmus producer
  • Loading branch information
richtia authored Aug 28, 2024
1 parent e924d20 commit 01ffc03
Show file tree
Hide file tree
Showing 73 changed files with 6,706 additions and 2,512 deletions.
17 changes: 14 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ jobs:

- name: Build & run
run: docker run --rm $(docker build -q --file ./ci/docker/ibis_producer.Dockerfile .)
isthmus_producer:
name: Run Isthmus producer tests
isthmus_producer_functions:
name: Run Isthmus producer function tests
runs-on: ubuntu-latest
steps:
- name: Checkout
Expand All @@ -115,4 +115,15 @@ jobs:
submodules: recursive

- name: Build & run
run: docker run --rm $(docker build -q --file ./ci/docker/isthmus_producer.Dockerfile .)
run: docker run --rm $(docker build -q --file ./ci/docker/isthmus_producer_functions.Dockerfile .)
isthmus_producer_relations:
name: Run Isthmus producer relation tests
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: recursive

- name: Build & run
run: docker run --rm $(docker build -q --file ./ci/docker/isthmus_producer_relations.Dockerfile .)
File renamed without changes.
24 changes: 24 additions & 0 deletions ci/docker/isthmus_producer_relations.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM ubuntu:22.04

RUN apt-get update -y \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:openjdk/ppa \
&& apt-get install openjdk-17-jdk -y \
&& apt-get install python3-pip -y \
&& apt-get -y install git \
&& apt-get install -y python3.10 \
&& ln -sf python3 /usr/bin/python \
&& export JAVA_HOME \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-amd64
RUN export JAVA_HOME
RUN apt install -y pip
RUN pip install --upgrade pip setuptools pytest pytest-snapshot substrait pyarrow protobuf duckdb filelock datafusion==40.1.0 ibis_substrait JPype1 substrait-validator

WORKDIR /consumer-testing
COPY . .

CMD git submodule update --init \
&& ./build-and-copy-isthmus-shadow-jar.sh \
&& /usr/bin/python -mpytest -m produce_substrait_snapshot --producer=IsthmusProducer substrait_consumer/tests/functional/relations
14 changes: 13 additions & 1 deletion substrait_consumer/context.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
import substrait_validator as sv

import jpype.imports

Expand All @@ -8,7 +9,7 @@
schema_file = Path.joinpath(REPO_DIR, "substrait_consumer/data/tpch_parquet/schema.sql")


def produce_isthmus_substrait(sql_string, schema_list):
def produce_isthmus_substrait(sql_string, schema_list, validate=False):
"""
Produce the substrait plan using Isthmus.
Expand All @@ -17,6 +18,8 @@ def produce_isthmus_substrait(sql_string, schema_list):
SQL query.
schema_list:
List of schemas.
validate:
Validate the Substrait plan.
Returns:
Substrait plan in json format.
Expand All @@ -29,6 +32,13 @@ def produce_isthmus_substrait(sql_string, schema_list):
java_sql_string = jpype.java.lang.String(sql_string)
plan = sql_to_substrait.execute(java_sql_string, schema_list)
json_plan = json_formatter.printer().print_(plan)
if validate:
config = sv.Config()
config.override_diagnostic_level(1002, "warning", "info") # error
config.override_diagnostic_level(2001, "warning", "info") # warning
config.override_diagnostic_level(3005, "warning", "info") # warning
config.override_diagnostic_level(1, "warning", "info") # warning
sv.check_plan_valid(json_plan, config)
return json_plan


Expand All @@ -50,6 +60,8 @@ def get_schema(file_names):
text_schema_file = open(schema_file)
schema_string = text_schema_file.read().replace("\n", " ").split(";")[:-1]
for create_table in schema_string:
if "small" not in file_names[0]:
create_table = create_table.replace("_small", "")
java_obj = jpype.JObject @ jpype.JString(create_table)
arr.add(java_obj)
java_obj = jpype.JObject @ jpype.JString(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from substrait_consumer.producers.duckdb_producer import DuckDBProducer
from substrait_consumer.producers.datafusion_producer import DataFusionProducer
from substrait_consumer.producers.isthmus_producer import IsthmusProducer

FILTER_RELATIONS = {
"where_equal_multi_col": (
Expand All @@ -10,7 +11,7 @@
ORDER BY L_DISCOUNT
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_not_equal_multi_col": (
"""
Expand All @@ -20,7 +21,7 @@
ORDER BY L_DISCOUNT, L_TAX
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_gt_multi_col": (
"""
Expand All @@ -30,7 +31,7 @@
ORDER BY L_DISCOUNT, L_TAX
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_gte_multi_col": (
"""
Expand All @@ -40,7 +41,7 @@
ORDER BY L_DISCOUNT, L_TAX
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_lt_multi_col": (
"""
Expand All @@ -50,7 +51,7 @@
ORDER BY L_DISCOUNT, L_TAX
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_lte_multi_col": (
"""
Expand All @@ -60,7 +61,7 @@
ORDER BY L_DISCOUNT, L_TAX
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_like": (
"""
Expand All @@ -70,7 +71,7 @@
ORDER BY L_ORDERKEY
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_between": (
"""
Expand All @@ -79,31 +80,31 @@
WHERE L_ORDERKEY BETWEEN 20 AND 50
LIMIT 20;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_in": (
"""
SELECT L_ORDERKEY
FROM '{}'
WHERE L_ORDERKEY IN (1, 2, 3)
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_or": (
"""
SELECT L_ORDERKEY, L_SHIPINSTRUCT
FROM '{}'
WHERE L_ORDERKEY = 2 OR L_ORDERKEY = 3
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"where_and": (
"""
SELECT L_ORDERKEY, L_SHIPINSTRUCT
FROM '{}'
WHERE L_ORDERKEY = 2 AND L_SHIPINSTRUCT = 'TAKE BACK RETURN'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"having": (
"""
Expand All @@ -113,6 +114,6 @@
HAVING COUNT(*) > 12100
ORDER BY L_QUANTITY
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
}
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
from substrait_consumer.producers.duckdb_producer import DuckDBProducer
from substrait_consumer.producers.datafusion_producer import DataFusionProducer
from substrait_consumer.producers.isthmus_producer import IsthmusProducer

PROJECT_RELATIONS = {
"project_single_col": (
"""
SELECT *
FROM '{}'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"project_multi_col": (
"""
SELECT L_DISCOUNT, L_TAX
FROM '{}'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"project_all_col": (
"""
SELECT *
FROM '{}'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"extended_project": (
"""
SELECT L_QUANTITY, L_EXTENDEDPRICE*10 AS MULTI_PRICE
FROM '{}'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"subquery_in_project": (
"""
Expand All @@ -39,21 +40,21 @@
AS total_price
FROM {}
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"distinct_in_project": (
"""
SELECT DISTINCT L_LINESTATUS
FROM '{}'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"count_distinct_in_project": (
"""
SELECT COUNT(DISTINCT L_EXTENDEDPRICE)
FROM '{}'
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from substrait_consumer.producers.duckdb_producer import DuckDBProducer
from substrait_consumer.producers.datafusion_producer import DataFusionProducer
from substrait_consumer.producers.isthmus_producer import IsthmusProducer

SORT_RELATIONS = {
"single_col_default_sort": (
Expand All @@ -9,7 +10,7 @@
ORDER BY PS_AVAILQTY
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"single_col_asc": (
"""
Expand All @@ -18,7 +19,7 @@
ORDER BY PS_SUPPLYCOST ASC
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"single_col_desc": (
"""
Expand All @@ -27,7 +28,7 @@
ORDER BY PS_SUPPLYCOST DESC
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"multi_col_asc": (
"""
Expand All @@ -36,7 +37,7 @@
ORDER BY PS_SUPPLYCOST ASC, PS_AVAILQTY
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"multi_col_desc": (
"""
Expand All @@ -45,7 +46,7 @@
ORDER BY PS_SUPPLYCOST DESC
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"multi_col_asc_desc": (
"""
Expand All @@ -54,7 +55,7 @@
ORDER BY PS_SUPPLYCOST ASC, PS_AVAILQTY DESC
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"multi_col_desc_asc": (
"""
Expand All @@ -63,7 +64,7 @@
ORDER BY PS_SUPPLYCOST DESC, PS_AVAILQTY ASC
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
"order_by_col_number": (
"""
Expand All @@ -72,6 +73,6 @@
ORDER BY 1, 2
LIMIT 10;
""",
[DuckDBProducer, DataFusionProducer],
[DuckDBProducer, DataFusionProducer, IsthmusProducer],
),
}
6 changes: 4 additions & 2 deletions substrait_consumer/producers/isthmus_producer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,20 @@ def __init__(self, db_connection=None):
def set_db_connection(self, db_connection):
self._db_connection = db_connection

def produce_substrait(self, sql_query: str, ibis_expr: str = None) -> str:
def produce_substrait(self, sql_query: str, validate=False, ibis_expr: str=None) -> str:
"""
Produce the Isthmus substrait plan using the given SQL query.
Parameters:
sql_query:
SQL query.
validate:
Validate the Substrait plan.
Returns:
Substrait query plan in json format.
"""
schema_list = get_schema(self.file_names)
substrait_plan_str = produce_isthmus_substrait(sql_query, schema_list)
substrait_plan_str = produce_isthmus_substrait(sql_query, schema_list, validate)

return substrait_plan_str

Expand Down
1 change: 0 additions & 1 deletion substrait_consumer/producers/producer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def load_tables_from_parquet(
table_names = []
for file_name, file_path in zip(file_names, parquet_file_paths):
table_name = Path(file_name).stem
table_name = table_name.translate(str.maketrans("", "", string.punctuation))
if table_name not in created_tables:
create_table_sql = f"CREATE TABLE {table_name} AS SELECT * FROM read_parquet('{file_path}');"
db_connection.execute(create_table_sql)
Expand Down
Loading

0 comments on commit 01ffc03

Please sign in to comment.