From 285c2b6e567f1b7bb26dc55c6c7e528bd801c6dc Mon Sep 17 00:00:00 2001 From: Zach Liu Date: Tue, 6 Aug 2024 23:36:58 -0400 Subject: [PATCH] Add data type to athena query runner (#7112) Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- redash/query_runner/athena.py | 22 ++++++++++++++++------ tests/query_runner/test_athena.py | 21 +++++++++++++++------ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/redash/query_runner/athena.py b/redash/query_runner/athena.py index 0d70a2a0e3..8ff9b3d1af 100644 --- a/redash/query_runner/athena.py +++ b/redash/query_runner/athena.py @@ -199,10 +199,20 @@ def __get_schema_from_glue(self, catalog_id=""): logger.warning("Glue table doesn't have StorageDescriptor: %s", table_name) continue if table_name not in schema: - column = [columns["Name"] for columns in table["StorageDescriptor"]["Columns"]] - schema[table_name] = {"name": table_name, "columns": column} - for partition in table.get("PartitionKeys", []): - schema[table_name]["columns"].append(partition["Name"]) + schema[table_name] = {"name": table_name, "columns": []} + + for column_data in table["StorageDescriptor"]["Columns"]: + column = { + "name": column_data["Name"], + "type": column_data["Type"] if "Type" in column_data else None, + } + schema[table_name]["columns"].append(column) + for partition in table.get("PartitionKeys", []): + partition_column = { + "name": partition["Name"], + "type": partition["Type"] if "Type" in partition else None, + } + schema[table_name]["columns"].append(partition_column) return list(schema.values()) def get_schema(self, get_stats=False): @@ -212,7 +222,7 @@ def get_schema(self, get_stats=False): schema = {} query = """ - SELECT table_schema, table_name, column_name + SELECT table_schema, table_name, column_name, data_type FROM information_schema.columns WHERE table_schema NOT IN ('information_schema') """ @@ -225,7 +235,7 @@ def get_schema(self, get_stats=False): table_name = "{0}.{1}".format(row["table_schema"], row["table_name"]) if table_name not in schema: schema[table_name] = {"name": table_name, "columns": []} - schema[table_name]["columns"].append(row["column_name"]) + schema[table_name]["columns"].append({"name": row["column_name"], "type": row["data_type"]}) return list(schema.values()) diff --git a/tests/query_runner/test_athena.py b/tests/query_runner/test_athena.py index 6cda21c03b..6027c8c8a2 100644 --- a/tests/query_runner/test_athena.py +++ b/tests/query_runner/test_athena.py @@ -75,7 +75,9 @@ def test_external_table(self): {"DatabaseName": "test1"}, ) with self.stubber: - assert query_runner.get_schema() == [{"columns": ["row_id"], "name": "test1.jdbc_table"}] + assert query_runner.get_schema() == [ + {"columns": [{"name": "row_id", "type": "int"}], "name": "test1.jdbc_table"} + ] def test_partitioned_table(self): """ @@ -124,7 +126,12 @@ def test_partitioned_table(self): {"DatabaseName": "test1"}, ) with self.stubber: - assert query_runner.get_schema() == [{"columns": ["sk", "category"], "name": "test1.partitioned_table"}] + assert query_runner.get_schema() == [ + { + "columns": [{"name": "sk", "type": "int"}, {"name": "category", "type": "int"}], + "name": "test1.partitioned_table", + } + ] def test_view(self): query_runner = Athena({"glue": True, "region": "mars-east-1"}) @@ -156,7 +163,7 @@ def test_view(self): {"DatabaseName": "test1"}, ) with self.stubber: - assert query_runner.get_schema() == [{"columns": ["sk"], "name": "test1.view"}] + assert query_runner.get_schema() == [{"columns": [{"name": "sk", "type": "int"}], "name": "test1.view"}] def test_dodgy_table_does_not_break_schema_listing(self): """ @@ -196,7 +203,9 @@ def test_dodgy_table_does_not_break_schema_listing(self): {"DatabaseName": "test1"}, ) with self.stubber: - assert query_runner.get_schema() == [{"columns": ["region"], "name": "test1.csv"}] + assert query_runner.get_schema() == [ + {"columns": [{"name": "region", "type": "string"}], "name": "test1.csv"} + ] def test_no_storage_descriptor_table(self): """ @@ -312,6 +321,6 @@ def test_multi_catalog_tables(self): ) with self.stubber: assert query_runner.get_schema() == [ - {"columns": ["row_id"], "name": "test1.jdbc_table"}, - {"columns": ["row_id"], "name": "test2.jdbc_table"}, + {"columns": [{"name": "row_id", "type": "int"}], "name": "test1.jdbc_table"}, + {"columns": [{"name": "row_id", "type": "int"}], "name": "test2.jdbc_table"}, ]