Skip to content

Commit

Permalink
feat: ignore nulls for container types (#2636)
Browse files Browse the repository at this point in the history
* propagate the ignore_null flag for recursive calls to pyarrow2athena

* update tests

* lint

---------

Co-authored-by: Leon Luttenberger <[email protected]>
  • Loading branch information
raaidarshad and LeonLuttenberger authored Jan 24, 2024
1 parent 2f3ca80 commit 860431f
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
10 changes: 6 additions & 4 deletions awswrangler/_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,17 @@ def pyarrow2athena( # noqa: PLR0911,PLR0912
if pa.types.is_binary(dtype):
return "binary"
if pa.types.is_dictionary(dtype):
return pyarrow2athena(dtype=dtype.value_type)
return pyarrow2athena(dtype=dtype.value_type, ignore_null=ignore_null)
if pa.types.is_decimal(dtype):
return f"decimal({dtype.precision},{dtype.scale})"
if pa.types.is_list(dtype):
return f"array<{pyarrow2athena(dtype=dtype.value_type)}>"
return f"array<{pyarrow2athena(dtype=dtype.value_type, ignore_null=ignore_null)}>"
if pa.types.is_struct(dtype):
return f"struct<{','.join([f'{f.name}:{pyarrow2athena(dtype=f.type)}' for f in dtype])}>"
return (
f"struct<{','.join([f'{f.name}:{pyarrow2athena(dtype=f.type, ignore_null=ignore_null)}' for f in dtype])}>"
)
if pa.types.is_map(dtype):
return f"map<{pyarrow2athena(dtype=dtype.key_type)},{pyarrow2athena(dtype=dtype.item_type)}>"
return f"map<{pyarrow2athena(dtype=dtype.key_type, ignore_null=ignore_null)},{pyarrow2athena(dtype=dtype.item_type, ignore_null=ignore_null)}>"
if dtype == pa.null():
if ignore_null:
return ""
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_athena_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,14 +376,14 @@ def test_store_metadata_partitions_sample_dataset(glue_database, glue_table, pat

def test_store_metadata_ignore_null_columns(glue_database, glue_table, path):
df = pd.DataFrame({"c0": [0, 1, 2], "c1": [3, 4, 5], "c2_null": [None, None, None], "c3_null": [None, None, None]})
wr.s3.to_parquet(df=df, path=path, dataset=True, dtype={"c2_null": "int", "c3_null": "int"})
wr.s3.to_parquet(df=df, path=path, dataset=True, dtype={"c2_null": "int", "c3_null": "array<int>"})
wr.s3.store_parquet_metadata(
path=path,
database=glue_database,
table=glue_table,
ignore_null=True,
dataset=True,
dtype={"c2_null": "int", "c3_null": "int"},
dtype={"c2_null": "int", "c3_null": "array<int>"},
)


Expand Down

0 comments on commit 860431f

Please sign in to comment.