Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added col stat helpers #10

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions levi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
from typing import List

from deltalake import DeltaTable


Expand Down Expand Up @@ -43,6 +45,28 @@ def latest_version(delta_table: DeltaTable):
return delta_table.version()


def get_all_columns_with_statistics(delta_table: DeltaTable) -> List[str]:
df = delta_table.get_add_actions(flatten=True).to_pandas()

columns_with_statistics = set()
for col in df:
if col.startswith("null_count."):
columns_with_statistics.add(col[11:])

return list(columns_with_statistics)


def get_files_without_statistics_for_column(delta_table: DeltaTable, col: str) -> List[str]:
df = delta_table.get_add_actions(flatten=True).to_pandas()

files_without_statistics = []
for row in df.iterrows():
if f"null_count.{col}" not in row[1]:
files_without_statistics.append(row[1]["path"])

return files_without_statistics


def delta_file_sizes(delta_table: DeltaTable, boundaries=None):
if boundaries is None:
boundaries = ["<1mb", "1mb-500mb", "500mb-1gb", "1gb-2gb", ">2gb"]
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"7af43bcf-3822-4650-8601-f5c11452841f","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"col1\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col3\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.dataSkippingNumIndexedCols":"3"},"createdTime":1678625165713}}
{"add":{"path":"part-00000-afe4717a-0bf0-4076-a5e8-564f849ba901-c000.snappy.parquet","partitionValues":{},"size":923,"modificationTime":1678625168936,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"col1\":1,\"col2\":\"a\",\"col3\":\"b\"},\"maxValues\":{\"col1\":3,\"col2\":\"c\",\"col3\":\"c\"},\"nullCount\":{\"col1\":0,\"col2\":0,\"col3\":1}}"}}
{"commitInfo":{"timestamp":1678625169127,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"3","numOutputBytes":"923"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"966af096-fe00-4e0d-900e-cce53ac9fe42"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"metaData":{"id":"7af43bcf-3822-4650-8601-f5c11452841f","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"col1\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col3\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.dataSkippingNumIndexedCols":"1"},"createdTime":1678625165713}}
{"commitInfo":{"timestamp":1678625203902,"operation":"SET TBLPROPERTIES","operationParameters":{"properties":"{\"delta.dataSkippingNumIndexedCols\":\"1\"}"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"9f432d11-eeed-4f95-8f19-90c49f4685e7"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"add":{"path":"part-00000-1344c9ae-d450-49ca-aea9-ef32ad16fdf2-c000.snappy.parquet","partitionValues":{},"size":923,"modificationTime":1678625206973,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"col1\":1},\"maxValues\":{\"col1\":3},\"nullCount\":{\"col1\":0}}"}}
{"commitInfo":{"timestamp":1678625207032,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"3","numOutputBytes":"923"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"70d649cb-e31b-4601-aecd-8473bc30bd4e"}}
Binary file not shown.
Binary file not shown.
21 changes: 18 additions & 3 deletions tests/test_public_interface.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

import levi
from deltalake import DeltaTable

Expand All @@ -19,21 +21,34 @@ def test_filters_to_sql():
assert levi.filter_to_sql(("a_float", "=", 4.5)) == "(`min.a_float` <= 4.5 and `max.a_float` >= 4.5)"



def test_filter_to_sql():
assert levi.filter_to_sql(("a_float", "=", 4.5)) == "(`min.a_float` <= 4.5 and `max.a_float` >= 4.5)"
assert levi.filter_to_sql(("a_float", ">", 3)) == "(`min.a_float` < 3)"




def test_delta_file_sizes():
dt = DeltaTable("./tests/reader_tests/generated/basic_append/delta")
res = levi.delta_file_sizes(dt, ["<300b", "300b-1kb", "1kb-100kb", ">100kb"])
expected = {'num_files_<300b': 0, 'num_files_300b-1kb': 2, 'num_files_1kb-100kb': 0, 'num_files_>100kb': 0}
assert res == expected


@pytest.mark.skip(reason="There is a bug in the .get_actions implementation of delta-rs")
def test_get_all_columns_with_statistics():
dt = DeltaTable("./tests/reader_tests/generated/with_changing_number_of_stat_cols/delta")
columns = levi.get_all_columns_with_statistics(dt)
assert columns == ['col1', 'col2', 'col3']


@pytest.mark.skip(reason="There is a bug in the .get_actions implementation of delta-rs")
def test_get_files_without_statistics_for_column():
dt = DeltaTable("./tests/reader_tests/generated/with_changing_number_of_stat_cols/delta")
res_col1 = levi.get_files_without_statistics_for_column(dt, 'col1')
res_col2 = levi.get_files_without_statistics_for_column(dt, 'col2')
assert res_col1 == []
assert res_col2 == ['part-00000-1344c9ae-d450-49ca-aea9-ef32ad16fdf2-c000.snappy.parquet']


def test_latest_version():
dt = DeltaTable("./tests/reader_tests/generated/multi_partitioned/delta")
res = levi.latest_version(dt)
Expand Down