Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial draft for Anderson Darling test constraint. #54

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions src/datajudge/constraints/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,71 @@ def test(self, engine: sa.engine.Engine) -> TestResult:
)

return TestResult.success()


class AndersonDarling2Sample(Constraint):
def __init__(
self, ref: DataReference, ref2: DataReference, significance_level: float = 0.05
):
self.significance_level = significance_level
super().__init__(ref, ref2=ref)

@staticmethod
def approximate_critical_value(
size_sample1: int, size_sample2: int, significance_level: float
) -> float:
"""Approximate critical value for specific significance_level given sample sizes."""
coefficient_map = {
0.25: {"b0": 0.675, "b1": -0.245, "b2": -0.105},
0.1: {"b0": 1.281, "b1": 0.25, "b2": -0.305},
0.05: {"b0": 1.645, "b1": 0.678, "b2": -0.362},
0.025: {"b0": 1.96, "b1": 1.149, "b2": -0.391},
0.01: {"b0": 2.326, "b1": 1.822, "b2": -0.396},
0.005: {"b0": 2.573, "b1": 2.364, "b2": -0.345},
0.001: {"b0": 3.085, "b1": 3.615, "b2": -0.154},
}

if significance_level not in coefficient_map.keys():
raise KeyError(
f"Significance-level {significance_level} not supported."
f" Please choose one of {coefficient_map.keys()}."
)

coefficients = coefficient_map[significance_level]
b0 = coefficients["b0"]
b1 = coefficients["b1"]
b2 = coefficients["b2"]
critical_value = b0 + b1 / math.sqrt(size_sample1) + b2 / size_sample1
return critical_value

@staticmethod
def compute_test_statistic(sum1, sum2, sample_size1, sample_size2):
sample_size = sample_size1 + sample_size2
return ((sum1 * sample_size1) + (sum2 * sample_size2)) / sample_size

def test(self, engine: sa.engine.Engine) -> TestResult:
sample_size1, sample_size1_selections = db_access.get_row_count(
engine, self.ref
)
sample_size2, sample_size2_selections = db_access.get_row_count(
engine, self.ref2
)
sample_size = sample_size1 + sample_size2
sum1, sum2, sum_selections = db_access.get_anderson_darling_sums(
engine, self.ref, self.ref2, sample_size
)
test_statistic = self.compute_test_statistic(
sum1, sum2, sample_size1, sample_size2
)
critical_value = self.approximate_critical_value(
sample_size1, sample_size2, self.significance_level
)
result = test_statistic <= critical_value
assertion_text = ""
if not result:
return TestResult.failure(
assertion_text,
self.get_description(),
sum_selections + sample_size1_selections + sample_size2_selections,
)
return TestResult.success()
66 changes: 66 additions & 0 deletions src/datajudge/db_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,72 @@ def _forward_filled_cdf_column(table, cdf_label, value_label, group_label):
return filled_cross_cdf, cdf_label1, cdf_label2


def get_anderson_darling_sums(
engine: sa.engine.Engine,
ref1: DataReference,
ref2: DataReference,
sample_size: int,
):
"""Return per-sample inner sum term of k-sample Anderson Darling test statistic.

For the first sample, this sum term equals

.. math::

\\sum_{j=1}^{N-1} \\frac{(N F_1(j) - j n_1)^2}{j(N-j)}

while for the second sample, this sum term equals:

.. math::

\\sum_{j=1}^{N-1} \\frac{(N F_2(j) - j n_2)^2}{j(N-j)}

where:

* :math:`F_i` is the respective empirical cumulative distribution function
* :math:`n_i` is the respective sample size
"""
cdf_label = "cdf"
value_label = "val"
filled_cross_cdf_selection, cdf_label1, cdf_label2 = _cross_cdf_selection(
engine, ref1, ref2, cdf_label, value_label
)
filled_cross_cdf_selection = filled_cross_cdf_selection.subquery()

# Add a row number to the cross_cdf_selection.
row_number = "row_number"
selection = sa.select(
[
sa.func.row_number()
.over(
order_by=filled_cross_cdf_selection.c[value_label],
)
.label(row_number),
filled_cross_cdf_selection.c[value_label],
filled_cross_cdf_selection.c[cdf_label1],
filled_cross_cdf_selection.c[cdf_label2],
]
).subquery()

def sum_selection(cdf_label):
return sa.select(
sa.func.sum(
(sample_size * selection.c[cdf_label] - selection.c[row_number])
* (sample_size * selection.c[cdf_label] - selection.c[row_number])
/ (selection.c[row_number] * (sample_size - selection.c[row_number]))
)
)

sum_selection1 = sum_selection(cdf_label1)
sum_selection2 = sum_selection(cdf_label2)

with engine.connect() as connection:
sum1 = connection.execute(sum_selection1).scalar()
sum2 = connection.execute(sum_selection2).scalar()

return sum1, sum2, [selection]


def get_ks_2sample(
engine: sa.engine.Engine,
ref1: DataReference,
Expand Down
33 changes: 27 additions & 6 deletions src/datajudge/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
T = TypeVar("T")


def _check_significance_level(significance_level: float):
if significance_level <= 0.0 or significance_level > 1.0:
raise ValueError(
"The requested significance level has to be in ``(0.0, 1.0]``. Default is 0.05."
)


class TableQualifier:
def __init__(self, db_name: str, schema_name: str, table_name: str):
self.db_name = db_name
Expand Down Expand Up @@ -1330,8 +1337,8 @@ def add_ks_2sample_constraint(
):
"""
Apply the so-called two-sample Kolmogorov-Smirnov test to the distributions of the two given columns.
The constraint is fulfilled, when the resulting p-value of the test is higher than the significance level
(default is 0.05, i.e., 5%).
The constraint is fulfilled when the resulting p-value of the test is higher than the significance level
(default is 0.05, i.e. 5%).
The signifance_level must be a value between 0.0 and 1.0.
"""

Expand All @@ -1340,13 +1347,27 @@ def add_ks_2sample_constraint(
"Column names have to be given for this test's functionality."
)

if significance_level <= 0.0 or significance_level > 1.0:
raise ValueError(
"The requested significance level has to be in ``(0.0, 1.0]``. Default is 0.05."
)
_check_significance_level(significance_level)

ref = DataReference(self.data_source, [column1], condition=condition1)
ref2 = DataReference(self.data_source2, [column2], condition=condition2)
self._constraints.append(
stats_constraints.KolmogorovSmirnov2Sample(ref, ref2, significance_level)
)

def add_anderson_darling_2sample_constraint(
self,
column1: str,
column2: str,
condition1: Condition = None,
condition2: Condition = None,
significance_level: float = 0.05,
):
"""Do."""
_check_significance_level(significance_level)

ref = DataReference(self.data_source, [column1], condition=condition1)
ref2 = DataReference(self.data_source2, [column2], condition=condition2)
self._constraints.append(
stats_constraints.AndersonDarling2Sample(ref, ref2, significance_level)
)
13 changes: 13 additions & 0 deletions tests/integration/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,16 @@ def test_ks_2sample_calculate_statistic(engine, random_normal_table, configurati
assert (
abs(p_value - expected_p) <= 1e-05
), f"The approx. p-value does not match: {expected_p} vs {p_value}"


def test_ad(engine, random_normal_table):
col_1 = "value_0_1"
col_2 = "value_005_1"
database, schema, table = random_normal_table
tds = TableDataSource(database, table, schema)
ref = DataReference(tds, columns=[col_1])
ref2 = DataReference(tds, columns=[col_2])
constraint = datajudge.constraints.stats.AndersonDarling2Sample(
ref, ref2, significance_level=0.05
)
constraint.test(engine)