Skip to content
This repository has been archived by the owner on May 17, 2024. It is now read-only.

Commit

Permalink
out of range temporal cols as UnknownColType
Browse files Browse the repository at this point in the history
issue 786 #786
  • Loading branch information
dlawin committed Nov 29, 2023
1 parent 283bbac commit 1bcc12c
Showing 1 changed file with 31 additions and 1 deletion.
32 changes: 31 additions & 1 deletion data_diff/databases/redshift.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import ClassVar, List, Dict, Type
import logging
from typing import ClassVar, List, Dict, Optional, Type

import attrs

Expand All @@ -10,6 +11,7 @@
FractionalType,
DbPath,
TimestampTZ,
UnknownColType,
)
from data_diff.databases.postgresql import (
BaseDialect,
Expand All @@ -21,6 +23,8 @@
PostgresqlDialect,
)

logger = logging.getLogger("redshift")


@attrs.define(frozen=False)
class Dialect(PostgresqlDialect):
Expand Down Expand Up @@ -183,3 +187,29 @@ def _normalize_table_path(self, path: DbPath) -> DbPath:
raise ValueError(
f"{self.name}: Bad table path for {self}: '{'.'.join(path)}'. Expected format: table, schema.table, or database.schema.table"
)

def _refine_coltypes(
self, table_path: DbPath, col_dict: Dict[str, ColType], where: str | None = None, sample_size=64
):
super()._refine_coltypes(table_path, col_dict, where, sample_size)
refine_columns = {k for k, v in col_dict.items() if isinstance(v, (TemporalType))}
if not refine_columns:
return

# values in a timestamp column in redshift can technically be "out of range"
# this causes issues when treating them as a valid timestamp, so do not treat a column with
# out of range issues as a TemporalType
for column_name in refine_columns:
# TRUE = strict_mode
try:
# check if the min + max are out of range
self.query(
f"select to_timestamp(max({column_name}),'YYYY-mm-dd HH24:MI:SS.US', TRUE), "
+ f"to_timestamp(min({column_name}),'YYYY-mm-dd HH24:MI:SS.US', TRUE) from {'.'.join(table_path)}"
)
except Exception:
self.query("rollback")
logger.warning(f"Temporal column {column_name} has out of range values, falling back to text.")
col_dict[column_name] = UnknownColType("Temporal type contains out of range values.")

return

0 comments on commit 1bcc12c

Please sign in to comment.