diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py index dcf061c43..415ac9982 100644 --- a/data_diff/databases/redshift.py +++ b/data_diff/databases/redshift.py @@ -1,4 +1,5 @@ -from typing import ClassVar, List, Dict, Type +import logging +from typing import ClassVar, List, Dict, Optional, Type import attrs @@ -10,6 +11,7 @@ FractionalType, DbPath, TimestampTZ, + UnknownColType, ) from data_diff.databases.postgresql import ( BaseDialect, @@ -21,6 +23,8 @@ PostgresqlDialect, ) +logger = logging.getLogger("redshift") + @attrs.define(frozen=False) class Dialect(PostgresqlDialect): @@ -183,3 +187,29 @@ def _normalize_table_path(self, path: DbPath) -> DbPath: raise ValueError( f"{self.name}: Bad table path for {self}: '{'.'.join(path)}'. Expected format: table, schema.table, or database.schema.table" ) + + def _refine_coltypes( + self, table_path: DbPath, col_dict: Dict[str, ColType], where: str | None = None, sample_size=64 + ): + super()._refine_coltypes(table_path, col_dict, where, sample_size) + refine_columns = {k for k, v in col_dict.items() if isinstance(v, (TemporalType))} + if not refine_columns: + return + + # values in a timestamp column in redshift can technically be "out of range" + # this causes issues when treating them as a valid timestamp, so do not treat a column with + # out of range issues as a TemporalType + for column_name in refine_columns: + # TRUE = strict_mode + try: + # check if the min + max are out of range + self.query( + f"select to_timestamp(max({column_name}),'YYYY-mm-dd HH24:MI:SS.US', TRUE), " + + f"to_timestamp(min({column_name}),'YYYY-mm-dd HH24:MI:SS.US', TRUE) from {'.'.join(table_path)}" + ) + except Exception: + self.query("rollback") + logger.warning(f"Temporal column {column_name} has out of range values, falling back to text.") + col_dict[column_name] = UnknownColType("Temporal type contains out of range values.") + + return