Fix pandas large ints (#454)

* Fix pandas inserts into big int columns * Tinkering with pandas insert data types
ClickHouse · Jan 14, 2025 · 697794e · 697794e
1 parent 8c94d92
commit 697794e
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,11 @@ release (0.9.0), unrecognized arguments/keywords for these methods of creating a
 instead of being passed as ClickHouse server settings. This is in conjunction with some refactoring in Client construction.
 The supported method of passing ClickHouse server settings is to prefix such arguments/query parameters with`ch_`.
 
+## 0.8.14, 2025-01-13
+### Bug Fix
+- Fix an edge case where a Pandas dataframe that contains _only_ Int64 (or smaller) values would cause an exception when
+inserting into a ClickHouse "big int" table of U/Int128/256.  Closes https://github.com/ClickHouse/clickhouse-connect/issues/452
+
 ## 0.8.13, 2025-01-07
 ### Bug Fix
 - Fix missing default for new access_token parameter.  Thanks to [Lukas Thaler](https://github.com/lukasthalerINNIO) for the PR.

diff --git a/clickhouse_connect/__version__.py b/clickhouse_connect/__version__.py
@@ -1 +1 @@
-version = '0.8.13'
+version = '0.8.14'
diff --git a/clickhouse_connect/datatypes/numeric.py b/clickhouse_connect/datatypes/numeric.py
@@ -98,6 +98,7 @@ def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence:
 class BigInt(ClickHouseType, registered=False):
     _signed = True
     valid_formats = 'string', 'native'
+    python_type = int
 
     def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any):
         signed = self._signed

diff --git a/clickhouse_connect/driver/insert.py b/clickhouse_connect/driver/insert.py
@@ -148,19 +148,20 @@ def _convert_pandas(self, df):
         data = []
         for df_col_name, col_name, ch_type in zip(df.columns, self.column_names, self.column_types):
             df_col = df[df_col_name]
-            d_type = str(df_col.dtype)
+            d_type_kind = df_col.dtype.kind
             if ch_type.python_type == int:
-                if 'float' in d_type:
+                if d_type_kind == 'f':
                     df_col = df_col.round().astype(ch_type.base_type, copy=False)
-                else:
-                    df_col = df_col.astype(ch_type.base_type, copy=False)
-            elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in d_type):
+                elif d_type_kind in ('i', 'u') and not df_col.hasnans:
+                    data.append(df_col.to_list())
+                    continue
+            elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in str(df_col.dtype)):
                 div = ch_type.nano_divisor
                 data.append([None if pd.isnull(x) else x.value // div for x in df_col])
                 self.column_formats[col_name] = 'int'
                 continue
             if ch_type.nullable:
-                if d_type == 'object':
+                if d_type_kind == 'O':
                     #  This is ugly, but the multiple replaces seem required as a result of this bug:
                     #  https://github.com/pandas-dev/pandas/issues/29024
                     df_col = df_col.replace({pd.NaT: None}).replace({np.nan: None})

diff --git a/tests/integration_tests/test_pandas.py b/tests/integration_tests/test_pandas.py
@@ -131,13 +131,14 @@ def test_pandas_low_card(test_client: Client, table_context: Callable):
 
 
 def test_pandas_large_types(test_client: Client, table_context: Callable):
-    columns = ['key String', 'value Int256']
+    columns = ['key String', 'value Int256', 'u_value UInt256'
+               ]
     key2_value = 30000000000000000000000000000000000
     if not test_client.min_version('21'):
         columns = ['key String', 'value Int64']
         key2_value = 3000000000000000000
     with table_context('test_pandas_big_int', columns):
-        df = pd.DataFrame([['key1', 2000], ['key2', key2_value]], columns=['key', 'value'])
+        df = pd.DataFrame([['key1', 2000, 50], ['key2', key2_value, 70], ['key3', -2350, 70]], columns=['key', 'value', 'u_value'])
         source_df = df.copy()
         test_client.insert_df('test_pandas_big_int', df)
         result_df = test_client.query_df('SELECT * FROM test_pandas_big_int')