Skip to content

Commit

Permalink
Fix pandas large ints (#454)
Browse files Browse the repository at this point in the history
* Fix pandas inserts into big int columns

* Tinkering with pandas insert data types
  • Loading branch information
genzgd authored Jan 14, 2025
1 parent 8c94d92 commit 697794e
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 9 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ release (0.9.0), unrecognized arguments/keywords for these methods of creating a
instead of being passed as ClickHouse server settings. This is in conjunction with some refactoring in Client construction.
The supported method of passing ClickHouse server settings is to prefix such arguments/query parameters with`ch_`.

## 0.8.14, 2025-01-13
### Bug Fix
- Fix an edge case where a Pandas dataframe that contains _only_ Int64 (or smaller) values would cause an exception when
inserting into a ClickHouse "big int" table of U/Int128/256. Closes https://github.com/ClickHouse/clickhouse-connect/issues/452

## 0.8.13, 2025-01-07
### Bug Fix
- Fix missing default for new access_token parameter. Thanks to [Lukas Thaler](https://github.com/lukasthalerINNIO) for the PR.
Expand Down
2 changes: 1 addition & 1 deletion clickhouse_connect/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = '0.8.13'
version = '0.8.14'
1 change: 1 addition & 0 deletions clickhouse_connect/datatypes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def _finalize_column(self, column: Sequence, ctx: QueryContext) -> Sequence:
class BigInt(ClickHouseType, registered=False):
_signed = True
valid_formats = 'string', 'native'
python_type = int

def _read_column_binary(self, source: ByteSource, num_rows: int, ctx: QueryContext, _read_state: Any):
signed = self._signed
Expand Down
13 changes: 7 additions & 6 deletions clickhouse_connect/driver/insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,20 @@ def _convert_pandas(self, df):
data = []
for df_col_name, col_name, ch_type in zip(df.columns, self.column_names, self.column_types):
df_col = df[df_col_name]
d_type = str(df_col.dtype)
d_type_kind = df_col.dtype.kind
if ch_type.python_type == int:
if 'float' in d_type:
if d_type_kind == 'f':
df_col = df_col.round().astype(ch_type.base_type, copy=False)
else:
df_col = df_col.astype(ch_type.base_type, copy=False)
elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in d_type):
elif d_type_kind in ('i', 'u') and not df_col.hasnans:
data.append(df_col.to_list())
continue
elif 'datetime' in ch_type.np_type and (pd_time_test(df_col) or 'datetime64[ns' in str(df_col.dtype)):
div = ch_type.nano_divisor
data.append([None if pd.isnull(x) else x.value // div for x in df_col])
self.column_formats[col_name] = 'int'
continue
if ch_type.nullable:
if d_type == 'object':
if d_type_kind == 'O':
# This is ugly, but the multiple replaces seem required as a result of this bug:
# https://github.com/pandas-dev/pandas/issues/29024
df_col = df_col.replace({pd.NaT: None}).replace({np.nan: None})
Expand Down
5 changes: 3 additions & 2 deletions tests/integration_tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,14 @@ def test_pandas_low_card(test_client: Client, table_context: Callable):


def test_pandas_large_types(test_client: Client, table_context: Callable):
columns = ['key String', 'value Int256']
columns = ['key String', 'value Int256', 'u_value UInt256'
]
key2_value = 30000000000000000000000000000000000
if not test_client.min_version('21'):
columns = ['key String', 'value Int64']
key2_value = 3000000000000000000
with table_context('test_pandas_big_int', columns):
df = pd.DataFrame([['key1', 2000], ['key2', key2_value]], columns=['key', 'value'])
df = pd.DataFrame([['key1', 2000, 50], ['key2', key2_value, 70], ['key3', -2350, 70]], columns=['key', 'value', 'u_value'])
source_df = df.copy()
test_client.insert_df('test_pandas_big_int', df)
result_df = test_client.query_df('SELECT * FROM test_pandas_big_int')
Expand Down

0 comments on commit 697794e

Please sign in to comment.