From 9810e24542a1c37a2b76140adf32863bbdf7b2be Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Fri, 20 Dec 2024 14:35:10 -0800 Subject: [PATCH 1/9] SNOW-1805840: add method_call_count and interchange_call_count to telemetry Signed-off-by: Labanya Mukhopadhyay --- .../modin/plugin/_internal/telemetry.py | 36 ++++++++- .../compiler/snowflake_query_compiler.py | 2 + tests/integ/modin/test_telemetry.py | 79 +++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py index 3ea77ca0025..df5ae686cae 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py @@ -18,6 +18,7 @@ from snowflake.snowpark.modin.plugin._internal.utils import ( is_snowpark_pandas_dataframe_or_series_type, ) +from collections import Counter from snowflake.snowpark.query_history import QueryHistory from snowflake.snowpark.session import Session @@ -36,6 +37,8 @@ class SnowparkPandasTelemetryField(Enum): ARGS = "argument" # fallback flag IS_FALLBACK = "is_fallback" + CALL_COUNT = "call_count" + INTERCHANGE_CALL_COUNT = "interchange_call_count" # Argument truncating size after converted to str. Size amount can be later specified after analysis and needs. @@ -58,6 +61,8 @@ def _send_snowpark_pandas_telemetry_helper( func_name: str, query_history: Optional[QueryHistory], api_calls: Union[str, list[dict[str, Any]]], + method_call_count: Counter[str], + interchange_call_count: Counter[str], ) -> None: """ A helper function that sends Snowpark pandas API telemetry data. @@ -71,14 +76,30 @@ def _send_snowpark_pandas_telemetry_helper( query_history: The query history context manager to record queries that are pushed down to the Snowflake database in the session. api_calls: Optional list of Snowpark pandas API calls made during the function execution. + method_call_count: Number of times a method has been called. + interchange_call_count: Number of times __dataframe__ has been called. Returns: None """ - data: dict[str, Union[str, list[dict[str, Any]], list[str], Optional[str]]] = { + data: dict[ + str, Union[str, list[dict[str, Any]], list[str], Optional[str], Counter[str]] + ] = { TelemetryField.KEY_FUNC_NAME.value: func_name, TelemetryField.KEY_CATEGORY.value: SnowparkPandasTelemetryField.FUNC_CATEGORY_SNOWPARK_PANDAS.value, TelemetryField.KEY_ERROR_MSG.value: error_msg, + **( + {SnowparkPandasTelemetryField.CALL_COUNT.value: method_call_count} + if method_call_count is not None + else {} + ), + **( + { + SnowparkPandasTelemetryField.INTERCHANGE_CALL_COUNT.value: interchange_call_count + } + if interchange_call_count is not None + else {} + ), } if len(api_calls) > 0: data[TelemetryField.KEY_API_CALLS.value] = api_calls @@ -274,6 +295,8 @@ def _telemetry_helper( # Moving existing api call out first can avoid to generate duplicates. existing_api_calls = [] need_to_restore_args0_api_calls = False + method_call_count = None + interchange_call_count = None # If the decorated func is a class method or a standalone function, we need to get an active session: if is_standalone_function or (len(args) > 0 and isinstance(args[0], type)): @@ -295,6 +318,13 @@ def _telemetry_helper( need_to_restore_args0_api_calls = True session = args[0]._query_compiler._modin_frame.ordered_dataframe.session class_prefix = args[0].__class__.__name__ + args[0]._query_compiler._method_call_counts[func.__name__] += 1 + method_call_count = args[0]._query_compiler._method_call_counts[ + func.__name__ + ] + interchange_call_count = args[0]._query_compiler._method_call_counts[ + "__dataframe__" + ] except (TypeError, IndexError, AttributeError): # TypeError: args might not support indexing; IndexError: args is empty; AttributeError: args[0] might not # have _query_compiler attribute. @@ -337,6 +367,8 @@ def _telemetry_helper( func_name=func_name, query_history=query_history, api_calls=existing_api_calls + [curr_api_call], + method_call_count=method_call_count, + interchange_call_count=interchange_call_count, ) raise e @@ -371,6 +403,8 @@ def _telemetry_helper( func_name=func_name, query_history=query_history, api_calls=existing_api_calls + [curr_api_call], + method_call_count=method_call_count, + interchange_call_count=interchange_call_count, ) if need_to_restore_args0_api_calls: args[0]._query_compiler.snowpark_pandas_api_calls = existing_api_calls diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 5849dba0745..7b597e67d1f 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -10,6 +10,7 @@ import json import logging import re +from collections import Counter import typing import uuid from collections.abc import Hashable, Iterable, Mapping, Sequence @@ -533,6 +534,7 @@ def __init__(self, frame: InternalFrame) -> None: # Copying and modifying self.snowpark_pandas_api_calls is taken care of in telemetry decorators self.snowpark_pandas_api_calls: list = [] self._attrs: dict[Any, Any] = {} + self._method_call_counts: Counter[str] = Counter[str]() def _raise_not_implemented_error_for_timedelta( self, frame: InternalFrame = None diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index 3caa81741b3..f08b8d116a2 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -559,6 +559,85 @@ def test_telemetry_repr(): ] +@sql_count_checker(query_count=6, join_count=4) +def test_telemetry_interchange_call_count(): + s = pd.DataFrame([1, 2, 3, 4]) + t = pd.DataFrame([5]) + s.__dataframe__() + s.__dataframe__() + t.__dataframe__() + + s.iloc[0, 0] = 7 + s.__dataframe__() + s.__dataframe__() + t.__dataframe__() + + def _get_data(call): + try: + return call.to_dict()["message"][TelemetryField.KEY_DATA.value] + except Exception: + return None + + telemetry_data = [ + _get_data(call) + for call in pd.session._conn._telemetry_client.telemetry._log_batch + if _get_data(call) is not None + and "func_name" in _get_data(call) + and _get_data(call)["func_name"] == "DataFrame.__dataframe__" + ] + assert len(telemetry_data) == 6 + # s calls __dataframe__() for the first time. + assert telemetry_data[0]["call_count"] == 1 + assert telemetry_data[0]["interchange_call_count"] == 1 + # s calls __dataframe__() for the second time. + assert telemetry_data[1]["call_count"] == 2 + assert telemetry_data[1]["interchange_call_count"] == 2 + # t calls __dataframe__() for the first time. + assert telemetry_data[2]["call_count"] == 1 + assert telemetry_data[2]["interchange_call_count"] == 1 + # the new version of s calls __dataframe__() for the first time. + assert telemetry_data[3]["call_count"] == 1 + assert telemetry_data[3]["interchange_call_count"] == 1 + # the new version of s calls __dataframe__() for the second time. + assert telemetry_data[4]["call_count"] == 2 + assert telemetry_data[4]["interchange_call_count"] == 2 + # t calls __dataframe__() for the second time. + assert telemetry_data[5]["call_count"] == 2 + assert telemetry_data[5]["interchange_call_count"] == 2 + + +@sql_count_checker(query_count=4) +def test_telemetry_func_call_count(): + s = pd.DataFrame([1, 2, np.nan, 4]) + t = pd.DataFrame([5]) + + s.__repr__() + s.__repr__() + s.__repr__() + + t.__repr__() + + def _get_data(call): + try: + return call.to_dict()["message"][TelemetryField.KEY_DATA.value] + except Exception: + return None + + telemetry_data = [ + _get_data(call) + for call in pd.session._conn._telemetry_client.telemetry._log_batch + if _get_data(call) is not None + and "func_name" in _get_data(call) + and _get_data(call)["func_name"] == "DataFrame.__repr__" + ] + assert len(telemetry_data) == 4 + + assert telemetry_data[2]["call_count"] == 3 + assert telemetry_data[2]["interchange_call_count"] == 0 + assert telemetry_data[3]["call_count"] == 1 + assert telemetry_data[2]["interchange_call_count"] == 0 + + @sql_count_checker(query_count=0) def test_telemetry_copy(): # copy() is defined in upstream Modin's BasePandasDataset class, and not overridden by any From 733f4b060f1c037c57645ab3e883e5139bdbae3e Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Fri, 20 Dec 2024 14:45:33 -0800 Subject: [PATCH 2/9] update changelog Signed-off-by: Labanya Mukhopadhyay --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b48523e8f51..b58f46c93c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,7 +54,8 @@ - Updated integration testing for `session.lineage.trace` to exclude deleted objects - Added documentation for `DataFrame.map`. - Improve performance of `DataFrame.apply` by mapping numpy functions to snowpark functions if possible. -- Added documentation on the extent of Snowpark pandas interoperability with scikit-learn +- Added documentation on the extent of Snowpark pandas interoperability with scikit-learn. +- Added `method_call_count` and `interchange_call_count` to telemetry. ## 1.26.0 (2024-12-05) From ed2bd258b44d35dbf45ae27cbabd748f849c2d70 Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Fri, 20 Dec 2024 15:39:28 -0800 Subject: [PATCH 3/9] fix telem tests Signed-off-by: Labanya Mukhopadhyay --- tests/integ/modin/test_telemetry.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index f08b8d116a2..d32817fe7a3 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -143,6 +143,8 @@ def test_snowpark_pandas_telemetry_method_decorator(test_table_name): "sfqids", "func_name", "error_msg", + "call_count", + "interchange_call_count", } assert data["category"] == "snowpark_pandas" assert data["api_calls"] == df1_expected_api_calls + [ @@ -178,6 +180,8 @@ def test_send_snowpark_pandas_telemetry_helper(send_mock): func_name="test_send_func", query_history=None, api_calls=[], + method_call_count=None, + interchange_call_count=None, ) send_mock.assert_called_with( { @@ -630,12 +634,14 @@ def _get_data(call): and "func_name" in _get_data(call) and _get_data(call)["func_name"] == "DataFrame.__repr__" ] - assert len(telemetry_data) == 4 - assert telemetry_data[2]["call_count"] == 3 - assert telemetry_data[2]["interchange_call_count"] == 0 - assert telemetry_data[3]["call_count"] == 1 - assert telemetry_data[2]["interchange_call_count"] == 0 + # second to last call from telemetry data + assert telemetry_data[-2]["call_count"] == 3 + assert telemetry_data[-2]["interchange_call_count"] == 0 + + # last call from telemetry data + assert telemetry_data[-1]["call_count"] == 1 + assert telemetry_data[-1]["interchange_call_count"] == 0 @sql_count_checker(query_count=0) From 0801060199477d93effa4487a38002d3f8f3cc79 Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Fri, 3 Jan 2025 16:43:25 -0800 Subject: [PATCH 4/9] fix unit telem tests Signed-off-by: Labanya Mukhopadhyay --- .../snowpark/modin/plugin/_internal/telemetry.py | 4 ++-- tests/unit/modin/test_telemetry.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py index df5ae686cae..a6a87718938 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py @@ -318,9 +318,9 @@ def _telemetry_helper( need_to_restore_args0_api_calls = True session = args[0]._query_compiler._modin_frame.ordered_dataframe.session class_prefix = args[0].__class__.__name__ - args[0]._query_compiler._method_call_counts[func.__name__] += 1 + args[0]._query_compiler._method_call_counts[func.__qualname__] += 1 method_call_count = args[0]._query_compiler._method_call_counts[ - func.__name__ + func.__qualname__ ] interchange_call_count = args[0]._query_compiler._method_call_counts[ "__dataframe__" diff --git a/tests/unit/modin/test_telemetry.py b/tests/unit/modin/test_telemetry.py index 92617906970..61cf7e093d5 100644 --- a/tests/unit/modin/test_telemetry.py +++ b/tests/unit/modin/test_telemetry.py @@ -54,6 +54,8 @@ def snowpark_pandas_error_test_helper( query_history=ANY, telemetry_type=telemetry_type, error_msg=error_msg, + method_call_count=ANY, + interchange_call_count=ANY, ) @@ -115,6 +117,8 @@ def raise_real_type_error(_): query_history=ANY, telemetry_type="snowpark_pandas_type_error", error_msg=None, + method_call_count=ANY, + interchange_call_count=ANY, ) assert len(mock_arg2._query_compiler.snowpark_pandas_api_calls) == 0 @@ -133,6 +137,8 @@ def raise_real_type_error(_): query_history=ANY, telemetry_type="snowpark_pandas_type_error", error_msg=None, + method_call_count=ANY, + interchange_call_count=ANY, ) @@ -165,6 +171,8 @@ def test_snowpark_pandas_telemetry_method_error(error): error_msg="test" if isinstance(error, (AssertionError, NotImplementedError)) else None, + method_call_count=0, + interchange_call_count=0, ) From 69a2cf2f7279aa6ca67685fcc63766f5bc7720d5 Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Fri, 3 Jan 2025 16:47:10 -0800 Subject: [PATCH 5/9] fix telem unit error test Signed-off-by: Labanya Mukhopadhyay --- tests/unit/modin/test_telemetry.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/modin/test_telemetry.py b/tests/unit/modin/test_telemetry.py index 61cf7e093d5..d40633e03fb 100644 --- a/tests/unit/modin/test_telemetry.py +++ b/tests/unit/modin/test_telemetry.py @@ -171,8 +171,6 @@ def test_snowpark_pandas_telemetry_method_error(error): error_msg="test" if isinstance(error, (AssertionError, NotImplementedError)) else None, - method_call_count=0, - interchange_call_count=0, ) From cccc1e20ce299563ccb1c4ee86627fecbe7b8f54 Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Mon, 6 Jan 2025 15:01:29 -0800 Subject: [PATCH 6/9] remove interchange call count and address comments Signed-off-by: Labanya Mukhopadhyay --- .../modin/plugin/_internal/telemetry.py | 33 +++++-------------- .../compiler/snowflake_query_compiler.py | 3 +- tests/integ/modin/test_telemetry.py | 12 ++----- tests/unit/modin/test_telemetry.py | 3 -- 4 files changed, 12 insertions(+), 39 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py index a6a87718938..26d82ad5c48 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py @@ -18,7 +18,6 @@ from snowflake.snowpark.modin.plugin._internal.utils import ( is_snowpark_pandas_dataframe_or_series_type, ) -from collections import Counter from snowflake.snowpark.query_history import QueryHistory from snowflake.snowpark.session import Session @@ -37,8 +36,8 @@ class SnowparkPandasTelemetryField(Enum): ARGS = "argument" # fallback flag IS_FALLBACK = "is_fallback" + # number of times a method has been called on the same QC CALL_COUNT = "call_count" - INTERCHANGE_CALL_COUNT = "interchange_call_count" # Argument truncating size after converted to str. Size amount can be later specified after analysis and needs. @@ -61,8 +60,7 @@ def _send_snowpark_pandas_telemetry_helper( func_name: str, query_history: Optional[QueryHistory], api_calls: Union[str, list[dict[str, Any]]], - method_call_count: Counter[str], - interchange_call_count: Counter[str], + method_call_count: str, ) -> None: """ A helper function that sends Snowpark pandas API telemetry data. @@ -77,14 +75,11 @@ def _send_snowpark_pandas_telemetry_helper( database in the session. api_calls: Optional list of Snowpark pandas API calls made during the function execution. method_call_count: Number of times a method has been called. - interchange_call_count: Number of times __dataframe__ has been called. Returns: None """ - data: dict[ - str, Union[str, list[dict[str, Any]], list[str], Optional[str], Counter[str]] - ] = { + data: dict[str, Union[str, list[dict[str, Any]], list[str], Optional[str]]] = { TelemetryField.KEY_FUNC_NAME.value: func_name, TelemetryField.KEY_CATEGORY.value: SnowparkPandasTelemetryField.FUNC_CATEGORY_SNOWPARK_PANDAS.value, TelemetryField.KEY_ERROR_MSG.value: error_msg, @@ -93,13 +88,6 @@ def _send_snowpark_pandas_telemetry_helper( if method_call_count is not None else {} ), - **( - { - SnowparkPandasTelemetryField.INTERCHANGE_CALL_COUNT.value: interchange_call_count - } - if interchange_call_count is not None - else {} - ), } if len(api_calls) > 0: data[TelemetryField.KEY_API_CALLS.value] = api_calls @@ -296,7 +284,6 @@ def _telemetry_helper( existing_api_calls = [] need_to_restore_args0_api_calls = False method_call_count = None - interchange_call_count = None # If the decorated func is a class method or a standalone function, we need to get an active session: if is_standalone_function or (len(args) > 0 and isinstance(args[0], type)): @@ -318,13 +305,11 @@ def _telemetry_helper( need_to_restore_args0_api_calls = True session = args[0]._query_compiler._modin_frame.ordered_dataframe.session class_prefix = args[0].__class__.__name__ - args[0]._query_compiler._method_call_counts[func.__qualname__] += 1 - method_call_count = args[0]._query_compiler._method_call_counts[ - func.__qualname__ - ] - interchange_call_count = args[0]._query_compiler._method_call_counts[ - "__dataframe__" - ] + func_name = _gen_func_name( + class_prefix, func, property_name, property_method_type + ) + args[0]._query_compiler._method_call_counts[func_name] += 1 + method_call_count = args[0]._query_compiler._method_call_counts[func_name] except (TypeError, IndexError, AttributeError): # TypeError: args might not support indexing; IndexError: args is empty; AttributeError: args[0] might not # have _query_compiler attribute. @@ -368,7 +353,6 @@ def _telemetry_helper( query_history=query_history, api_calls=existing_api_calls + [curr_api_call], method_call_count=method_call_count, - interchange_call_count=interchange_call_count, ) raise e @@ -404,7 +388,6 @@ def _telemetry_helper( query_history=query_history, api_calls=existing_api_calls + [curr_api_call], method_call_count=method_call_count, - interchange_call_count=interchange_call_count, ) if need_to_restore_args0_api_calls: args[0]._query_compiler.snowpark_pandas_api_calls = existing_api_calls diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py index 1f6dbe992c6..ab71addf5a3 100644 --- a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py +++ b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py @@ -531,7 +531,8 @@ def __init__(self, frame: InternalFrame) -> None: ), "frame is None or not a InternalFrame" self._modin_frame = frame # self.snowpark_pandas_api_calls a list of lazy Snowpark pandas telemetry api calls - # Copying and modifying self.snowpark_pandas_api_calls is taken care of in telemetry decorators + # Copying and modifying self.snowpark_pandas_api_calls and self._method_call_counts + # is taken care of in telemetry decorators self.snowpark_pandas_api_calls: list = [] self._attrs: dict[Any, Any] = {} self._method_call_counts: Counter[str] = Counter[str]() diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index d32817fe7a3..aa4a7fd4873 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -144,7 +144,6 @@ def test_snowpark_pandas_telemetry_method_decorator(test_table_name): "func_name", "error_msg", "call_count", - "interchange_call_count", } assert data["category"] == "snowpark_pandas" assert data["api_calls"] == df1_expected_api_calls + [ @@ -181,7 +180,6 @@ def test_send_snowpark_pandas_telemetry_helper(send_mock): query_history=None, api_calls=[], method_call_count=None, - interchange_call_count=None, ) send_mock.assert_called_with( { @@ -592,22 +590,16 @@ def _get_data(call): assert len(telemetry_data) == 6 # s calls __dataframe__() for the first time. assert telemetry_data[0]["call_count"] == 1 - assert telemetry_data[0]["interchange_call_count"] == 1 # s calls __dataframe__() for the second time. assert telemetry_data[1]["call_count"] == 2 - assert telemetry_data[1]["interchange_call_count"] == 2 # t calls __dataframe__() for the first time. assert telemetry_data[2]["call_count"] == 1 - assert telemetry_data[2]["interchange_call_count"] == 1 # the new version of s calls __dataframe__() for the first time. assert telemetry_data[3]["call_count"] == 1 - assert telemetry_data[3]["interchange_call_count"] == 1 # the new version of s calls __dataframe__() for the second time. assert telemetry_data[4]["call_count"] == 2 - assert telemetry_data[4]["interchange_call_count"] == 2 # t calls __dataframe__() for the second time. assert telemetry_data[5]["call_count"] == 2 - assert telemetry_data[5]["interchange_call_count"] == 2 @sql_count_checker(query_count=4) @@ -636,12 +628,12 @@ def _get_data(call): ] # second to last call from telemetry data + # s called __repr__() 3 times. assert telemetry_data[-2]["call_count"] == 3 - assert telemetry_data[-2]["interchange_call_count"] == 0 # last call from telemetry data + # t called __repr__() 1 time. assert telemetry_data[-1]["call_count"] == 1 - assert telemetry_data[-1]["interchange_call_count"] == 0 @sql_count_checker(query_count=0) diff --git a/tests/unit/modin/test_telemetry.py b/tests/unit/modin/test_telemetry.py index d40633e03fb..d8ffbb97dcd 100644 --- a/tests/unit/modin/test_telemetry.py +++ b/tests/unit/modin/test_telemetry.py @@ -55,7 +55,6 @@ def snowpark_pandas_error_test_helper( telemetry_type=telemetry_type, error_msg=error_msg, method_call_count=ANY, - interchange_call_count=ANY, ) @@ -118,7 +117,6 @@ def raise_real_type_error(_): telemetry_type="snowpark_pandas_type_error", error_msg=None, method_call_count=ANY, - interchange_call_count=ANY, ) assert len(mock_arg2._query_compiler.snowpark_pandas_api_calls) == 0 @@ -138,7 +136,6 @@ def raise_real_type_error(_): telemetry_type="snowpark_pandas_type_error", error_msg=None, method_call_count=ANY, - interchange_call_count=ANY, ) From e04b0911e76cbd38a597eb91366239457d5b92c5 Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Mon, 6 Jan 2025 15:03:33 -0800 Subject: [PATCH 7/9] update changelog Signed-off-by: Labanya Mukhopadhyay --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b58f46c93c5..5d3f7191398 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,7 +55,7 @@ - Added documentation for `DataFrame.map`. - Improve performance of `DataFrame.apply` by mapping numpy functions to snowpark functions if possible. - Added documentation on the extent of Snowpark pandas interoperability with scikit-learn. -- Added `method_call_count` and `interchange_call_count` to telemetry. +- Added `method_call_count` to telemetry. ## 1.26.0 (2024-12-05) From 05204b0a160a65d16cf09d68dcfc494d3b40878b Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Tue, 7 Jan 2025 11:32:44 -0800 Subject: [PATCH 8/9] add test for multiple funcs on same qc Signed-off-by: Labanya Mukhopadhyay --- CHANGELOG.md | 2 +- tests/integ/modin/test_telemetry.py | 40 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d3f7191398..6f51d61cb54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,7 +55,7 @@ - Added documentation for `DataFrame.map`. - Improve performance of `DataFrame.apply` by mapping numpy functions to snowpark functions if possible. - Added documentation on the extent of Snowpark pandas interoperability with scikit-learn. -- Added `method_call_count` to telemetry. +- Added `call_count` to telemetry that counts method calls including interchange protocol calls. ## 1.26.0 (2024-12-05) diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index aa4a7fd4873..425b4a37435 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -636,6 +636,46 @@ def _get_data(call): assert telemetry_data[-1]["call_count"] == 1 +@sql_count_checker(query_count=3) +def test_telemetry_multiple_func_call_count(): + s = pd.DataFrame([1, 2, np.nan, 4]) + + s.__repr__() + s.__repr__() + s.__dataframe__() + + def _get_data(call): + try: + return call.to_dict()["message"][TelemetryField.KEY_DATA.value] + except Exception: + return None + + repr_telemetry_data = [ + _get_data(call) + for call in pd.session._conn._telemetry_client.telemetry._log_batch + if _get_data(call) is not None + and "func_name" in _get_data(call) + and _get_data(call)["func_name"] == "DataFrame.__repr__" + ] + dataframe_telemetry_data = [ + _get_data(call) + for call in pd.session._conn._telemetry_client.telemetry._log_batch + if _get_data(call) is not None + and "func_name" in _get_data(call) + and _get_data(call)["func_name"] == "DataFrame.__dataframe__" + ] + + # last call from telemetry data + # s called __repr__() 2 times. + print(repr_telemetry_data) + print(dataframe_telemetry_data) + assert repr_telemetry_data[-1]["call_count"] == 2 + + # last call from telemetry data + # s called __dataframe__() 2 times. + assert dataframe_telemetry_data[-1]["call_count"] == 1 + + @sql_count_checker(query_count=0) def test_telemetry_copy(): # copy() is defined in upstream Modin's BasePandasDataset class, and not overridden by any From b38e30417537125ed1bf7e08a2b7863e8b7393f5 Mon Sep 17 00:00:00 2001 From: Labanya Mukhopadhyay Date: Tue, 7 Jan 2025 16:02:32 -0800 Subject: [PATCH 9/9] address comments Signed-off-by: Labanya Mukhopadhyay --- src/snowflake/snowpark/modin/plugin/_internal/telemetry.py | 2 +- tests/integ/modin/test_telemetry.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py index 26d82ad5c48..42a60d90bb6 100644 --- a/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py +++ b/src/snowflake/snowpark/modin/plugin/_internal/telemetry.py @@ -36,7 +36,7 @@ class SnowparkPandasTelemetryField(Enum): ARGS = "argument" # fallback flag IS_FALLBACK = "is_fallback" - # number of times a method has been called on the same QC + # number of times a method has been called on the same query compiler CALL_COUNT = "call_count" diff --git a/tests/integ/modin/test_telemetry.py b/tests/integ/modin/test_telemetry.py index 425b4a37435..a5a904c7431 100644 --- a/tests/integ/modin/test_telemetry.py +++ b/tests/integ/modin/test_telemetry.py @@ -667,8 +667,6 @@ def _get_data(call): # last call from telemetry data # s called __repr__() 2 times. - print(repr_telemetry_data) - print(dataframe_telemetry_data) assert repr_telemetry_data[-1]["call_count"] == 2 # last call from telemetry data