From 2c23de8d3fd73d33006511ec481fe74bccd065be Mon Sep 17 00:00:00 2001 From: davidbehar Date: Mon, 18 Nov 2024 14:46:15 +0100 Subject: [PATCH] limit when dataset is too large --- python-lib/backend/api_utils.py | 4 +++- python-lib/backend/fetch_api.py | 12 +++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/python-lib/backend/api_utils.py b/python-lib/backend/api_utils.py index 90c98ac4..63695962 100644 --- a/python-lib/backend/api_utils.py +++ b/python-lib/backend/api_utils.py @@ -33,7 +33,7 @@ def natural_sort_key(s): import re return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(s))] -def calculate_base_levels(df, exposure_column=None): +def calculate_base_levels(df, exposure_column=None, max_nb_options=100): cols_json = [] # Sort the columns using natural sorting sorted_columns = sorted(df.columns, key=natural_sort_key) @@ -47,8 +47,10 @@ def calculate_base_levels(df, exposure_column=None): if is_numeric: options = sorted([str(val) for val in df[col].unique()], key=float) + options = options[:max_nb_options] else: options = sorted([str(val) for val in df[col].unique()], key=natural_sort_key) + options = options[:max_nb_options] if exposure_column and exposure_column in df.columns: # Exposure-based calculation diff --git a/python-lib/backend/fetch_api.py b/python-lib/backend/fetch_api.py index 543a4f2c..cf84fcf6 100644 --- a/python-lib/backend/fetch_api.py +++ b/python-lib/backend/fetch_api.py @@ -18,7 +18,9 @@ from .api_utils import calculate_base_levels visual_ml_trainer = model_cache = model_deployer =relativities_calculator = None -is_local = False +is_local = True + +LIMIT = 100000 logger.debug(f"Starting web application with is_local: {is_local}") @@ -578,7 +580,7 @@ def get_excluded_columns(): def get_dataset_columns(): try: if is_local: - dataset_name = "claim_train" + dataset_name = "big_test_prepared" exposure_column = "exposure" else: web_app_config = get_webapp_config() @@ -587,9 +589,9 @@ def get_dataset_columns(): current_app.logger.info(f"Training Dataset name selected is: {dataset_name}") - df = dataiku.Dataset(dataset_name).get_dataframe() + df = dataiku.Dataset(dataset_name).get_dataframe(limit=LIMIT) cols_json = calculate_base_levels(df, exposure_column) - + current_app.logger.info(f"Successfully retrieved column for dataset '{dataset_name}': {[col['column'] for col in cols_json]}") return jsonify(cols_json) @@ -606,7 +608,7 @@ def get_dataset_columns(): def get_train_dataset_column_names(): try: if is_local: - dataset_name = "claim_train" + dataset_name = "big_test_prepared" else: dataset_name = visual_ml_config.input_dataset