From 2c23de8d3fd73d33006511ec481fe74bccd065be Mon Sep 17 00:00:00 2001
From: davidbehar <david.behar@dataiku.com>
Date: Mon, 18 Nov 2024 14:46:15 +0100
Subject: [PATCH] limit when dataset is too large

---
 python-lib/backend/api_utils.py |  4 +++-
 python-lib/backend/fetch_api.py | 12 +++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/python-lib/backend/api_utils.py b/python-lib/backend/api_utils.py
index 90c98ac4..63695962 100644
--- a/python-lib/backend/api_utils.py
+++ b/python-lib/backend/api_utils.py
@@ -33,7 +33,7 @@ def natural_sort_key(s):
     import re
     return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(s))]
 
-def calculate_base_levels(df, exposure_column=None):
+def calculate_base_levels(df, exposure_column=None, max_nb_options=100):
     cols_json = []
     # Sort the columns using natural sorting
     sorted_columns = sorted(df.columns, key=natural_sort_key)
@@ -47,8 +47,10 @@ def calculate_base_levels(df, exposure_column=None):
         
         if is_numeric:
             options = sorted([str(val) for val in df[col].unique()], key=float)
+            options = options[:max_nb_options]
         else:
             options = sorted([str(val) for val in df[col].unique()], key=natural_sort_key)
+            options = options[:max_nb_options]
         
         if exposure_column and exposure_column in df.columns:
             # Exposure-based calculation
diff --git a/python-lib/backend/fetch_api.py b/python-lib/backend/fetch_api.py
index 543a4f2c..cf84fcf6 100644
--- a/python-lib/backend/fetch_api.py
+++ b/python-lib/backend/fetch_api.py
@@ -18,7 +18,9 @@
 from .api_utils import calculate_base_levels
 
 visual_ml_trainer = model_cache = model_deployer =relativities_calculator = None
-is_local = False
+is_local = True
+
+LIMIT = 100000
 
 logger.debug(f"Starting web application with is_local: {is_local}")
 
@@ -578,7 +580,7 @@ def get_excluded_columns():
 def get_dataset_columns():
     try:
         if is_local:
-            dataset_name = "claim_train"
+            dataset_name = "big_test_prepared"
             exposure_column = "exposure"
         else:
             web_app_config = get_webapp_config()
@@ -587,9 +589,9 @@ def get_dataset_columns():
             
         current_app.logger.info(f"Training Dataset name selected is: {dataset_name}")
         
-        df = dataiku.Dataset(dataset_name).get_dataframe()
+        df = dataiku.Dataset(dataset_name).get_dataframe(limit=LIMIT)
         cols_json = calculate_base_levels(df, exposure_column)
-
+        
         current_app.logger.info(f"Successfully retrieved column for dataset '{dataset_name}': {[col['column'] for col in cols_json]}")
 
         return jsonify(cols_json)
@@ -606,7 +608,7 @@ def get_dataset_columns():
 def get_train_dataset_column_names():
     try:
         if is_local:
-            dataset_name = "claim_train"
+            dataset_name = "big_test_prepared"
         else:
             dataset_name = visual_ml_config.input_dataset