Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

limit when dataset is too large #93

Open
wants to merge 1 commit into
base: release/1.0.5
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python-lib/backend/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def natural_sort_key(s):
import re
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(s))]

def calculate_base_levels(df, exposure_column=None):
def calculate_base_levels(df, exposure_column=None, max_nb_options=100):
cols_json = []
# Sort the columns using natural sorting
sorted_columns = sorted(df.columns, key=natural_sort_key)
Expand All @@ -47,8 +47,10 @@ def calculate_base_levels(df, exposure_column=None):

if is_numeric:
options = sorted([str(val) for val in df[col].unique()], key=float)
options = options[:max_nb_options]
else:
options = sorted([str(val) for val in df[col].unique()], key=natural_sort_key)
options = options[:max_nb_options]

if exposure_column and exposure_column in df.columns:
# Exposure-based calculation
Expand Down
12 changes: 7 additions & 5 deletions python-lib/backend/fetch_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
from .api_utils import calculate_base_levels

visual_ml_trainer = model_cache = model_deployer =relativities_calculator = None
is_local = False
is_local = True

LIMIT = 100000

logger.debug(f"Starting web application with is_local: {is_local}")

Expand Down Expand Up @@ -578,7 +580,7 @@ def get_excluded_columns():
def get_dataset_columns():
try:
if is_local:
dataset_name = "claim_train"
dataset_name = "big_test_prepared"
exposure_column = "exposure"
else:
web_app_config = get_webapp_config()
Expand All @@ -587,9 +589,9 @@ def get_dataset_columns():

current_app.logger.info(f"Training Dataset name selected is: {dataset_name}")

df = dataiku.Dataset(dataset_name).get_dataframe()
df = dataiku.Dataset(dataset_name).get_dataframe(limit=LIMIT)
cols_json = calculate_base_levels(df, exposure_column)

current_app.logger.info(f"Successfully retrieved column for dataset '{dataset_name}': {[col['column'] for col in cols_json]}")

return jsonify(cols_json)
Expand All @@ -606,7 +608,7 @@ def get_dataset_columns():
def get_train_dataset_column_names():
try:
if is_local:
dataset_name = "claim_train"
dataset_name = "big_test_prepared"
else:
dataset_name = visual_ml_config.input_dataset

Expand Down