From 181719d0fc578bdb55fe12b8e992aff105aabf74 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 7 Jan 2025 09:19:26 +0100 Subject: [PATCH 01/48] add base md Signed-off-by: mahdikhashan --- .../user-guides/hp-tuning/llm-hp-optimization.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md new file mode 100644 index 0000000000..3aeb65e28e --- /dev/null +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -0,0 +1,12 @@ ++++ +title = "How to Configure Algorithms" +description = "List of supported algorithms for hyperparameter tuning" +weight = 20 ++++ + +This page describes LLM hyperparameter (HP) optimization API that Katib supports and how to configure +it. + +## LLM Hyper Parameter Optimization + +TODO \ No newline at end of file From aa3b2befdb076690d1e022828c201d5e082161df Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 7 Jan 2025 10:21:40 +0100 Subject: [PATCH 02/48] update title and description Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 3aeb65e28e..a1ba7b279d 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -1,6 +1,6 @@ +++ -title = "How to Configure Algorithms" -description = "List of supported algorithms for hyperparameter tuning" +title = "How to Optimize LLM Hyperparameters" +description = "API description" weight = 20 +++ From 36f2c1e10f9368127ddc7f0b50a81440030deccc Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 12:38:23 +0100 Subject: [PATCH 03/48] add draft code Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 71 ++++++++++++++++++- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index a1ba7b279d..6cd6ec7626 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -7,6 +7,73 @@ weight = 20 This page describes LLM hyperparameter (HP) optimization API that Katib supports and how to configure it. -## LLM Hyper Parameter Optimization +## LLM Hyperparameters Optimization -TODO \ No newline at end of file + +```python +import kubeflow.katib as katib +from kubeflow.katib import KatibClient + +import transformers +from peft import LoraConfig + +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, +) + +hf_model = HuggingFaceModelParams( + model_uri = "hf://meta-llama/Llama-3.2-1B", + transformer_type = transformers.AutoModelForSequenceClassification, +) + +# Train the model on 1000 movie reviews from imdb +# https://huggingface.co/datasets/stanfordnlp/imdb +hf_dataset = HuggingFaceDatasetParams( + repo_id = "imdb", + split = "train[:1000]", +) + +hf_tuning_parameters = HuggingFaceTrainerParams( + training_parameters = transformers.TrainingArguments( + output_dir = "results", + save_strategy = "no", + learning_rate = katib.search.double(min=1e-05, max=5e-05), + num_train_epochs=3, + ), + # Set LoRA config to reduce number of trainable model parameters. + lora_config = LoraConfig( + r = katib.search.int(min=8, max=32), + lora_alpha = 8, + lora_dropout = 0.1, + bias = "none", + ), +) + +cl = KatibClient(namespace="kubeflow") + +# Fine-tuning for Binary Classification +exp_name = "Llama-3.2-fine-tune" +cl.tune( + name = exp_name, + model_provider_parameters = hf_model, + dataset_provider_parameters = hf_dataset, + trainer_parameters = hf_tuning_parameters, + objective_metric_name = "train_loss", + objective_type = "minimize", + algorithm_name = "random", + max_trial_count = 10, + parallel_trial_count = 2, + resources_per_trial={ + "gpu": "2", + "cpu": "4", + "memory": "10G", + }, +) + +cl.wait_for_experiment_condition(name=exp_name) + +# Get the best hyperparameters. +print(cl.get_optimal_hyperparameters(exp_name)) +``` From b7120e0e0d4577ee5657d7a430c467982e31f893 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 13:12:20 +0100 Subject: [PATCH 04/48] add prerequisites Signed-off-by: mahdikhashan --- .../user-guides/hp-tuning/llm-hp-optimization.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 6cd6ec7626..bbd0782a33 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -4,9 +4,20 @@ description = "API description" weight = 20 +++ -This page describes LLM hyperparameter (HP) optimization API that Katib supports and how to configure +This page describes LLM hyperparameter (HP) optimization Python API that Katib supports and how to configure it. +## Prerequisites + +You need to install the following Katib components to run code in this guide: + +- Katib control plane [install](/docs/components/katib/installation/#installing-control-plane). +- Katib Python SDK [install](/docs/components/katib/installation/#installing-python-sdk). + +Additionally install following python packages to run the example: + +- Transformers from Hugging Face [install](https://pypi.org/project/transformers/). + ## LLM Hyperparameters Optimization From 182b493f4725cc7029451144439fcd11de3e65ca Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 18:39:54 +0100 Subject: [PATCH 05/48] add huggingface api details,s3 api, update example Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 151 +++++++++++++++++- 1 file changed, 147 insertions(+), 4 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index bbd0782a33..27797fc1dc 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -18,14 +18,157 @@ Additionally install following python packages to run the example: - Transformers from Hugging Face [install](https://pypi.org/project/transformers/). -## LLM Hyperparameters Optimization +## Load Model and Dataset + +To fine-tune a pre-trained model, it is essential to load the model and dataset from a provider. Currently, this can be done using external platforms like **Hugging Face** and **S3-compatible object storage** (e.g., Amazon S3) through the `storage_initializer` API from Kubeflow. + +### Hugging Face Integration + +The Hugging Face provider enables seamless integration of models and datasets for training and evaluation. You can import the necessary components for Hugging Face using the following code: + +```python +from kubeflow.storage_initializer.hugging_face import ( + HuggingFaceModelParams, + HuggingFaceDatasetParams, + HuggingFaceTrainerParams, +) +``` + +### S3-Compatible Object Storage Integration + +In addition to Hugging Face, you can integrate with S3-compatible object storage platforms to load datasets. To work with S3, use the `S3DatasetParams` class to define your dataset parameters. + +```python +from kubeflow.storage_initializer.s3 import S3DatasetParams +``` + +### HuggingFaceModelParams Description + +The `HuggingFaceModelParams` dataclass holds configuration parameters for initializing Hugging Face models with validation checks. + +| **Attribute** | **Type** | **Description** | +|------------------|---------------------------------------|----------------------------------------------------------------| +| `model_uri` | `str` | URI or path to the Hugging Face model (must not be empty). | +| `transformer_type` | `TRANSFORMER_TYPES` | Specifies the model type for various NLP/ML tasks. | +| `access_token` | `Optional[str]` (default: `None`) | Token for accessing private models on Hugging Face. | +| `num_labels` | `Optional[int]` (default: `None`) | Number of output labels (used for classification tasks). | + +### Supported Transformer Types (`TRANSFORMER_TYPES`) + +| **Model Type** | **Task** | +|------------------------------------------------|-----------------------------| +| `AutoModelForSequenceClassification` | Text classification | +| `AutoModelForTokenClassification` | Named entity recognition | +| `AutoModelForQuestionAnswering` | Question answering | +| `AutoModelForCausalLM` | Text generation (causal) | +| `AutoModelForMaskedLM` | Masked language modeling | +| `AutoModelForImageClassification` | Image classification | + +#### Example Usage + +```python +from transformers import AutoModelForSequenceClassification + +params = HuggingFaceModelParams( + model_uri="bert-base-uncased", + transformer_type=AutoModelForSequenceClassification, + access_token="huggingface_access_token", + num_labels=2 # For binary classification +) +``` + +### HuggingFaceDatasetParams Description + +The `HuggingFaceDatasetParams` class holds configuration parameters for loading datasets from Hugging Face with validation checks. + +| **Attribute** | **Type** | **Description** | +|------------------|-------------------------|----------------------------------------------------------------| +| `repo_id` | `str` | Identifier of the dataset repository on Hugging Face (must not be empty). | +| `access_token` | `Optional[str]` (default: `None`) | Token for accessing private datasets on Hugging Face. | +| `split` | `Optional[str]` (default: `None`) | Dataset split to load (e.g., `"train"`, `"test"`). | + +### S3DatasetParams Description + +The `S3DatasetParams` class is used for loading datasets from S3-compatible object storage. The parameters are defined as follows: + +| **Parameter** | **Type** | **Description** | +|-------------------|--------------------|-------------------------------------------------------------------| +| `endpoint_url` | `str` | URL of the S3-compatible storage service. | +| `bucket_name` | `str` | Name of the S3 bucket containing the dataset. | +| `file_key` | `str` | Key (path) to the dataset file within the bucket. | +| `region_name` | `str`, optional | The AWS region of the S3 bucket (optional). | +| `access_key` | `str`, optional | The access key for authentication with S3 (optional). | +| `secret_key` | `str`, optional | The secret key for authentication with S3 (optional). | + +#### Example Usage + +##### Hugging Face + +```python +dataset_params = HuggingFaceDatasetParams( + repo_id="imdb", # Public dataset repository ID on Hugging Face + split="train", # Dataset split to load + access_token=None # Not needed for public datasets +) +``` + +##### S3 + +```python +s3_params = S3DatasetParams( + endpoint_url="https://s3.amazonaws.com", + bucket_name="my-dataset-bucket", + file_key="datasets/train.csv", + region_name="us-west-2", + access_key="YOUR_ACCESS_KEY", + secret_key="YOUR_SECRET_KEY" +) +``` + + +### Hugging Face trainer params + +TODO + +### Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset + +This code provides an example of fine-tuning the [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. + +#### Model: +- [**Llama-3.2** from Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B) + +#### Dataset: +- [**IMDB movie reviews**](https://huggingface.co/datasets/stanfordnlp/imdb) (1000 samples for training) + +#### Training: +- Fine-tuning for binary classification +- Hyperparameter tuning with Katib + +### Katib Configuration + +The following table outlines the Katib configuration used for hyperparameter tuning in the fine-tuning process: + +| **Parameter** | **Description** | +|----------------------------|-----------------------------------------------------------------------| +| `exp_name` | Name of the experiment (`Llama-3.2-fine-tune`). | +| `model_provider_parameters`| Parameters for the Hugging Face model (Llama-3.2). | +| `dataset_provider_parameters`| Parameters for the IMDB dataset (1000 movie reviews). | +| `trainer_parameters` | Parameters for the Hugging Face trainer, including LoRA settings. | +| `objective_metric_name` | The objective metric to minimize, in this case, `"train_loss"`. | +| `objective_type` | Type of optimization: `"minimize"` for training loss. | +| `algorithm_name` | The optimization algorithm used, set to `"random"` for random search.| +| `max_trial_count` | Maximum number of trials to run, set to `10`. | +| `parallel_trial_count` | Number of trials to run in parallel, set to `2`. | +| `resources_per_trial` | Resources allocated for each trial: 2 GPUs, 4 CPUs, 10GB memory. | + +This configuration is used to find the best hyperparameters for fine-tuning the Llama-3.2 model using Katib. ```python import kubeflow.katib as katib from kubeflow.katib import KatibClient -import transformers +from transformers import AutoModelForSequenceClassification, TrainingArguments from peft import LoraConfig from kubeflow.storage_initializer.hugging_face import ( @@ -36,7 +179,7 @@ from kubeflow.storage_initializer.hugging_face import ( hf_model = HuggingFaceModelParams( model_uri = "hf://meta-llama/Llama-3.2-1B", - transformer_type = transformers.AutoModelForSequenceClassification, + transformer_type = AutoModelForSequenceClassification, ) # Train the model on 1000 movie reviews from imdb @@ -47,7 +190,7 @@ hf_dataset = HuggingFaceDatasetParams( ) hf_tuning_parameters = HuggingFaceTrainerParams( - training_parameters = transformers.TrainingArguments( + training_parameters = TrainingArguments( output_dir = "results", save_strategy = "no", learning_rate = katib.search.double(min=1e-05, max=5e-05), From 21646a886cce7aefe26bface790607c89392d170 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 18:48:43 +0100 Subject: [PATCH 06/48] remove redundant text Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 27797fc1dc..f125acbc7e 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -161,9 +161,6 @@ The following table outlines the Katib configuration used for hyperparameter tun | `parallel_trial_count` | Number of trials to run in parallel, set to `2`. | | `resources_per_trial` | Resources allocated for each trial: 2 GPUs, 4 CPUs, 10GB memory. | -This configuration is used to find the best hyperparameters for fine-tuning the Llama-3.2 model using Katib. - - ```python import kubeflow.katib as katib from kubeflow.katib import KatibClient From cc71dee89baea595dd243a81aafec9559ade87e2 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 19:06:26 +0100 Subject: [PATCH 07/48] add HuggingFaceTrainerParams description Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index f125acbc7e..b64e765b7b 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -126,9 +126,40 @@ s3_params = S3DatasetParams( ``` -### Hugging Face trainer params +### HuggingFaceTrainerParams Description -TODO +The `HuggingFaceTrainerParams` class is used to define parameters for the training process in the Hugging Face framework. It includes the training arguments and LoRA configuration to optimize model training. + +| **Parameter** | **Type** | **Description** | +|----------------------------|-------------------------------------|-------------------------------------------------------------------------------------------------| +| `training_parameters` | `transformers.TrainingArguments` | Contains the training arguments like learning rate, epochs, batch size, etc. | +| `lora_config` | `LoraConfig` | LoRA configuration to reduce the number of trainable parameters in the model. | + +### Example Usage + +This is an **example** of how to use the `HuggingFaceTrainerParams` class to define the training and LoRA parameters. + +```python +from transformers import TrainingArguments +from peft import LoraConfig +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + +# Set up training and LoRA configuration +trainer_params = HuggingFaceTrainerParams( + training_parameters=TrainingArguments( + output_dir="results", + learning_rate=1e-5, + num_train_epochs=3, + per_device_train_batch_size=8, + ), + lora_config=LoraConfig( + r=8, + lora_alpha=16, + lora_dropout=0.1, + bias="none", + ), +) +``` ### Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset From 56e4e537f0e2591098320981312f749e97da184f Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 19:09:19 +0100 Subject: [PATCH 08/48] update prerequisites Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index b64e765b7b..d1754b290d 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -12,11 +12,13 @@ it. You need to install the following Katib components to run code in this guide: - Katib control plane [install](/docs/components/katib/installation/#installing-control-plane). +- Kubeflow Training SDK [install](https://github.com/kubeflow/training-operator/tree/1dfa40c12516fc9eb2ce12c5ef52da7d46670457/sdk/python) - Katib Python SDK [install](/docs/components/katib/installation/#installing-python-sdk). Additionally install following python packages to run the example: - Transformers from Hugging Face [install](https://pypi.org/project/transformers/). +- Peft from Hugging Face [install](https://pypi.org/project/peft/) ## Load Model and Dataset From 6f36f1eba3a3fe8bb17b1f97bcd97616750b032f Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 19:25:35 +0100 Subject: [PATCH 09/48] update code example Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 72 ++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index d1754b290d..0f02c46d96 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -1,10 +1,10 @@ +++ -title = "How to Optimize LLM Hyperparameters" +title = "How to Optimize Language Models Hyperparameters" description = "API description" weight = 20 +++ -This page describes LLM hyperparameter (HP) optimization Python API that Katib supports and how to configure +This page describes Language Models hyperparameter (HP) optimization Python API that Katib supports and how to configure it. ## Prerequisites @@ -163,6 +163,74 @@ trainer_params = HuggingFaceTrainerParams( ) ``` +## Finetune Language Models + +In the context of fine-tuning large language models (LLMs) like GPT, BERT, or similar transformer-based models, it is crucial to optimize various hyperparameters to improve model performance. This sub-section covers the key parameters used in tuning LLMs via a `tune` function, specifically using tools like Katib for automated hyperparameter optimization in Kubernetes environments. + +### Key Parameters for LLM Hyperparameter Tuning + +| **Parameter** | **Description** | **Required** | +|----------------------------------|---------------------------------------------------------------------------------|--------------| +| `name` | Name of the experiment. | Required | +| `model_provider_parameters` | Parameters for the model provider, such as model type and configuration. | Required | +| `dataset_provider_parameters` | Parameters for the dataset provider, such as dataset configuration. | Required | +| `trainer_parameters` | Configuration for the trainer, including hyperparameters for model training. | Required | +| `storage_config` | Configuration for storage, like PVC size and storage class. | Optional | +| `objective` | Objective function for training and optimization. | Optional | +| `base_image` | Base image for executing the objective function. | Optional | +| `parameters` | Hyperparameters for tuning the experiment. | Optional | +| `namespace` | Kubernetes namespace for the experiment. | Optional | +| `env_per_trial` | Environment variables for each trial. | Optional | +| `algorithm_name` | Algorithm used for the hyperparameter search. | Required | +| `algorithm_settings` | Settings for the search algorithm. | Optional | +| `objective_metric_name` | Name of the objective metric for optimization. | Required | +| `additional_metric_names` | List of additional metrics to collect from the objective function. | Optional | +| `objective_type` | Type of optimization for the objective metric (minimize or maximize). | Required | +| `objective_goal` | The target value for the objective to succeed. | Optional | +| `max_trial_count` | Maximum number of trials to run. | Optional | +| `parallel_trial_count` | Number of trials to run in parallel. | Optional | +| `max_failed_trial_count` | Maximum number of failed trials allowed. | Optional | +| `resources_per_trial` | Resource requirements per trial, including CPU, memory, and GPU. | Optional | +| `retain_trials` | Whether to retain resources from completed trials. | Optional | +| `packages_to_install` | List of additional Python packages to install. | Optional | +| `pip_index_url` | The PyPI URL from which to install Python packages. | Optional | +| `metrics_collector_config` | Configuration for the metrics collector. | Optional | + + +### Example: + +```python +from kubeflow.katib import KatibClient + +cl = KatibClient(namespace="kubeflow") +cl.tune( + name="LLM-Hyperparameter-Tuning", + model_provider_parameters=HuggingFaceModelParams(model_name="bert-base-uncased"), + dataset_provider_parameters=HuggingFaceDatasetParams(dataset_name="imdb"), + trainer_parameters=HuggingFaceTrainerParams( + training_parameters=transformers.TrainingArguments( + learning_rate=katib.search.double(min=1e-5, max=5e-5), + per_device_train_batch_size=katib.search.choice([16, 32, 64]), + num_train_epochs=3 + ) + ), + # Optional when model is used + # objective=lambda hp: train_model(hp['lr'], hp['per_device_train_batch_size']), + # parameters={ + # "lr": katib.search.double(min=1e-5, max=5e-5), + # "per_device_train_batch_size": katib.search.choice([16, 32, 64]) + # }, + objective_metric_name="eval_loss", + objective_type="minimize", + max_trial_count=50, + parallel_trial_count=4, + resources_per_trial={"cpu": "4", "gpu": "2", "memory": "10Gi"}, + # Optional + # packages_to_install=["transformers", "datasets"], + # metrics_collector_config={"kind": "Push"} +) +``` + ### Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset This code provides an example of fine-tuning the [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. From 41efc881fa3c0b199ce12c1240c4b2c4fe16d247 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 11 Jan 2025 19:38:35 +0100 Subject: [PATCH 10/48] add sections Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 0f02c46d96..6d25f9b3c1 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -7,6 +7,12 @@ weight = 20 This page describes Language Models hyperparameter (HP) optimization Python API that Katib supports and how to configure it. +## Sections +- [Prerequisites](#Prerequisites) +- [Load Model and Dataset](#Load-Model-and-Dataset) +- [Finetune](#Finetune-Language-Models) +- [Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset](#example-fine-tuning-llama-32-for-binary-classification-on-imdb-dataset) + ## Prerequisites You need to install the following Katib components to run code in this guide: From 856f8223aacc1efa312d3b6c1fb8e9c6cd0cd06f Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 10:49:03 +0100 Subject: [PATCH 11/48] replace langauge models with large language models Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 6d25f9b3c1..0085658d91 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -1,10 +1,10 @@ +++ -title = "How to Optimize Language Models Hyperparameters" +title = "How to Optimize Large Language Models Hyperparameters" description = "API description" weight = 20 +++ -This page describes Language Models hyperparameter (HP) optimization Python API that Katib supports and how to configure +This page describes Large Language Models hyperparameter (HP) optimization Python API that Katib supports and how to configure it. ## Sections From 9e7982004de0f2849437c4a729b201ebd67e2b7e Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 10:58:37 +0100 Subject: [PATCH 12/48] improve prerequisites Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 0085658d91..36fc6b6af7 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -18,14 +18,18 @@ it. You need to install the following Katib components to run code in this guide: - Katib control plane [install](/docs/components/katib/installation/#installing-control-plane). -- Kubeflow Training SDK [install](https://github.com/kubeflow/training-operator/tree/1dfa40c12516fc9eb2ce12c5ef52da7d46670457/sdk/python) -- Katib Python SDK [install](/docs/components/katib/installation/#installing-python-sdk). +- Katib Python SDK with LLM hyperparameter optimization support (`pip install -U kubeflow-katib[huggingface]`) or [install](/docs/components/katib/installation/#installing-python-sdk). -Additionally install following python packages to run the example: +The following Python packages are automatically installed through the `extra_requires` section of the API, so you don't need to install them manually: - Transformers from Hugging Face [install](https://pypi.org/project/transformers/). - Peft from Hugging Face [install](https://pypi.org/project/peft/) +This API supports both non-distributed training and distributed training using **PyTorchJob**. +If you want to use **distributed training**, make sure to install the **Training Operator** control plane in addition to the packages mentioned above. + +- [Training Operator control plane](https://www.kubeflow.org/docs/components/training/installation/#installing-the-control-plane) + ## Load Model and Dataset To fine-tune a pre-trained model, it is essential to load the model and dataset from a provider. Currently, this can be done using external platforms like **Hugging Face** and **S3-compatible object storage** (e.g., Amazon S3) through the `storage_initializer` API from Kubeflow. From db9cabf36e36b7280e819cc5f5b9f924d9677cb0 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 11:01:46 +0100 Subject: [PATCH 13/48] algorithm_name is optional Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 36fc6b6af7..110721e057 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -191,7 +191,7 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `parameters` | Hyperparameters for tuning the experiment. | Optional | | `namespace` | Kubernetes namespace for the experiment. | Optional | | `env_per_trial` | Environment variables for each trial. | Optional | -| `algorithm_name` | Algorithm used for the hyperparameter search. | Required | +| `algorithm_name` | Algorithm used for the hyperparameter search. | Optional | | `algorithm_settings` | Settings for the search algorithm. | Optional | | `objective_metric_name` | Name of the objective metric for optimization. | Required | | `additional_metric_names` | List of additional metrics to collect from the objective function. | Optional | From 0b9d9bd93d5c1f02b0684e7fec5bf45a4aa1857c Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 11:02:23 +0100 Subject: [PATCH 14/48] objective_type is optional Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 110721e057..7faeb51bb5 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -195,7 +195,7 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `algorithm_settings` | Settings for the search algorithm. | Optional | | `objective_metric_name` | Name of the objective metric for optimization. | Required | | `additional_metric_names` | List of additional metrics to collect from the objective function. | Optional | -| `objective_type` | Type of optimization for the objective metric (minimize or maximize). | Required | +| `objective_type` | Type of optimization for the objective metric (minimize or maximize). | Optional | | `objective_goal` | The target value for the objective to succeed. | Optional | | `max_trial_count` | Maximum number of trials to run. | Optional | | `parallel_trial_count` | Number of trials to run in parallel. | Optional | From 6996c4558ba43a3afd2818252b31ff0e24f3f1f2 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 11:03:42 +0100 Subject: [PATCH 15/48] objective_metric_name is optional Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 7faeb51bb5..5f556fd76e 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -193,7 +193,7 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `env_per_trial` | Environment variables for each trial. | Optional | | `algorithm_name` | Algorithm used for the hyperparameter search. | Optional | | `algorithm_settings` | Settings for the search algorithm. | Optional | -| `objective_metric_name` | Name of the objective metric for optimization. | Required | +| `objective_metric_name` | Name of the objective metric for optimization. | Optional | | `additional_metric_names` | List of additional metrics to collect from the objective function. | Optional | | `objective_type` | Type of optimization for the objective metric (minimize or maximize). | Optional | | `objective_goal` | The target value for the objective to succeed. | Optional | From 87e0a1b089988df1fa8f0fdc6a174772175852ae Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 11:08:04 +0100 Subject: [PATCH 16/48] remove redundant example Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 5f556fd76e..e1002e22bb 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -206,41 +206,6 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `pip_index_url` | The PyPI URL from which to install Python packages. | Optional | | `metrics_collector_config` | Configuration for the metrics collector. | Optional | - -### Example: - -```python -from kubeflow.katib import KatibClient - -cl = KatibClient(namespace="kubeflow") -cl.tune( - name="LLM-Hyperparameter-Tuning", - model_provider_parameters=HuggingFaceModelParams(model_name="bert-base-uncased"), - dataset_provider_parameters=HuggingFaceDatasetParams(dataset_name="imdb"), - trainer_parameters=HuggingFaceTrainerParams( - training_parameters=transformers.TrainingArguments( - learning_rate=katib.search.double(min=1e-5, max=5e-5), - per_device_train_batch_size=katib.search.choice([16, 32, 64]), - num_train_epochs=3 - ) - ), - # Optional when model is used - # objective=lambda hp: train_model(hp['lr'], hp['per_device_train_batch_size']), - # parameters={ - # "lr": katib.search.double(min=1e-5, max=5e-5), - # "per_device_train_batch_size": katib.search.choice([16, 32, 64]) - # }, - objective_metric_name="eval_loss", - objective_type="minimize", - max_trial_count=50, - parallel_trial_count=4, - resources_per_trial={"cpu": "4", "gpu": "2", "memory": "10Gi"}, - # Optional - # packages_to_install=["transformers", "datasets"], - # metrics_collector_config={"kind": "Push"} -) -``` - ### Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset This code provides an example of fine-tuning the [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. From e1b7b3513c8a9e17bd0315c2e00728c8ce985a47 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 11:13:01 +0100 Subject: [PATCH 17/48] change tune args to optional Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index e1002e22bb..c72b97835e 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -182,9 +182,9 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | **Parameter** | **Description** | **Required** | |----------------------------------|---------------------------------------------------------------------------------|--------------| | `name` | Name of the experiment. | Required | -| `model_provider_parameters` | Parameters for the model provider, such as model type and configuration. | Required | -| `dataset_provider_parameters` | Parameters for the dataset provider, such as dataset configuration. | Required | -| `trainer_parameters` | Configuration for the trainer, including hyperparameters for model training. | Required | +| `model_provider_parameters` | Parameters for the model provider, such as model type and configuration. | Optional | +| `dataset_provider_parameters` | Parameters for the dataset provider, such as dataset configuration. | Optional | +| `trainer_parameters` | Configuration for the trainer, including hyperparameters for model training. | Optional | | `storage_config` | Configuration for storage, like PVC size and storage class. | Optional | | `objective` | Objective function for training and optimization. | Optional | | `base_image` | Base image for executing the objective function. | Optional | From 4addd74d58f9ef18469e54778d2d089d3d39e79d Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 11:46:04 +0100 Subject: [PATCH 18/48] add search api Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index c72b97835e..49660c2725 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -147,20 +147,39 @@ The `HuggingFaceTrainerParams` class is used to define parameters for the traini | `training_parameters` | `transformers.TrainingArguments` | Contains the training arguments like learning rate, epochs, batch size, etc. | | `lora_config` | `LoraConfig` | LoRA configuration to reduce the number of trainable parameters in the model. | + + +#### Katib Search API for Defining Hyperparameter Search Space + +The **Katib Search API** allows users to define the search space for hyperparameters during model tuning. This API supports continuous, discrete, and categorical parameter sampling, enabling flexible and efficient hyperparameter optimization. + +Below are the available methods for defining hyperparameter search spaces: + +| **Function** | **Description** | **Parameter Type** | **Arguments** | +|------------------|----------------------------------------------------------|--------------------|---------------------------------------------------| +| `double()` | Samples a continuous float value within a specified range. | `double` | `min` (float, required), `max` (float, required), `step` (float, optional) | +| `int()` | Samples an integer value within a specified range. | `int` | `min` (int, required), `max` (int, required), `step` (int, optional) | +| `categorical()` | Samples a value from a predefined list of categories. | `categorical` | `list` (List, required) | + + ### Example Usage This is an **example** of how to use the `HuggingFaceTrainerParams` class to define the training and LoRA parameters. ```python +import kubeflow.katib as katib +from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams + from transformers import TrainingArguments from peft import LoraConfig -from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams # Set up training and LoRA configuration trainer_params = HuggingFaceTrainerParams( training_parameters=TrainingArguments( output_dir="results", - learning_rate=1e-5, + # Using katib search api to define a search space for the parameter + # learning_rate=1e-5, + learning_rate = katib.search.double(min=1e-05, max=5e-05), num_train_epochs=3, per_device_train_batch_size=8, ), From d0d3c92aba31ec2e8219fcb947dcc876538a16d9 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 13:48:31 +0100 Subject: [PATCH 19/48] update link title Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 49660c2725..0c6103187b 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -22,8 +22,8 @@ You need to install the following Katib components to run code in this guide: The following Python packages are automatically installed through the `extra_requires` section of the API, so you don't need to install them manually: -- Transformers from Hugging Face [install](https://pypi.org/project/transformers/). -- Peft from Hugging Face [install](https://pypi.org/project/peft/) +- Transformers from Hugging Face [pypi](https://pypi.org/project/transformers/). +- Peft from Hugging Face [pypi](https://pypi.org/project/peft/) This API supports both non-distributed training and distributed training using **PyTorchJob**. If you want to use **distributed training**, make sure to install the **Training Operator** control plane in addition to the packages mentioned above. From 2eda5a5b1b70f441ebffe70cbbab337f71c8538c Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 13:59:09 +0100 Subject: [PATCH 20/48] add two scenarios for tune function with custom objective or loading model and parameters from hugging face Signed-off-by: mahdikhashan --- .../user-guides/hp-tuning/llm-hp-optimization.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 0c6103187b..cc66aee38f 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -225,7 +225,20 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `pip_index_url` | The PyPI URL from which to install Python packages. | Optional | | `metrics_collector_config` | Configuration for the metrics collector. | Optional | -### Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset +### **Parameter Flexibility in the `tune` Function** + +The parameters `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters` are **optional** and default to `None` if not specified. This design offers users flexibility in configuring their hyperparameter optimization process. Users can choose between the following approaches: + +- **Use Predefined Models and Datasets:** + Import models and datasets from external platforms by providing values for `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters`. + +- **Define a Custom Objective Function:** + Customize the training process by specifying `objective`, `base_image`, and `parameters` to define a fully custom objective function. + +Although these parameters are optional, the API internally checks their existence to ensure consistency and proper configuration. + + +## Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset This code provides an example of fine-tuning the [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. From d308043f2dde487e12b7c692a5288dad8dc076a2 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 14:01:42 +0100 Subject: [PATCH 21/48] add link for custom objective function example Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index cc66aee38f..312a875278 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -235,6 +235,8 @@ The parameters `model_provider_parameters`, `dataset_provider_parameters`, and ` - **Define a Custom Objective Function:** Customize the training process by specifying `objective`, `base_image`, and `parameters` to define a fully custom objective function. +For more information on creating custom objective functions, visit the [Katib Custom Objective Functions Guide](https://www.kubeflow.org/docs/components/katib/getting-started/). + Although these parameters are optional, the API internally checks their existence to ensure consistency and proper configuration. From f4165814018a70aba8221477f5acd82f73ea042d Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sun, 12 Jan 2025 14:04:35 +0100 Subject: [PATCH 22/48] improve tune section Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 312a875278..af3bbb450d 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -225,6 +225,31 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `pip_index_url` | The PyPI URL from which to install Python packages. | Optional | | `metrics_collector_config` | Configuration for the metrics collector. | Optional | +### Considerations for Hyperparameter Optimization + +Before exploring custom objective functions, consider the following important points: + +1. **Supported Objective Metric for LLMs** + Currently, for large language model (LLM) hyperparameter optimization, only `train_loss` is supported as the objective metric. This is because `train_loss` is the default metric produced by the `trainer.train()` function in Hugging Face, which our trainer utilizes. We plan to expand support for additional metrics in future updates. + +2. **Enabling Distributed Training with PyTorchJob** + To leverage PyTorchJob for distributed training during hyperparameter optimization, users can define a `types.TrainerResources` object for the `resources_per_trial` parameter. This allows precise allocation of computational resources for each trial. + + **Example Configuration:** + ```python + resources_per_trial = types.TrainerResources( + num_workers=4, # Number of distributed workers + num_procs_per_worker=2, # Processes per worker + resources_per_worker={ # Resource allocation per worker + "gpu": 2, # Number of GPUs + "cpu": 5, # Number of CPUs + "memory": "10G", # Memory allocation + }, + ) + ``` + +--- + ### **Parameter Flexibility in the `tune` Function** The parameters `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters` are **optional** and default to `None` if not specified. This design offers users flexibility in configuring their hyperparameter optimization process. Users can choose between the following approaches: @@ -235,10 +260,9 @@ The parameters `model_provider_parameters`, `dataset_provider_parameters`, and ` - **Define a Custom Objective Function:** Customize the training process by specifying `objective`, `base_image`, and `parameters` to define a fully custom objective function. -For more information on creating custom objective functions, visit the [Katib Custom Objective Functions Guide](https://www.kubeflow.org/docs/components/katib/getting-started/). - Although these parameters are optional, the API internally checks their existence to ensure consistency and proper configuration. +For more information on creating custom objective functions, visit the [Katib Custom Objective Functions Guide](https://www.kubeflow.org/docs/components/katib/getting-started/). ## Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset From 4a178388284aac906d7c98cc09e2d0ead478d534 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:17:51 +0100 Subject: [PATCH 23/48] improve title Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index af3bbb450d..b688996905 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -1,5 +1,5 @@ +++ -title = "How to Optimize Large Language Models Hyperparameters" +title = "How to Optimize Hyperparameters of LLMs with Kubeflow" description = "API description" weight = 20 +++ From 7298213a2605b1eebbfe6d07c6300ce63b8ca7e0 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:31:50 +0100 Subject: [PATCH 24/48] fix failing ci Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index b688996905..64fb9a3ede 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -8,9 +8,9 @@ This page describes Large Language Models hyperparameter (HP) optimization Pytho it. ## Sections -- [Prerequisites](#Prerequisites) -- [Load Model and Dataset](#Load-Model-and-Dataset) -- [Finetune](#Finetune-Language-Models) +- [Prerequisites](#prerequisites) +- [Load Model and Dataset](#load-model-and-dataset) +- [Finetune](#finetune-language-models) - [Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset](#example-fine-tuning-llama-32-for-binary-classification-on-imdb-dataset) ## Prerequisites From 372cce441459c7ea3bdbb061db49f043e64e9c34 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:32:11 +0100 Subject: [PATCH 25/48] add warning of alpha api Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 64fb9a3ede..280c27d3e0 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -4,6 +4,12 @@ description = "API description" weight = 20 +++ +{{% alert title="Warning" color="warning" %}} +This feature is in **alpha** stage and the Kubeflow community is looking for your feedback. Please +share your experience using the [#kubeflow-katib Slack channel](https://cloud-native.slack.com/archives/C0742LDFZ4K) +or the [Kubeflow Katib GitHub](https://github.com/kubeflow/katib/issues/new). +{{% /alert %}} + This page describes Large Language Models hyperparameter (HP) optimization Python API that Katib supports and how to configure it. From a582176141b65584d8928782547375e1fef3204f Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:33:40 +0100 Subject: [PATCH 26/48] improve links Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 280c27d3e0..37233db450 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -6,8 +6,8 @@ weight = 20 {{% alert title="Warning" color="warning" %}} This feature is in **alpha** stage and the Kubeflow community is looking for your feedback. Please -share your experience using the [#kubeflow-katib Slack channel](https://cloud-native.slack.com/archives/C0742LDFZ4K) -or the [Kubeflow Katib GitHub](https://github.com/kubeflow/katib/issues/new). +share your experience using the [#kubeflow-katib Slack channel](https://cloud-native.slack.com/archives/C073N7AS48P) +or the [Kubeflow Katib GitHub](https://github.com/kubeflow/katib/issues). {{% /alert %}} This page describes Large Language Models hyperparameter (HP) optimization Python API that Katib supports and how to configure From 90df939db6e0fa8930c9e90cd4bac33154e8d196 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:42:53 +0100 Subject: [PATCH 27/48] improve python code consistency Signed-off-by: mahdikhashan --- .../user-guides/hp-tuning/llm-hp-optimization.md | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 37233db450..17a7470f90 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -185,7 +185,7 @@ trainer_params = HuggingFaceTrainerParams( output_dir="results", # Using katib search api to define a search space for the parameter # learning_rate=1e-5, - learning_rate = katib.search.double(min=1e-05, max=5e-05), + learning_rate=katib.search.double(min=1e-05, max=5e-05), num_train_epochs=3, per_device_train_batch_size=8, ), @@ -274,16 +274,6 @@ For more information on creating custom objective functions, visit the [Katib Cu This code provides an example of fine-tuning the [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. -#### Model: -- [**Llama-3.2** from Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B) - -#### Dataset: -- [**IMDB movie reviews**](https://huggingface.co/datasets/stanfordnlp/imdb) (1000 samples for training) - -#### Training: -- Fine-tuning for binary classification -- Hyperparameter tuning with Katib - ### Katib Configuration The following table outlines the Katib configuration used for hyperparameter tuning in the fine-tuning process: @@ -301,6 +291,8 @@ The following table outlines the Katib configuration used for hyperparameter tun | `parallel_trial_count` | Number of trials to run in parallel, set to `2`. | | `resources_per_trial` | Resources allocated for each trial: 2 GPUs, 4 CPUs, 10GB memory. | +### Code Example + ```python import kubeflow.katib as katib from kubeflow.katib import KatibClient From 46f6e7cbc73e46d5af24ae59ca11dc89d9a47c69 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:43:29 +0100 Subject: [PATCH 28/48] define search space for r in LoraConfig Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 17a7470f90..81864d8780 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -190,7 +190,7 @@ trainer_params = HuggingFaceTrainerParams( per_device_train_batch_size=8, ), lora_config=LoraConfig( - r=8, + r=katib.search.int(min=8, max=32), lora_alpha=16, lora_dropout=0.1, bias="none", From 47a50b2a9a211de9ee9197248dbf0804eb974b64 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Mon, 13 Jan 2025 11:44:59 +0100 Subject: [PATCH 29/48] remove redundant line Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 1 - 1 file changed, 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 81864d8780..5f5d8c9c2c 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -184,7 +184,6 @@ trainer_params = HuggingFaceTrainerParams( training_parameters=TrainingArguments( output_dir="results", # Using katib search api to define a search space for the parameter - # learning_rate=1e-5, learning_rate=katib.search.double(min=1e-05, max=5e-05), num_train_epochs=3, per_device_train_batch_size=8, From 7e4520f09640aaae6ec3afe1106a81bafcf324a8 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 11:34:01 +0100 Subject: [PATCH 30/48] make sure imports are all consistent in snippets Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 5f5d8c9c2c..ee0f3b6e45 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -87,6 +87,9 @@ The `HuggingFaceModelParams` dataclass holds configuration parameters for initia ```python from transformers import AutoModelForSequenceClassification +from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams + + params = HuggingFaceModelParams( model_uri="bert-base-uncased", transformer_type=AutoModelForSequenceClassification, @@ -123,6 +126,9 @@ The `S3DatasetParams` class is used for loading datasets from S3-compatible obje ##### Hugging Face ```python +from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams + + dataset_params = HuggingFaceDatasetParams( repo_id="imdb", # Public dataset repository ID on Hugging Face split="train", # Dataset split to load @@ -133,6 +139,9 @@ dataset_params = HuggingFaceDatasetParams( ##### S3 ```python +from kubeflow.storage_initializer.s3 import S3DatasetParams + + s3_params = S3DatasetParams( endpoint_url="https://s3.amazonaws.com", bucket_name="my-dataset-bucket", From 29e789ed8c788df01201052c7dd486f02925e8e0 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 12:17:30 +0100 Subject: [PATCH 31/48] improve link Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index ee0f3b6e45..1e3ac1e327 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -25,16 +25,13 @@ You need to install the following Katib components to run code in this guide: - Katib control plane [install](/docs/components/katib/installation/#installing-control-plane). - Katib Python SDK with LLM hyperparameter optimization support (`pip install -U kubeflow-katib[huggingface]`) or [install](/docs/components/katib/installation/#installing-python-sdk). - -The following Python packages are automatically installed through the `extra_requires` section of the API, so you don't need to install them manually: - - Transformers from Hugging Face [pypi](https://pypi.org/project/transformers/). - Peft from Hugging Face [pypi](https://pypi.org/project/peft/) This API supports both non-distributed training and distributed training using **PyTorchJob**. If you want to use **distributed training**, make sure to install the **Training Operator** control plane in addition to the packages mentioned above. -- [Training Operator control plane](https://www.kubeflow.org/docs/components/training/installation/#installing-the-control-plane) +- Training Operator control plane [install](https://www.kubeflow.org/docs/components/training/installation/#installing-the-control-plane) ## Load Model and Dataset From 72149051607c7835c96e1c9c4b0bef5d853061ae Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 12:36:14 +0100 Subject: [PATCH 32/48] improve fine-tune section Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 30 +++++-------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 1e3ac1e327..794381abb6 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -236,18 +236,15 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si | `pip_index_url` | The PyPI URL from which to install Python packages. | Optional | | `metrics_collector_config` | Configuration for the metrics collector. | Optional | -### Considerations for Hyperparameter Optimization - -Before exploring custom objective functions, consider the following important points: - 1. **Supported Objective Metric for LLMs** Currently, for large language model (LLM) hyperparameter optimization, only `train_loss` is supported as the objective metric. This is because `train_loss` is the default metric produced by the `trainer.train()` function in Hugging Face, which our trainer utilizes. We plan to expand support for additional metrics in future updates. + 2. **Enabling Distributed Training with PyTorchJob** - To leverage PyTorchJob for distributed training during hyperparameter optimization, users can define a `types.TrainerResources` object for the `resources_per_trial` parameter. This allows precise allocation of computational resources for each trial. + To leverage PyTorchJob for distributed training during hyperparameter optimization, users can define a `types.TrainerResources` object for the `resources_per_trial` parameter. This allows precise allocation of computational resources for each trial. - **Example Configuration:** - ```python + **Example Configuration:** + ```python resources_per_trial = types.TrainerResources( num_workers=4, # Number of distributed workers num_procs_per_worker=2, # Processes per worker @@ -257,23 +254,12 @@ Before exploring custom objective functions, consider the following important po "memory": "10G", # Memory allocation }, ) - ``` - ---- - -### **Parameter Flexibility in the `tune` Function** - -The parameters `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters` are **optional** and default to `None` if not specified. This design offers users flexibility in configuring their hyperparameter optimization process. Users can choose between the following approaches: - -- **Use Predefined Models and Datasets:** - Import models and datasets from external platforms by providing values for `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters`. - -- **Define a Custom Objective Function:** - Customize the training process by specifying `objective`, `base_image`, and `parameters` to define a fully custom objective function. + ``` -Although these parameters are optional, the API internally checks their existence to ensure consistency and proper configuration. +3. **Defining a Custom Objective Function** + In addition to importing models and datasets from external platforms using `model_provider_parameters`, `dataset_provider_parameters`, and `trainer_parameters`, users also have the flexibility to define a custom objective function, along with a custom image and parameters for hyperparameter optimization. -For more information on creating custom objective functions, visit the [Katib Custom Objective Functions Guide](https://www.kubeflow.org/docs/components/katib/getting-started/). + For detailed instructions, refer to [this guide](https://www.kubeflow.org/docs/components/katib/getting-started/). ## Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset From 05bf68896d64cabbb4a39317324a7956dc91163c Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 13:07:01 +0100 Subject: [PATCH 33/48] improve links in prerequisites Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 794381abb6..866b8d18a6 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -23,15 +23,13 @@ it. You need to install the following Katib components to run code in this guide: -- Katib control plane [install](/docs/components/katib/installation/#installing-control-plane). -- Katib Python SDK with LLM hyperparameter optimization support (`pip install -U kubeflow-katib[huggingface]`) or [install](/docs/components/katib/installation/#installing-python-sdk). -- Transformers from Hugging Face [pypi](https://pypi.org/project/transformers/). -- Peft from Hugging Face [pypi](https://pypi.org/project/peft/) +- Katib control plane – [install](/docs/components/katib/installation/#installing-control-plane). +- Katib Python SDK with LLM Hyperparameter Optimization Support – [install](https://www.kubeflow.org/docs/components/katib/installation/#installing-python-sdk) -This API supports both non-distributed training and distributed training using **PyTorchJob**. +Additionally, this API supports both non-distributed training and distributed training using **PyTorchJob**. If you want to use **distributed training**, make sure to install the **Training Operator** control plane in addition to the packages mentioned above. -- Training Operator control plane [install](https://www.kubeflow.org/docs/components/training/installation/#installing-the-control-plane) +- Training Operator control plane – [install](https://www.kubeflow.org/docs/components/training/installation/#installing-the-control-plane) ## Load Model and Dataset From 68aa6b80f9f96a4a163d84917e6e8a3d2a335865 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 13:21:49 +0100 Subject: [PATCH 34/48] improve structure of integrations section Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 94 ++++++++++--------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 866b8d18a6..6463f0f3aa 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -47,15 +47,9 @@ from kubeflow.storage_initializer.hugging_face import ( ) ``` -### S3-Compatible Object Storage Integration +### HuggingFaceModelParams -In addition to Hugging Face, you can integrate with S3-compatible object storage platforms to load datasets. To work with S3, use the `S3DatasetParams` class to define your dataset parameters. - -```python -from kubeflow.storage_initializer.s3 import S3DatasetParams -``` - -### HuggingFaceModelParams Description +#### Description The `HuggingFaceModelParams` dataclass holds configuration parameters for initializing Hugging Face models with validation checks. @@ -66,7 +60,7 @@ The `HuggingFaceModelParams` dataclass holds configuration parameters for initia | `access_token` | `Optional[str]` (default: `None`) | Token for accessing private models on Hugging Face. | | `num_labels` | `Optional[int]` (default: `None`) | Number of output labels (used for classification tasks). | -### Supported Transformer Types (`TRANSFORMER_TYPES`) +##### Supported Transformer Types (`TRANSFORMER_TYPES`) | **Model Type** | **Task** | |------------------------------------------------|-----------------------------| @@ -93,7 +87,9 @@ params = HuggingFaceModelParams( ) ``` -### HuggingFaceDatasetParams Description +### HuggingFaceDatasetParams + +#### Description The `HuggingFaceDatasetParams` class holds configuration parameters for loading datasets from Hugging Face with validation checks. @@ -103,23 +99,8 @@ The `HuggingFaceDatasetParams` class holds configuration parameters for loading | `access_token` | `Optional[str]` (default: `None`) | Token for accessing private datasets on Hugging Face. | | `split` | `Optional[str]` (default: `None`) | Dataset split to load (e.g., `"train"`, `"test"`). | -### S3DatasetParams Description - -The `S3DatasetParams` class is used for loading datasets from S3-compatible object storage. The parameters are defined as follows: - -| **Parameter** | **Type** | **Description** | -|-------------------|--------------------|-------------------------------------------------------------------| -| `endpoint_url` | `str` | URL of the S3-compatible storage service. | -| `bucket_name` | `str` | Name of the S3 bucket containing the dataset. | -| `file_key` | `str` | Key (path) to the dataset file within the bucket. | -| `region_name` | `str`, optional | The AWS region of the S3 bucket (optional). | -| `access_key` | `str`, optional | The access key for authentication with S3 (optional). | -| `secret_key` | `str`, optional | The secret key for authentication with S3 (optional). | - #### Example Usage -##### Hugging Face - ```python from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams @@ -131,24 +112,9 @@ dataset_params = HuggingFaceDatasetParams( ) ``` -##### S3 - -```python -from kubeflow.storage_initializer.s3 import S3DatasetParams - - -s3_params = S3DatasetParams( - endpoint_url="https://s3.amazonaws.com", - bucket_name="my-dataset-bucket", - file_key="datasets/train.csv", - region_name="us-west-2", - access_key="YOUR_ACCESS_KEY", - secret_key="YOUR_SECRET_KEY" -) -``` - +### HuggingFaceTrainerParams -### HuggingFaceTrainerParams Description +#### Description The `HuggingFaceTrainerParams` class is used to define parameters for the training process in the Hugging Face framework. It includes the training arguments and LoRA configuration to optimize model training. @@ -157,8 +123,6 @@ The `HuggingFaceTrainerParams` class is used to define parameters for the traini | `training_parameters` | `transformers.TrainingArguments` | Contains the training arguments like learning rate, epochs, batch size, etc. | | `lora_config` | `LoraConfig` | LoRA configuration to reduce the number of trainable parameters in the model. | - - #### Katib Search API for Defining Hyperparameter Search Space The **Katib Search API** allows users to define the search space for hyperparameters during model tuning. This API supports continuous, discrete, and categorical parameter sampling, enabling flexible and efficient hyperparameter optimization. @@ -172,7 +136,7 @@ Below are the available methods for defining hyperparameter search spaces: | `categorical()` | Samples a value from a predefined list of categories. | `categorical` | `list` (List, required) | -### Example Usage +#### Example Usage This is an **example** of how to use the `HuggingFaceTrainerParams` class to define the training and LoRA parameters. @@ -201,6 +165,44 @@ trainer_params = HuggingFaceTrainerParams( ) ``` +## S3-Compatible Object Storage Integration + +In addition to Hugging Face, you can integrate with S3-compatible object storage platforms to load datasets. To work with S3, use the `S3DatasetParams` class to define your dataset parameters. + +```python +from kubeflow.storage_initializer.s3 import S3DatasetParams +``` + +### S3DatasetParams + +#### Description + +The `S3DatasetParams` class is used for loading datasets from S3-compatible object storage. The parameters are defined as follows: + +| **Parameter** | **Type** | **Description** | +|-------------------|--------------------|-------------------------------------------------------------------| +| `endpoint_url` | `str` | URL of the S3-compatible storage service. | +| `bucket_name` | `str` | Name of the S3 bucket containing the dataset. | +| `file_key` | `str` | Key (path) to the dataset file within the bucket. | +| `region_name` | `str`, optional | The AWS region of the S3 bucket (optional). | +| `access_key` | `str`, optional | The access key for authentication with S3 (optional). | +| `secret_key` | `str`, optional | The secret key for authentication with S3 (optional). | + +#### Example Usage + +```python +from kubeflow.storage_initializer.s3 import S3DatasetParams + + +s3_params = S3DatasetParams( + endpoint_url="https://s3.amazonaws.com", + bucket_name="my-dataset-bucket", + file_key="datasets/train.csv", + region_name="us-west-2", + access_key="YOUR_ACCESS_KEY", + secret_key="YOUR_SECRET_KEY" +) +``` ## Finetune Language Models In the context of fine-tuning large language models (LLMs) like GPT, BERT, or similar transformer-based models, it is crucial to optimize various hyperparameters to improve model performance. This sub-section covers the key parameters used in tuning LLMs via a `tune` function, specifically using tools like Katib for automated hyperparameter optimization in Kubernetes environments. @@ -243,7 +245,7 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si **Example Configuration:** ```python - resources_per_trial = types.TrainerResources( + resources_per_trial = types.TrainerResources( num_workers=4, # Number of distributed workers num_procs_per_worker=2, # Processes per worker resources_per_worker={ # Resource allocation per worker From a5b84739dff83d0b22606a526382f295c6512980 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 13:22:21 +0100 Subject: [PATCH 35/48] add missing import Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 6463f0f3aa..c9cdb3b1e8 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -245,7 +245,10 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si **Example Configuration:** ```python - resources_per_trial = types.TrainerResources( + from kubeflow.katib.types import TrainerResources + + + resources_per_trial = TrainerResources( num_workers=4, # Number of distributed workers num_procs_per_worker=2, # Processes per worker resources_per_worker={ # Resource allocation per worker From 27f81895cd2d3b41a1ec79a141dc98b6e8cc1106 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Tue, 14 Jan 2025 13:35:57 +0100 Subject: [PATCH 36/48] replace local address instead of hardcoded link to website Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index c9cdb3b1e8..ff7d38b442 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -24,12 +24,12 @@ it. You need to install the following Katib components to run code in this guide: - Katib control plane – [install](/docs/components/katib/installation/#installing-control-plane). -- Katib Python SDK with LLM Hyperparameter Optimization Support – [install](https://www.kubeflow.org/docs/components/katib/installation/#installing-python-sdk) +- Katib Python SDK with LLM Hyperparameter Optimization Support – [install](/docs/components/katib/installation/#installing-python-sdk) Additionally, this API supports both non-distributed training and distributed training using **PyTorchJob**. If you want to use **distributed training**, make sure to install the **Training Operator** control plane in addition to the packages mentioned above. -- Training Operator control plane – [install](https://www.kubeflow.org/docs/components/training/installation/#installing-the-control-plane) +- Training Operator control plane – [install](/docs/components/training/installation/#installing-the-control-plane) ## Load Model and Dataset From b72a7231a43cadf9f96241a9ec1688b2f3f36c3c Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Thu, 16 Jan 2025 14:40:06 +0100 Subject: [PATCH 37/48] fix import Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index ff7d38b442..36e6ff4089 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -245,7 +245,7 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si **Example Configuration:** ```python - from kubeflow.katib.types import TrainerResources + from kubeflow.katib.types.trainer_resources import TrainerResources resources_per_trial = TrainerResources( From 47659c591942a5befcaf43956ca354fe73ad3d09 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 15:20:09 +0100 Subject: [PATCH 38/48] use hyperparameter optimization instead of fine-tune Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 36e6ff4089..63f4d608aa 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -16,8 +16,8 @@ it. ## Sections - [Prerequisites](#prerequisites) - [Load Model and Dataset](#load-model-and-dataset) -- [Finetune](#finetune-language-models) -- [Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset](#example-fine-tuning-llama-32-for-binary-classification-on-imdb-dataset) +- [Optimizing Hyperparameters of Large Language Models](#optimizing-hyperparameters-of-large-language-models) +- [Example: Optimizing Hyperparameters of Llama-3.2 for Binary Classification on IMDB Dataset](#example-optimizing-hyperparameters-of-llama-32-for-binary-classification-on-imdb-dataset) ## Prerequisites @@ -33,7 +33,7 @@ If you want to use **distributed training**, make sure to install the **Training ## Load Model and Dataset -To fine-tune a pre-trained model, it is essential to load the model and dataset from a provider. Currently, this can be done using external platforms like **Hugging Face** and **S3-compatible object storage** (e.g., Amazon S3) through the `storage_initializer` API from Kubeflow. +To optimize hyperparameters of a pre-trained model, it is essential to load the model and dataset from a provider. Currently, this can be done using external platforms like **Hugging Face** and **S3-compatible object storage** (e.g., Amazon S3) through the `storage_initializer` API from Kubeflow. ### Hugging Face Integration @@ -203,9 +203,9 @@ s3_params = S3DatasetParams( secret_key="YOUR_SECRET_KEY" ) ``` -## Finetune Language Models +## Optimizing Hyperparameters of Large Language Models -In the context of fine-tuning large language models (LLMs) like GPT, BERT, or similar transformer-based models, it is crucial to optimize various hyperparameters to improve model performance. This sub-section covers the key parameters used in tuning LLMs via a `tune` function, specifically using tools like Katib for automated hyperparameter optimization in Kubernetes environments. +In the context of optimizing hyperparameters of large language models (LLMs) like GPT, BERT, or similar transformer-based models, it is crucial to optimize various hyperparameters to improve model performance. This sub-section covers the key parameters used in tuning LLMs via a `tune` function, specifically using tools like Katib for automated hyperparameter optimization in Kubernetes environments. ### Key Parameters for LLM Hyperparameter Tuning @@ -264,26 +264,26 @@ In the context of fine-tuning large language models (LLMs) like GPT, BERT, or si For detailed instructions, refer to [this guide](https://www.kubeflow.org/docs/components/katib/getting-started/). -## Example: Fine-Tuning Llama-3.2 for Binary Classification on IMDB Dataset +## Example: Optimizing Hyperparameters of Llama-3.2 for Binary Classification on IMDB Dataset -This code provides an example of fine-tuning the [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. +This code provides an example of hyperparameter optimized [**Llama-3.2 model**](https://huggingface.co/meta-llama/Llama-3.2-1B) for a **binary classification** task on the [**IMDB movie reviews dataset**](https://huggingface.co/datasets/stanfordnlp/imdb). The **Llama-3.2 model** is fine-tuned using **LoRA** (Low-Rank Adaptation) to reduce the number of trainable parameters. The dataset used in this example consists of 1000 movie reviews from the **IMDB** dataset, and the training process is optimized through **Katib** to find the best hyperparameters. ### Katib Configuration -The following table outlines the Katib configuration used for hyperparameter tuning in the fine-tuning process: - -| **Parameter** | **Description** | -|----------------------------|-----------------------------------------------------------------------| -| `exp_name` | Name of the experiment (`Llama-3.2-fine-tune`). | -| `model_provider_parameters`| Parameters for the Hugging Face model (Llama-3.2). | -| `dataset_provider_parameters`| Parameters for the IMDB dataset (1000 movie reviews). | -| `trainer_parameters` | Parameters for the Hugging Face trainer, including LoRA settings. | -| `objective_metric_name` | The objective metric to minimize, in this case, `"train_loss"`. | -| `objective_type` | Type of optimization: `"minimize"` for training loss. | -| `algorithm_name` | The optimization algorithm used, set to `"random"` for random search.| -| `max_trial_count` | Maximum number of trials to run, set to `10`. | -| `parallel_trial_count` | Number of trials to run in parallel, set to `2`. | -| `resources_per_trial` | Resources allocated for each trial: 2 GPUs, 4 CPUs, 10GB memory. | +The following table outlines the Katib configuration used for hyperparameter optimization process: + +| **Parameter** | **Description** | +|----------------------------|----------------------------------------------------------------| +| `exp_name` | Name of the experiment (`llama`). | +| `model_provider_parameters`| Parameters for the Hugging Face model (Llama-3.2). | +| `dataset_provider_parameters`| Parameters for the IMDB dataset (1000 movie reviews). | +| `trainer_parameters` | Parameters for the Hugging Face trainer, including LoRA settings. | +| `objective_metric_name` | The objective metric to minimize, in this case, `"train_loss"`. | +| `objective_type` | Type of optimization: `"minimize"` for training loss. | +| `algorithm_name` | The optimization algorithm used, set to `"random"` for random search. | +| `max_trial_count` | Maximum number of trials to run, set to `10`. | +| `parallel_trial_count` | Number of trials to run in parallel, set to `2`. | +| `resources_per_trial` | Resources allocated for each trial: 2 GPUs, 4 CPUs, 10GB memory. | ### Code Example @@ -330,8 +330,8 @@ hf_tuning_parameters = HuggingFaceTrainerParams( cl = KatibClient(namespace="kubeflow") -# Fine-tuning for Binary Classification -exp_name = "Llama-3.2-fine-tune" +# Optimizing Hyperparameters for Binary Classification +exp_name = "llamafinetune" cl.tune( name = exp_name, model_provider_parameters = hf_model, From 85f2283871e442196c897dac75d3b480b42d7913 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 15:23:02 +0100 Subject: [PATCH 39/48] fix header levels Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 63f4d608aa..e34b79b405 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -47,9 +47,9 @@ from kubeflow.storage_initializer.hugging_face import ( ) ``` -### HuggingFaceModelParams +#### HuggingFaceModelParams -#### Description +##### Description The `HuggingFaceModelParams` dataclass holds configuration parameters for initializing Hugging Face models with validation checks. @@ -60,7 +60,7 @@ The `HuggingFaceModelParams` dataclass holds configuration parameters for initia | `access_token` | `Optional[str]` (default: `None`) | Token for accessing private models on Hugging Face. | | `num_labels` | `Optional[int]` (default: `None`) | Number of output labels (used for classification tasks). | -##### Supported Transformer Types (`TRANSFORMER_TYPES`) +###### Supported Transformer Types (`TRANSFORMER_TYPES`) | **Model Type** | **Task** | |------------------------------------------------|-----------------------------| @@ -71,7 +71,7 @@ The `HuggingFaceModelParams` dataclass holds configuration parameters for initia | `AutoModelForMaskedLM` | Masked language modeling | | `AutoModelForImageClassification` | Image classification | -#### Example Usage +##### Example Usage ```python from transformers import AutoModelForSequenceClassification @@ -87,9 +87,9 @@ params = HuggingFaceModelParams( ) ``` -### HuggingFaceDatasetParams +#### HuggingFaceDatasetParams -#### Description +##### Description The `HuggingFaceDatasetParams` class holds configuration parameters for loading datasets from Hugging Face with validation checks. @@ -99,7 +99,7 @@ The `HuggingFaceDatasetParams` class holds configuration parameters for loading | `access_token` | `Optional[str]` (default: `None`) | Token for accessing private datasets on Hugging Face. | | `split` | `Optional[str]` (default: `None`) | Dataset split to load (e.g., `"train"`, `"test"`). | -#### Example Usage +##### Example Usage ```python from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams @@ -112,9 +112,9 @@ dataset_params = HuggingFaceDatasetParams( ) ``` -### HuggingFaceTrainerParams +#### HuggingFaceTrainerParams -#### Description +##### Description The `HuggingFaceTrainerParams` class is used to define parameters for the training process in the Hugging Face framework. It includes the training arguments and LoRA configuration to optimize model training. @@ -123,7 +123,7 @@ The `HuggingFaceTrainerParams` class is used to define parameters for the traini | `training_parameters` | `transformers.TrainingArguments` | Contains the training arguments like learning rate, epochs, batch size, etc. | | `lora_config` | `LoraConfig` | LoRA configuration to reduce the number of trainable parameters in the model. | -#### Katib Search API for Defining Hyperparameter Search Space +##### Katib Search API for Defining Hyperparameter Search Space The **Katib Search API** allows users to define the search space for hyperparameters during model tuning. This API supports continuous, discrete, and categorical parameter sampling, enabling flexible and efficient hyperparameter optimization. @@ -136,7 +136,7 @@ Below are the available methods for defining hyperparameter search spaces: | `categorical()` | Samples a value from a predefined list of categories. | `categorical` | `list` (List, required) | -#### Example Usage +##### Example Usage This is an **example** of how to use the `HuggingFaceTrainerParams` class to define the training and LoRA parameters. @@ -165,7 +165,7 @@ trainer_params = HuggingFaceTrainerParams( ) ``` -## S3-Compatible Object Storage Integration +### S3-Compatible Object Storage Integration In addition to Hugging Face, you can integrate with S3-compatible object storage platforms to load datasets. To work with S3, use the `S3DatasetParams` class to define your dataset parameters. @@ -173,9 +173,9 @@ In addition to Hugging Face, you can integrate with S3-compatible object storage from kubeflow.storage_initializer.s3 import S3DatasetParams ``` -### S3DatasetParams +#### S3DatasetParams -#### Description +##### Description The `S3DatasetParams` class is used for loading datasets from S3-compatible object storage. The parameters are defined as follows: From 6a2e59ad7f73d752365f224ef1e3d912dc713bc5 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 15:26:10 +0100 Subject: [PATCH 40/48] replace code import Signed-off-by: mahdikhashan --- .../hp-tuning/llm-hp-optimization.md | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index e34b79b405..ef5a083db4 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -245,18 +245,13 @@ In the context of optimizing hyperparameters of large language models (LLMs) lik **Example Configuration:** ```python - from kubeflow.katib.types.trainer_resources import TrainerResources - - - resources_per_trial = TrainerResources( - num_workers=4, # Number of distributed workers - num_procs_per_worker=2, # Processes per worker - resources_per_worker={ # Resource allocation per worker - "gpu": 2, # Number of GPUs - "cpu": 5, # Number of CPUs - "memory": "10G", # Memory allocation - }, - ) + import kubeflow.katib as katib + + resources_per_trial=katib.TrainerResources( + num_workers=1, + num_procs_per_worker=1, + resources_per_worker={"gpu": 0, "cpu": 1, "memory": "10G",}, + ) ``` 3. **Defining a Custom Objective Function** From ef11299d409e27bee232f97e0b6b6f6872a79215 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 15:27:03 +0100 Subject: [PATCH 41/48] replace name Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index ef5a083db4..c00649664c 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -326,7 +326,7 @@ hf_tuning_parameters = HuggingFaceTrainerParams( cl = KatibClient(namespace="kubeflow") # Optimizing Hyperparameters for Binary Classification -exp_name = "llamafinetune" +exp_name = "llama" cl.tune( name = exp_name, model_provider_parameters = hf_model, From 37fbca2910770cd4c0b6f0d604b25a4584cb61fc Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 15:41:56 +0100 Subject: [PATCH 42/48] replace definition of distributed training Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index c00649664c..79dd5a247c 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -241,7 +241,7 @@ In the context of optimizing hyperparameters of large language models (LLMs) lik 2. **Enabling Distributed Training with PyTorchJob** - To leverage PyTorchJob for distributed training during hyperparameter optimization, users can define a `types.TrainerResources` object for the `resources_per_trial` parameter. This allows precise allocation of computational resources for each trial. + To enable PyTorchJob for distributed training during hyperparameter optimization, you must define `resources_per_trial` using the `TrainerResources` object when importing models and datasets from external platforms. This requirement applies regardless of whether you choose to use distributed training for LLM hyperparameter optimization. If you prefer not to use distributed training, simply set `num_workers=1`. **Example Configuration:** ```python From e4e190c6003acdae19d728570dba1d742a236994 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 18:38:37 +0100 Subject: [PATCH 43/48] decrease header level Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 79dd5a247c..139933060b 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -188,7 +188,7 @@ The `S3DatasetParams` class is used for loading datasets from S3-compatible obje | `access_key` | `str`, optional | The access key for authentication with S3 (optional). | | `secret_key` | `str`, optional | The secret key for authentication with S3 (optional). | -#### Example Usage +##### Example Usage ```python from kubeflow.storage_initializer.s3 import S3DatasetParams From c59f268cc9405ecd366e083efad58a0dea35b2ca Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 18:40:07 +0100 Subject: [PATCH 44/48] decrease header level Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 139933060b..74eeda54f1 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -123,7 +123,7 @@ The `HuggingFaceTrainerParams` class is used to define parameters for the traini | `training_parameters` | `transformers.TrainingArguments` | Contains the training arguments like learning rate, epochs, batch size, etc. | | `lora_config` | `LoraConfig` | LoRA configuration to reduce the number of trainable parameters in the model. | -##### Katib Search API for Defining Hyperparameter Search Space +###### Katib Search API for Defining Hyperparameter Search Space The **Katib Search API** allows users to define the search space for hyperparameters during model tuning. This API supports continuous, discrete, and categorical parameter sampling, enabling flexible and efficient hyperparameter optimization. From 0447f73675f89451da927cbf84a830b29e50a38e Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Fri, 17 Jan 2025 18:46:44 +0100 Subject: [PATCH 45/48] update configuration for `resource_per_trial` Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 74eeda54f1..fe1d9330c0 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -240,8 +240,8 @@ In the context of optimizing hyperparameters of large language models (LLMs) lik Currently, for large language model (LLM) hyperparameter optimization, only `train_loss` is supported as the objective metric. This is because `train_loss` is the default metric produced by the `trainer.train()` function in Hugging Face, which our trainer utilizes. We plan to expand support for additional metrics in future updates. -2. **Enabling Distributed Training with PyTorchJob** - To enable PyTorchJob for distributed training during hyperparameter optimization, you must define `resources_per_trial` using the `TrainerResources` object when importing models and datasets from external platforms. This requirement applies regardless of whether you choose to use distributed training for LLM hyperparameter optimization. If you prefer not to use distributed training, simply set `num_workers=1`. +2. **Configuring resources_per_trial** + When importing models and datasets from external platforms, you are required to define `resources_per_trial` using the `TrainerResources` object. Setting `num_workers` to a value greater than 1 enables distributed training through PyTorchJob. To disable distributed training, simply set `num_workers=1`. **Example Configuration:** ```python From 94240ac495eebd637a94b2e7d4b29a091c822562 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 18 Jan 2025 13:09:54 +0100 Subject: [PATCH 46/48] update header title Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index fe1d9330c0..8327612b50 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -240,7 +240,7 @@ In the context of optimizing hyperparameters of large language models (LLMs) lik Currently, for large language model (LLM) hyperparameter optimization, only `train_loss` is supported as the objective metric. This is because `train_loss` is the default metric produced by the `trainer.train()` function in Hugging Face, which our trainer utilizes. We plan to expand support for additional metrics in future updates. -2. **Configuring resources_per_trial** +2. **Configuring `resources_per_trial`** When importing models and datasets from external platforms, you are required to define `resources_per_trial` using the `TrainerResources` object. Setting `num_workers` to a value greater than 1 enables distributed training through PyTorchJob. To disable distributed training, simply set `num_workers=1`. **Example Configuration:** From 23904d90b5e48a06cf9bf01d3a749878456e5ac1 Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 18 Jan 2025 13:19:02 +0100 Subject: [PATCH 47/48] update training operator control plane Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index 8327612b50..e595867f1d 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -26,8 +26,9 @@ You need to install the following Katib components to run code in this guide: - Katib control plane – [install](/docs/components/katib/installation/#installing-control-plane). - Katib Python SDK with LLM Hyperparameter Optimization Support – [install](/docs/components/katib/installation/#installing-python-sdk) -Additionally, this API supports both non-distributed training and distributed training using **PyTorchJob**. -If you want to use **distributed training**, make sure to install the **Training Operator** control plane in addition to the packages mentioned above. +Additionally, this API supports both non-distributed and distributed training using PyTorchJob. If you want to enable distributed training, you must install the Training Operator control plane in addition to the required packages mentioned above. + +Furthermore, when importing models and datasets from external platforms, you are required to define `resources_per_trial` using the `TrainerResources` object. This setup automatically utilizes PyTorchJob for training, making the installation of the Training Operator control plane **mandatory** in this case. Please ensure this setup is properly configured. - Training Operator control plane – [install](/docs/components/training/installation/#installing-the-control-plane) From e0ef3377e6d71f12b34a729b84540c2c3181769e Mon Sep 17 00:00:00 2001 From: mahdikhashan Date: Sat, 18 Jan 2025 20:39:36 +0100 Subject: [PATCH 48/48] improve prerequisites Signed-off-by: mahdikhashan --- .../katib/user-guides/hp-tuning/llm-hp-optimization.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md index e595867f1d..64c67e312f 100644 --- a/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md +++ b/content/en/docs/components/katib/user-guides/hp-tuning/llm-hp-optimization.md @@ -23,14 +23,11 @@ it. You need to install the following Katib components to run code in this guide: -- Katib control plane – [install](/docs/components/katib/installation/#installing-control-plane). +- Katib control plane – [install](/docs/components/katib/installation/#installing-control-plane) +- Training Operator control plane – [install](/docs/components/training/installation/#installing-the-control-plane) - Katib Python SDK with LLM Hyperparameter Optimization Support – [install](/docs/components/katib/installation/#installing-python-sdk) -Additionally, this API supports both non-distributed and distributed training using PyTorchJob. If you want to enable distributed training, you must install the Training Operator control plane in addition to the required packages mentioned above. - -Furthermore, when importing models and datasets from external platforms, you are required to define `resources_per_trial` using the `TrainerResources` object. This setup automatically utilizes PyTorchJob for training, making the installation of the Training Operator control plane **mandatory** in this case. Please ensure this setup is properly configured. - -- Training Operator control plane – [install](/docs/components/training/installation/#installing-the-control-plane) +**Note:** If you choose to define your own custom objective function and optimize parameters within it, distributed training is currently not supported. In this case, installing the Training Operator control plane is not necessary. For detailed instructions, please refer to [this guide](/docs/components/katib/getting-started). ## Load Model and Dataset