Skip to content

Commit

Permalink
Hardcode number of batches
Browse files Browse the repository at this point in the history
Signed-off-by: sailesh duddupudi <[email protected]>
  • Loading branch information
saileshd1402 committed Jan 16, 2025
1 parent c80db23 commit 343090c
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ def accuracy(params, batch):
train_images, train_labels, test_images, test_labels = datasets.mnist()
num_train = train_images.shape[0]
num_complete_batches, leftover = divmod(num_train, batch_size)
num_batches = num_complete_batches + bool(leftover)

# Increasing number of batches requires more resources.
num_batches = 10

def data_stream():
rng = npr.RandomState(0)
Expand Down Expand Up @@ -155,7 +157,6 @@ def replicate_array(x):

for epoch in range(num_epochs):
start_time = time.time()
num_batches = 5
for _ in range(num_batches):
replicated_params = spmd_update(replicated_params, next(batches))
epoch_time = time.time() - start_time
Expand Down
6 changes: 3 additions & 3 deletions sdk/python/test/e2e/test_e2e_jaxjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def test_sdk_e2e_with_gang_scheduling(job_namespace):
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))

try:
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=9000)
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
except Exception as e:
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
Expand Down Expand Up @@ -123,7 +123,7 @@ def test_sdk_e2e(job_namespace):
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))

try:
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=9000)
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
except Exception as e:
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
Expand Down Expand Up @@ -156,5 +156,5 @@ def generate_container() -> V1Container:
return V1Container(
name=CONTAINER_NAME,
image=os.getenv("JAX_JOB_IMAGE", "docker.io/kubeflow/jaxjob-dist-spmd-mnist:latest"),
# resources=V1ResourceRequirements(limits={"memory": "4Gi", "cpu": "1.6"}),
resources=V1ResourceRequirements(limits={"memory": "3Gi", "cpu": "1.2"}),
)

0 comments on commit 343090c

Please sign in to comment.