Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix smoke tests bug #4492

Merged
merged 30 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
33bd131
fix release smoke tests
zpoint Dec 20, 2024
d787e21
robust backward compatibility
zpoint Dec 20, 2024
49beeec
bug fix
zpoint Dec 20, 2024
1f98e11
Merge branch 'master' into dev/zeping/fix_smoke_tests
zpoint Dec 25, 2024
b8c86ec
support gke mark
zpoint Dec 25, 2024
28ce35d
update retry field
zpoint Dec 27, 2024
1aa9e9d
Merge branch 'master' into dev/zeping/fix_smoke_tests
zpoint Dec 27, 2024
38a26bb
fix
zpoint Dec 27, 2024
e117e5e
fix smoke tests issue
zpoint Dec 30, 2024
caa0b86
update command of GCP
zpoint Dec 31, 2024
1a878bd
comment for azure fix
zpoint Dec 31, 2024
6fd05bb
merge master
zpoint Jan 1, 2025
30548d0
clear resources
zpoint Jan 1, 2025
5357036
remove legacy_credentials
zpoint Jan 2, 2025
3a51d3a
Merge branch 'master' into dev/zeping/fix_smoke_tests
zpoint Jan 2, 2025
9ac96b1
bug fix
zpoint Jan 2, 2025
81ec3b2
resolve PR comment
zpoint Jan 6, 2025
168093a
update comment
zpoint Jan 6, 2025
d53b849
text param for subprocess
zpoint Jan 6, 2025
1844001
longer timeout
zpoint Jan 8, 2025
758c31a
Merge branch 'master' into dev/zeping/fix_smoke_tests
zpoint Jan 8, 2025
3ead41b
rename to is_on_gcp
zpoint Jan 9, 2025
a85077d
merge master
zpoint Jan 9, 2025
744163f
revert initial_delay
zpoint Jan 9, 2025
017477d
cache based on boot time
zpoint Jan 9, 2025
e701aed
fix command
zpoint Jan 9, 2025
76dec93
Merge branch 'master' into dev/zeping/fix_smoke_tests
zpoint Jan 9, 2025
56392fc
Merge branch 'master' into dev/zeping/fix_smoke_tests
zpoint Jan 10, 2025
ddcfb0a
fix bug
zpoint Jan 10, 2025
7aac868
add TODO
zpoint Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions .buildkite/generate_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
QUEUE_GENERIC_CLOUD = 'generic_cloud'
QUEUE_GENERIC_CLOUD_SERVE = 'generic_cloud_serve'
QUEUE_KUBERNETES = 'kubernetes'
QUEUE_KUBERNETES_SERVE = 'kubernetes_serve'
QUEUE_GKE = 'gke'
# Only aws, gcp, azure, and kubernetes are supported for now.
# Other clouds do not have credentials.
CLOUD_QUEUE_MAP = {
Expand All @@ -52,7 +52,9 @@
'aws': QUEUE_GENERIC_CLOUD_SERVE,
'gcp': QUEUE_GENERIC_CLOUD_SERVE,
'azure': QUEUE_GENERIC_CLOUD_SERVE,
'kubernetes': QUEUE_KUBERNETES_SERVE
# Now we run kubernetes on local cluster, so it should be find if we run
# serve tests on same queue as kubernetes.
'kubernetes': QUEUE_KUBERNETES
}

GENERATED_FILE_HEAD = ('# This is an auto-generated Buildkite pipeline by '
Expand Down Expand Up @@ -103,6 +105,7 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
clouds_to_include = []
clouds_to_exclude = []
is_serve_test = False
run_on_gke = False
for decorator in node.decorator_list:
if isinstance(decorator, ast.Call):
# We only need to consider the decorator with no arguments
Expand All @@ -118,6 +121,9 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
if suffix == 'serve':
is_serve_test = True
continue
elif suffix == 'requires_gke':
run_on_gke = True
continue
if suffix not in PYTEST_TO_CLOUD_KEYWORD:
# This mark does not specify a cloud, so we skip it.
continue
Expand Down Expand Up @@ -150,12 +156,14 @@ def _extract_marked_tests(file_path: str) -> Dict[str, List[str]]:
function_name = (f'{class_name}::{node.name}'
if class_name else node.name)
function_cloud_map[function_name] = (final_clouds_to_include, [
cloud_queue_map[cloud] for cloud in final_clouds_to_include
QUEUE_GKE if run_on_gke else cloud_queue_map[cloud]
for cloud in final_clouds_to_include
])
return function_cloud_map


def _generate_pipeline(test_file: str) -> Dict[str, Any]:
def _generate_pipeline(test_file: str,
auto_retry: bool = False) -> Dict[str, Any]:
"""Generate a Buildkite pipeline from test files."""
steps = []
function_cloud_map = _extract_marked_tests(test_file)
Expand All @@ -172,6 +180,11 @@ def _generate_pipeline(test_file: str) -> Dict[str, Any]:
},
'if': f'build.env("{cloud}") == "1"'
}
if auto_retry:
step['retry'] = {
# Automatically retry 2 times on any failure by default.
'automatic': True
}
steps.append(step)
return {'steps': steps}

Expand Down Expand Up @@ -199,7 +212,7 @@ def _convert_release(test_files: List[str]):
output_file_pipelines = []
for test_file in test_files:
print(f'Converting {test_file} to {yaml_file_path}')
pipeline = _generate_pipeline(test_file)
pipeline = _generate_pipeline(test_file, auto_retry=True)
output_file_pipelines.append(pipeline)
print(f'Converted {test_file} to {yaml_file_path}\n\n')
# Enable all clouds by default for release pipeline.
Expand Down
6 changes: 6 additions & 0 deletions sky/data/mounting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import random
import shlex
import textwrap
import time
from typing import Optional

from sky import exceptions
Expand Down Expand Up @@ -122,6 +123,11 @@ def get_az_mount_cmd(container_name: str,
cache_path = _BLOBFUSE_CACHE_DIR.format(
storage_account_name=storage_account_name,
container_name=container_name)
# The line below ensures the cache directory is new before mounting to
# avoid "config error in file_cache [temp directory not empty]" error, which
# can occur after stopping and starting the same cluster on Azure.
# This helps ensure a clean state for blobfuse2 operations.
cache_path += f'_{int(time.time())}'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this cause the same issue for launch on existing UP cluster where we create a new folder everytime?

if _bucket_sub_path is None:
bucket_sub_path_arg = ''
else:
Expand Down
11 changes: 8 additions & 3 deletions tests/backward_compatibility_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ uv pip install --prerelease=allow "azure-cli>=2.65.0"
uv pip install -e ".[all]"


clear_resources() {
sky down ${CLUSTER_NAME}* -y
sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y
}

# Set trap to call cleanup on script exit
trap clear_resources EXIT

# exec + launch
if [ "$start_from" -le 1 ]; then
conda activate sky-back-compat-master
Expand Down Expand Up @@ -193,6 +201,3 @@ echo "$s"
echo "$s" | grep "SUCCEEDED" | wc -l | grep 2 || exit 1
echo "$s" | grep "CANCELLING\|CANCELLED" | wc -l | grep 1 || exit 1
fi

sky down ${CLUSTER_NAME}* -y
sky jobs cancel -n ${MANAGED_JOB_JOB_NAME}* -y
2 changes: 1 addition & 1 deletion tests/skyserve/update/new_autoscaler_before.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 50
zpoint marked this conversation as resolved.
Show resolved Hide resolved
replicas: 2

resources:
Expand Down
2 changes: 1 addition & 1 deletion tests/smoke_tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def test_kubernetes_context_failover():
'kubectl get namespaces --context kind-skypilot | grep test-namespace || '
'{ echo "Should set the namespace to test-namespace for kind-skypilot. Check the instructions in '
'tests/test_smoke.py::test_kubernetes_context_failover." && exit 1; }',
'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 3, 4, 5, 6, 7, 8"',
'sky show-gpus --cloud kubernetes --region kind-skypilot | grep H100 | grep "1, 2, 4, 8"',
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
# Get contexts and set current context to the other cluster that is not kind-skypilot
f'kubectl config use-context {context}',
# H100 should not in the current context
Expand Down
7 changes: 5 additions & 2 deletions tests/smoke_tests/test_cluster_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ def test_tpu_vm_pod():


# ---------- TPU Pod Slice on GKE. ----------
@pytest.mark.requires_gke
@pytest.mark.kubernetes
def test_tpu_pod_slice_gke():
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -876,6 +877,7 @@ def test_add_and_remove_pod_annotations_with_autostop():


# ---------- Container logs from task on Kubernetes ----------
@pytest.mark.requires_gke
@pytest.mark.kubernetes
def test_container_logs_multinode_kubernetes():
name = smoke_tests_utils.get_cluster_name()
Expand Down Expand Up @@ -1429,6 +1431,7 @@ def test_aws_custom_image():
smoke_tests_utils.run_one_test(test)


@pytest.mark.requires_gke
@pytest.mark.kubernetes
@pytest.mark.parametrize(
'image_id',
Expand Down Expand Up @@ -1570,7 +1573,7 @@ def test_azure_disk_tier():
name = smoke_tests_utils.get_cluster_name() + '-' + disk_tier.value
name_on_cloud = common_utils.make_cluster_name_on_cloud(
name, sky.Azure.max_cluster_name_length())
region = 'westus2'
region = 'eastus2'
test = smoke_tests_utils.Test(
'azure-disk-tier-' + disk_tier.value,
[
Expand All @@ -1592,7 +1595,7 @@ def test_azure_best_tier_failover():
name = smoke_tests_utils.get_cluster_name()
name_on_cloud = common_utils.make_cluster_name_on_cloud(
name, sky.Azure.max_cluster_name_length())
region = 'westus2'
region = 'eastus2'
test = smoke_tests_utils.Test(
'azure-best-tier-failover',
[
Expand Down
18 changes: 18 additions & 0 deletions tests/smoke_tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
# Change cloud for generic tests to aws
# > pytest tests/smoke_tests/test_images.py --generic-cloud aws

import os
import subprocess

import pytest
from smoke_tests import smoke_tests_utils

Expand Down Expand Up @@ -345,6 +348,21 @@ def test_gcp_mig():
@pytest.mark.gcp
def test_gcp_force_enable_external_ips():
name = smoke_tests_utils.get_cluster_name()

# Command to check if the instance is on GCP
is_gcp_command = (
zpoint marked this conversation as resolved.
Show resolved Hide resolved
'curl -s -H "Metadata-Flavor: Google" '
'"http://metadata.google.internal/computeMetadata/v1/instance/name"')

# Run the GCP check
is_gcp = subprocess.run(f'{is_gcp_command}',
shell=True,
check=True,
text=True,
capture_output=True).stdout.strip()
if not is_gcp:
pytest.skip('Not on GCP, skipping test')

test_commands = [
f'sky launch -y -c {name} --cloud gcp --cpus 2 tests/test_yamls/minimal.yaml',
# Check network of vm is "default"
Expand Down
2 changes: 1 addition & 1 deletion tests/smoke_tests/test_managed_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ def test_managed_jobs_storage(generic_cloud: str):
storage_lib.StoreType.GCS, output_storage_name, 'output.txt')
output_check_cmd = f'{gcs_check_file_count} | grep 1'
elif generic_cloud == 'azure':
region = 'westus2'
region = 'centralus'
region_flag = f' --region {region}'
storage_account_name = (
storage_lib.AzureBlobStore.get_default_storage_account_name(region))
Expand Down
4 changes: 2 additions & 2 deletions tests/smoke_tests/test_mount_and_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,8 +1605,8 @@ def test_aws_regions(self, tmp_local_storage_obj, region):
'europe-west8', 'europe-west9', 'europe-west10', 'europe-west12',
'asia-east1', 'asia-east2', 'asia-northeast1', 'asia-northeast2',
'asia-northeast3', 'asia-southeast1', 'asia-south1', 'asia-south2',
'asia-southeast2', 'me-central1', 'me-central2', 'me-west1',
'australia-southeast1', 'australia-southeast2', 'africa-south1'
'asia-southeast2', 'me-central1', 'me-west1', 'australia-southeast1',
'australia-southeast2', 'africa-south1'
])
def test_gcs_regions(self, tmp_local_storage_obj, region):
# This tests creation and upload to bucket in all GCS regions
Expand Down
1 change: 1 addition & 0 deletions tests/smoke_tests/test_sky_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def test_skyserve_azure_http():

@pytest.mark.kubernetes
@pytest.mark.serve
@pytest.mark.requires_gke
def test_skyserve_kubernetes_http():
"""Test skyserve on Kubernetes"""
name = _get_service_name()
Expand Down
Loading