Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tests] Fix SkyServe Smoke Test #4566

Merged
merged 9 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/skyserve/auto_restart.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replicas: 1


Expand Down
5 changes: 4 additions & 1 deletion tests/skyserve/llm/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ setup: |
fi

# Install dependencies
pip install "fschat[model_worker,webui]==0.2.24"
# TODO(tian): transformers<4.48.0 is a temporary solution for breaking
# change in transformers 4.48.0. Update to latest version when the issue
# is fixed. Ref: https://github.com/huggingface/transformers/issues/35639
pip install "fschat[model_worker,webui]==0.2.24" "transformers<4.48.0"
pip install sentencepiece protobuf

run: |
Expand Down
2 changes: 1 addition & 1 deletion tests/skyserve/restart/user_bug.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
replicas: 1


Expand Down
3 changes: 2 additions & 1 deletion tests/skyserve/spot/base_ondemand_fallback.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ resources:
cpus: 2+
use_spot: true

workdir: examples/serve/http_server
setup: |
wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py

# Use 8080 to test jupyter service is terminated
run: python3 server.py --port 8080
2 changes: 1 addition & 1 deletion tests/skyserve/update/bump_version_after.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ service:
replicas: 3

resources:
ports: 8080
ports: 8081
cpus: 2+

setup: |
Expand Down
2 changes: 1 addition & 1 deletion tests/skyserve/update/bump_version_before.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ service:
replicas: 2

resources:
ports: 8080
ports: 8081
cpus: 2+

setup: |
Expand Down
3 changes: 2 additions & 1 deletion tests/skyserve/update/new_autoscaler_after.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ resources:
use_spot: true
cpus: 2+

workdir: examples/serve/http_server
setup: |
wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py

run: |
if [ $SKYPILOT_SERVE_REPLICA_ID -eq 7 ]; then
Expand Down
5 changes: 3 additions & 2 deletions tests/skyserve/update/new_autoscaler_before.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
service:
readiness_probe:
path: /health
initial_delay_seconds: 20
initial_delay_seconds: 60
cblmemo marked this conversation as resolved.
Show resolved Hide resolved
replicas: 2

resources:
ports: 8081
cpus: 2+

workdir: examples/serve/http_server
setup: |
wget https://raw.githubusercontent.com/skypilot-org/skypilot/refs/heads/master/examples/serve/http_server/server.py

run: python3 server.py --port 8081
49 changes: 43 additions & 6 deletions tests/smoke_tests/test_sky_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,50 @@ def _get_service_name() -> str:
_SERVE_ENDPOINT_WAIT = (
'export ORIGIN_SKYPILOT_DEBUG=$SKYPILOT_DEBUG; export SKYPILOT_DEBUG=0; '
'endpoint=$(sky serve status --endpoint {name}); '
'until ! echo "$endpoint" | grep "Controller is initializing"; '
'until ! echo "$endpoint" | grep -qE "Controller is initializing|^-$"; '
'do echo "Waiting for serve endpoint to be ready..."; '
'sleep 5; endpoint=$(sky serve status --endpoint {name}); done; '
'export SKYPILOT_DEBUG=$ORIGIN_SKYPILOT_DEBUG; echo "$endpoint"')

_SERVE_STATUS_WAIT = ('s=$(sky serve status {name}); '
'until ! echo "$s" | grep "Controller is initializing."; '
'do echo "Waiting for serve status to be ready..."; '
'sleep 5; s=$(sky serve status {name}); done; echo "$s"')
_SERVE_STATUS_WAIT = (
's=$(sky serve status {name}); '
# Wait for "Controller is initializing." to disappear
'until ! echo "$s" | grep "Controller is initializing."; '
'do '
' echo "Waiting for serve status to be ready..."; '
' sleep 5; '
' s=$(sky serve status {name}); '
'done; '
'echo "$s"')

_WAIT_PROVISION_REPR = (
# Once controller is ready, check provisioning vs. vCPU=2. This is for
# the `_check_replica_in_status`, which will check number of `vCPU=2` in the
# `sky serve status` output and use that to suggest the number of replicas.
# However, replicas in provisioning state is possible to have a repr of `-`,
# since the desired `launched_resources` is not decided yet. This would
# cause an error when counting desired number of replicas. We wait for the
# representation of `vCPU=2` the same with number of provisioning replicas
# to avoid this error.
# NOTE(tian): This assumes the replica will not do failover, as the
# requested resources is only 2 vCPU and likely to be immediately available
# on every region, hence no failover. If the replica will go through
# failover
# Check #4565 for more information.
'num_provisioning=$(echo "$s" | grep "PROVISIONING" | wc -l); '
'num_vcpu_in_provision=$(echo "$s" | grep "PROVISIONING" | grep "vCPU=2" | wc -l); '
'until [ "$num_provisioning" -eq "$num_vcpu_in_provision" ]; '
'do '
' echo "Waiting for provisioning resource repr ready..."; '
' echo "PROVISIONING: $num_provisioning, vCPU: $num_vcpu_in_provision"; '
' sleep 2; '
' s=$(sky serve status {name}); '
' num_provisioning=$(echo "$s" | grep "PROVISIONING" | wc -l); '
' num_vcpu_in_provision=$(echo "$s" | grep "PROVISIONING" | grep "vCPU=2" | wc -l); '
'done; '
# Provisioning is complete
'echo "Provisioning complete. PROVISIONING: $num_provisioning, vCPU=2: $num_vcpu_in_provision"'
)


def _get_replica_ip(name: str, replica_id: int) -> str:
Expand Down Expand Up @@ -141,7 +176,9 @@ def _check_replica_in_status(name: str, check_tuples: List[Tuple[int, bool,
resource_str = f'({spot_str}vCPU=2)'
check_cmd += (f' echo "$s" | grep "{resource_str}" | '
f'grep "{status}" | wc -l | grep {count} || exit 1;')
return (f'{_SERVE_STATUS_WAIT.format(name=name)}; echo "$s"; ' + check_cmd)
return (f'{_SERVE_STATUS_WAIT.format(name=name)}; '
f'{_WAIT_PROVISION_REPR.format(name=name)}; '
f'echo "$s"; {check_cmd}')


def _check_service_version(service_name: str, version: str) -> str:
Expand Down
Loading