Skip to content

Commit

Permalink
add pulumi
Browse files Browse the repository at this point in the history
  • Loading branch information
calufa committed May 15, 2023
1 parent cf1d67a commit 535d370
Show file tree
Hide file tree
Showing 12 changed files with 536 additions and 0 deletions.
3 changes: 3 additions & 0 deletions infra/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gcp.json
app
Pulumi.dev.yaml
8 changes: 8 additions & 0 deletions infra/.sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export NAME=stablediffusion
export PROJECT={PROJECT} # <-- replace
export REGION={REGION} # <-- replace
export NODE_COUNT={NODE_COUNT} # <-- replace
export MACHINE_TYPE={MACHINE_TYPE} # <-- replace
export REPLICAS={REPLICAS} # <-- replace
export PULUMI_CONFIG_PASSPHRASE={PULUMI_CONFIG_PASSPHRASE} # <-- replace
export GOOGLE_APPLICATION_CREDENTIALS=./gcp.json
39 changes: 39 additions & 0 deletions infra/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel

RUN apt update && \
apt install -y \
git \
ffmpeg \
libsm6 \
libxext6 \
wget

# Install dependencies
WORKDIR /app
COPY ./app/requirements.txt /app/requirements.txt
COPY ./app/environment.yaml /app/environment.yaml
COPY ./app/setup.py /app/setup.py
RUN conda env create -f environment.yaml

# Make RUN commands use the new environment:
SHELL ["conda", "run", "-n", "ldm", "/bin/bash", "-c"]

# Install xformers for memory efficient flash attention
RUN conda install xformers -c xformers/label/dev

RUN conda init bash
RUN echo "conda activate ldm" >> $HOME/.bashrc

# Install server dependencies
RUN pip install \
flask==2.3.2 \
triton==2.0.0.post1

# Copy files into container
COPY ./app /app
COPY ./server.py /app/server.py
COPY ./cmd.sh /app/cmd.sh

# Start server
EXPOSE 80
CMD ["bash", "cmd.sh"]
2 changes: 2 additions & 0 deletions infra/Pulumi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
name: stablediffusion
runtime: python
318 changes: 318 additions & 0 deletions infra/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
import pulumi
from pulumi_gcp import projects, container, config
from pulumi_docker import Image
from pulumi_kubernetes import Provider
from pulumi_kubernetes.core.v1 import Service
from pulumi_kubernetes.apps.v1 import Deployment
import google.auth
from google.auth.transport.requests import Request
from pulumi_kubernetes.apps.v1 import DaemonSet


config = pulumi.Config()
name = config.require("name")
project = config.require("project")
location = config.require("region")
node_count = config.require_int("node_count")
machine_type = config.require("machine_type")
replicas = config.require_int("replicas")


# Fetch access token from credentials
def get_access_token():
scopes = ["https://www.googleapis.com/auth/cloud-platform"]
creds, _ = google.auth.default(scopes=scopes)

if not creds.token:
creds.refresh(Request())

return creds.token


# Enable services
container_api = projects.Service(
"container.googleapis.com",
service="container.googleapis.com",
project=project,
)
cloud_resource_manager_api = projects.Service(
"cloudresourcemanager.googleapis.com",
service="cloudresourcemanager.googleapis.com",
project=project,
)

# Build and push Docker image to container registry
image = Image(
name,
image_name=f"gcr.io/{project}/{name}",
build={
"context": ".",
"platform": "linux/amd64",
},
registry={
"server": "gcr.io",
"username": "oauth2accesstoken",
"password": pulumi.Output.from_input(get_access_token()),
},
opts=pulumi.ResourceOptions(depends_on=[container_api, cloud_resource_manager_api]),
)

# Fetch GKE engine versions
def get_engine_versions(digest):
return container.get_engine_versions(project=project, location=location)


engine_versions = pulumi.Output.all([image.repo_digest]).apply(get_engine_versions)

# Create Kubernetes cluster
cluster = container.Cluster(
name,
project=project,
location=location,
initial_node_count=node_count,
min_master_version=engine_versions.latest_master_version,
node_version=engine_versions.latest_master_version,
node_config={
"machine_type": machine_type,
"oauth_scopes": [
"https://www.googleapis.com/auth/compute",
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
],
"image_type": "COS_CONTAINERD",
"guest_accelerator": [
{
"type": "nvidia-tesla-a100",
"count": 1,
}
],
},
opts=pulumi.ResourceOptions(depends_on=[image]),
)


def generate_kubeconfig(name, endpoint, master_auth):
context = f"{project}_{location}_{name}"
return f"""apiVersion: v1
clusters:
- cluster:
certificate-authority-data: {master_auth['cluster_ca_certificate']}
server: https://{endpoint}
name: {context}
contexts:
- context:
cluster: {context}
user: {context}
name: {context}
current-context: {context}
kind: Config
preferences: {{}}
users:
- name: {context}
user:
exec:
apiVersion: client.authentication.k8s.io/v1beta1
command: gke-gcloud-auth-plugin
installHint: Install gke-gcloud-auth-plugin for use with kubectl by following
https://cloud.google.com/blog/products/containers-kubernetes/kubectl-auth-changes-in-gke
provideClusterInfo: true
"""


kubeconfig = pulumi.Output.all(
cluster.name, cluster.endpoint, cluster.master_auth
).apply(lambda args: generate_kubeconfig(*args))

# Create a Kubernetes provider
cluster_provider = Provider(name, kubeconfig=kubeconfig)

# Deploy NVIDIA daemon set
nvidia_gpu_device_plugin = DaemonSet(
"nvidia-gpu-device-plugin",
metadata={
"name": "nvidia-driver-installer",
"namespace": "kube-system",
"labels": {"k8s-app": "nvidia-driver-installer"},
},
spec={
"selector": {"matchLabels": {"k8s-app": "nvidia-driver-installer"}},
"updateStrategy": {"type": "RollingUpdate"},
"template": {
"metadata": {
"labels": {
"name": "nvidia-driver-installer",
"k8s-app": "nvidia-driver-installer",
}
},
"spec": {
"priorityClassName": "system-node-critical",
"affinity": {
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-accelerator",
"operator": "Exists",
},
{
"key": "cloud.google.com/gke-gpu-driver-version",
"operator": "DoesNotExist",
},
]
}
]
}
}
},
"tolerations": [{"operator": "Exists"}],
"hostNetwork": True,
"hostPID": True,
"volumes": [
{"name": "dev", "hostPath": {"path": "/dev"}},
{
"name": "vulkan-icd-mount",
"hostPath": {
"path": "/home/kubernetes/bin/nvidia/vulkan/icd.d"
},
},
{
"name": "nvidia-install-dir-host",
"hostPath": {"path": "/home/kubernetes/bin/nvidia"},
},
{"name": "root-mount", "hostPath": {"path": "/"}},
{"name": "cos-tools", "hostPath": {"path": "/var/lib/cos-tools"}},
{"name": "nvidia-config", "hostPath": {"path": "/etc/nvidia"}},
],
"initContainers": [
{
"image": "cos-nvidia-installer:fixed",
"imagePullPolicy": "Never",
"name": "nvidia-driver-installer",
"resources": {"requests": {"cpu": "150m"}},
"securityContext": {"privileged": True},
"env": [
{
"name": "NVIDIA_INSTALL_DIR_HOST",
"value": "/home/kubernetes/bin/nvidia",
},
{
"name": "NVIDIA_INSTALL_DIR_CONTAINER",
"value": "/usr/local/nvidia",
},
{
"name": "VULKAN_ICD_DIR_HOST",
"value": "/home/kubernetes/bin/nvidia/vulkan/icd.d",
},
{
"name": "VULKAN_ICD_DIR_CONTAINER",
"value": "/etc/vulkan/icd.d",
},
{"name": "ROOT_MOUNT_DIR", "value": "/root"},
{
"name": "COS_TOOLS_DIR_HOST",
"value": "/var/lib/cos-tools",
},
{
"name": "COS_TOOLS_DIR_CONTAINER",
"value": "/build/cos-tools",
},
],
"volumeMounts": [
{
"name": "nvidia-install-dir-host",
"mountPath": "/usr/local/nvidia",
},
{
"name": "vulkan-icd-mount",
"mountPath": "/etc/vulkan/icd.d",
},
{"name": "dev", "mountPath": "/dev"},
{"name": "root-mount", "mountPath": "/root"},
{"name": "cos-tools", "mountPath": "/build/cos-tools"},
],
},
{
"image": "gcr.io/gke-release/nvidia-partition-gpu@sha256:c54fd003948fac687c2a93a55ea6e4d47ffbd641278a9191e75e822fe72471c2",
"name": "partition-gpus",
"env": [
{
"name": "LD_LIBRARY_PATH",
"value": "/usr/local/nvidia/lib64",
}
],
"resources": {"requests": {"cpu": "150m"}},
"securityContext": {"privileged": True},
"volumeMounts": [
{
"name": "nvidia-install-dir-host",
"mountPath": "/usr/local/nvidia",
},
{"name": "dev", "mountPath": "/dev"},
{"name": "nvidia-config", "mountPath": "/etc/nvidia"},
],
},
],
"containers": [
{"image": "gcr.io/google-containers/pause:2.0", "name": "pause"}
],
},
},
},
opts=pulumi.ResourceOptions(provider=cluster_provider),
)


# Create Kubernetes deployment
deployment = Deployment(
name,
metadata={"name": name},
spec={
"strategy": {
"type": "Recreate",
},
"replicas": replicas,
"selector": {"matchLabels": {"app": name}},
"template": {
"metadata": {"labels": {"app": name}},
"spec": {
"containers": [
{
"name": name,
"image": image.repo_digest,
"resources": {"limits": {"nvidia.com/gpu": 1}},
"ports": [{"containerPort": 80}],
},
],
},
},
},
opts=pulumi.ResourceOptions(
provider=cluster_provider, depends_on=[nvidia_gpu_device_plugin]
),
)

# Create Kubernetes service to expose port 80
service = Service(
name,
spec={
"type": "LoadBalancer",
"selector": {"app": name},
"ports": [
{
"protocol": "TCP",
"port": 80,
"targetPort": 80,
},
],
},
opts=pulumi.ResourceOptions(provider=cluster_provider, depends_on=[deployment]),
)

# Export IP address of the LoadBalancer
pulumi.export(
"load_balancer_ip",
service.status.apply(lambda status: status.load_balancer.ingress[0].ip),
)
6 changes: 6 additions & 0 deletions infra/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
mkdir checkpoints
cd checkpoints
wget https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.ckpt
cd ..
mkdir static
python server.py
2 changes: 2 additions & 0 deletions infra/destroy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
source .env
pulumi destroy --yes --stack dev
11 changes: 11 additions & 0 deletions infra/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>Gallery</title>
</head>
<body>
{% for image in images %}
<img src="{{ image }}" alt="Image">
{% endfor %}
</body>
</html>
Loading

0 comments on commit 535d370

Please sign in to comment.