Skip to content

Commit

Permalink
bursting working on compute engine! (#194)
Browse files Browse the repository at this point in the history
* bursting working on compute engine!

Signed-off-by: vsoch <[email protected]>
  • Loading branch information
vsoch authored Jul 10, 2023
1 parent ef47268 commit f5c6b31
Show file tree
Hide file tree
Showing 16 changed files with 658 additions and 25 deletions.
15 changes: 15 additions & 0 deletions api/v1alpha1/minicluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,17 @@ type Bursting struct {
//+optional
LeadBroker FluxBroker `json:"leadBroker"`

// Hostlist is a custom hostlist for the broker.toml
// that includes the local plus bursted cluster. This
// is typically used for bursting to another resource
// type, where we can predict the hostnames but they
// don't follow the same convention as the Flux Operator
//+optional
Hostlist string `json:"hostlist"`

// External clusters to burst to. Each external
// cluster must share the same listing to align ranks
//+optional
Clusters []BurstedCluster `json:"clusters"`
}

Expand Down Expand Up @@ -770,6 +779,12 @@ func (f *MiniCluster) Validate() bool {
f.Spec.Flux.Bursting.LeadBroker.Port = 8050
}

// If we are provided a hostlist, we don't need bursted clusters
if f.Spec.Flux.Bursting.Hostlist != "" && len(f.Spec.Flux.Bursting.Clusters) > 0 {
fmt.Printf("😥️ A custom hostlist cannot be provided with a bursting spec, choose one or the other!\n")
return false
}

// Set default port if unset
for b, bursted := range f.Spec.Flux.Bursting.Clusters {

Expand Down
8 changes: 5 additions & 3 deletions api/v1alpha1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@
"Bursting": {
"description": "Bursting Config For simplicity, we internally handle the name of the job (hostnames)",
"type": "object",
"required": [
"clusters"
],
"properties": {
"clusters": {
"description": "External clusters to burst to. Each external cluster must share the same listing to align ranks",
Expand All @@ -37,6 +34,11 @@
"$ref": "#/definitions/BurstedCluster"
}
},
"hostlist": {
"description": "Hostlist is a custom hostlist for the broker.toml that includes the local plus bursted cluster. This is typically used for bursting to another resource type, where we can predict the hostnames but they don't follow the same convention as the Flux Operator",
"type": "string",
"default": ""
},
"leadBroker": {
"description": "The lead broker ip address to join to. E.g., if we burst to cluster 2, this is the address to connect to cluster 1 For the first cluster, this should not be defined",
"default": {},
Expand Down
9 changes: 8 additions & 1 deletion api/v1alpha1/zz_generated.openapi.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions chart/templates/minicluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,13 @@ spec:
type: integer
type: object
type: array
hostlist:
description: Hostlist is a custom hostlist for the broker.toml
that includes the local plus bursted cluster. This is typically
used for bursting to another resource type, where we can predict
the hostnames but they don't follow the same convention as
the Flux Operator
type: string
leadBroker:
description: The lead broker ip address to join to. E.g., if
we burst to cluster 2, this is the address to connect to cluster
Expand Down Expand Up @@ -329,8 +336,6 @@ spec:
- name
- size
type: object
required:
- clusters
type: object
connectTimeout:
default: 5s
Expand Down
9 changes: 7 additions & 2 deletions config/crd/bases/flux-framework.org_miniclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,13 @@ spec:
type: integer
type: object
type: array
hostlist:
description: Hostlist is a custom hostlist for the broker.toml
that includes the local plus bursted cluster. This is typically
used for bursting to another resource type, where we can
predict the hostnames but they don't follow the same convention
as the Flux Operator
type: string
leadBroker:
description: The lead broker ip address to join to. E.g.,
if we burst to cluster 2, this is the address to connect
Expand Down Expand Up @@ -331,8 +338,6 @@ spec:
- name
- size
type: object
required:
- clusters
type: object
connectTimeout:
default: 5s
Expand Down
23 changes: 17 additions & 6 deletions controllers/flux/minicluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,17 @@ func (r *MiniClusterReconciler) getConfigMap(
// generateHostlist for a specific size given the cluster namespace and a size
func generateHostlist(cluster *api.MiniCluster, size int32) string {

// If we don't have a leadbroker address, we are at the root
var hosts string
if cluster.Spec.Flux.Bursting.LeadBroker.Address == "" {
if cluster.Spec.Flux.Bursting.Hostlist != "" {

// In case 1, we are given a custom hostlist
// This is usually the case when we are bursting to a different resource
// Where the hostlists are not predictable.
hosts = cluster.Spec.Flux.Bursting.Hostlist

} else if cluster.Spec.Flux.Bursting.LeadBroker.Address == "" {

// If we don't have a leadbroker address, we are at the root
hosts = fmt.Sprintf("%s-[%s]", cluster.Name, generateRange(size, 0))

} else {
Expand All @@ -419,12 +427,15 @@ func generateHostlist(cluster *api.MiniCluster, size int32) string {
)
}

// Now regardless of where we are, we add the bursted jobs in the same order.
// For cases where the Flux Operator determines the hostlist, we need to
// add the bursted jobs in the same order.
// Any cluster with bursting must share all the bursted hosts across clusters
// This ensures that the ranks line up
for _, bursted := range cluster.Spec.Flux.Bursting.Clusters {
burstedHosts := fmt.Sprintf("%s-[%s]", bursted.Name, generateRange(bursted.Size, 0))
hosts = fmt.Sprintf("%s,%s", hosts, burstedHosts)
if cluster.Spec.Flux.Bursting.Hostlist == "" {
for _, bursted := range cluster.Spec.Flux.Bursting.Clusters {
burstedHosts := fmt.Sprintf("%s-[%s]", bursted.Name, generateRange(bursted.Size, 0))
hosts = fmt.Sprintf("%s,%s", hosts, burstedHosts)
}
}
return hosts
}
Expand Down
23 changes: 21 additions & 2 deletions docs/getting_started/custom-resource-definition.md
Original file line number Diff line number Diff line change
Expand Up @@ -536,8 +536,27 @@ Using the above, both the main and bursted to cluster will have almost the same
and broker.toml (config). The main difference will be that the bursted cluster knows about the first one via
it's ip address or hostname, and not, for example `flux-sample-0`. Also note that when bursting, you don't
explicitly give a command to the bursted cluster - the jobs are launched on the main cluster and sent
to these external resources when they come up and are available (and needed). For a full example,
see [the bursting](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/bursting)
to these external resources when they come up and are available (and needed).

Finally, for advanced bursting cases where the pattern of hostnames does not match the convention
deployed by the Flux Operator, we allow the CRD to define a custom list. As an example, here is how
we might burst to compute engine:

```yaml
flux:
leadBroker:
# This is the name of the first minicluster.yaml spec
name: flux-sample
# In a cloud environment this would be a NodePort
address: 24.123.50.123
port: 30093
hostlist: "flux-sample-[0-3],gffw-compute-a-[001-003]"
```

In the above case, the clusters are not used. The bursting plugin you use will determine
how the hostnames and address are provided to the remote (second) cluster.

For full examples, see [the bursting](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/bursting)
examples directory.


Expand Down
1 change: 1 addition & 0 deletions docs/tutorials/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ The following tutorials are provided from their respective directories (and are

- [Bursting to GKE](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/bursting/broker-gke) from a local broker to an external Google Kubernetes Engine cluster.
- [Bursting to EKS](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/bursting/broker-eks) from a local broker to an external Amazon Elastic Kubernetes Service
- [Bursting to Compute Engine](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/bursting/broker-compute-engine) from a GKE broker to an external Compute Engine cluster.
- [Bursting (nginx service)](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/bursting/nginx) design to use central router for bursting.

#### Nested
Expand Down
9 changes: 7 additions & 2 deletions examples/dist/flux-operator-arm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,13 @@ spec:
type: integer
type: object
type: array
hostlist:
description: Hostlist is a custom hostlist for the broker.toml
that includes the local plus bursted cluster. This is typically
used for bursting to another resource type, where we can
predict the hostnames but they don't follow the same convention
as the Flux Operator
type: string
leadBroker:
description: The lead broker ip address to join to. E.g.,
if we burst to cluster 2, this is the address to connect
Expand Down Expand Up @@ -337,8 +344,6 @@ spec:
- name
- size
type: object
required:
- clusters
type: object
connectTimeout:
default: 5s
Expand Down
9 changes: 7 additions & 2 deletions examples/dist/flux-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,13 @@ spec:
type: integer
type: object
type: array
hostlist:
description: Hostlist is a custom hostlist for the broker.toml
that includes the local plus bursted cluster. This is typically
used for bursting to another resource type, where we can
predict the hostnames but they don't follow the same convention
as the Flux Operator
type: string
leadBroker:
description: The lead broker ip address to join to. E.g.,
if we burst to cluster 2, this is the address to connect
Expand Down Expand Up @@ -337,8 +344,6 @@ spec:
- name
- size
type: object
required:
- clusters
type: object
connectTimeout:
default: 5s
Expand Down
Loading

0 comments on commit f5c6b31

Please sign in to comment.