diff --git a/.activate.sh b/.activate.sh index 5d40028f79..5f6cfd1bc1 120000 --- a/.activate.sh +++ b/.activate.sh @@ -1 +1 @@ -.tox/py37-linux/bin/activate \ No newline at end of file +.tox/py38-linux/bin/activate \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1e1e843e68..3732c0107b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,10 @@ jobs: fail-fast: false matrix: toxenv: - - py38-linux,docs,mypy,tests + - py38-linux + - docs + - mypy + - tests - general_itests env: DOCKER_REGISTRY: "" diff --git a/.gitignore b/.gitignore index 660fe9e3e0..44cc926814 100644 --- a/.gitignore +++ b/.gitignore @@ -43,12 +43,14 @@ paasta_itests/fake_etc_paasta/marathon.json yelp_package/gopath .mypy_cache/ .pytest_cache/ +.hypothesis/ debian/.debhelper example_cluster/paasta/docker_registry.json general_itests/fake_etc_paasta/clusters.json pip-wheel-metadata debian/debhelper-build-stamp unique-run +.vault-token # Coverage artifacts .coverage diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000000..dc3c22889d --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# RTD defaults as of 2023-11-08 +build: + os: ubuntu-22.04 + tools: + python: "3.8" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/source/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: requirements-docs.txt diff --git a/Makefile b/Makefile index c79ca995ab..cb9ded6341 100644 --- a/Makefile +++ b/Makefile @@ -23,8 +23,10 @@ endif ifeq ($(PAASTA_ENV),YELP) export DOCKER_REGISTRY ?= docker-dev.yelpcorp.com/ + export DOCKER_OPT_ARGS ?= else - export DOCKER_REGISTRY ?= "" + export DOCKER_REGISTRY ?= docker.io/ + export DOCKER_OPT_ARGS ?= --user `id -u`:`id -g` export INDEX_URL_BUILD_ARG ?= PIP_INDEX_URL endif @@ -50,7 +52,7 @@ test-not-yelpy: .paasta/bin/activate .paasta/bin/tox -e tests quick-test: .tox/py38-linux - TZ=UTC .tox/py38-linux/bin/py.test --last-failed -x -- tests + TZ=UTC .tox/py38-linux/bin/py.test --failed-first -x --disable-warnings -- tests .tox/py38-linux: .paasta/bin/activate .paasta/bin/tox @@ -115,8 +117,8 @@ k8s_clean: .paasta/bin/activate # in paasta repo: java -jar ~/openapi-generator/modules/openapi-generator-cli/target/openapi-generator-cli.jar openapi-codegen: rm -rf paasta_tools/paastaapi - docker run --rm -i --user `id -u`:`id -g` -v `pwd`:/src -w /src \ - yelp/openapi-generator-cli:20201026 \ + docker run --rm -i ${DOCKER_OPT_ARGS} -v `pwd`:/src -w /src \ + ${DOCKER_REGISTRY}yelp/openapi-generator-cli:20201026 \ generate \ -i paasta_tools/api/api_docs/oapi.yaml \ -g python-experimental \ @@ -127,8 +129,8 @@ openapi-codegen: rm -rf temp-openapi-client swagger-validate: - docker run --rm -i --user `id -u`:`id -g` -v `pwd`:/src -w /src \ - yelp/openapi-generator-cli:20201026 \ + docker run --rm -i ${DOCKER_OPT_ARGS} -v `pwd`:/src -w /src \ + ${DOCKER_REGISTRY}yelp/openapi-generator-cli:20201026 \ validate \ -i paasta_tools/api/api_docs/swagger.json @@ -158,6 +160,13 @@ setup-kubernetes-job: k8s_fake_cluster generate_deployments_for_service export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\ .tox/py38-linux/bin/python -m paasta_tools.list_kubernetes_service_instances -d ./soa_config_playground --shuffle --group-lines 1 | xargs --no-run-if-empty .tox/py38-linux/bin/python -m paasta_tools.setup_kubernetes_job -d ./soa_config_playground -c kind-${USER}-k8s-test +.PHONY: cleanup-kubernetes-jobs +cleanup-kubernetes-jobs: + export KUBECONFIG=./k8s_itests/kubeconfig;\ + export PAASTA_SYSTEM_CONFIG_DIR=./etc_paasta_playground/;\ + export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\ + .tox/py38-linux/bin/python -m paasta_tools.cleanup_kubernetes_jobs -d ./soa_config_playground -c kind-${USER}-k8s-test --force + .PHONY: paasta-secrets-sync paasta-secrets-sync: setup-kubernetes-job .vault-token export KUBECONFIG=./k8s_itests/kubeconfig;\ @@ -165,6 +174,19 @@ paasta-secrets-sync: setup-kubernetes-job .vault-token export PAASTA_TEST_CLUSTER=kind-${USER}-k8s-test;\ { .tox/py38-linux/bin/python -m paasta_tools.list_kubernetes_service_instances -d ./soa_config_playground ; echo -n \ _shared; } | cut -f1 -d"." | uniq | shuf | xargs .tox/py38-linux/bin/python -m paasta_tools.kubernetes.bin.paasta_secrets_sync -v -d ./soa_config_playground -t ./.vault-token +define ANNOUNCE_CRONS_BODY +The following PaaSTA cron jobs will run on an infinite loop using the PaaSTA Playground k8s cluster: +- setup-kubernetes-job +- cleanup-kubernetes-job +- paasta-secrets-sync +- generate_deployments_for_service +endef +export ANNOUNCE_CRONS_BODY +.PHONY: paasta-crons +make paasta-cronjobs: + @echo "$$ANNOUNCE_CRONS_BODY" + while true; do make paasta-secrets-sync && make cleanup-kubernetes-jobs; sleep 5; done + .vault-token: export VAULT_ADDR=https://vault-devc.yelpcorp.com:8200 ;\ export VAULT_SKIP_VERIFY=true ;\ diff --git a/README.md b/README.md index ee19b25396..eb5b5b58d5 100644 --- a/README.md +++ b/README.md @@ -5,17 +5,23 @@ ![PaaSTA Logo](http://engineeringblog.yelp.com/images/previews/paasta_preview.png) PaaSTA is a highly-available, distributed system for building, deploying, and -running services using containers and Apache Mesos! +running services using containers and Kubernetes. + +PaaSTA has been running production services at Yelp since 2016. It was +originally designed to run on top of Apache Mesos but has subsequently been +updated to use Kubernetes. Over time the features and functionality that +PaaSTA provides have increased but the principal design remains the same. + +PaaSTA aims to take a declarative description of the services that teams need +to run and then ensures that those services are deployed safely, efficiently, +and in a manner that is easy for the teams to maintain. Rather than managing +Kubernetes YAML files, PaaSTA provides a simplified schema to describe your service +and in addition to configuring Kubernetes it can also configure other infrastructure +tools to provide monitoring, logging, cost management etc. Want to know more about the opinions behind what makes PaaSTA special? Check out the [PaaSTA Principles](http://paasta.readthedocs.io/en/latest/about/paasta_principles.html). -*Note*: PaaSTA has been running in production at Yelp for years, -and has a number of "Yelpisms" still lingering in the codebase. We have made -efforts to excise them, but there are bound to be lingering issues. Please help us -by opening an [issue](https://github.com/Yelp/paasta/issues/new) or -better yet a [pull request](https://github.com/Yelp/paasta/pulls). - ## Components *Note*: PaaSTA is an opinionated platform that uses a few un-opinionated @@ -23,27 +29,37 @@ tools. It requires a non-trivial amount of infrastructure to be in place before it works completely: * [Docker](http://www.docker.com/) for code delivery and containment - * [Mesos](http://mesos.apache.org/) / [Kubernetes](https://kubernetes.io/) for code execution and scheduling (runs Docker containers) - * [Marathon](https://mesosphere.github.io/marathon/) for managing long-running services + * [Kubernetes](https://kubernetes.io/) for code execution and scheduling (runs Docker containers) * [Tron](https://tron.readthedocs.io/en/latest/) for running things on a timer (nightly batches) - * [SmartStack](http://nerds.airbnb.com/smartstack-service-discovery-cloud/) / [Envoy](https://www.envoyproxy.io/) for service registration and discovery - * [Sensu](https://sensuapp.org/) for monitoring/alerting + * [SmartStack](http://nerds.airbnb.com/smartstack-service-discovery-cloud/) and [Envoy](https://www.envoyproxy.io/) for service registration and discovery + * [Sensu](https://sensu.io/) for monitoring/alerting * [Jenkins](https://jenkins-ci.org/) (optionally) for continuous deployment + * [Prometheus](https://prometheus.io/) and [HPA](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) for autoscaling services + +One advantage to having a PaaS composed of components like these is you +get to reuse them for other purposes. For example, at Yelp Sensu is not just for +PaaSTA, it can be used to monitor all sorts of things. We also use Kubernetes to run +other more complex workloads like [Jolt](https://dcos.io/events/2017/jolt-distributed-fault-tolerant-tests-at-scale-on-mesos/) and [Cassandra](https://engineeringblog.yelp.com/2020/11/orchestrating-cassandra-on-kubernetes-with-operators.html). Our service mesh, which +is a heavily customised version of SmartStack and Envoy, allows many systems at Yelp +to communicate with PaaSTA services and each other. + +On the other hand, requiring lots of components, means lots of infrastructure to +setup before PaaSTA can work effectively! Realistacally, running PaaSTA outside of Yelp +would not be sensible, because in addition to the integrations mentioned above we also +have strong opinions encoded in other tooling that you would need to replicate. Nevertheless, +we code PaaSTA in the open because we think it is useful to share our approach and hope that +the code can at least help others understand or solve similar problems. -The main advantage to having a PaaS composed of components like these is you -get to reuse them for other purposes. For example at Yelp Sensu is not just for -PaaSTA, it can be used to monitor all sorts of things. Also Mesos can be -re-used for things like custom frameworks. For example at Yelp we use the Mesos -infrastructure to run our large-scale testing framework: -[Jolt](https://dcos.io/events/2017/jolt-distributed-fault-tolerant-tests-at-scale-on-mesos/). -SmartStack is used at Yelp for service discovery for Non-PaaSTA things as well, -like databases, legacy apps, and Puppet-defined apps. Most PaaS's do not -allow for this type of component re-use. - -On the other hand, requiring lots of components means lots of infrastructure to -setup before PaaSTA is fully baked. If you are looking for a project that -doesn't require external components, we encourage you to look at the doc -[comparing PaaSTA to other tools](https://github.com/Yelp/paasta/blob/master/comparison.md). +## Integrations and Features + +In addition to the direct integrations above PaaSTA also relies on other components +to provide PaaSTA users with other features and to manage compute capacity at Yelp. + +* We use [Karpenter](https://karpenter.sh/) to autoscale pools of EC2 instances to run PaaSTA. Formerly we used our own autoscaler [Clusterman](https://engineeringblog.yelp.com/2019/11/open-source-clusterman.html) +* We bake AMIs using [Packer](https://www.packer.io/) +* We collect logs from services and send them via [Monk](https://engineeringblog.yelp.com/2020/01/streams-and-monk-how-yelp-approaches-kafka-in-2020.html) to [Kafka](https://kafka.apache.org/) +* We use [StatefulSets](https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/) to run a few stateful PaaSTA services +* We autotune the resources needed by each service by monitoring usage (similar to [VPA](https://github.com/kubernetes/autoscaler/tree/master/vertical-pod-autoscaler)) ## Design Goals @@ -54,16 +70,11 @@ doesn't require external components, we encourage you to look at the doc * No single points of failure * Pleasant interface -PaaSTA is an opinionated platform, and it is not designed to interoperate with -every possible backend service out there. - -Think of it as an example of how we have integrated these technologies together -to build a cohesive PaaS. It is not a turn-key PaaS solution. - ## Getting Started See the [getting started](http://paasta.readthedocs.io/en/latest/installation/getting_started.html) -documentation for how to deploy PaaSTA. +documentation for how to deploy PaaSTA. This reference is intended to help understand how PaaSTA +works but we don't advise that you use PaaSTA in production. ## Debugging PaaSTA (in VS Code) diff --git a/contrib/python-k8s-client.diff b/contrib/python-k8s-client.diff new file mode 100644 index 0000000000..1ee45ccca7 --- /dev/null +++ b/contrib/python-k8s-client.diff @@ -0,0 +1,11 @@ +--- a/debian/paasta/opt/venvs/paasta/lib/python3.8/site-packages/kubernetes/client/api_client.py ++++ b/debian/paasta/opt/venvs/paasta/lib/python3.8/site-packages/kubernetes/client/api_client.py +@@ -629,7 +629,7 @@ + 'get_real_child_model'): + return data + +- kwargs = {} ++ kwargs = {"local_vars_configuration": self.configuration} + if (data is not None and + klass.openapi_types is not None and + isinstance(data, (list, dict))): diff --git a/debian/changelog b/debian/changelog index 6fd31a4a94..40ba0a47fa 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,698 @@ +paasta-tools (0.218.6) xenial; urgency=medium + + * 0.218.6 tagged with 'make release' + Commit: Speed up secret syncing (#3803) It turns out that the + slowness here was due to our hardcoded .3s sleep - we"re now syncing + a lot more secrets (and generally doing more work) because of our + namespace sharding, so this sleep ends up taking quite a bit of + time. We generally haven"t had to throttle much (outside of Pod + creation, but that"s generally due to other constraints: e.g., + cluster autoscaling and whatnot), so it"s probably fine to remove + this sleep entirely for the time being. That said, in the interest + of safety, I"ve made this delay configurable - so if this ends up + causing issues after release (or in the far future), we can easily + tweak the delay. + + -- Luis Perez Tue, 20 Feb 2024 11:59:32 -0800 + +paasta-tools (0.218.5) xenial; urgency=medium + + * 0.218.5 tagged with 'make release' + Commit: PAASTA-18005: Add default timeout to paasta-api (#3800) + + -- Jon Lee Thu, 15 Feb 2024 06:09:53 -0800 + +paasta-tools (0.218.4) xenial; urgency=medium + + * 0.218.4 tagged with 'make release' + Commit: Scale with ready pods count rather current replicas for + uwsgi (#3802) We had some surprising behavior recently where an + autoscaled app was crashlooping and the HPA did not keep the desired + scale as-is - we believe that this is due to a mismatch in how we + calculate the value we send to k8s: The existing PromQL divides by + current replica to turn things into a utilization percentage such + that when the HPA multiplies later on by the number of current + replicas, we reach our target replicas. However, the HPA will + actually multiply by the number of ready replicas, leading to a + mismatch. (h/t to @krall for wording things much better than I + could) + + -- Luis Perez Wed, 14 Feb 2024 10:06:51 -0800 + +paasta-tools (0.218.3) xenial; urgency=medium + + * 0.218.3 tagged with 'make release' + Commit: Fixes local tests and openapi-codegen (#3801) + + -- Jon Lee Tue, 13 Feb 2024 13:49:03 -0800 + +paasta-tools (0.218.2) xenial; urgency=medium + + * 0.218.2 tagged with 'make release' + Commit: Remove spark mesos logic (#3799) + + -- Chi Chang Thu, 08 Feb 2024 09:07:55 -0800 + +paasta-tools (0.218.1) xenial; urgency=medium + + * 0.218.1 tagged with 'make release' + Commit: Merge branch "u/mpiano/SECNESS-1911" + + -- Matteo Piano Wed, 07 Feb 2024 02:03:23 -0800 + +paasta-tools (0.218.0) xenial; urgency=medium + + * 0.218.0 tagged with 'make release' + Commit: Call ensure_namespace() in paasta_secrets_sync (#3792) At + the moment, you cannot pre-create secrets on a new dual-name cluster + since the namespaces for services will not yet have been created by + setup_kubernetes_job (and they may not have been created by Flux yet + either). We can fix this by calling ensure_namespace() in the + secret sync code - the caching we have on + ensure_namespace()/get_all_namespaces() means that this *should* be + pretty fast as it should be grabbing cached data most of the time. + + -- Luis Perez Tue, 06 Feb 2024 09:25:50 -0800 + +paasta-tools (0.217.3) xenial; urgency=medium + + * 0.217.3 tagged with 'make release' + Commit: Displays EKS in PaaSTA Status output (#3788) + + -- Jon Lee Tue, 30 Jan 2024 11:11:19 -0800 + +paasta-tools (0.217.2) xenial; urgency=medium + + * 0.217.2 tagged with 'make release' + Commit: Merge pull request #3784 from + Yelp/jfong/fix_noconfiguration_race_condition Catch NoConfiguration + exceptions in bounce_status + + -- Jen Patague Thu, 25 Jan 2024 12:59:59 -0800 + +paasta-tools (0.217.1) xenial; urgency=medium + + * 0.217.1 tagged with 'make release' + Commit: Merge pull request #3782 from Yelp/u/vit/SEC-18515-bump- + boto3 SEC-18515: Bump boto3 to enable IMDSv2 usage + + -- Vincent Thibault Wed, 24 Jan 2024 13:02:25 -0800 + +paasta-tools (0.217.0) xenial; urgency=medium + + * 0.217.0 tagged with 'make release' + Commit: Merge pull request #3758 from Yelp/u/gonabavi/DREIMP- + 10204_vitess_deployment_poc Setup scaffolding for Vitess deployment + PoC + + -- Luis Perez Tue, 23 Jan 2024 09:08:49 -0800 + +paasta-tools (0.216.0) xenial; urgency=medium + + * 0.216.0 tagged with 'make release' + Commit: Merge pull request #3776 from + Yelp/u/jfong/remove_uwsgi_sidecar Stop using uwsgi sidecars and + supporting custom stats port (Again) + + -- Jen Patague Tue, 16 Jan 2024 14:00:16 -0800 + +paasta-tools (0.215.1) xenial; urgency=medium + + * 0.215.1 tagged with 'make release' + Commit: Add placeholder value for non-existent boto_keys (#3773) + Previously, using an invalid value here would cause the service to + fail to launch. Now that we"re migrating away from using AWS users, + people mess this up more often, and it may be desirable to let the + pod launch with placeholder values. Placeholder values instead just + ignoring them and not placing keys was much simpler to implement, + and only very marginally less desirable IMO. We still get a log + line that we can alert on when this happens. + + -- Luis Perez Tue, 16 Jan 2024 09:56:23 -0800 + +paasta-tools (0.215.0) xenial; urgency=medium + + * 0.215.0 tagged with 'make release' + Commit: Make get_running_task_allocation include all namespaces by + default (#3774) For the most part, we"re not really excluding many + namespaces (most of our namespaces start with paasta). That said, + now that we allow folks to use any arbitrary namespace, we can run + into issues where a namespace is not in our allowlist and then + autotune silently stops working. I don"t foresee this adding too + much additional data (and if it does, we can filter out noisy + namespaces). Co-authored-by: tzhu + + -- Luis Perez Fri, 12 Jan 2024 08:18:55 -0800 + +paasta-tools (0.214.0) xenial; urgency=medium + + * 0.214.0 tagged with 'make release' + Commit: Merge pull request #3761 from Yelp/u/cuza/patching-python- + k8s-client Patching python-k8s-client at build time + + -- Luis Perez Wed, 10 Jan 2024 12:24:51 -0800 + +paasta-tools (0.213.1) xenial; urgency=medium + + * 0.213.1 tagged with 'make release' + Commit: Merge pull request #3768 from Yelp/luisp/i-am-very-sorry + Load eks- and kubernetes- files in autotune merging + + -- Jen Patague Mon, 08 Jan 2024 17:04:39 -0800 + +paasta-tools (0.213.0) xenial; urgency=medium + + * 0.213.0 tagged with 'make release' + Commit: Remove paasta performance-check (#3750) This was never + really utilized as intended and the necessary infra to run this was + torn down long ago - let"s delete this until someone has the desire + and will to bring it back :) + + -- Luis Perez Mon, 08 Jan 2024 11:43:25 -0800 + +paasta-tools (0.212.1) xenial; urgency=medium + + * 0.212.1 tagged with 'make release' + Commit: Merge pull request #3767 from rockdog/master Fix + NO_DESCRIPTION_MESSAGE used in paasta info cli command + + -- Luis Perez Mon, 08 Jan 2024 08:08:15 -0800 + +paasta-tools (0.212.0) xenial; urgency=medium + + * 0.212.0 tagged with 'make release' + Commit: Merge pull request #3763 from + Yelp/u/jfong/no_include_smartstack Remove include_smartstack + + -- Jen Patague Fri, 05 Jan 2024 09:40:59 -0800 + +paasta-tools (0.211.2) xenial; urgency=medium + + * 0.211.2 tagged with 'make release' + Commit: Guess the correct context (or allow override) in + paasta_habitat_fixer (#3766) Turns out that I didn"t read the admin + kubeconf closely enough and missed that the context names there are + prefixed with kubernetes-admin@ + + -- Luis Perez Thu, 04 Jan 2024 15:46:52 -0800 + +paasta-tools (0.211.1) xenial; urgency=medium + + * 0.211.1 tagged with 'make release' + Commit: Fix debian link (#3765) This is what I get for naming the + file differently from the command - `paasta_habitat_fixer` does not + work as bin/ does not contain a paasta_habitat_fixer.py + + -- Luis Perez Thu, 04 Jan 2024 13:49:39 -0800 + +paasta-tools (0.211.0) xenial; urgency=medium + + * 0.211.0 tagged with 'make release' + Commit: Add workaround script to fix habitat labels (#3762) We"re + seeing some weirdness with Karpenter where one of our custom labels + (yelp.com/habitat) is sometimes being set as a random integer. + While we debug this, we have enough information to write a script to + fix these. + + -- Luis Perez Thu, 04 Jan 2024 10:18:00 -0800 + +paasta-tools (0.210.0) xenial; urgency=medium + + * 0.210.0 tagged with 'make release' + Commit: Merge branch "u/krall/fix_local_run_docker_hostname" + + -- Evan Krall Wed, 03 Jan 2024 11:14:38 -0800 + +paasta-tools (0.209.3) xenial; urgency=medium + + * 0.209.3 tagged with 'make release' + Commit: Merge branch "u/krall/optimize_paasta_validate" + + -- Evan Krall Fri, 15 Dec 2023 13:36:26 -0800 + +paasta-tools (0.209.2) xenial; urgency=medium + + * 0.209.2 tagged with 'make release' + Commit: remove use_k8s deprecated flag (#3749) + + -- Jon Lee Fri, 15 Dec 2023 05:53:59 -0800 + +paasta-tools (0.209.1) xenial; urgency=medium + + * 0.209.1 tagged with 'make release' + Commit: Merge pull request #3751 from + Yelp/fix_autoscaler_check_symlink Add missing symlink for + check_autoscaler_max_instances + + -- Jen Patague Thu, 14 Dec 2023 16:31:41 -0800 + +paasta-tools (0.209.0) xenial; urgency=medium + + * 0.209.0 tagged with 'make release' + Commit: Merge branch "u/krall/podtopologyspread" + + -- Evan Krall Thu, 14 Dec 2023 13:08:44 -0800 + +paasta-tools (0.208.0) xenial; urgency=medium + + * 0.208.0 tagged with 'make release' + Commit: Add support for setting lower/upper-bounds for autotuned + resources (#3744) At the moment this only supports a specific + subset of top-level resource keys: * cpus * mem * disk Supporting + {min,max}_instances should be pretty trivial (but I"m not quite sure + when we"d want to use that yet, so I left those out of the + allowlist). I"m explicitly not supporting these values for sidecars + since: a) it seemed like it would complicate things a bit and, b) + I"m not sure we have a usecase for this :p + + -- Luis Perez Thu, 14 Dec 2023 11:46:59 -0800 + +paasta-tools (0.207.12) xenial; urgency=medium + + * 0.207.12 tagged with 'make release' + Commit: Merge pull request #3746 from Yelp/jfong/PAASTA-18122 Stop + deleting brutal deployments + + -- Jen Patague Thu, 14 Dec 2023 10:52:01 -0800 + +paasta-tools (0.207.11) xenial; urgency=medium + + * 0.207.11 tagged with 'make release' + Commit: Merge pull request #3743 from Yelp/u/vit/PAASTA-18111-fix- + secret-validation Secret validation now take into account service: + override. + + -- Vincent Thibault Thu, 30 Nov 2023 06:30:56 -0800 + +paasta-tools (0.207.10) xenial; urgency=medium + + * 0.207.10 tagged with 'make release' + Commit: MLCOMPUTE-1035 Update dnsPolicy (#3742) + + -- Li Rong Fri, 24 Nov 2023 09:17:41 -0800 + +paasta-tools (0.207.9) xenial; urgency=medium + + * 0.207.9 tagged with 'make release' + Commit: Merge pull request #3741 from Yelp/MLCOMPUTE- + 1098_fix_parsing_error MLCOMPUTE-1098 | fix error in parsing user + spark args + + -- Sameer Sharma Wed, 22 Nov 2023 10:12:30 -0800 + +paasta-tools (0.207.8) xenial; urgency=medium + + * 0.207.8 tagged with 'make release' + Commit: Merge branch "u/mpiano/SECNESS-1815" + + -- Matteo Piano Mon, 20 Nov 2023 01:22:55 -0800 + +paasta-tools (0.207.7) xenial; urgency=medium + + * 0.207.7 tagged with 'make release' + Commit: Released 0.207.6 via make release + + -- Luis Perez Thu, 16 Nov 2023 12:23:29 -0800 + +paasta-tools (0.207.6) xenial; urgency=medium + + * 0.207.6 tagged with 'make release' + Commit: Merge pull request #3738 from + Yelp/jfong/rollback_conditions_fix Fix conditions for displaying + disable auto rollbacks button + + -- Jen Patague Thu, 16 Nov 2023 12:02:54 -0800 + +paasta-tools (0.207.5) xenial; urgency=medium + + * 0.207.5 tagged with 'make release' + Commit: Merge pull request #3737 from + Yelp/u/jfong/fix_paasta_secrets_eks Fix paasta secrets sync + namespace checking for eks instances + + -- Jen Patague Tue, 31 Oct 2023 10:45:47 -0700 + +paasta-tools (0.207.4) xenial; urgency=medium + + * 0.207.4 tagged with 'make release' + Commit: Merge pull request #3736 from + Yelp/d/manpreet/limit_spark_app_name_to_63_characters Upgrade + service-configuration-lib to limit character count in spark a… + + -- Manpreet Singh Tue, 31 Oct 2023 09:21:36 -0700 + +paasta-tools (0.207.3) xenial; urgency=medium + + * 0.207.3 tagged with 'make release' + Commit: Merge remote-tracking branch "origin/u/mpiano/SECNESS-1787" + + -- Matteo Piano Tue, 31 Oct 2023 06:48:08 -0700 + +paasta-tools (0.207.2) xenial; urgency=medium + + * 0.207.2 tagged with 'make release' + Commit: Merge branch "u/krall/limitrange_paastasvc-" + + -- Evan Krall Fri, 27 Oct 2023 11:29:04 -0700 + +paasta-tools (0.207.1) xenial; urgency=medium + + * 0.207.1 tagged with 'make release' + Commit: Merge pull request #3733 from + Yelp/jfong/bounce_status_relay_404 Catch statefulset missing as 404 + in API bounce_status + + -- Jen Patague Fri, 27 Oct 2023 09:49:30 -0700 + +paasta-tools (0.207.0) xenial; urgency=medium + + * 0.207.0 tagged with 'make release' + Commit: Merge branch "u/krall/allow_non_paastasvc" + + -- Evan Krall Thu, 26 Oct 2023 11:18:27 -0700 + +paasta-tools (0.206.0) xenial; urgency=medium + + * 0.206.0 tagged with 'make release' + Commit: Merge pull request #3729 from Yelp/u/cuza/making-skj-and-ckj- + aware-of-downthenup-bounces-across-namespaces making skj and ckj + aware of downthenup bounces across namespaces + + -- Dave Cuza Wed, 25 Oct 2023 10:18:25 -0700 + +paasta-tools (0.205.1) xenial; urgency=medium + + * 0.205.1 tagged with 'make release' + Commit: Merge pull request #3730 from + Yelp/u/jfong/fix_spark_run_docker_reg Use the correct registry_uri + to check if we need to sudo + + -- Jen Patague Mon, 23 Oct 2023 17:08:53 -0700 + +paasta-tools (0.205.0) xenial; urgency=medium + + * 0.205.0 tagged with 'make release' + Commit: Respect a service"s docker_registry for adhoc spark-runs + (#3728) There are some services where we want to ensure that a + specialized docker registry is always used - even for adhoc + development runs. This change is loosely based on what we do in + push-to-registry, where we read service.yaml to see if a specialized + registry needs to be used. + + -- Luis Perez Mon, 23 Oct 2023 13:45:05 -0700 + +paasta-tools (0.204.2) xenial; urgency=medium + + * 0.204.2 tagged with 'make release' + Commit: Bump service-configuration-lib to v2.18.10 (#3725) + + -- Chi Chang Fri, 20 Oct 2023 03:20:32 -0700 + +paasta-tools (0.204.1) xenial; urgency=medium + + * 0.204.1 tagged with 'make release' + Commit: Merge pull request #3724 from Yelp/sina/try-remote-branch- + first DREIMP-10150: Base new commits on remote branch, if it exists + + -- Sina Siadat Mon, 16 Oct 2023 03:21:50 -0700 + +paasta-tools (0.204.0) xenial; urgency=medium + + * 0.204.0 tagged with 'make release' + Commit: Add support for autotuned type aliases (#3693) For some + instance types that are autotuned (e.g., `kubernetes`), we may have + a largely-similar instance type (e.g., `eks`) and the ability to + trivially move instances back and forth between these. For these, + there"s a couple different options for how to handle autotune: 1) + have tooling/documentation to migrate data between + `autotuned_defaults/` files on instance moves + teach the autotune + machinery what the correct cluster/filename to update should be (as + well as teach autotune how to gather data correctly) 2) have + tooling/documentation to temporarily pin the new instance type at + the old autotuned request until autotune has updated with the new + instance type data 3) add some form of aliasing and pretend (at the + autotune level) that there"s a single instance type for these + largely-similar instance types This PR goes for option 3 as it + is the least complex and has the fewest moving parts/places for + things to horribly blow up. + + -- Luis Perez Thu, 12 Oct 2023 12:18:40 -0700 + +paasta-tools (0.203.0) xenial; urgency=medium + + * 0.203.0 tagged with 'make release' + Commit: Merge branch "u/krall/PAASTA-18016_change_default_namespace" + + -- Evan Krall Wed, 11 Oct 2023 10:45:59 -0700 + +paasta-tools (0.202.4) xenial; urgency=medium + + * 0.202.4 tagged with 'make release' + Commit: Sync paasta secrets for eks instances as well (#3718) We + could also teach this to only sync secrets for self-managed OR EKS + clusters individually - but that seems like more effort (and there"s + no real harm to syncing these to both: in fact, it makes rolling + back even easier since there"s no need to wait for secrets to re- + sync) + + -- Luis Perez Tue, 10 Oct 2023 14:24:18 -0700 + +paasta-tools (0.202.3) xenial; urgency=medium + + * 0.202.3 tagged with 'make release' + Commit: [COREJAVA-998] Use registration name for paasta_instance + name if a single custom registration is used (#3717) * [COREJAVA- + 998] Use registration name for paasta_instance name if a single + custom registration is used * Add rules to active-requests validate + * Add comment inline * Update + paasta_tools/setup_prometheus_adapter_config.py Simplify + envoy_filter_terms Co-authored-by: Luis Pérez * + Update tests and add registration length check * Update + tests/test_setup_prometheus_adapter_config.py Remove debug print. + Co-authored-by: Luis Pérez --------- Co-authored- + by: Luis Pérez + + -- Charan Gangaraju Mon, 09 Oct 2023 10:13:40 -0700 + +paasta-tools (0.202.2) xenial; urgency=medium + + * 0.202.2 tagged with 'make release' + Commit: [COREJAVA-995] Refactor to pass instance_config to all + scaling rules (#3716) * [COREJAVA-995] Refactor to pass + instance_config to all scaling rules * Update the type of + instance_config * Remove namespace as parameter for active-requests + scaling * remove default value as get_namespace() will always + return a value + + -- Charan Gangaraju Fri, 06 Oct 2023 02:16:10 -0700 + +paasta-tools (0.202.1) xenial; urgency=medium + + * 0.202.1 tagged with 'make release' + Commit: Merge pull request #3715 from + Yelp/jfong/uppercase_iam_support Support uppercase IAM role names + + -- Jen Patague Tue, 03 Oct 2023 14:45:46 -0700 + +paasta-tools (0.202.0) xenial; urgency=medium + + * 0.202.0 tagged with 'make release' + Commit: Merge pull request #3714 from Yelp/jfong/PAASTA-18066 + Validate iam_role and stop supporting kiam + + -- Jen Patague Mon, 02 Oct 2023 10:04:47 -0700 + +paasta-tools (0.201.5) xenial; urgency=medium + + * 0.201.5 tagged with 'make release' + Commit: Remove sfx autoscaling link (#3710) We haven"t used SFX for + autoscaling in quite some time and this confuses people that see it + referenced in the output of `paasta status` At some point we"ll + want to replace this with a dashboard that visualizes how the + Prometheus-based autoscaling works - but that"s for future-us to do + :) + + -- Luis Perez Wed, 27 Sep 2023 14:37:27 -0700 + +paasta-tools (0.201.4) xenial; urgency=medium + + * 0.201.4 tagged with 'make release' + Commit: Ensure secret_volume volumes are not optional (#3708) We + noticed some logging from a newly added use of secret_volume that + makes us think that the default for these is that + V1SecretVolumeSources are optional - let"s clear up any ambiguity + and explicitly set a value for this NOTE: I"m not quite sure if + this will require a big bounce of anything using secret_volume - but + I guess that"s also less of a concern than a big bounce of + *everything* + + -- Luis Perez Mon, 25 Sep 2023 13:01:01 -0700 + +paasta-tools (0.201.3) xenial; urgency=medium + + * 0.201.3 tagged with 'make release' + Commit: Merge pull request #3709 from + Yelp/u/msurnin/add_cassandracluster_crd_autotune_spec_schema + COMPINFRA-3079: Add cassandracluster crd autotune spec schema + + -- Mark Surnin Mon, 25 Sep 2023 02:58:12 -0700 + +paasta-tools (0.201.2) xenial; urgency=medium + + * 0.201.2 tagged with 'make release' + Commit: Merge pull request #3703 from siadat/sina/add- + cassandracluster-to-known-config-types COMPINFRA-3079: Add + cassandracluster to known autotuned config types + + -- Sina Siadat Tue, 19 Sep 2023 08:34:48 -0700 + +paasta-tools (0.201.1) xenial; urgency=medium + + * 0.201.1 tagged with 'make release' + Commit: [COREJAVA-901] Remove calculation of missing instances as it + can"t be calculated with data in HPA (#3706) * [COREJAVA-901] + Remove calculation of missing instances as it can"t be calculated + with data in HPA * Remove unused ready pods calculation + + -- Charan Gangaraju Fri, 15 Sep 2023 07:44:20 -0700 + +paasta-tools (0.201.0) xenial; urgency=medium + + * 0.201.0 tagged with 'make release' + Commit: Merge pull request #3702 from Yelp/u/emanelsabban/PAASTA- + 17988 Add eks support for cleanup kubernetes job - PAASTA-17988 + + -- Eman Elsabban Fri, 15 Sep 2023 06:12:01 -0700 + +paasta-tools (0.200.4) xenial; urgency=medium + + * 0.200.4 tagged with 'make release' + Commit: [COREJAVA-897] Avoid using label_replace in series query + (#3704) + + -- Charan Gangaraju Wed, 13 Sep 2023 13:40:45 -0700 + +paasta-tools (0.200.3) xenial; urgency=medium + + * 0.200.3 tagged with 'make release' + Commit: [COREJAVA-888] Add kube_deployment and kube_namespace to + series and metrics query for hpa (#3701) * [COREJAVA-888] Add + kube_deployment and kube_namespace to series and metrics query for + hpa * add a comment * Format * PR Feedback - rename variable and + minify promql + + -- Charan Gangaraju Tue, 12 Sep 2023 06:53:28 -0700 + +paasta-tools (0.200.2) xenial; urgency=medium + + * 0.200.2 tagged with 'make release' + Commit: Add quotes around new label name in label_replace (#3700) * + [COREJAVA-888] Add kube deployment label to envoy metrics + aggregation * Format * Add missing quotes * PR Feedback - update + regex in label_replace and add comment * Add quotes around new + label name in label_replace + + -- Charan Gangaraju Mon, 11 Sep 2023 15:12:29 -0700 + +paasta-tools (0.200.1) xenial; urgency=medium + + * 0.200.1 tagged with 'make release' + Commit: [COREJAVA-888] Add kube deployment label to envoy metrics + aggregation (#3699) * [COREJAVA-888] Add kube deployment label to + envoy metrics aggregation * Format * Add missing quotes * PR + Feedback - update regex in label_replace and add comment + + -- Charan Gangaraju Mon, 11 Sep 2023 13:54:00 -0700 + +paasta-tools (0.200.0) xenial; urgency=medium + + * 0.200.0 tagged with 'make release' + Commit: Merge pull request #3696 from Yelp/u/emanelsabban/PAASTA- + 17987 Adding support for SKJ on EKS - PAASTA-17987 + + -- Eman Elsabban Mon, 11 Sep 2023 07:40:31 -0700 + +paasta-tools (0.199.1) xenial; urgency=medium + + * 0.199.1 tagged with 'make release' + Commit: [COREJAVA-883] Update labels in filter terms to match k8 + labels (#3697) * [COREJAVA-883] Update labels in filter terms to + match k8 labels * Update the metric according to the label change + * Remove accidental push of test file + + -- Charan Gangaraju Fri, 08 Sep 2023 06:44:06 -0700 + +paasta-tools (0.199.0) xenial; urgency=medium + + * 0.199.0 tagged with 'make release' + Commit: Add API support for paasta ing eks-* instances (#3685) + We do a tiny bit of lying here in order to keep the same PaaSTA + cluster name in the soaconfigs filenames (and CLI), but introduce + the concept of an API cluster so that we can direct queries for + things running from eks-* files to the correct PaaSTA API + + -- Luis Perez Thu, 07 Sep 2023 10:00:44 -0700 + +paasta-tools (0.198.3) xenial; urgency=medium + + * 0.198.3 tagged with 'make release' + Commit: Merge pull request #3694 from Yelp/u/vit/fix-local-run- + secret PAASTA-18017: Read secret where instance service is defined, + rather than service defined in CLI args. + + -- Vincent Thibault Wed, 06 Sep 2023 10:25:37 -0700 + +paasta-tools (0.198.2) xenial; urgency=medium + + * 0.198.2 tagged with 'make release' + Commit: MLCOMPUTE-1008 | update regex for Spark volume names to be + alphanumeric (#3690) * MLCOMPUTE-1008 | update regex for Spark + volume names to be alphanumeric * MLCOMPUTE-1008 | update regex + according to k8s standard, fail fast on incorrect volume names * + MLCOMPUTE-1008 | fix mypy tests * MLCOMPUTE-1008 | add unit tests - + -------- Co-authored-by: Sameer Sharma + + -- Sameer Sharma Wed, 06 Sep 2023 09:18:30 -0700 + +paasta-tools (0.198.1) xenial; urgency=medium + + * 0.198.1 tagged with 'make release' + Commit: [COREJAVA-869] Add active requests to kubernetes schema + (#3692) + + -- Charan Gangaraju Tue, 05 Sep 2023 07:11:23 -0700 + +paasta-tools (0.198.0) xenial; urgency=medium + + * 0.198.0 tagged with 'make release' + Commit: [COREJAVA-800] Implement active requests autoscaler (#3688) + * [COREJAVA-800] Implement active requests autoscaler * Update test + * Reformat code * Add validation for active-requests * Add tests + for autoscaling config validate * Incorporate PR feedback * Move + default threshold to a shared variable and update schema * Update + paasta_tools/cli/cmds/validate.py Co-authored-by: Luis Pérez + * Setup HPA for active-requests --------- Co- + authored-by: Luis Pérez + + -- Charan Gangaraju Fri, 01 Sep 2023 09:11:07 -0700 + +paasta-tools (0.197.1) xenial; urgency=medium + + * 0.197.1 tagged with 'make release' + Commit: Bump service-configuration-lib to v2.18.6 (#3682) * Bump + service-configuration-lib to v2.18.5 * Add an option for explicitly + specifying spark app id + + -- Chi Chang Thu, 24 Aug 2023 03:46:20 -0700 + +paasta-tools (0.197.0) xenial; urgency=medium + + * 0.197.0 tagged with 'make release' + Commit: Merge pull request #3680 from Yelp/u/emanelsabban/PAASTA- + 17985 Setup eks-$clustername schema files - PAASTA-17985 + + -- Eman Elsabban Tue, 22 Aug 2023 05:20:27 -0700 + paasta-tools (0.196.0) xenial; urgency=medium * 0.196.0 tagged with 'make release' diff --git a/debian/paasta-tools.links b/debian/paasta-tools.links index 4f2e1769ff..42e04acb7f 100644 --- a/debian/paasta-tools.links +++ b/debian/paasta-tools.links @@ -8,6 +8,7 @@ opt/venvs/paasta-tools/bin/check_marathon_has_apps.py usr/bin/check_marathon_has opt/venvs/paasta-tools/bin/check_marathon_services_frontends.py usr/bin/check_marathon_services_frontends opt/venvs/paasta-tools/bin/check_kubernetes_api.py usr/bin/check_kubernetes_api opt/venvs/paasta-tools/bin/check_kubernetes_services_replication.py usr/bin/check_kubernetes_services_replication +opt/venvs/paasta-tools/bin/check_autoscaler_max_instances.py usr/bin/check_autoscaler_max_instances opt/venvs/paasta-tools/bin/check_mesos_active_frameworks.py usr/bin/check_mesos_active_frameworks opt/venvs/paasta-tools/bin/check_mesos_duplicate_frameworks.py usr/bin/check_mesos_duplicate_frameworks opt/venvs/paasta-tools/bin/check_mesos_quorum.py usr/bin/check_mesos_quorum @@ -56,3 +57,4 @@ opt/venvs/paasta-tools/bin/setup_kubernetes_cr.py usr/bin/setup_kubernetes_cr opt/venvs/paasta-tools/bin/setup_prometheus_adapter_config.py usr/bin/setup_prometheus_adapter_config opt/venvs/paasta-tools/bin/synapse_srv_namespaces_fact.py usr/bin/synapse_srv_namespaces_fact opt/venvs/paasta-tools/bin/paasta_update_soa_memcpu.py usr/bin/paasta_update_soa_memcpu +opt/venvs/paasta-tools/bin/habitat_fixer.py usr/bin/paasta_habitat_fixer diff --git a/debian/rules b/debian/rules index ceb60ed69d..609fb0aff1 100755 --- a/debian/rules +++ b/debian/rules @@ -24,3 +24,5 @@ override_dh_virtualenv: --preinstall no-manylinux1 \ --preinstall=-rrequirements-bootstrap.txt cp yelp_package/gopath/paasta_go $(DH_VENV_DIR)/bin/paasta_go + @echo patching k8s client lib + patch $(DH_VENV_DIR)/lib/python3.8/site-packages/kubernetes/client/api_client.py contrib/python-k8s-client.diff diff --git a/docs/source/autoscaling.rst b/docs/source/autoscaling.rst index c3f6865e5f..82d0da8577 100644 --- a/docs/source/autoscaling.rst +++ b/docs/source/autoscaling.rst @@ -65,20 +65,13 @@ The currently available metrics providers are: Measures the CPU usage of your service's container. :uwsgi: - With the ``uwsgi`` metrics provider, Paasta will configure your pods to run an additional container with the `uwsgi_exporter `_ image. - This sidecar will listen on port 9117, and will request metrics from your uWSGI master via its `stats server `_. - The uwsgi_exporter container needs to know what port your uWSGI master's stats server is on - you can configure this with the ``uwsgi_stats_port`` key in the ``autoscaling`` dictionary. - ``uwsgi_exporter`` will translate the uWSGI stats into Prometheus format, which Prometheus will scrape. + With the ``uwsgi`` metrics provider, Paasta will configure your pods to be scraped from your uWSGI master via its `stats server `_. + We currently only support uwsgi stats on port 8889, and Prometheus will attempt to scrape that port. .. note:: - If you have configured your service to use a non-default stats port (8889), you need to explicity set ``uwsgi_stats_port`` in your autoscaling config with the same value to ensure that metrics are being exported. + If you have configured your service to use a non-default stats port (8889), PaaSTA will not scale your service correctly! - Extra parameters: - - :uwsgi_stats_port: - the port that your uWSGI master process will respond to with stats. - Defaults to 8889. :gunicorn: With the ``gunicorn`` metrics provider, Paasta will configure your pods to run an additional container with the `statsd_exporter `_ image. @@ -100,6 +93,17 @@ The currently available decicion policies are: :offset: Float between 0.0 and 1.0, representing expected baseline load for each container. Defaults to 0.0. + + **DEPRECATED** - while it was previously more complicated, offset is now simply subtracted from your setpoint. + For example, ``setpoint: 0.6`` with ``offset: 0.25`` is equivalent to ``setpoint: 0.35`` with no ``offset``. + We recommend you just lower your setpoint by the same amount and remove the ``offset``. + + Previously, offset was used to counteract the fake utilization that would be seen by our old uWSGI metrics provider. + Under the old system, the uWSGI metrics provider would always see 1 extra worker busy, because the metrics query was proxied through the actual uWSGI workers. + Having the autoscaler understand how much load was fake and how much was real helped it converge faster to your target load. + Nowadays, we measure uWSGI utilization in a different way that does not use a uWSGI worker, so this is no longer necessary. + Support for ``offset`` was only retained to provide a smooth transition from the old system to the new system. + :good_enough_window: **Not currently supported** An array of two utilization values [low, high]. @@ -133,3 +137,18 @@ of instances PaaSTA thinks your service should have. Finally, remember to set the ``decision_policy`` of the ``autoscaling`` parameter for each service instance to ``"bespoke"`` or else PaaSTA will attempt to autoscale your service with the default autoscaling method. + + +``max_instances`` alerting +-------------------------- + +In order to make you aware of when your ``max_instances`` may be too low, causing issues with your service, paasta will send you alerts if all of the following conditions are true: + + * The autoscaler has scaled your service to ``max_instances``. + + * The load on your service (as measured by the ``metrics_provider`` you specified, e.g. your worker utilization or CPU utilization) is above ``max_instances_alert_threshold``. + +The default value for ``max_instances_alert_threshold`` is whatever your ``setpoint`` is. +This means by default the alert will trigger when the autoscaler wants to scale up but is prevented from doing so by your ``max_instances`` setting. +If this alert is noisy, you can try setting ``max_instances_alert_threshold`` to something a little higher than your ``setpoint``. +Setting a very high value (a utilization value your metrics_provider would never measure) will effectively disable this alert. diff --git a/docs/source/deploy_groups.rst b/docs/source/deploy_groups.rst index 9d6d28d80a..3783c9e0a0 100644 --- a/docs/source/deploy_groups.rst +++ b/docs/source/deploy_groups.rst @@ -20,7 +20,6 @@ As an example, consider a service with the following deploy.yaml: - step: itest - step: security-check - step: push-to-registry - - step: performance-check - step: dev-stage.everything trigger_next_step_manually: true - step: prod.canary @@ -29,7 +28,7 @@ As an example, consider a service with the following deploy.yaml: This pipeline will: -1. Run ``itest``, ``security-check``, ``push-to-registry``, and ``performance-check`` steps, which are build and testing steps. +1. Run ``itest``, ``security-check``, and ``push-to-registry`` steps, which are build and testing steps. During ``itest`` phase, a new container image is built (per the `Paasta Contract `_). This image is pushed to Paasta's Docker registry in the ``push-to-registry`` step. 2. Deploy the new container image to all instances with ``deploy_group: dev-stage.everything``, and wait for someone to click a button in Jenkins before continuing. @@ -125,3 +124,23 @@ String interpolation -------------------- Deploy groups support string interpolation for the following variables: ``cluster``, ``instance`` and ``service``. String interpolation works by surrounding the variable's name with braces (``{}``) in the ``deploy_group`` field -- this is python's ``str.format`` syntax. E.g. ``deploy_group: '{cluster}.all'``. You must still specify explicit deploy groups in your ``deploy.yaml`` however. + +Parallel steps +-------------------- + +Parallel steps are supported in ``deploy.yaml`` to allow steps that aren't reliant on each other to be executed at the same time. The parallel block also supports waiting before moving on to the next step. + +As an example the following deploy.yaml will execute steps ``security-check`` & ``command-test`` together. It will then wait for user input before moving on to the ``performance-check`` step. + +.. sourcecode:: yaml + + --- + pipeline: + - parallel: + - step: security-check + - step: command-test + trigger_next_step_manually: true + - step: performance-check + - step: prod.canary + trigger_next_step_manually: true + - step: prod.non_canary diff --git a/docs/source/generated/paasta_tools.check_autoscaler_max_instances.rst b/docs/source/generated/paasta_tools.check_autoscaler_max_instances.rst new file mode 100644 index 0000000000..cb6b39db8d --- /dev/null +++ b/docs/source/generated/paasta_tools.check_autoscaler_max_instances.rst @@ -0,0 +1,7 @@ +paasta\_tools.check\_autoscaler\_max\_instances module +====================================================== + +.. automodule:: paasta_tools.check_autoscaler_max_instances + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/generated/paasta_tools.cli.cmds.performance_check.rst b/docs/source/generated/paasta_tools.cli.cmds.performance_check.rst deleted file mode 100644 index 23a860c6f1..0000000000 --- a/docs/source/generated/paasta_tools.cli.cmds.performance_check.rst +++ /dev/null @@ -1,7 +0,0 @@ -paasta\_tools.cli.cmds.performance\_check module -================================================ - -.. automodule:: paasta_tools.cli.cmds.performance_check - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/generated/paasta_tools.cli.cmds.rst b/docs/source/generated/paasta_tools.cli.cmds.rst index f475fd99a6..e676079277 100644 --- a/docs/source/generated/paasta_tools.cli.cmds.rst +++ b/docs/source/generated/paasta_tools.cli.cmds.rst @@ -24,7 +24,6 @@ Submodules paasta_tools.cli.cmds.mesh_status paasta_tools.cli.cmds.metastatus paasta_tools.cli.cmds.pause_service_autoscaler - paasta_tools.cli.cmds.performance_check paasta_tools.cli.cmds.push_to_registry paasta_tools.cli.cmds.remote_run paasta_tools.cli.cmds.rollback diff --git a/docs/source/generated/paasta_tools.contrib.habitat_fixer.rst b/docs/source/generated/paasta_tools.contrib.habitat_fixer.rst new file mode 100644 index 0000000000..953e064283 --- /dev/null +++ b/docs/source/generated/paasta_tools.contrib.habitat_fixer.rst @@ -0,0 +1,7 @@ +paasta\_tools.contrib.habitat\_fixer module +=========================================== + +.. automodule:: paasta_tools.contrib.habitat_fixer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/generated/paasta_tools.contrib.rst b/docs/source/generated/paasta_tools.contrib.rst index d2193d5888..b80741b8ba 100644 --- a/docs/source/generated/paasta_tools.contrib.rst +++ b/docs/source/generated/paasta_tools.contrib.rst @@ -14,6 +14,7 @@ Submodules paasta_tools.contrib.emit_allocated_cpu_metrics paasta_tools.contrib.get_running_task_allocation paasta_tools.contrib.graceful_container_drain + paasta_tools.contrib.habitat_fixer paasta_tools.contrib.ide_helper paasta_tools.contrib.is_pod_healthy_in_proxy paasta_tools.contrib.is_pod_healthy_in_smartstack diff --git a/docs/source/generated/paasta_tools.rst b/docs/source/generated/paasta_tools.rst index 63273645b5..e5bbb977e0 100644 --- a/docs/source/generated/paasta_tools.rst +++ b/docs/source/generated/paasta_tools.rst @@ -33,6 +33,7 @@ Submodules paasta_tools.bounce_lib paasta_tools.broadcast_log_to_services paasta_tools.cassandracluster_tools + paasta_tools.check_autoscaler_max_instances paasta_tools.check_cassandracluster_services_replication paasta_tools.check_flink_services_health paasta_tools.check_kubernetes_api @@ -107,7 +108,7 @@ Submodules paasta_tools.synapse_srv_namespaces_fact paasta_tools.tron_tools paasta_tools.utils - paasta_tools.vitess_tools + paasta_tools.vitesscluster_tools Module contents --------------- diff --git a/docs/source/generated/paasta_tools.vitess_tools.rst b/docs/source/generated/paasta_tools.vitess_tools.rst deleted file mode 100644 index af379915df..0000000000 --- a/docs/source/generated/paasta_tools.vitess_tools.rst +++ /dev/null @@ -1,7 +0,0 @@ -paasta\_tools.vitess\_tools module -================================== - -.. automodule:: paasta_tools.vitess_tools - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/generated/paasta_tools.vitesscluster_tools.rst b/docs/source/generated/paasta_tools.vitesscluster_tools.rst new file mode 100644 index 0000000000..9567a92252 --- /dev/null +++ b/docs/source/generated/paasta_tools.vitesscluster_tools.rst @@ -0,0 +1,7 @@ +paasta\_tools.vitesscluster\_tools module +========================================= + +.. automodule:: paasta_tools.vitesscluster_tools + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/index.rst b/docs/source/index.rst index 7911b2da28..2227f36658 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -16,6 +16,7 @@ Directions (For PaaSTA Users) autoscaling hpa deploy_groups + persistent_volumes About PaaSTA's Architecture --------------------------- diff --git a/docs/source/persistent_volumes.rst b/docs/source/persistent_volumes.rst new file mode 100644 index 0000000000..f728283a6a --- /dev/null +++ b/docs/source/persistent_volumes.rst @@ -0,0 +1,36 @@ +================== +Persistent Volumes +================== + +What are Persistent Volumes? +---------------------------- +Persistent Volumes are a Kubernetes feature that allows you to attach stateful storage (like an EBS volume) to Pods (i.e., your PaaSTA instance). + +Caveats +------- +In general, we discourage the use of Persistent Volumes in favor of totally stateless services (i.e., where the state is separated from the service itself in a DB, S3, etc.) + +That said, there are several things to keep in mind before deciding to use Persistent Volumes: + - PaaSTA does not provide monitoring for Persistent Volumes - you are responsible for staying on top of your usage (i.e., there is no alerting for full or almost full volumes) + - Persistent Volumes cannot be resized online - if you run out of space and need to expand your volume: there *will* be downtime + - Additionally, PaaSTA cannot automatically handle this resize: you *will* need to find an engineer on the Compute Infrastructure team to do this + - If you need to delete the Persistent Volume for your service for whatever reason, you will need an engineer on the Compute Infrastructure team to do this for you + - As of this writing (2024-02-16), we've occasionally noticed some large (double-digit minute) delays where Kubernetes is unable to attach the EBS volume backing a Persistent Volume because of AWS errors. + - In other words, until the cause of this issue is identified, it's possible that a Spot interruption (or other sort of disruption) could potentially cause downtime for your service + +How do I use Persistent Volumes in PaaSTA? +------------------------------------------ +If the above is acceptable, adding a block like: + +.. sourcecode:: yaml + + persistent_volumes: + - container_path: /path/to/mount + # if you're a power-user, know what you're doing, and need something more specific than a bog-standard GP3 EBS volume - come talk to use in #paasta + storage_class_name: ebs-retain-gp3 + size: 10 # in GB + mode: RW # unless you're populating the EBS volume externally, you likely want to be able to write to the volume :) + +to your instance config will attach a Persistent Volume with 10GB of storage to every replica of your PaaSTA instance at ``/path/to/mount``. + +NOTE: a Persistent Volume will be created *per-replica* - they are *not* shared between replicas. diff --git a/docs/source/yelpsoa_configs.rst b/docs/source/yelpsoa_configs.rst index c1457dbbc1..83dab2c0bb 100644 --- a/docs/source/yelpsoa_configs.rst +++ b/docs/source/yelpsoa_configs.rst @@ -392,6 +392,13 @@ instance MAY have: * ``decision_policy``: Which method the autoscaler will use to determine when to autoscale a service. Should be ``proportional`` or ``bespoke``. + * ``setpoint``: The target utilization (as measured by your ``metrics_provider``) that the autoscaler will try to achieve. + Default value is 0.8. + + * ``max_instances_alert_threshold``: If the autoscaler has scaled your service to ``max_instances``, + and the service's utilization (as measured by your ``metrics_provider``) is above this value, you'll get an alert. + The default is the same as your ``setpoint``. + * ``deploy_group``: A string identifying what deploy group this instance belongs to. The ``step`` parameter in ``deploy.yaml`` references this value to determine the order in which to build & deploy deploy groups. Defaults to @@ -483,8 +490,7 @@ instance MAY have: * ``namespace``: **Currently in development, do not use.** The Kubernetes namespace where Paasta will create objects related to this service. - Defaults to ``paasta``. - Currently, only ``paasta`` and namespaces starting with ``paastasvc-`` are permitted. + Defaults to ``paastasvc-service--name`` (that is, the service name will have underscores replaced with ``--``.) **Note**: Although many of these settings are inherited from ``smartstack.yaml``, their thresholds are not the same. The reason for this has to do with control diff --git a/general_itests/fake_soa_configs/fake_deployments_json_service/eks-test-cluster.yaml b/general_itests/fake_soa_configs/fake_deployments_json_service/eks-test-cluster.yaml new file mode 100644 index 0000000000..d23e8270fa --- /dev/null +++ b/general_itests/fake_soa_configs/fake_deployments_json_service/eks-test-cluster.yaml @@ -0,0 +1,11 @@ +--- +test_instance: + cpus: 0.1 + ram: 100 + disk: 512.3 + +test_instance_2: + cpus: 0.1 + ram: 250 + disk: 256.7 + deploy_group: test-cluster.test_instance diff --git a/general_itests/fake_soa_configs_validate/fake_invalid_service/marathon-test-cluster.yaml b/general_itests/fake_soa_configs_validate/fake_invalid_service/kubernetes-test-cluster.yaml similarity index 100% rename from general_itests/fake_soa_configs_validate/fake_invalid_service/marathon-test-cluster.yaml rename to general_itests/fake_soa_configs_validate/fake_invalid_service/kubernetes-test-cluster.yaml diff --git a/general_itests/fake_soa_configs_validate/fake_valid_service/marathon-test-cluster.yaml b/general_itests/fake_soa_configs_validate/fake_valid_service/marathon-test-cluster.yaml deleted file mode 100644 index cacffd9021..0000000000 --- a/general_itests/fake_soa_configs_validate/fake_valid_service/marathon-test-cluster.yaml +++ /dev/null @@ -1,9 +0,0 @@ ---- -main: - cpus: .1 - mem: 100 - disk: 200.0 - instances: 1 - env: - FOO: BAR - deploy_group: fake_deploy_group diff --git a/general_itests/steps/paasta_execute_docker_command.py b/general_itests/steps/paasta_execute_docker_command.py index f39c0e8eda..92fd6e4190 100644 --- a/general_itests/steps/paasta_execute_docker_command.py +++ b/general_itests/steps/paasta_execute_docker_command.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os + from behave import given from behave import then from behave import when @@ -30,6 +32,7 @@ def docker_is_available(context): @given("a running docker container with task id {task_id} and image {image_name}") def create_docker_container(context, task_id, image_name): container_name = "paasta-itest-execute-in-containers" + image_name = os.getenv("DOCKER_REGISTRY", "docker-dev.yelpcorp.com/") + image_name try: context.docker_client.remove_container(container_name, force=True) except APIError: diff --git a/paasta_tools/__init__.py b/paasta_tools/__init__.py index a3cf5a0046..883099aa33 100644 --- a/paasta_tools/__init__.py +++ b/paasta_tools/__init__.py @@ -17,4 +17,4 @@ # setup phase, the dependencies may not exist on disk yet. # # Don't bump version manually. See `make release` docs in ./Makefile -__version__ = "0.196.0" +__version__ = "0.218.6" diff --git a/paasta_tools/api/api_docs/oapi.yaml b/paasta_tools/api/api_docs/oapi.yaml index 58ad9f55e7..e0d064a555 100644 --- a/paasta_tools/api/api_docs/oapi.yaml +++ b/paasta_tools/api/api_docs/oapi.yaml @@ -1771,12 +1771,6 @@ paths: schema: format: int32 type: integer - - description: Include Smartstack information - in: query - name: include_smartstack - required: false - schema: - type: boolean - description: Include Envoy information in: query name: include_envoy @@ -1825,13 +1819,6 @@ paths: required: true schema: type: string - - description: Include Smartstack information - in: query - name: include_smartstack - required: false - schema: - type: boolean - default: true - description: Include Envoy information in: query name: include_envoy diff --git a/paasta_tools/api/client.py b/paasta_tools/api/client.py index 2708db508f..d7391a1fe9 100644 --- a/paasta_tools/api/client.py +++ b/paasta_tools/api/client.py @@ -60,6 +60,10 @@ def get_paasta_oapi_client_by_url( config.ssl_ca_cert = ssl_ca_cert client = paastaapi.ApiClient(configuration=config) + # PAASTA-18005: Adds default timeout to paastaapi client + client.rest_client.pool_manager.connection_pool_kw[ + "timeout" + ] = load_system_paasta_config().get_api_client_timeout() return PaastaOApiClient( autoscaler=paastaapis.AutoscalerApi(client), default=paastaapis.DefaultApi(client), diff --git a/paasta_tools/api/views/instance.py b/paasta_tools/api/views/instance.py index 9783f0d648..34edfc4724 100644 --- a/paasta_tools/api/views/instance.py +++ b/paasta_tools/api/views/instance.py @@ -75,6 +75,7 @@ from paasta_tools.utils import get_image_version_from_dockerurl from paasta_tools.utils import NoConfigurationForServiceError from paasta_tools.utils import NoDockerImageError +from paasta_tools.utils import PAASTA_K8S_INSTANCE_TYPES from paasta_tools.utils import TimeoutError from paasta_tools.utils import validate_service_instance @@ -150,7 +151,6 @@ def marathon_instance_status( service: str, instance: str, verbose: int, - include_smartstack: bool, include_envoy: bool, include_mesos: bool, ) -> Mapping[str, Any]: @@ -174,7 +174,7 @@ def marathon_instance_status( ) ) - if include_smartstack or include_envoy: + if include_envoy: service_namespace_config = marathon_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), @@ -184,26 +184,15 @@ def marathon_instance_status( tasks = [ task for app, _ in matching_apps_with_clients for task in app.tasks ] - if include_smartstack: - mstatus["smartstack"] = marathon_service_mesh_status( - service, - pik.ServiceMesh.SMARTSTACK, - instance, - job_config, - service_namespace_config, - tasks, - should_return_individual_backends=verbose > 0, - ) - if include_envoy: - mstatus["envoy"] = marathon_service_mesh_status( - service, - pik.ServiceMesh.ENVOY, - instance, - job_config, - service_namespace_config, - tasks, - should_return_individual_backends=verbose > 0, - ) + mstatus["envoy"] = marathon_service_mesh_status( + service, + pik.ServiceMesh.ENVOY, + instance, + job_config, + service_namespace_config, + tasks, + should_return_individual_backends=verbose > 0, + ) if include_mesos: mstatus["mesos"] = marathon_mesos_status(service, instance, verbose) @@ -640,9 +629,6 @@ def instance_status(request): instance = request.swagger_data.get("instance") verbose = request.swagger_data.get("verbose") or 0 use_new = request.swagger_data.get("new") or False - include_smartstack = request.swagger_data.get("include_smartstack") - if include_smartstack is None: - include_smartstack = True include_envoy = request.swagger_data.get("include_envoy") if include_envoy is None: include_envoy = True @@ -698,7 +684,6 @@ def instance_status(request): service, instance, verbose, - include_smartstack=include_smartstack, include_envoy=include_envoy, include_mesos=include_mesos, ) @@ -712,7 +697,6 @@ def instance_status(request): service=service, instance=instance, verbose=verbose, - include_smartstack=include_smartstack, include_envoy=include_envoy, use_new=use_new, instance_type=instance_type, @@ -864,7 +848,7 @@ def bounce_status(request): error_message = traceback.format_exc() raise ApiFailure(error_message, 500) - if instance_type != "kubernetes": + if instance_type not in PAASTA_K8S_INSTANCE_TYPES: # We are using HTTP 204 to indicate that the instance exists but has # no bounce status to be returned. The client should just mark the # instance as bounced. @@ -873,13 +857,28 @@ def bounce_status(request): return response try: - return pik.bounce_status(service, instance, settings) + return pik.bounce_status( + service, instance, settings, is_eks=(instance_type == "eks") + ) + except NoConfigurationForServiceError: + # Handle race condition where instance has been removed since the above validation + error_message = no_configuration_for_service_message( + settings.cluster, + service, + instance, + ) + raise ApiFailure(error_message, 404) except asyncio.TimeoutError: raise ApiFailure( "Temporary issue fetching bounce status. Please try again.", 599 ) - except Exception: + except Exception as e: error_message = traceback.format_exc() + if getattr(e, "status", None) == 404: + # some bounces delete the app & recreate + # in this case, we relay the 404 and cli handles gracefully + raise ApiFailure(error_message, 404) + # for all others, treat as a 500 raise ApiFailure(error_message, 500) @@ -925,7 +924,6 @@ def get_deployment_version( def instance_mesh_status(request): service = request.swagger_data.get("service") instance = request.swagger_data.get("instance") - include_smartstack = request.swagger_data.get("include_smartstack") include_envoy = request.swagger_data.get("include_envoy") instance_mesh: Dict[str, Any] = {} @@ -953,7 +951,6 @@ def instance_mesh_status(request): instance=instance, instance_type=instance_type, settings=settings, - include_smartstack=include_smartstack, include_envoy=include_envoy, ) ) diff --git a/paasta_tools/check_autoscaler_max_instances.py b/paasta_tools/check_autoscaler_max_instances.py new file mode 100755 index 0000000000..e1cd4c8c0f --- /dev/null +++ b/paasta_tools/check_autoscaler_max_instances.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +import argparse +import asyncio +import logging +from typing import Type + +import pysensu_yelp + +from paasta_tools.eks_tools import EksDeploymentConfig +from paasta_tools.instance import kubernetes as pik +from paasta_tools.kubernetes_tools import get_kubernetes_app_name +from paasta_tools.kubernetes_tools import KubeClient +from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig +from paasta_tools.metrics.metastatus_lib import suffixed_number_value +from paasta_tools.monitoring_tools import send_event +from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader +from paasta_tools.utils import DEFAULT_SOA_DIR +from paasta_tools.utils import list_services +from paasta_tools.utils import load_system_paasta_config +from paasta_tools.utils import SystemPaastaConfig + +log = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + "Check all autoscaled services to see if they're at their max_instances. If" + " so, send an alert if their utilization is above" + " max_instances_alert_threshold." + ) + ) + parser.add_argument( + "-d", + "--soa-dir", + dest="soa_dir", + default=DEFAULT_SOA_DIR, + help="Use a different soa config directory", + ) + parser.add_argument( + "--dry-run", + dest="dry_run", + action="store_true", + help="Print Sensu alert events instead of sending them", + ) + return parser.parse_args() + + +async def check_max_instances( + soa_dir: str, + cluster: str, + instance_type_class: Type[KubernetesDeploymentConfig], + system_paasta_config: SystemPaastaConfig, + dry_run: bool = False, +): + kube_client = KubeClient() + for service in list_services(soa_dir=soa_dir): + service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) + for job_config in service_config.instance_configs( + cluster=cluster, instance_type_class=instance_type_class + ): + instance = job_config.get_instance() + if not job_config.get_autoscaling_metric_spec( + name=get_kubernetes_app_name(service, instance), + cluster=cluster, + kube_client=kube_client, + namespace=job_config.get_namespace(), + ): + # Not an instance that uses HPA, don't check. + # TODO: should we send status=0 here, in case someone disables autoscaling for their service / changes + # to bespoke autoscaler? + continue + + if not job_config.get_docker_image(): + # skip services that haven't been marked for deployment yet. + continue + + autoscaling_status = await pik.autoscaling_status( + kube_client=kube_client, + job_config=job_config, + namespace=job_config.get_namespace(), + ) + if autoscaling_status["min_instances"] == -1: + log.warning( + f"HPA {job_config.get_sanitised_deployment_name()} not found." + ) + continue + + if ( + autoscaling_status["min_instances"] + == autoscaling_status["max_instances"] + ) and "canary" in instance: + status = pysensu_yelp.Status.OK + output = ( + f"Not checking {service}.{instance} as the instance name contains" + ' "canary" and min_instances == max_instances.' + ) + elif ( + autoscaling_status["desired_replicas"] + >= autoscaling_status["max_instances"] + ): + threshold = job_config.get_autoscaling_max_instances_alert_threshold() + setpoint = job_config.get_autoscaling_params()["setpoint"] + metric_threshold_target_ratio = threshold / setpoint + + status = pysensu_yelp.Status.UNKNOWN + output = "how are there no metrics for this thing?" + for metric in autoscaling_status["metrics"]: + current_value = suffixed_number_value(metric["current_value"]) + target_value = suffixed_number_value(metric["target_value"]) + + if current_value / target_value > metric_threshold_target_ratio: + status = pysensu_yelp.Status.CRITICAL + output = ( + f"{service}.{instance}: Service is at max_instances, and" + " ratio of current value to target value" + f" ({current_value} / {target_value}) is greater than the" + " ratio of max_instances_alert_threshold to setpoint" + f" ({threshold} / {setpoint})" + ) + else: + status = pysensu_yelp.Status.OK + output = ( + f"{service}.{instance}: Service is at max_instances, but" + " ratio of current value to target value" + f" ({current_value} / {target_value}) is below the ratio of" + f" max_instances_alert_threshold to setpoint ({threshold} /" + f" {setpoint})" + ) + else: + status = pysensu_yelp.Status.OK + output = f"{service}.{instance} is below max_instances." + + monitoring_overrides = job_config.get_monitoring() + monitoring_overrides.update( + { + "page": False, # TODO: remove this line once this alert has been deployed for a little while. + "runbook": "y/check-autoscaler-max-instances", + "tip": ( + "The autoscaler wants to scale up to handle additional load" + " because your service is overloaded, but cannot scale any" + " higher because of max_instances. You may want to bump" + " max_instances. To make this alert quieter, adjust" + " autoscaling.max_instances_alert_threshold in yelpsoa-configs." + ), + } + ) + send_event( + service, + check_name=f"check_autoscaler_max_instances.{service}.{instance}", + overrides=monitoring_overrides, + status=status, + output=output, + soa_dir=soa_dir, + ttl=None, + cluster=cluster, + system_paasta_config=system_paasta_config, + dry_run=dry_run, + ) + + +def main(): + args = parse_args() + system_paasta_config = load_system_paasta_config() + + for instance_type_class in [KubernetesDeploymentConfig, EksDeploymentConfig]: + asyncio.run( + check_max_instances( + soa_dir=args.soa_dir, + cluster=system_paasta_config.get_cluster(), + instance_type_class=instance_type_class, + system_paasta_config=system_paasta_config, + dry_run=args.dry_run, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/paasta_tools/check_kubernetes_services_replication.py b/paasta_tools/check_kubernetes_services_replication.py index 9605751587..f239b5d9f2 100755 --- a/paasta_tools/check_kubernetes_services_replication.py +++ b/paasta_tools/check_kubernetes_services_replication.py @@ -33,10 +33,14 @@ import logging from typing import Optional from typing import Sequence +from typing import Union +from paasta_tools import eks_tools from paasta_tools import kubernetes_tools from paasta_tools import monitoring_tools from paasta_tools.check_services_replication_tools import main +from paasta_tools.check_services_replication_tools import parse_args +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes_tools import filter_pods_by_service_instance from paasta_tools.kubernetes_tools import is_pod_ready from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig @@ -50,7 +54,7 @@ def check_healthy_kubernetes_tasks_for_service_instance( - instance_config: KubernetesDeploymentConfig, + instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig], expected_count: int, all_pods: Sequence[V1Pod], dry_run: bool = False, @@ -73,7 +77,7 @@ def check_healthy_kubernetes_tasks_for_service_instance( def check_kubernetes_pod_replication( - instance_config: KubernetesDeploymentConfig, + instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig], all_tasks_or_pods: Sequence[V1Pod], replication_checker: KubeSmartstackEnvoyReplicationChecker, dry_run: bool = False, @@ -81,7 +85,7 @@ def check_kubernetes_pod_replication( """Checks a service's replication levels based on how the service's replication should be monitored. (smartstack/envoy or k8s) - :param instance_config: an instance of KubernetesDeploymentConfig + :param instance_config: an instance of KubernetesDeploymentConfig or EksDeploymentConfig :param replication_checker: an instance of KubeSmartstackEnvoyReplicationChecker """ default_alert_after = DEFAULT_ALERT_AFTER @@ -129,7 +133,10 @@ def check_kubernetes_pod_replication( if __name__ == "__main__": + args = parse_args() main( - instance_type_class=kubernetes_tools.KubernetesDeploymentConfig, + instance_type_class=eks_tools.EksDeploymentConfig + if args.eks + else kubernetes_tools.KubernetesDeploymentConfig, check_service_replication=check_kubernetes_pod_replication, ) diff --git a/paasta_tools/check_services_replication_tools.py b/paasta_tools/check_services_replication_tools.py index 27c286a94a..14f352bd26 100644 --- a/paasta_tools/check_services_replication_tools.py +++ b/paasta_tools/check_services_replication_tools.py @@ -30,10 +30,9 @@ from mypy_extensions import Arg from mypy_extensions import NamedArg -from paasta_tools.kubernetes_tools import get_all_namespaces +from paasta_tools.kubernetes_tools import get_all_managed_namespaces from paasta_tools.kubernetes_tools import get_all_nodes from paasta_tools.kubernetes_tools import get_all_pods -from paasta_tools.kubernetes_tools import get_matching_namespaces from paasta_tools.kubernetes_tools import KubeClient from paasta_tools.kubernetes_tools import V1Node from paasta_tools.kubernetes_tools import V1Pod @@ -110,13 +109,6 @@ def parse_args() -> argparse.Namespace: dest="dry_run", help="Print Sensu alert events and metrics instead of sending them", ) - parser.add_argument( - "--namespace-prefix", - help="prefix of the namespace to check services replication for" - "Used only when service is kubernetes", - dest="namespace_prefix", - default="paastasvc-", - ) parser.add_argument( "--additional-namespaces", help="full names of namespaces to check services replication for that don't match --namespace-prefix" @@ -127,6 +119,13 @@ def parse_args() -> argparse.Namespace: # to avoid having two cron jobs running with two different namespace-prefix default=["paasta"], ) + parser.add_argument( + "--eks", + help="This flag checks k8 services running on EKS", + dest="eks", + action="store_true", + default=False, + ) options = parser.parse_args() return options @@ -209,9 +208,6 @@ def main( replication_checker: ReplicationChecker if namespace: - # Note: we will have by default namespace_prefix always set to paastasvc - # which means we could have namespace and namespace_prefix set at the same time - # what differentiate between which one we will use, will be this if statement tasks_or_pods, nodes = get_kubernetes_pods_and_nodes(namespace=namespace) replication_checker = KubeSmartstackEnvoyReplicationChecker( nodes=nodes, @@ -219,7 +215,6 @@ def main( ) else: tasks_or_pods, nodes = get_kubernetes_pods_and_nodes( - namespace_prefix=args.namespace_prefix, additional_namespaces=args.additional_namespaces, ) replication_checker = KubeSmartstackEnvoyReplicationChecker( @@ -273,7 +268,6 @@ def get_mesos_tasks_and_slaves( def get_kubernetes_pods_and_nodes( - namespace_prefix: Optional[str] = None, namespace: Optional[str] = None, additional_namespaces: Optional[Container[str]] = None, ) -> Tuple[List[V1Pod], List[V1Node]]: @@ -283,12 +277,10 @@ def get_kubernetes_pods_and_nodes( if namespace: all_pods = get_all_pods(kube_client=kube_client, namespace=namespace) else: - all_namespaces = get_all_namespaces(kube_client) - for matching_namespace in get_matching_namespaces( - all_namespaces, namespace_prefix, additional_namespaces - ): + all_managed_namespaces = get_all_managed_namespaces(kube_client) + for managed_namespace in all_managed_namespaces: all_pods.extend( - get_all_pods(kube_client=kube_client, namespace=matching_namespace) + get_all_pods(kube_client=kube_client, namespace=managed_namespace) ) all_nodes = get_all_nodes(kube_client) diff --git a/paasta_tools/cleanup_kubernetes_jobs.py b/paasta_tools/cleanup_kubernetes_jobs.py index 715224ed20..15e42b3a44 100644 --- a/paasta_tools/cleanup_kubernetes_jobs.py +++ b/paasta_tools/cleanup_kubernetes_jobs.py @@ -30,6 +30,8 @@ - -t , --kill-threshold: The decimal fraction of apps we think is sane to kill when this job runs - -f, --force: Force the killing of apps if we breach the threshold +- -c, --cluster: Specifies the paasta cluster to check +- --eks: This flag cleans up only k8 services that shouldn't be running on EKS leaving instances from eks-*.yaml files """ import argparse import logging @@ -41,11 +43,14 @@ from typing import List from typing import Set from typing import Tuple +from typing import Union from kubernetes.client import V1Deployment from kubernetes.client import V1StatefulSet from pysensu_yelp import Status +from paasta_tools.eks_tools import EksDeploymentConfig +from paasta_tools.eks_tools import load_eks_service_config from paasta_tools.kubernetes.application.controller_wrappers import DeploymentWrapper from paasta_tools.kubernetes.application.controller_wrappers import StatefulSetWrapper from paasta_tools.kubernetes.application.tools import Application @@ -106,12 +111,12 @@ def alert_state_change(application: Application, cluster: str) -> Generator: def instance_is_not_bouncing( - instance_config: KubernetesDeploymentConfig, + instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig], applications: List[Application], ) -> bool: """ - :param instance_config: a KubernetesDeploymentConfig with the configuration of the instance + :param instance_config: a KubernetesDeploymentConfig or an EksDeploymentConfig with the configuration of the instance :param applications: a list of all deployments or stateful sets on the cluster that match the service and instance of provided instance_config """ @@ -119,10 +124,16 @@ def instance_is_not_bouncing( if isinstance(application, DeploymentWrapper): existing_app = application.item if ( - existing_app.metadata.namespace == instance_config.get_namespace() - and ( - instance_config.get_instances() - <= (existing_app.status.ready_replicas or 0) + ( + existing_app.metadata.namespace != instance_config.get_namespace() + and (instance_config.get_bounce_method() == "downthenup") + ) + or ( + existing_app.metadata.namespace == instance_config.get_namespace() + and ( + instance_config.get_instances() + <= (existing_app.status.ready_replicas or 0) + ) ) ) or instance_config.get_desired_state() == "stop": return True @@ -144,6 +155,7 @@ def get_applications_to_kill( cluster: str, valid_services: Set[Tuple[str, str]], soa_dir: str, + eks: bool = False, ) -> List[Application]: """ @@ -161,9 +173,21 @@ def get_applications_to_kill( if (service, instance) not in valid_services: applications_to_kill.extend(applications) else: - instance_config = load_kubernetes_service_config( - cluster=cluster, service=service, instance=instance, soa_dir=soa_dir - ) + instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig] + if eks: + instance_config = load_eks_service_config( + cluster=cluster, + service=service, + instance=instance, + soa_dir=soa_dir, + ) + else: + instance_config = load_kubernetes_service_config( + cluster=cluster, + service=service, + instance=instance, + soa_dir=soa_dir, + ) try: not_bouncing = instance_is_not_bouncing( instance_config, applications @@ -200,6 +224,7 @@ def cleanup_unused_apps( cluster: str, kill_threshold: float = 0.5, force: bool = False, + eks: bool = False, ) -> None: """Clean up old or invalid jobs/apps from kubernetes. Retrieves both a list of apps currently in kubernetes and a list of valid @@ -217,11 +242,13 @@ def cleanup_unused_apps( applications_dict = list_all_applications(kube_client, APPLICATION_TYPES) log.info("Retrieving valid apps from yelpsoa_configs") valid_services = set( - get_services_for_cluster(instance_type="kubernetes", soa_dir=soa_dir) + get_services_for_cluster( + instance_type="eks" if eks else "kubernetes", soa_dir=soa_dir + ) ) applications_to_kill: List[Application] = get_applications_to_kill( - applications_dict, cluster, valid_services, soa_dir + applications_dict, cluster, valid_services, soa_dir, eks ) log.debug("Running apps: %s" % list(applications_dict)) @@ -280,6 +307,13 @@ def parse_args(argv): default=False, help="Force the cleanup if we are above the " "kill_threshold", ) + parser.add_argument( + "--eks", + help="This flag cleans up only k8 services that shouldn't be running on EKS leaving instances from eks-*.yaml files", + dest="eks", + action="store_true", + default=False, + ) return parser.parse_args(argv) @@ -289,13 +323,18 @@ def main(argv=None) -> None: kill_threshold = args.kill_threshold force = args.force cluster = args.cluster + eks = args.eks if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) try: cleanup_unused_apps( - soa_dir, cluster=cluster, kill_threshold=kill_threshold, force=force + soa_dir, + cluster=cluster, + kill_threshold=kill_threshold, + force=force, + eks=eks, ) except DontKillEverythingError: sys.exit(1) diff --git a/paasta_tools/cli/cli.py b/paasta_tools/cli/cli.py index 47b675fa11..46d99cbffc 100755 --- a/paasta_tools/cli/cli.py +++ b/paasta_tools/cli/cli.py @@ -118,7 +118,6 @@ def add_subparser(command, subparsers): "mesh-status": "mesh_status", "metastatus": "metastatus", "pause_service_autoscaler": "pause_service_autoscaler", - "performance-check": "performance_check", "push-to-registry": "push_to_registry", "remote-run": "remote_run", "rollback": "rollback", diff --git a/paasta_tools/cli/cmds/autoscale.py b/paasta_tools/cli/cmds/autoscale.py index de45236375..4f01d8e336 100644 --- a/paasta_tools/cli/cmds/autoscale.py +++ b/paasta_tools/cli/cmds/autoscale.py @@ -17,9 +17,13 @@ import paasta_tools.paastaapi.models as paastamodels from paasta_tools.api import client from paasta_tools.cli.utils import figure_out_service_name +from paasta_tools.cli.utils import get_instance_configs_for_service +from paasta_tools.cli.utils import get_paasta_oapi_api_clustername from paasta_tools.cli.utils import lazy_choices_completer from paasta_tools.cli.utils import list_instances +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.utils import _log_audit +from paasta_tools.utils import DEFAULT_SOA_DIR from paasta_tools.utils import list_clusters from paasta_tools.utils import list_services from paasta_tools.utils import PaastaColors @@ -52,13 +56,42 @@ def add_subparser(subparsers): autoscale_parser.add_argument( "--set", help="Set the number to scale to. Must be an Int.", type=int ) + autoscale_parser.add_argument( + "-d", + "--soa-dir", + dest="soa_dir", + metavar="SOA_DIR", + default=DEFAULT_SOA_DIR, + help="define a different soa config directory", + ) autoscale_parser.set_defaults(command=paasta_autoscale) def paasta_autoscale(args): log.setLevel(logging.DEBUG) service = figure_out_service_name(args) - api = client.get_paasta_oapi_client(cluster=args.cluster, http_res=True) + instance_config = next( + get_instance_configs_for_service( + service=service, + soa_dir=args.soa_dir, + clusters=[args.cluster], + instances=[args.instance], + ), + None, + ) + if not instance_config: + print( + "Could not find config files for this service instance in soaconfigs. Maybe you mispelled an argument?" + ) + return 1 + + api = client.get_paasta_oapi_client( + cluster=get_paasta_oapi_api_clustername( + cluster=args.cluster, + is_eks=(instance_config.__class__ == EksDeploymentConfig), + ), + http_res=True, + ) if not api: print("Could not connect to paasta api. Maybe you misspelled the cluster?") return 1 diff --git a/paasta_tools/cli/cmds/info.py b/paasta_tools/cli/cmds/info.py index e61fe8ab5d..e612ca032b 100644 --- a/paasta_tools/cli/cmds/info.py +++ b/paasta_tools/cli/cmds/info.py @@ -30,7 +30,7 @@ from paasta_tools.utils import NoDeploymentsAvailable from paasta_tools.utils import PaastaColors -NO_DESCRIPTION_MESSAGE = "No 'description' entry in service.yaml. Please a one line sentence that describes this service" +NO_DESCRIPTION_MESSAGE = "No 'description' entry in service.yaml. Please add a one line sentence that describes this service" NO_EXTERNAL_LINK_MESSAGE = ( "No 'external_link' entry in service.yaml. " "Please add one that points to a reference doc for your service" diff --git a/paasta_tools/cli/cmds/local_run.py b/paasta_tools/cli/cmds/local_run.py index 979b96765e..dde20796bf 100755 --- a/paasta_tools/cli/cmds/local_run.py +++ b/paasta_tools/cli/cmds/local_run.py @@ -25,6 +25,7 @@ import uuid from os import execlpe from random import randint +from typing import Optional from urllib.parse import urlparse import boto3 @@ -472,6 +473,12 @@ def add_subparser(subparsers): action="store_true", default=False, ) + list_parser.add_argument( + "--assume-role-aws-account", + "--aws-account", + "-a", + help="Specify AWS account from which to source credentials", + ) list_parser.add_argument( "--assume-role-arn", help=( @@ -688,12 +695,31 @@ def check_if_port_free(port): return True +def resolve_aws_account_from_runtimeenv() -> str: + try: + with open("/nail/etc/runtimeenv") as runtimeenv_file: + runtimeenv = runtimeenv_file.read() + except FileNotFoundError: + print( + "Unable to determine environment for AWS account name. Using 'dev'", + file=sys.stderr, + ) + runtimeenv = "dev" + + runtimeenv_to_account_overrides = { + "stage": "dev", + "corp": "corpprod", + } + return runtimeenv_to_account_overrides.get(runtimeenv, runtimeenv) + + def assume_aws_role( instance_config: InstanceConfig, service: str, assume_role_arn: str, assume_pod_identity: bool, use_okta_role: bool, + aws_account: str, ) -> AWSSessionCreds: """Runs AWS cli to assume into the correct role, then extract and return the ENV variables from that session""" pod_identity = instance_config.get_iam_role() @@ -705,20 +731,7 @@ def assume_aws_role( file=sys.stderr, ) sys.exit(1) - try: - with open("/nail/etc/runtimeenv") as runtimeenv_file: - aws_account = runtimeenv_file.read() - # Map runtimeenv in special cases to proper aws account name - if aws_account == "stage": - aws_account = "dev" - elif aws_account == "corp": - aws_account = "corpprod" - except FileNotFoundError: - print( - "Unable to determine environment for AWS account name. Using 'dev'", - file=sys.stderr, - ) - aws_account = "dev" + if pod_identity and (assume_pod_identity or assume_role_arn): print( "Calling aws-okta to assume role {} using account {}".format( @@ -727,10 +740,26 @@ def assume_aws_role( ) elif use_okta_role: print(f"Calling aws-okta using account {aws_account}") + elif "AWS_ROLE_ARN" in os.environ and "AWS_WEB_IDENTITY_TOKEN_FILE" in os.environ: + # Get a session using the current pod identity + print( + f"Found Pod Identity token in env. Assuming into role {os.environ['AWS_ROLE_ARN']}." + ) + boto_session = boto3.Session() + credentials = boto_session.get_credentials() + assumed_creds_dict: AWSSessionCreds = { + "AWS_ACCESS_KEY_ID": credentials.access_key, + "AWS_SECRET_ACCESS_KEY": credentials.secret_key, + "AWS_SESSION_TOKEN": credentials.token, + "AWS_SECURITY_TOKEN": credentials.token, + } + return assumed_creds_dict else: - # use_okta_role, assume_pod_identity, and assume_role are all empty. This shouldn't happen + # use_okta_role, assume_pod_identity, and assume_role are all empty, and there's no + # pod identity (web identity token) in the env. This shouldn't happen print( - "Error: assume_aws_role called without required arguments", file=sys.stderr + "Error: assume_aws_role called without required arguments and no pod identity env", + file=sys.stderr, ) sys.exit(1) # local-run will sometimes run as root - make sure that we get the actual @@ -803,6 +832,7 @@ def run_docker_container( assume_pod_identity=False, assume_role_arn="", use_okta_role=False, + assume_role_aws_account: Optional[str] = None, ): """docker-py has issues running a container with a TTY attached, so for consistency we execute 'docker run' directly in both interactive and @@ -826,7 +856,7 @@ def run_docker_container( else: chosen_port = pick_random_port(service) environment = instance_config.get_env_dictionary() - secret_volumes = {} + secret_volumes = {} # type: ignore if not skip_secrets: # if secrets_for_owner_team enabled in yelpsoa for service if is_secrets_for_teams_enabled(service, soa_dir): @@ -857,7 +887,7 @@ def run_docker_container( secret_provider_name=secret_provider_name, environment=environment, soa_dir=soa_dir, - service_name=service, + service_name=instance_config.get_service(), cluster_name=instance_config.cluster, secret_provider_kwargs=secret_provider_kwargs, ) @@ -865,7 +895,7 @@ def run_docker_container( secret_provider_name=secret_provider_name, secret_volumes_config=instance_config.get_secret_volumes(), soa_dir=soa_dir, - service_name=service, + service_name=instance_config.get_service(), cluster_name=instance_config.cluster, secret_provider_kwargs=secret_provider_kwargs, ) @@ -876,13 +906,19 @@ def run_docker_container( ) sys.exit(1) environment.update(secret_environment) - if assume_role_arn or assume_pod_identity or use_okta_role: + if ( + assume_role_arn + or assume_pod_identity + or use_okta_role + or "AWS_WEB_IDENTITY_TOKEN_FILE" in os.environ + ): aws_creds = assume_aws_role( instance_config, service, assume_role_arn, assume_pod_identity, use_okta_role, + assume_role_aws_account, ) environment.update(aws_creds) @@ -929,8 +965,8 @@ def run_docker_container( except TypeError: # If that fails, try to write it as bytes # This is for binary files like TLS keys - with open(temp_secret_filename, "wb") as f: - f.write(secret_content) + with open(temp_secret_filename, "wb") as fb: + fb.write(secret_content) # Append this to the list of volumes passed to docker run volumes.append(f"{temp_secret_filename}:{container_mount_path}:ro") @@ -1029,8 +1065,10 @@ def _output_exit_code(): running = docker_client.inspect_container(container_id)["State"]["Running"] if running: print("Your service is now running! Tailing stdout and stderr:") - for line in docker_client.attach( - container_id, stderr=True, stream=True, logs=True + for line in docker_client.logs( + container_id, + stderr=True, + stream=True, ): # writing to sys.stdout.buffer lets us write the raw bytes we # get from the docker client without having to convert them to @@ -1072,6 +1110,7 @@ def configure_and_run_docker_container( cluster, system_paasta_config, args, + assume_role_aws_account, pull_image=False, dry_run=False, ): @@ -1226,6 +1265,7 @@ def configure_and_run_docker_container( skip_secrets=args.skip_secrets, assume_pod_identity=args.assume_pod_identity, assume_role_arn=args.assume_role_arn, + assume_role_aws_account=assume_role_aws_account, use_okta_role=args.use_okta_role, ) @@ -1270,6 +1310,7 @@ def paasta_local_run(args): local_run_config = system_paasta_config.get_local_run_config() service = figure_out_service_name(args, soa_dir=args.yelpsoa_config_root) + if args.cluster: cluster = args.cluster else: @@ -1285,6 +1326,12 @@ def paasta_local_run(args): file=sys.stderr, ) return 1 + assume_role_aws_account = args.assume_role_aws_account or ( + system_paasta_config.get_kube_clusters() + .get(cluster, {}) + .get("aws_account", resolve_aws_account_from_runtimeenv()) + ) + instance = args.instance docker_client = get_docker_client() @@ -1322,6 +1369,7 @@ def paasta_local_run(args): pull_image=pull_image, system_paasta_config=system_paasta_config, dry_run=args.action == "dry_run", + assume_role_aws_account=assume_role_aws_account, ) except errors.APIError as e: print("Can't run Docker container. Error: %s" % str(e), file=sys.stderr) diff --git a/paasta_tools/cli/cmds/logs.py b/paasta_tools/cli/cmds/logs.py index 663283c07a..abb316e7cf 100644 --- a/paasta_tools/cli/cmds/logs.py +++ b/paasta_tools/cli/cmds/logs.py @@ -637,6 +637,15 @@ def __init__(self, cluster_map: Mapping[str, Any]) -> None: ) self.cluster_map = cluster_map + def get_scribereader_selector(self, scribe_env: str) -> str: + # this is kinda silly, but until the scribereader cli becomes more ergonomic + # we'll need to do a little bit of string munging to let humans use scribereader + # in the same way we are (tl;dr: scribereader has sorta confusing behavior between + # what can be use for --ecosystem, --region, and --superregion and the fastest/least + # hacky thing to figure out which we wanna use is that any env with a - in it is a region + # and any without one is an ecosystem) + return "-e" if "-" in scribe_env else "-r" + def run_code_over_scribe_envs( self, clusters: Sequence[str], @@ -745,8 +754,10 @@ def callback( else: kw["stream_name"] = stream_info.stream_name_fn(service) log.debug( - "Running the equivalent of 'scribereader -e {} {}'".format( - scribe_env, kw["stream_name"] + "Running the equivalent of 'scribereader {} {} {}'".format( + self.get_scribereader_selector(scribe_env), + scribe_env, + kw["stream_name"], ) ) process = Process(target=self.scribe_tail, kwargs=kw) @@ -1036,8 +1047,14 @@ def scribe_get_from_time( end_date_yst = end_time.astimezone(pytz.timezone("America/Los_Angeles")).date() log.debug( - "Running the equivalent of 'scribereader -e %s %s --min-date %s --max-date %s" - % (scribe_env, stream_name, start_date_yst, end_date_yst) + "Running the equivalent of 'scribereader %s %s %s --min-date %s --max-date %s" + % ( + self.get_scribereader_selector(scribe_env), + scribe_env, + stream_name, + start_date_yst, + end_date_yst, + ) ) return scribereader.get_stream_reader( stream_name=stream_name, @@ -1064,7 +1081,7 @@ def scribe_get_last_n_lines( @contextmanager def fake_context(): log.debug( - f"Running the equivalent of 'scribereader -e {scribe_env} {stream_name}'" + f"Running the equivalent of 'scribereader -n {line_count} {self.get_scribereader_selector(scribe_env)} {scribe_env} {stream_name}'" ) yield scribereader.get_stream_tailer( stream_name=stream_name, diff --git a/paasta_tools/cli/cmds/mark_for_deployment.py b/paasta_tools/cli/cmds/mark_for_deployment.py index 8c7b2df2ed..58b7f2cc68 100644 --- a/paasta_tools/cli/cmds/mark_for_deployment.py +++ b/paasta_tools/cli/cmds/mark_for_deployment.py @@ -59,6 +59,7 @@ from paasta_tools.cli.cmds.status import get_version_table_entry from paasta_tools.cli.cmds.status import recent_container_restart from paasta_tools.cli.utils import get_jenkins_build_output_url +from paasta_tools.cli.utils import get_paasta_oapi_api_clustername from paasta_tools.cli.utils import lazy_choices_completer from paasta_tools.cli.utils import list_deploy_groups from paasta_tools.cli.utils import trigger_deploys @@ -68,6 +69,7 @@ from paasta_tools.cli.utils import validate_short_git_sha from paasta_tools.deployment_utils import get_currently_deployed_sha from paasta_tools.deployment_utils import get_currently_deployed_version +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig from paasta_tools.long_running_service_tools import LongRunningServiceConfig from paasta_tools.marathon_tools import MarathonServiceConfig @@ -984,8 +986,7 @@ def valid_transitions(self) -> Iterator[state_machine.TransitionDefinition]: "dest": None, # Don't actually change state, just call the before function. "trigger": "disable_auto_rollbacks_button_clicked", "conditions": [ - self.any_slo_failing, - self.any_metric_failing, + self.any_rollback_condition_failing, self.auto_rollbacks_enabled, ], "before": self.disable_auto_rollbacks, @@ -1496,12 +1497,16 @@ def diagnose_why_instance_is_stuck( should_ping_for_unhealthy_pods: bool, notify_fn: Optional[Callable[[str], None]] = None, ) -> None: - api = client.get_paasta_oapi_client(cluster=cluster) + api = client.get_paasta_oapi_client( + cluster=get_paasta_oapi_api_clustername( + cluster=cluster, + is_eks=(instance_config.get_instance_type() == "eks"), + ), + ) try: status = api.service.status_instance( service=service, instance=instance, - include_smartstack=False, include_envoy=False, include_mesos=False, new=True, @@ -1622,7 +1627,12 @@ def check_if_instance_is_done( api: Optional[client.PaastaOApiClient] = None, ) -> bool: if api is None: - api = client.get_paasta_oapi_client(cluster=cluster) + api = client.get_paasta_oapi_client( + cluster=get_paasta_oapi_api_clustername( + cluster=cluster, + is_eks=(instance_config.get_instance_type() == "eks"), + ), + ) if not api: log.warning( "Couldn't reach the PaaSTA api for {}! Assuming it is not " @@ -1732,6 +1742,7 @@ def check_if_instance_is_done( WAIT_FOR_INSTANCE_CLASSES = [ MarathonServiceConfig, KubernetesDeploymentConfig, + EksDeploymentConfig, CassandraClusterDeploymentConfig, ] diff --git a/paasta_tools/cli/cmds/mesh_status.py b/paasta_tools/cli/cmds/mesh_status.py index 0d26c1cd56..3ddecd5eb3 100644 --- a/paasta_tools/cli/cmds/mesh_status.py +++ b/paasta_tools/cli/cmds/mesh_status.py @@ -19,8 +19,11 @@ from paasta_tools.cli.cmds.status import get_envoy_status_human from paasta_tools.cli.cmds.status import get_smartstack_status_human from paasta_tools.cli.utils import figure_out_service_name +from paasta_tools.cli.utils import get_instance_configs_for_service +from paasta_tools.cli.utils import get_paasta_oapi_api_clustername from paasta_tools.cli.utils import lazy_choices_completer from paasta_tools.cli.utils import verify_instances +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.utils import DEFAULT_SOA_DIR from paasta_tools.utils import list_clusters from paasta_tools.utils import list_services @@ -75,8 +78,30 @@ def paasta_mesh_status_on_api_endpoint( service: str, instance: str, system_paasta_config: SystemPaastaConfig, + soa_dir: str = DEFAULT_SOA_DIR, ) -> Tuple[int, List[str]]: - client = get_paasta_oapi_client(cluster, system_paasta_config) + instance_config = next( + get_instance_configs_for_service( + service=service, + soa_dir=soa_dir, + clusters=[cluster], + instances=[instance], + ), + None, + ) + if not instance_config: + print( + "ERROR: Could not find config files for this service instance in soaconfigs. Maybe you mispelled an argument?" + ) + exit(1) + + client = get_paasta_oapi_client( + cluster=get_paasta_oapi_api_clustername( + cluster, + is_eks=(instance_config.__class__ == EksDeploymentConfig), + ), + system_paasta_config=system_paasta_config, + ) if not client: print("ERROR: Cannot get a paasta-api client") exit(1) @@ -85,7 +110,6 @@ def paasta_mesh_status_on_api_endpoint( mesh_status = client.service.mesh_instance( service=service, instance=instance, - include_smartstack=False, ) except client.api_error as exc: # 405 (method not allowed) is returned for instances that are not configured @@ -128,7 +152,7 @@ def paasta_mesh_status(args) -> int: # validate args, funcs have their own error output service = figure_out_service_name(args, args.soa_dir) - if verify_instances(args.instance, service, [args.cluster]): + if verify_instances(args.instance, service, [args.cluster], args.soa_dir): return 1 return_code, mesh_output = paasta_mesh_status_on_api_endpoint( @@ -136,6 +160,7 @@ def paasta_mesh_status(args) -> int: service=service, instance=args.instance, system_paasta_config=system_paasta_config, + soa_dir=args.soa_dir, ) output = [ diff --git a/paasta_tools/cli/cmds/performance_check.py b/paasta_tools/cli/cmds/performance_check.py deleted file mode 100644 index a70e7b24ee..0000000000 --- a/paasta_tools/cli/cmds/performance_check.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - -import requests -from service_configuration_lib import read_extra_service_information - -from paasta_tools.cli.utils import validate_service_name -from paasta_tools.utils import DEFAULT_SOA_DIR -from paasta_tools.utils import timeout - - -def add_subparser(subparsers): - list_parser = subparsers.add_parser( - "performance-check", - description="Performs a performance check", - help="Performs a performance check", - ) - list_parser.add_argument( - "-s", - "--service", - help='Name of service for which you wish to check. Leading "services-", as included in a ' - "Jenkins job name, will be stripped.", - ) - list_parser.add_argument("-k", "--commit", help=argparse.SUPPRESS) - list_parser.add_argument( - "-d", - "--soa-dir", - dest="soa_dir", - metavar="SOA_DIR", - default=DEFAULT_SOA_DIR, - help="Define a different soa config directory", - ) - list_parser.set_defaults(command=perform_performance_check) - - -def load_performance_check_config(service, soa_dir): - return read_extra_service_information( - service_name=service, extra_info="performance-check", soa_dir=soa_dir - ) - - -def submit_performance_check_job(service, soa_dir): - performance_check_config = load_performance_check_config(service, soa_dir) - - if not performance_check_config: - print("No performance-check.yaml. Skipping performance-check.") - return - - endpoint = performance_check_config.pop("endpoint") - r = requests.post(url=endpoint, params=performance_check_config) - r.raise_for_status() - print("Posted a submission to the PaaSTA performance-check service.") - print(f"Endpoint: {endpoint}") - print(f"Parameters: {performance_check_config}") - - -@timeout() -def perform_performance_check(args): - service = args.service - if service.startswith("services-"): - service = service.split("services-", 1)[1] - validate_service_name(service, args.soa_dir) - - try: - submit_performance_check_job(service=service, soa_dir=args.soa_dir) - except Exception as e: - print( - "Something went wrong with the performance check. Safely bailing. No need to panic." - ) - print("Here was the error:") - print(str(e)) diff --git a/paasta_tools/cli/cmds/secret.py b/paasta_tools/cli/cmds/secret.py index 408d7b8e37..7ada4a4325 100644 --- a/paasta_tools/cli/cmds/secret.py +++ b/paasta_tools/cli/cmds/secret.py @@ -423,7 +423,8 @@ def paasta_secret(args): get_secret( kube_client, get_paasta_secret_name(namespace, service, args.secret_name), - namespace, + key_name=args.secret_name, + namespace=namespace, ) ) # fallback to default in case mapping fails @@ -432,7 +433,8 @@ def paasta_secret(args): get_secret( kube_client, get_paasta_secret_name("paasta", service, args.secret_name), - "paasta", + key_name=args.secret_name, + namespace="paasta", ) ) return diff --git a/paasta_tools/cli/cmds/spark_run.py b/paasta_tools/cli/cmds/spark_run.py index db89624ab8..7f233091ba 100644 --- a/paasta_tools/cli/cmds/spark_run.py +++ b/paasta_tools/cli/cmds/spark_run.py @@ -16,14 +16,14 @@ from typing import Union import yaml -from boto3.exceptions import Boto3Error +from service_configuration_lib import read_service_configuration +from service_configuration_lib import read_yaml_file from service_configuration_lib import spark_config from service_configuration_lib.spark_config import get_aws_credentials from service_configuration_lib.spark_config import get_grafana_url from service_configuration_lib.spark_config import get_resources_requested from service_configuration_lib.spark_config import get_signalfx_url from service_configuration_lib.spark_config import get_spark_hourly_cost -from service_configuration_lib.spark_config import send_and_calculate_resources_cost from service_configuration_lib.spark_config import UnsupportedClusterManagerException from paasta_tools.cli.cmds.check import makefile_responds_to @@ -35,12 +35,11 @@ from paasta_tools.kubernetes_tools import limit_size_with_hash from paasta_tools.spark_tools import DEFAULT_SPARK_SERVICE from paasta_tools.spark_tools import get_volumes_from_spark_k8s_configs -from paasta_tools.spark_tools import get_volumes_from_spark_mesos_configs from paasta_tools.spark_tools import get_webui_url from paasta_tools.spark_tools import inject_spark_conf_str from paasta_tools.utils import _run from paasta_tools.utils import DEFAULT_SOA_DIR -from paasta_tools.utils import get_docker_client +from paasta_tools.utils import filter_templates_from_config from paasta_tools.utils import get_possible_launched_by_user_variable_from_env from paasta_tools.utils import get_username from paasta_tools.utils import InstanceConfig @@ -60,10 +59,10 @@ DEFAULT_SPARK_DOCKER_REGISTRY = "docker-dev.yelpcorp.com" SENSITIVE_ENV = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN"] clusterman_metrics, CLUSTERMAN_YAML_FILE_PATH = get_clusterman_metrics() -CLUSTER_MANAGER_MESOS = "mesos" CLUSTER_MANAGER_K8S = "kubernetes" CLUSTER_MANAGER_LOCAL = "local" -CLUSTER_MANAGERS = {CLUSTER_MANAGER_MESOS, CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL} +CLUSTER_MANAGERS = {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL} +DEFAULT_DOCKER_SHM_SIZE = "64m" # Reference: https://spark.apache.org/docs/latest/configuration.html#application-properties DEFAULT_DRIVER_CORES_BY_SPARK = 1 DEFAULT_DRIVER_MEMORY_BY_SPARK = "1g" @@ -73,6 +72,7 @@ POD_TEMPLATE_DIR = "/nail/tmp" POD_TEMPLATE_PATH = "/nail/tmp/spark-pt-{file_uuid}.yaml" DEFAULT_RUNTIME_TIMEOUT = "12h" +DEFAILT_AWS_PROFILE = "default" POD_TEMPLATE = """ apiVersion: v1 @@ -81,6 +81,7 @@ labels: spark: {spark_pod_label} spec: + dnsPolicy: Default affinity: podAffinity: preferredDuringSchedulingIgnoredDuringExecution: @@ -173,7 +174,7 @@ def add_subparser(subparsers): "--docker-memory-limit", help=( "Set docker memory limit. Should be greater than driver memory. Defaults to 2x spark.driver.memory. Example: 2g, 500m, Max: 64g" - "Note: If memory limit provided is greater than associated with the batch instance, it will default to max memory of the box." + " Note: If memory limit provided is greater than associated with the batch instance, it will default to max memory of the box." ), default=None, ) @@ -181,7 +182,27 @@ def add_subparser(subparsers): "--docker-cpu-limit", help=( "Set docker cpus limit. Should be greater than driver cores. Defaults to 1x spark.driver.cores." - "Note: The job will fail if the limit provided is greater than number of cores present on batch box (8 for production batch boxes)." + " Note: The job will fail if the limit provided is greater than number of cores present on batch box (8 for production batch boxes)." + ), + default=None, + ) + + list_parser.add_argument( + "--docker-shm-size", + help=( + "Set docker shared memory size limit for the driver's container. This is the same as setting docker run --shm-size and the shared" + " memory is mounted to /dev/shm in the container. Anything written to the shared memory mount point counts towards the docker memory" + " limit for the driver's container. Therefore, this should be less than --docker-memory-limit." + f" Defaults to {DEFAULT_DOCKER_SHM_SIZE}. Example: 8g, 256m" + " Note: this option is mainly useful when training TensorFlow models in the driver, with multiple GPUs using NCCL. The shared memory" + f" space is used to sync gradient updates between GPUs during training. The default value of {DEFAULT_DOCKER_SHM_SIZE} is typically not large enough for" + " this inter-gpu communication to run efficiently. We recommend a starting value of 8g to ensure that the entire set of model parameters" + " can fit in the shared memory. This can be less if you are training a smaller model (<1g parameters) or more if you are using a larger model (>2.5g parameters)" + " If you are observing low, average GPU utilization during epoch training (<65-70 percent) you can also try increasing this value; you may be" + " resource constrained when GPUs sync training weights between mini-batches (there are other potential bottlenecks that could cause this as well)." + " A tool such as nvidia-smi can be use to check GPU utilization." + " This option also adds the --ulimit memlock=-1 to the docker run command since this is recommended for TensorFlow applications that use NCCL." + " Please refer to docker run documentation for more details on --shm-size and --ulimit memlock=-1." ), default=None, ) @@ -197,7 +218,7 @@ def add_subparser(subparsers): list_parser.add_argument( "--docker-registry", help="Docker registry to push the Spark image built.", - default=DEFAULT_SPARK_DOCKER_REGISTRY, + default=None, ) list_parser.add_argument( @@ -282,9 +303,8 @@ def add_subparser(subparsers): list_parser.add_argument( "--spark-args", - help="Spark configurations documented in https://spark.apache.org/docs/latest/configuration.html. " - r'For example, --spark-args "spark.mesos.constraints=pool:default\;instance_type:m4.10xlarge ' - 'spark.executor.cores=4".', + help="Spark configurations documented in https://spark.apache.org/docs/latest/configuration.html, separated by space. " + 'For example, --spark-args "spark.executor.cores=1 spark.executor.memory=7g spark.executor.instances=2".', ) list_parser.add_argument( @@ -319,6 +339,20 @@ def add_subparser(subparsers): default=False, ) + list_parser.add_argument( + "--tronfig", + help="Load the Tron config yaml. Use with --job-id.", + type=str, + default=None, + ) + + list_parser.add_argument( + "--job-id", + help="Tron job id . in the Tronfig to run. Use wuth --tronfig.", + type=str, + default=None, + ) + k8s_target_cluster_type_group = list_parser.add_mutually_exclusive_group() k8s_target_cluster_type_group.add_argument( "--force-use-eks", @@ -339,14 +373,6 @@ def add_subparser(subparsers): default=None, ) - if clusterman_metrics: - list_parser.add_argument( - "--suppress-clusterman-metrics-errors", - help="Continue even if sending resource requirements to Clusterman fails. This may result in the job " - "failing to acquire resources.", - action="store_true", - ) - list_parser.add_argument( "-j", "--jars", help=argparse.SUPPRESS, action=DeprecatedAction ) @@ -397,7 +423,7 @@ def add_subparser(subparsers): "--aws-credentials-yaml is not specified and --service is either " "not specified or the service does not have credentials in " "/etc/boto_cfg", - default="default", + default=DEFAILT_AWS_PROFILE, ) aws_group.add_argument( @@ -508,13 +534,18 @@ def get_docker_run_cmd( docker_cmd, nvidia, docker_memory_limit, + docker_shm_size, docker_cpu_limit, ): print( - f"Setting docker memory and cpu limits as {docker_memory_limit}, {docker_cpu_limit} core(s) respectively." + f"Setting docker memory, shared memory, and cpu limits as {docker_memory_limit}, {docker_shm_size}, and {docker_cpu_limit} core(s) respectively." ) cmd = ["paasta_docker_wrapper", "run"] cmd.append(f"--memory={docker_memory_limit}") + if docker_shm_size is not None: + cmd.append(f"--shm-size={docker_shm_size}") + cmd.append("--ulimit") + cmd.append("memlock=-1") cmd.append(f"--cpus={docker_cpu_limit}") cmd.append("--rm") cmd.append("--net=host") @@ -671,21 +702,20 @@ def _parse_user_spark_args( enable_compact_bin_packing: bool = False, enable_spark_dra: bool = False, ) -> Dict[str, str]: - if not spark_args: - return {} user_spark_opts = {} - for spark_arg in spark_args.split(): - fields = spark_arg.split("=", 1) - if len(fields) != 2: - print( - PaastaColors.red( - "Spark option %s is not in format option=value." % spark_arg - ), - file=sys.stderr, - ) - sys.exit(1) - user_spark_opts[fields[0]] = fields[1] + if spark_args: + for spark_arg in spark_args.split(): + fields = spark_arg.split("=", 1) + if len(fields) != 2: + print( + PaastaColors.red( + "Spark option %s is not in format option=value." % spark_arg + ), + file=sys.stderr, + ) + sys.exit(1) + user_spark_opts[fields[0]] = fields[1] if enable_compact_bin_packing: user_spark_opts["spark.kubernetes.executor.podTemplateFile"] = pod_template_path @@ -738,6 +768,7 @@ def run_docker_container( dry_run, nvidia, docker_memory_limit, + docker_shm_size, docker_cpu_limit, ) -> int: @@ -749,6 +780,7 @@ def run_docker_container( docker_cmd=docker_cmd, nvidia=nvidia, docker_memory_limit=docker_memory_limit, + docker_shm_size=docker_shm_size, docker_cpu_limit=docker_cpu_limit, ) docker_run_cmd = get_docker_run_cmd(**docker_run_args) @@ -819,6 +851,17 @@ def _calculate_docker_memory_limit( return docker_memory_limit +def _calculate_docker_shared_memory_size(shm_size: Optional[str]) -> str: + """In Order of preference: + 1. Argument: --docker-shm-size + 3. Default + """ + if shm_size: + return shm_size + + return DEFAULT_DOCKER_SHM_SIZE + + def _calculate_docker_cpu_limit( spark_conf: Mapping[str, str], cpu_limit: Optional[str] ) -> str: @@ -843,18 +886,18 @@ def configure_and_run_docker_container( aws_creds: Tuple[Optional[str], Optional[str], Optional[str]], cluster_manager: str, pod_template_path: str, + extra_driver_envs: Dict[str, str] = dict(), ) -> int: docker_memory_limit = _calculate_docker_memory_limit( spark_conf, args.docker_memory_limit ) + docker_shm_size = _calculate_docker_shared_memory_size(args.docker_shm_size) docker_cpu_limit = _calculate_docker_cpu_limit( spark_conf, args.docker_cpu_limit, ) - if cluster_manager == CLUSTER_MANAGER_MESOS: - volumes = get_volumes_from_spark_mesos_configs(spark_conf) - elif cluster_manager in {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL}: + if cluster_manager in {CLUSTER_MANAGER_K8S, CLUSTER_MANAGER_LOCAL}: # service_configuration_lib puts volumes into the k8s # configs for local mode volumes = get_volumes_from_spark_k8s_configs(spark_conf) @@ -883,6 +926,7 @@ def configure_and_run_docker_container( system_paasta_config=system_paasta_config, ) ) # type:ignore + environment.update(extra_driver_envs) webui_url = get_webui_url(spark_conf["spark.ui.port"]) webui_url_msg = PaastaColors.green(f"\nSpark monitoring URL: ") + f"{webui_url}\n" @@ -915,40 +959,21 @@ def configure_and_run_docker_container( print(f"Selected cluster manager: {cluster_manager}\n") if clusterman_metrics and _should_get_resource_requirements(docker_cmd, args.mrjob): - try: - if cluster_manager == CLUSTER_MANAGER_MESOS: - print("Sending resource request metrics to Clusterman") - hourly_cost, resources = send_and_calculate_resources_cost( - clusterman_metrics, spark_conf, webui_url, args.pool - ) - else: - resources = get_resources_requested(spark_conf) - hourly_cost = get_spark_hourly_cost( - clusterman_metrics, - resources, - spark_conf["spark.executorEnv.PAASTA_CLUSTER"], - args.pool, - ) - message = ( - f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)" - f" is estimated to cost ${hourly_cost} per hour" - ) - if clusterman_metrics.util.costs.should_warn(hourly_cost): - print(PaastaColors.red(f"WARNING: {message}")) - else: - print(message) - except Boto3Error as e: - print( - PaastaColors.red( - f"Encountered {e} while attempting to send resource requirements to Clusterman." - ) - ) - if args.suppress_clusterman_metrics_errors: - print( - "Continuing anyway since --suppress-clusterman-metrics-errors was passed" - ) - else: - raise + resources = get_resources_requested(spark_conf) + hourly_cost = get_spark_hourly_cost( + clusterman_metrics, + resources, + spark_conf["spark.executorEnv.PAASTA_CLUSTER"], + args.pool, + ) + message = ( + f"Resource request ({resources['cpus']} cpus and {resources['mem']} MB memory total)" + f" is estimated to cost ${hourly_cost} per hour" + ) + if clusterman_metrics.util.costs.should_warn(hourly_cost): + print(PaastaColors.red(f"WARNING: {message}")) + else: + print(message) return run_docker_container( container_name=spark_conf["spark.app.name"], @@ -959,6 +984,7 @@ def configure_and_run_docker_container( dry_run=args.dry_run, nvidia=args.nvidia, docker_memory_limit=docker_memory_limit, + docker_shm_size=docker_shm_size, docker_cpu_limit=docker_cpu_limit, ) @@ -995,6 +1021,14 @@ def get_docker_cmd( return inject_spark_conf_str(original_docker_cmd, spark_conf_str) +def _get_adhoc_docker_registry(service: str, soa_dir: str = DEFAULT_SOA_DIR) -> str: + if service is None: + raise NotImplementedError('"None" is not a valid service') + + service_configuration = read_service_configuration(service, soa_dir) + return service_configuration.get("docker_registry", DEFAULT_SPARK_DOCKER_REGISTRY) + + def build_and_push_docker_image(args: argparse.Namespace) -> Optional[str]: """ Build an image if the default Spark service image is not preferred. @@ -1018,24 +1052,61 @@ def build_and_push_docker_image(args: argparse.Namespace) -> Optional[str]: if cook_return != 0: return None - docker_url = f"{args.docker_registry}/{docker_tag}" + registry_uri = args.docker_registry or _get_adhoc_docker_registry( + service=args.service, + soa_dir=args.yelpsoa_config_root, + ) + + docker_url = f"{registry_uri}/{docker_tag}" command = f"docker tag {docker_tag} {docker_url}" print(PaastaColors.grey(command)) retcode, _ = _run(command, stream=True) if retcode != 0: return None - if args.docker_registry != DEFAULT_SPARK_DOCKER_REGISTRY: + if registry_uri != DEFAULT_SPARK_DOCKER_REGISTRY: command = "sudo -H docker push %s" % docker_url else: command = "docker push %s" % docker_url print(PaastaColors.grey(command)) - retcode, _ = _run(command, stream=True) + retcode, output = _run(command, stream=False) if retcode != 0: return None - return docker_url + # With unprivileged docker, the digest on the remote registry may not match the digest + # in the local environment. Because of this, we have to parse the digest message from the + # server response and use downstream when launching spark executors + + # Output from `docker push` with unprivileged docker looks like + # Using default tag: latest + # The push refers to repository [docker-dev.yelpcorp.com/paasta-spark-run-dpopes:latest] + # latest: digest: sha256:0a43aa65174a400bd280d48d460b73eb49b0ded4072c9e173f919543bf693557 + + # With privileged docker, the last line has an extra "size: 123" + # latest: digest: sha256:0a43aa65174a400bd280d48d460b73eb49b0ded4072c9e173f919543bf693557 size: 52 + + digest_line = output.split("\n")[-1] + digest_match = re.match(r"[^:]*: [^:]*: (?P[^\s]*)", digest_line) + if not digest_match: + raise ValueError(f"Could not determine digest from output: {output}") + digest = digest_match.group("digest") + + image_url = f"{docker_url}@{digest}" + + # If the local digest doesn't match the remote digest AND the registry is + # non-default (which requires requires authentication, and consequently sudo), + # downstream `docker run` commands will fail trying to authenticate. + # To work around this, we can proactively `sudo docker pull` here so that + # the image exists locally and can be `docker run` without sudo + if registry_uri != DEFAULT_SPARK_DOCKER_REGISTRY: + command = f"sudo -H docker pull {image_url}" + print(PaastaColors.grey(command)) + retcode, output = _run(command, stream=False) + if retcode != 0: + raise NoDockerImageError(f"Could not pull {image_url}: {output}") + + return image_url def validate_work_dir(s): @@ -1129,7 +1200,110 @@ def _get_k8s_url_for_cluster(cluster: str) -> Optional[str]: ) -def paasta_spark_run(args): +def parse_tronfig(tronfig_path: str, job_id: str) -> Optional[Dict[str, Any]]: + splitted = job_id.split(".") + if len(splitted) != 2: + return None + job_name, action_name = splitted + + file_content = read_yaml_file(tronfig_path) + jobs = filter_templates_from_config(file_content) + if job_name not in jobs or action_name not in jobs[job_name].get("actions", {}): + return None + return jobs[job_name]["actions"][action_name] + + +def update_args_from_tronfig(args: argparse.Namespace) -> Optional[Dict[str, str]]: + """ + Load and check the following config fields from the provided Tronfig. + - executor + - pool + - iam_role + - iam_role_provider + - command + - env + - spark_args + + Returns: environment variables dictionary or None if failed. + """ + action_dict = parse_tronfig(args.tronfig, args.job_id) + if action_dict is None: + print( + PaastaColors.red(f"Unable to get configs from job-id: {args.job_id}"), + file=sys.stderr, + ) + return None + + # executor === spark + if action_dict.get("executor", "") != "spark": + print( + PaastaColors.red("Invalid Tronfig: executor should be 'spark'"), + file=sys.stderr, + ) + return None + + # iam_role / aws_profile + if "iam_role" in action_dict and action_dict.get("iam_role_provider", "") != "aws": + print( + PaastaColors.red("Invalid Tronfig: iam_role_provider should be 'aws'"), + file=sys.stderr, + ) + return None + + # Other args + fields_to_args = { + "pool": "pool", + "iam_role": "assume_aws_role", + "command": "cmd", + "spark_args": "spark_args", + } + for field_name, arg_name in fields_to_args.items(): + if field_name in action_dict: + value = action_dict[field_name] + + # Convert spark_args values from dict to a string "k1=v1 k2=v2" + if field_name == "spark_args": + value = " ".join([f"{k}={v}" for k, v in dict(value).items()]) + + # Befutify for printing + arg_name_str = (f"--{arg_name.replace('_', '-')}").ljust(20, " ") + field_name_str = field_name.ljust(12) + + # Only load iam_role value if --aws-profile is not set + if field_name == "iam_role" and args.aws_profile != DEFAILT_AWS_PROFILE: + print( + PaastaColors.yellow( + f"Overwriting args with Tronfig: {arg_name_str} => {field_name_str} : IGNORE, " + "since --aws-profile is provided" + ), + ) + continue + + if hasattr(args, arg_name): + print( + PaastaColors.yellow( + f"Overwriting args with Tronfig: {arg_name_str} => {field_name_str} : {value}" + ), + ) + setattr(args, arg_name, value) + + # env (currently paasta spark-run does not support Spark driver secrets environment variables) + return action_dict.get("env", dict()) + + +def paasta_spark_run(args: argparse.Namespace) -> int: + driver_envs_from_tronfig: Dict[str, str] = dict() + if args.tronfig is not None: + if args.job_id is None: + print( + PaastaColors.red("Missing --job-id when --tronfig is provided"), + file=sys.stderr, + ) + return False + driver_envs_from_tronfig = update_args_from_tronfig(args) + if driver_envs_from_tronfig is None: + return False + # argparse does not work as expected with both default and # type=validate_work_dir. validate_work_dir(args.work_dir) @@ -1204,18 +1378,10 @@ def paasta_spark_run(args): assume_aws_role_arn=args.assume_aws_role, session_duration=args.aws_role_duration, ) - docker_image = get_docker_image(args, instance_config) - if docker_image is None: + docker_image_digest = get_docker_image(args, instance_config) + if docker_image_digest is None: return 1 - # Get image digest - docker_client = get_docker_client() - image_details = docker_client.inspect_image(docker_image) - if len(image_details["RepoDigests"]) < 1: - print("Failed to get docker image digest", file=sys.stderr) - return None - docker_image_digest = image_details["RepoDigests"][0] - pod_template_path = generate_pod_template_path() args.enable_compact_bin_packing = should_enable_compact_bin_packing( args.disable_compact_bin_packing, args.cluster_manager @@ -1294,4 +1460,5 @@ def paasta_spark_run(args): aws_creds=aws_creds, cluster_manager=args.cluster_manager, pod_template_path=pod_template_path, + extra_driver_envs=driver_envs_from_tronfig, ) diff --git a/paasta_tools/cli/cmds/status.py b/paasta_tools/cli/cmds/status.py index 8da41723fb..88870b0864 100644 --- a/paasta_tools/cli/cmds/status.py +++ b/paasta_tools/cli/cmds/status.py @@ -52,11 +52,13 @@ from paasta_tools.cassandracluster_tools import CassandraClusterDeploymentConfig from paasta_tools.cli.utils import figure_out_service_name from paasta_tools.cli.utils import get_instance_configs_for_service +from paasta_tools.cli.utils import get_paasta_oapi_api_clustername from paasta_tools.cli.utils import lazy_choices_completer from paasta_tools.cli.utils import list_deploy_groups from paasta_tools.cli.utils import NoSuchService from paasta_tools.cli.utils import validate_service_name from paasta_tools.cli.utils import verify_instances +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.flink_tools import FlinkDeploymentConfig from paasta_tools.flink_tools import get_flink_config_from_paasta_api_client from paasta_tools.flink_tools import get_flink_jobs_from_paasta_api_client @@ -101,6 +103,7 @@ CassandraClusterDeploymentConfig, KafkaClusterDeploymentConfig, KubernetesDeploymentConfig, + EksDeploymentConfig, AdhocJobConfig, MarathonServiceConfig, TronActionConfig, @@ -112,6 +115,7 @@ CassandraClusterDeploymentConfig, KafkaClusterDeploymentConfig, KubernetesDeploymentConfig, + EksDeploymentConfig, AdhocJobConfig, MarathonServiceConfig, ] @@ -278,9 +282,16 @@ def paasta_status_on_api_endpoint( lock: Lock, verbose: int, new: bool = False, + is_eks: bool = False, ) -> int: - output = ["", f"\n{service}.{PaastaColors.cyan(instance)} in {cluster}"] - client = get_paasta_oapi_client(cluster, system_paasta_config) + output = [ + "", + f"\n{service}.{PaastaColors.cyan(instance)} in {cluster}{' (EKS)' if is_eks else ''}", + ] + client = get_paasta_oapi_client( + cluster=get_paasta_oapi_api_clustername(cluster=cluster, is_eks=is_eks), + system_paasta_config=system_paasta_config, + ) if not client: print("Cannot get a paasta-api client") exit(1) @@ -290,7 +301,6 @@ def paasta_status_on_api_endpoint( instance=instance, verbose=verbose, new=new, - include_smartstack=False, ) except client.api_error as exc: output.append(PaastaColors.red(exc.reason)) @@ -1716,7 +1726,6 @@ def get_autoscaling_table( f" Desired instances: {autoscaling_status['desired_replicas']}" ) table.append(f" Last scale time: {autoscaling_status['last_scale_time']}") - table.append(f" Dashboard: y/sfx-autoscaling") NA = PaastaColors.red("N/A") if len(autoscaling_status["metrics"]) > 0: table.append(f" Metrics:") @@ -2138,7 +2147,7 @@ def report_status_for_cluster( output = ["", "service: %s" % service, "cluster: %s" % cluster] deployed_instances = [] instances = [ - instance + (instance, instance_config_class) for instance, instance_config_class in instance_whitelist.items() if instance_config_class in ALLOWED_INSTANCE_CONFIG ] @@ -2175,7 +2184,7 @@ def report_status_for_cluster( return_code = 0 return_codes = [] - for deployed_instance in instances: + for deployed_instance, instance_config_class in instances: return_codes.append( paasta_status_on_api_endpoint( cluster=cluster, @@ -2185,6 +2194,7 @@ def report_status_for_cluster( lock=lock, verbose=verbose, new=new, + is_eks=(instance_config_class == EksDeploymentConfig), ) ) @@ -2192,7 +2202,11 @@ def report_status_for_cluster( return_code = 1 output.append( - report_invalid_whitelist_values(instances, seen_instances, "instance") + report_invalid_whitelist_values( + whitelist=[instance[0] for instance in instances], + items=seen_instances, + item_type="instance", + ) ) return return_code, output @@ -2569,6 +2583,7 @@ def _use_new_paasta_status(args, system_paasta_config) -> bool: marathon=print_marathon_status, kubernetes=print_kubernetes_status, kubernetes_v2=print_kubernetes_status_v2, + eks=print_kubernetes_status, tron=print_tron_status, adhoc=print_adhoc_status, flink=print_flink_status, diff --git a/paasta_tools/cli/cmds/validate.py b/paasta_tools/cli/cmds/validate.py index b591ccf930..f13b8eb28b 100644 --- a/paasta_tools/cli/cmds/validate.py +++ b/paasta_tools/cli/cmds/validate.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import functools import json import os import pkgutil @@ -26,6 +27,7 @@ from typing import Dict from typing import List from typing import Optional +from typing import Tuple from typing import Union import pytz @@ -49,6 +51,9 @@ from paasta_tools.cli.utils import PaastaColors from paasta_tools.cli.utils import success from paasta_tools.kubernetes_tools import sanitise_kubernetes_name +from paasta_tools.long_running_service_tools import ( + DEFAULT_DESIRED_ACTIVE_REQUESTS_PER_REPLICA, +) from paasta_tools.secret_tools import get_secret_name_from_ref from paasta_tools.secret_tools import is_secret_ref from paasta_tools.secret_tools import is_shared_secret @@ -58,6 +63,8 @@ from paasta_tools.tron_tools import TronJobConfig from paasta_tools.tron_tools import validate_complete_config from paasta_tools.utils import get_service_instance_list +from paasta_tools.utils import InstanceConfig +from paasta_tools.utils import InstanceConfigDict from paasta_tools.utils import list_all_instances_for_service from paasta_tools.utils import list_clusters from paasta_tools.utils import list_services @@ -96,10 +103,11 @@ SCHEMA_TYPES = { "adhoc", "kubernetes", # long-running services - "marathon", # long-running services on mesos - no longer used "rollback", # automatic rollbacks during deployments "tron", # batch workloads "eks", # eks workloads + "autotuned_defaults/kubernetes", + "autotuned_defaults/cassandracluster", } # we expect a comment that looks like # override-cpu-setting PROJ-1234 # but we don't have a $ anchor in case users want to add an additional @@ -115,6 +123,8 @@ # this to the autotune cap (i.e., 1) CPU_BURST_THRESHOLD = 2 +K8S_TYPES = {"eks", "kubernetes"} + class ConditionConfig(TypedDict, total=False): """ @@ -131,6 +141,26 @@ class ConditionConfig(TypedDict, total=False): dry_run: bool +@functools.lru_cache() +def load_all_instance_configs_for_service( + service: str, cluster: str, soa_dir: str +) -> Tuple[Tuple[str, InstanceConfig], ...]: + ret = [] + for instance in list_all_instances_for_service( + service=service, clusters=[cluster], soa_dir=soa_dir + ): + instance_config = get_instance_config( + service=service, + instance=instance, + cluster=cluster, + load_deployments=False, + soa_dir=soa_dir, + ) + ret.append((instance, instance_config)) + + return tuple(ret) + + def invalid_tron_namespace(cluster, output, filename): return failure( "%s is invalid:\n %s\n " "More info:" % (filename, output), @@ -283,7 +313,7 @@ def validate_schema(file_path: str, file_type: str) -> bool: config_file_object = get_config_file_dict(file_path) try: validator.validate(config_file_object) - if file_type == "kubernetes" and not validate_instance_names( + if file_type in K8S_TYPES and not validate_instance_names( config_file_object, file_path ): return False @@ -315,15 +345,16 @@ def validate_all_schemas(service_path: str) -> bool: :param service_path: path to location of configuration files """ - path = os.path.join(service_path, "*.yaml") + path = os.path.join(service_path, "**/*.yaml") returncode = True - for file_name in glob(path): + for file_name in glob(path, recursive=True): if os.path.islink(file_name): continue - basename = os.path.basename(file_name) + + filename_without_service_path = os.path.relpath(file_name, start=service_path) for file_type in SCHEMA_TYPES: - if basename.startswith(file_type): + if filename_without_service_path.startswith(file_type): if not validate_schema(file_name, file_type): returncode = False return returncode @@ -466,16 +497,9 @@ def validate_paasta_objects(service_path): returncode = True messages = [] for cluster in list_clusters(service, soa_dir): - for instance in list_all_instances_for_service( - service=service, clusters=[cluster], soa_dir=soa_dir + for instance, instance_config in load_all_instance_configs_for_service( + service=service, cluster=cluster, soa_dir=soa_dir ): - instance_config = get_instance_config( - service=service, - instance=instance, - cluster=cluster, - load_deployments=False, - soa_dir=soa_dir, - ) messages.extend(instance_config.validate()) returncode = len(messages) == 0 @@ -560,26 +584,46 @@ def validate_autoscaling_configs(service_path): ) for cluster in list_clusters(service, soa_dir): - for instance in list_all_instances_for_service( - service=service, clusters=[cluster], soa_dir=soa_dir + for instance, instance_config in load_all_instance_configs_for_service( + service=service, cluster=cluster, soa_dir=soa_dir ): - instance_config = get_instance_config( - service=service, - instance=instance, - cluster=cluster, - load_deployments=False, - soa_dir=soa_dir, - ) if ( - instance_config.get_instance_type() == "kubernetes" + instance_config.get_instance_type() in K8S_TYPES and instance_config.is_autoscaling_enabled() # we should eventually make the python templates add the override comment # to the correspoding YAML line, but until then we just opt these out of that validation - and __is_templated(service, soa_dir, cluster, workload="kubernetes") + and __is_templated( + service, + soa_dir, + cluster, + workload=instance_config.get_instance_type(), + ) is False ): autoscaling_params = instance_config.get_autoscaling_params() + if autoscaling_params["metrics_provider"] == "active-requests": + desired_active_requests_per_replica = autoscaling_params.get( + "desired_active_requests_per_replica", + DEFAULT_DESIRED_ACTIVE_REQUESTS_PER_REPLICA, + ) + if desired_active_requests_per_replica <= 0: + returncode = False + print( + failure( + msg="Autoscaling configuration is invalid: desired_active_requests_per_replica must be " + "greater than zero", + link="", + ) + ) + if len(instance_config.get_registrations()) > 1: + returncode = False + print( + failure( + msg="Autoscaling configuration is invalid: active-requests autoscaler doesn't support instances with multiple registrations.", + link="", + ) + ) if autoscaling_params["metrics_provider"] in { "uwsgi", "piscina", @@ -611,7 +655,11 @@ def validate_autoscaling_configs(service_path): # we need access to the comments, so we need to read the config with ruamel to be able # to actually get them in a "nice" automated fashion config = get_config_file_dict( - os.path.join(soa_dir, service, f"kubernetes-{cluster}.yaml"), + os.path.join( + soa_dir, + service, + f"{instance_config.get_instance_type()}-{cluster}.yaml", + ), use_ruamel=True, ) if config[instance].get("cpus") is None: @@ -649,16 +697,9 @@ def validate_min_max_instances(service_path): returncode = True for cluster in list_clusters(service, soa_dir): - for instance in list_all_instances_for_service( - service=service, clusters=[cluster], soa_dir=soa_dir + for instance, instance_config in load_all_instance_configs_for_service( + service=service, cluster=cluster, soa_dir=soa_dir ): - instance_config = get_instance_config( - service=service, - instance=instance, - cluster=cluster, - load_deployments=False, - soa_dir=soa_dir, - ) if instance_config.get_instance_type() != "tron": min_instances = instance_config.get_min_instances() max_instances = instance_config.get_max_instances() @@ -676,15 +717,21 @@ def validate_min_max_instances(service_path): return returncode -def check_secrets_for_instance(instance_config_dict, soa_dir, service_path, vault_env): +def check_secrets_for_instance( + instance_config_dict: InstanceConfigDict, soa_dir: str, service: str, vault_env: str +) -> bool: return_value = True + # If the service: directive is used, look for the secret there, rather than where the instance config is defined. + service_containing_secret = instance_config_dict.get("service", service) for env_value in instance_config_dict.get("env", {}).values(): if is_secret_ref(env_value): secret_name = get_secret_name_from_ref(env_value) if is_shared_secret(env_value): secret_file_name = f"{soa_dir}/_shared/secrets/{secret_name}.json" else: - secret_file_name = f"{service_path}/secrets/{secret_name}.json" + secret_file_name = ( + f"{soa_dir}/{service_containing_secret}/secrets/{secret_name}.json" + ) if os.path.isfile(secret_file_name): secret_json = get_config_file_dict(secret_file_name) if "ciphertext" not in secret_json["environments"].get(vault_env, {}): @@ -720,18 +767,11 @@ def validate_secrets(service_path): return_value = False continue - for instance in list_all_instances_for_service( - service=service, clusters=[cluster], soa_dir=soa_dir + for instance, instance_config in load_all_instance_configs_for_service( + service=service, cluster=cluster, soa_dir=soa_dir ): - instance_config = get_instance_config( - service=service, - instance=instance, - cluster=cluster, - load_deployments=False, - soa_dir=soa_dir, - ) if not check_secrets_for_instance( - instance_config.config_dict, soa_dir, service_path, vault_env + instance_config.config_dict, soa_dir, service, vault_env ): return_value = False if return_value: @@ -747,27 +787,29 @@ def validate_cpu_burst(service_path: str) -> bool: returncode = True for cluster in list_clusters(service, soa_dir): - if __is_templated(service, soa_dir, cluster, workload="kubernetes"): + if __is_templated( + service, soa_dir, cluster, workload="kubernetes" + ) or __is_templated(service, soa_dir, cluster, workload="eks"): # we should eventually make the python templates add the override comment # to the correspoding YAML line, but until then we just opt these out of that validation continue - for instance in list_all_instances_for_service( - service=service, clusters=[cluster], soa_dir=soa_dir + for instance, instance_config in load_all_instance_configs_for_service( + service=service, cluster=cluster, soa_dir=soa_dir ): - instance_config = get_instance_config( - service=service, - instance=instance, - cluster=cluster, - load_deployments=False, - soa_dir=soa_dir, + is_k8s_service = ( + instance_config.get_instance_type() == "kubernetes" + or instance_config.get_instance_type() == "eks" ) - is_k8s_service = instance_config.get_instance_type() == "kubernetes" should_skip_cpu_burst_validation = service in skip_cpu_burst_validation_list if is_k8s_service and not should_skip_cpu_burst_validation: # we need access to the comments, so we need to read the config with ruamel to be able # to actually get them in a "nice" automated fashion config = get_config_file_dict( - os.path.join(soa_dir, service, f"kubernetes-{cluster}.yaml"), + os.path.join( + soa_dir, + service, + f"{instance_config.get_instance_type()}-{cluster}.yaml", + ), use_ruamel=True, ) diff --git a/paasta_tools/cli/schemas/autotuned_defaults/marathon_schema.json b/paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json similarity index 62% rename from paasta_tools/cli/schemas/autotuned_defaults/marathon_schema.json rename to paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json index 89464e4f92..0725f3e3b3 100644 --- a/paasta_tools/cli/schemas/autotuned_defaults/marathon_schema.json +++ b/paasta_tools/cli/schemas/autotuned_defaults/cassandracluster_schema.json @@ -1,6 +1,6 @@ { "$schema": "http://json-schema.org/draft-04/schema#", - "description": "Properties that can be set by automated processes for http://paasta.readthedocs.io/en/latest/yelpsoa_configs.html#marathon-clustername-yaml", + "description": "Properties that can be set by automated processes for cassandracluster spec files", "type": "object", "additionalProperties": false, "minProperties": 1, @@ -15,29 +15,20 @@ "minimum": 0, "exclusiveMinimum": true }, - "cpu_burst_add": { + "cpu_burst_percent": { "type": "number", - "minimum": 0.0, - "exclusiveMinimum": false - }, - "disk": { - "type": "number", - "minimum": 128, - "exclusiveMinimum": true - }, - "min_instances": { - "type": "integer", "minimum": 0, "exclusiveMinimum": false }, - "max_instances": { - "type": "integer", - "minimum": 0, - "exclusiveMinimum": false + "disk": { + "type": "string" }, "mem": { + "type": "string" + }, + "replicas": { "type": "number", - "minimum": 32, + "minimum": 0, "exclusiveMinimum": true } } diff --git a/paasta_tools/cli/schemas/eks_schema.json b/paasta_tools/cli/schemas/eks_schema.json index 80d8fdfd37..b2ce87c6b5 120000 --- a/paasta_tools/cli/schemas/eks_schema.json +++ b/paasta_tools/cli/schemas/eks_schema.json @@ -1 +1 @@ -paasta_tools/cli/schemas/kubernetes_schema.json \ No newline at end of file +kubernetes_schema.json \ No newline at end of file diff --git a/paasta_tools/cli/schemas/kubernetes_schema.json b/paasta_tools/cli/schemas/kubernetes_schema.json index a2f4174cdc..005add2832 100644 --- a/paasta_tools/cli/schemas/kubernetes_schema.json +++ b/paasta_tools/cli/schemas/kubernetes_schema.json @@ -198,12 +198,16 @@ "cpu", "piscina", "gunicorn", - "arbitrary_promql" + "arbitrary_promql", + "active-requests" ] }, "decision_policy": { "type": "string" }, + "desired_active_requests_per_replica": { + "type": "number" + }, "setpoint": { "type": "number" }, @@ -225,9 +229,6 @@ "use_resource_metrics": { "type": "boolean" }, - "uwsgi_stats_port": { - "type": "integer" - }, "scaledown_policies": { "type": "object" }, @@ -273,6 +274,9 @@ "required": [ "metricsQuery" ] + }, + "max_instances_alert_threshold": { + "type": "number" } }, "allOf": [ @@ -287,6 +291,7 @@ "cpu", "piscina", "gunicorn", + "active-requests", "if metrics_provider is arbitrary_promql, the prometheus_adapter_config parameter is required" ] } @@ -424,6 +429,101 @@ } } }, + "node_selectors_preferred": { + "type": "array", + "items": { + "type": "object", + "properties": { + "weight": { + "type": "integer" + }, + "preferences": { + "type": "object", + "additionalProperties": false, + "patternProperties": { + "^[a-zA-Z0-9]+[a-zA-Z0-9-_./]*[a-zA-Z0-9]+$": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "additionalProperties": false, + "properties": { + "operator": { + "enum": [ + "In", + "NotIn" + ] + }, + "values": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + } + }, + "required": [ + "operator", + "values" + ] + }, + { + "type": "object", + "additionalProperties": false, + "properties": { + "operator": { + "enum": [ + "Exists", + "DoesNotExist" + ] + } + }, + "required": [ + "operator" + ] + }, + { + "type": "object", + "additionalProperties": false, + "properties": { + "operator": { + "enum": [ + "Gt", + "Lt" + ] + }, + "value": { + "type": "integer" + } + }, + "required": [ + "operator", + "value" + ] + } + ] + } + } + ] + } + } + } + } + } + }, "pool": { "type": "string" }, @@ -623,11 +723,12 @@ "type": "array" }, "iam_role": { - "type": "string" + "type": "string", + "pattern": "^arn:aws:iam::[0-9]+:role/[a-zA-Z0-9+=,.@_-]+$", + "$comment": "This should be a valid AWS IAM role ARN, see https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_iam-quotas.html#reference_iam-quotas-names" }, "iam_role_provider": { "enum": [ - "kiam", "aws" ] }, @@ -858,7 +959,81 @@ }, "namespace": { "type": "string", - "pattern": "^(paasta|paastasvc-.*)$" + "pattern": "^[a-z0-9]([-a-z0-9]*[a-z0-9])?$", + "maxLength": 63 + }, + "autotune_limits": { + "type": "object", + "properties": { + "cpus": { + "type": "object", + "properties": { + "min": { + "type": "number", + "minimum": 0, + "exclusiveMinimum": true + }, + "max": { + "type": "number", + "minimum": 0, + "exclusiveMinimum": true + } + } + }, + "mem": { + "type": "object", + "properties": { + "min": { + "type": "integer", + "minimum": 32, + "exclusiveMinimum": false + }, + "max": { + "type": "integer", + "minimum": 32, + "exclusiveMinimum": false + } + } + }, + "disk": { + "type": "object", + "properties": { + "min": { + "type": "integer", + "minimum": 128, + "exclusiveMinimum": false + }, + "max": { + "type": "integer", + "minimum": 128, + "exclusiveMinimum": false + } + } + } + } + }, + "topology_spread_constraints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "topology_key": { + "type": "string" + }, + "when_unsatisfiable": { + "type": "string", + "enum": [ + "ScheduleAnyway", + "DoNotSchedule" + ] + }, + "max_skew": { + "type": "integer" + } + }, + "required": [] + }, + "uniqueItems": true } } } diff --git a/paasta_tools/cli/schemas/marathon_schema.json b/paasta_tools/cli/schemas/marathon_schema.json deleted file mode 100644 index b8ccaa820c..0000000000 --- a/paasta_tools/cli/schemas/marathon_schema.json +++ /dev/null @@ -1,371 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-04/schema#", - "description": "http://paasta.readthedocs.io/en/latest/yelpsoa_configs.html#marathon-clustername-yaml", - "type": "object", - "additionalProperties": false, - "minProperties": 1, - "patternProperties": { - "^_.*$": { - "type": "object", - "additionalProperties": true - }, - "^([a-z0-9]|[a-z0-9][a-z0-9_-]*[a-z0-9])*$": { - "type": "object", - "additionalProperties": false, - "minProperties": 1, - "allOf": [ - { - "oneOf": [ - { - "properties": { - "healthcheck_mode": { - "enum": [ - "tcp", - "http", - "https" - ] - } - } - }, - { - "properties": { - "healthcheck_mode": { - "enum": [ - "cmd" - ] - }, - "healthcheck_cmd": { - "type": "string" - } - }, - "required": [ - "healthcheck_cmd" - ] - } - ] - }, - { - "oneOf": [ - { - "properties": { - "drain_method": { - "enum": [ - "noop", - "hacheck", - "test" - ] - } - } - }, - { - "properties": { - "drain_method": { - "enum": [ - "http" - ] - }, - "drain_method_params": { - "type": "object", - "properties": { - "drain": { - "type": "object" - }, - "stop_draining": { - "type": "object" - }, - "is_draining": { - "type": "object" - }, - "is_safe_to_kill": { - "type": "object" - } - }, - "required": [ - "drain", - "stop_draining", - "is_draining", - "is_safe_to_kill" - ] - } - }, - "required": [ - "drain_method_params" - ] - } - ] - } - ], - "properties": { - "cpus": { - "type": "number", - "minimum": 0, - "exclusiveMinimum": true, - "default": 0.25 - }, - "mem": { - "type": "number", - "minimum": 32, - "exclusiveMinimum": true, - "default": 1024 - }, - "disk": { - "type": "number", - "minimum": 0, - "exclusiveMinimum": true, - "default": 1024 - }, - "gpus": { - "type": "integer", - "minimum": 0, - "exclusiveMinimum": false - }, - "instances": { - "type": "integer", - "minimum": 0, - "exclusiveMinimum": false - }, - "min_instances": { - "type": "integer", - "minimum": 0, - "exclusiveMinimum": false - }, - "max_instances": { - "type": "integer", - "minimum": 0, - "exclusiveMinimum": false - }, - "backoff_factor": { - "type": "integer", - "default": 2 - }, - "max_launch_delay_seconds": { - "type": "integer", - "default": 300 - }, - "registrations": { - "type": "array", - "items": { - "type": "string" - }, - "uniqueItems": true - }, - "bounce_method": { - "type": "string" - }, - "bounce_health_params": { - "type": "object", - "properties": { - "check_haproxy": { - "type": "boolean", - "default": true - }, - "min_task_uptime": { - "type": "number" - }, - "haproxy_min_fraction_up": { - "type": "number", - "minimum": 0.0, - "maximum": 1.0, - "exclusiveMinimum": true, - "exclusiveMaximum": false - } - } - }, - "bounce_margin_factor": { - "type": "number", - "default": 1, - "minimum": 0, - "maximum": 1, - "exclusiveMinimum": true, - "exclusiveMaximum": false - }, - "bounce_priority": { - "type": "integer" - }, - "bounce_start_deadline": { - "type": "number" - }, - "deploy_group": { - "type": "string" - }, - "autoscaling": { - "type": "object" - }, - "sfn_autoscaling": { - "type": "object" - }, - "drain_method": { - "enum": [ - "noop", - "hacheck", - "http", - "test" - ], - "default": "noop" - }, - "drain_method_params": { - "type": "object" - }, - "constraints": { - "type": "array", - "items": { - "type": "array" - }, - "uniqueItems": true - }, - "extra_constraints": { - "type": "array", - "items": { - "type": "array" - }, - "uniqueItems": true - }, - "pool": { - "type": "string" - }, - "cmd": { - "type": "string" - }, - "args": { - "type": "array", - "items": { - "type": "string" - } - }, - "env": { - "type": "object", - "patternProperties": { - "^[a-zA-Z_]+[a-zA-Z0-9_]*$": { - "type": "string" - } - }, - "additionalProperties": false - }, - "extra_volumes": { - "type": "array", - "items": { - "type": "object" - }, - "uniqueItems": true - }, - "monitoring": { - "type": "object", - "properties": { - "team": { - "type": "string" - }, - "page": { - "type": "boolean" - } - }, - "additionalProperties": true - }, - "net": { - "type": "string" - }, - "container_port": { - "type": "number" - }, - "deploy_blacklist": { - "type": "array" - }, - "deploy_whitelist": { - "type": "array" - }, - "healthcheck_mode": { - "enum": [ - "cmd", - "tcp", - "http", - "https" - ] - }, - "healthcheck_cmd": { - "type": "string", - "default": "/bin/true" - }, - "healthcheck_grace_period_seconds": { - "type": "number", - "default": 60 - }, - "healthcheck_interval_seconds": { - "type": "number", - "default": 10 - }, - "healthcheck_timeout_seconds": { - "type": "number", - "default": 10 - }, - "healthcheck_max_consecutive_failures": { - "type": "integer", - "default": 30 - }, - "healthcheck_uri": { - "type": "string", - "default": "/status" - }, - "marathon_shard": { - "type": "integer", - "minimum": 0 - }, - "previous_marathon_shards": { - "type": "array" - }, - "replication_threshold": { - "type": "integer", - "minimum": 0 - }, - "cap_add": { - "type": "array", - "items": { - "type": "string" - } - }, - "cfs_period_us": { - "type": "integer", - "minimum": 1000, - "maximum": 1000000, - "exclusiveMinimum": false - }, - "cpu_burst_add": { - "type": "number", - "minimum": 0.0, - "exclusiveMinimum": false - }, - "host_port": { - "type": "integer", - "default": 0, - "minimum": 0, - "maximum": 65535, - "exclusiveMinimum": false - }, - "dependencies_reference": { - "type": "string" - }, - "extra_docker_args": { - "type": "object", - "additionalProperties": { - "type": "string" - } - }, - "security": { - "type": "object", - "properties": { - "inbound_firewall": { - "enum": [ - "accept", - "reject" - ] - }, - "outbound_firewall": { - "enum": [ - "block", - "monitor" - ] - } - } - } - } - } - } -} diff --git a/paasta_tools/cli/schemas/tron_schema.json b/paasta_tools/cli/schemas/tron_schema.json index 9134bfc934..3f1ad3095c 100644 --- a/paasta_tools/cli/schemas/tron_schema.json +++ b/paasta_tools/cli/schemas/tron_schema.json @@ -101,7 +101,9 @@ "type": "string" }, "iam_role": { - "type": "string" + "type": "string", + "pattern": "^arn:aws:iam::[0-9]+:role/[a-zA-Z0-9+=,.@_-]+$", + "$comment": "This should be a valid AWS IAM role ARN, see https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_iam-quotas.html#reference_iam-quotas-names" }, "iam_role_provider": { "enum": [ @@ -467,9 +469,6 @@ ], "additionalProperties": false, "properties": { - "use_k8s": { - "type": "boolean" - }, "name": { "$ref": "#definitions/name" }, diff --git a/paasta_tools/cli/utils.py b/paasta_tools/cli/utils.py index b5af5762eb..9cbca4bcf3 100644 --- a/paasta_tools/cli/utils.py +++ b/paasta_tools/cli/utils.py @@ -26,6 +26,7 @@ from shlex import quote from typing import Callable from typing import Collection +from typing import Generator from typing import Iterable from typing import List from typing import Mapping @@ -41,6 +42,8 @@ from paasta_tools import remote_git from paasta_tools.adhoc_tools import load_adhoc_job_config from paasta_tools.cassandracluster_tools import load_cassandracluster_instance_config +from paasta_tools.eks_tools import EksDeploymentConfig +from paasta_tools.eks_tools import load_eks_service_config from paasta_tools.flink_tools import load_flink_instance_config from paasta_tools.kafkacluster_tools import load_kafkacluster_instance_config from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig @@ -65,9 +68,11 @@ from paasta_tools.utils import list_clusters from paasta_tools.utils import list_services from paasta_tools.utils import load_system_paasta_config +from paasta_tools.utils import PAASTA_K8S_INSTANCE_TYPES from paasta_tools.utils import PaastaColors from paasta_tools.utils import SystemPaastaConfig from paasta_tools.utils import validate_service_instance +from paasta_tools.vitesscluster_tools import load_vitess_instance_config log = logging.getLogger(__name__) @@ -772,6 +777,7 @@ class LongRunningInstanceTypeHandler(NamedTuple): kubernetes=InstanceTypeHandler( get_service_instance_list, load_kubernetes_service_config ), + eks=InstanceTypeHandler(get_service_instance_list, load_eks_service_config), tron=InstanceTypeHandler(get_service_instance_list, load_tron_instance_config), flink=InstanceTypeHandler(get_service_instance_list, load_flink_instance_config), cassandracluster=InstanceTypeHandler( @@ -780,6 +786,9 @@ class LongRunningInstanceTypeHandler(NamedTuple): kafkacluster=InstanceTypeHandler( get_service_instance_list, load_kafkacluster_instance_config ), + vitesscluster=InstanceTypeHandler( + get_service_instance_list, load_vitess_instance_config + ), nrtsearchservice=InstanceTypeHandler( get_service_instance_list, load_nrtsearchservice_instance_config ), @@ -807,12 +816,18 @@ class LongRunningInstanceTypeHandler(NamedTuple): kafkacluster=LongRunningInstanceTypeHandler( get_service_instance_list, load_kafkacluster_instance_config ), + vitesscluster=LongRunningInstanceTypeHandler( + get_service_instance_list, load_vitess_instance_config + ), nrtsearchservice=LongRunningInstanceTypeHandler( get_service_instance_list, load_nrtsearchservice_instance_config ), monkrelays=LongRunningInstanceTypeHandler( get_service_instance_list, load_monkrelaycluster_instance_config ), + eks=LongRunningInstanceTypeHandler( + get_service_instance_list, load_eks_service_config + ), ) @@ -852,11 +867,16 @@ def get_namespaces_for_secret( ) -> Set[str]: secret_to_k8s_namespace = set() + k8s_instance_type_classes = { + "kubernetes": KubernetesDeploymentConfig, + "eks": EksDeploymentConfig, + } for instance_type in INSTANCE_TYPES: - if instance_type == "kubernetes": + if instance_type in PAASTA_K8S_INSTANCE_TYPES: config_loader = PaastaServiceConfigLoader(service, soa_dir) for service_instance_config in config_loader.instance_configs( - cluster=cluster, instance_type_class=KubernetesDeploymentConfig + cluster=cluster, + instance_type_class=k8s_instance_type_classes[instance_type], ): secret_to_k8s_namespace.add(service_instance_config.get_namespace()) else: @@ -1046,14 +1066,14 @@ def get_instance_configs_for_service( type_filter: Optional[Iterable[str]] = None, clusters: Optional[Sequence[str]] = None, instances: Optional[Sequence[str]] = None, -) -> Iterable[InstanceConfig]: +) -> Generator[InstanceConfig, None, None]: if not clusters: clusters = list_clusters(service=service, soa_dir=soa_dir) if type_filter is None: type_filter = INSTANCE_TYPE_HANDLERS.keys() - for cluster in list_clusters(service=service, soa_dir=soa_dir): + for cluster in clusters: for instance_type, instance_handlers in INSTANCE_TYPE_HANDLERS.items(): if instance_type not in type_filter: continue @@ -1178,3 +1198,15 @@ def verify_instances( print(" %s" % instance) return misspelled_instances + + +def get_paasta_oapi_api_clustername(cluster: str, is_eks: bool) -> str: + """ + We'll be doing a tiny bit of lying while we have both EKS and non-EKS + clusters: these will generally share the same PaaSTA name (i.e., the + soaconfigs suffix will stay the same) - but we'll need a way to route API + requests to the correct place. To do so, we'll add "fake" entries to our + api_endpoints SystemPaastaConfig that are the PaaSTA clustername with an + "eks-" prefix + """ + return f"eks-{cluster}" if is_eks else cluster diff --git a/paasta_tools/config_utils.py b/paasta_tools/config_utils.py index ad6ead5f0c..4e43d05280 100644 --- a/paasta_tools/config_utils.py +++ b/paasta_tools/config_utils.py @@ -1,3 +1,4 @@ +import copy import logging import os import subprocess @@ -7,17 +8,30 @@ from typing import List from typing import Optional from typing import Set +from typing import Tuple import ruamel.yaml as yaml from paasta_tools.cli.cmds.validate import validate_schema +from paasta_tools.utils import AUTO_SOACONFIG_SUBDIR from paasta_tools.utils import DEFAULT_SOA_DIR log = logging.getLogger(__name__) # Must have a schema defined -KNOWN_CONFIG_TYPES = ("marathon", "kubernetes", "deploy", "smartstack") +KNOWN_CONFIG_TYPES = ( + "marathon", + "kubernetes", + "deploy", + "smartstack", + "cassandracluster", +) + +# this could use a better name - but basically, this is for pairs of instance types +# where you generally want to check both types (i.e.,g a status-quo and migration +# instance type) +INSTANCE_TYPE_COUNTERPARTS = {"eks": "kubernetes", "kubernetes": "eks"} def my_represent_none(self, data): @@ -174,7 +188,13 @@ def __enter__(self): self.pwd = os.getcwd() os.chdir(self.working_dir) if self.branch != "master": - subprocess.check_call(["git", "checkout", "-b", self.branch]) + if self._remote_branch_exists(): + subprocess.check_call(["git", "fetch", "origin", self.branch]) + subprocess.check_call( + ["git", "checkout", "-b", self.branch, f"origin/{self.branch}"] + ) + else: + subprocess.check_call(["git", "checkout", "-b", self.branch]) return self def __exit__(self, type, value, traceback): @@ -182,6 +202,14 @@ def __exit__(self, type, value, traceback): if self.tmp_dir: self.tmp_dir.cleanup() + def _remote_branch_exists(self) -> bool: + return ( + subprocess.run( + ["git", "ls-remote", "--exit-code", "--heads", "origin", self.branch], + ).returncode + == 0 + ) + def write_configs( self, service: str, @@ -239,3 +267,106 @@ def commit_to_remote(self, extra_message: str = ""): _push_to_remote(self.branch) else: log.info("No files changed, no push required.") + + def _clamp_recommendations( + self, merged_recommendation: Dict[str, Any], config: Dict[str, Any] + ) -> Dict[str, Any]: + clamped_recomendation = copy.deepcopy(merged_recommendation) + for limit_type, limits in config.get("autotune_limits", {}).items(): + log.debug(f"Processing {limit_type} autotune limits...") + min_value = limits.get("min") + max_value = limits.get("max") + unclamped_resource_value = clamped_recomendation.get(limit_type) + + # no autotune data present, but min value present + if not unclamped_resource_value and min_value: + # use the min value since this is likely an autogenerated service where we know we have a given minimum CPU + # that we'd like to allocate + log.debug( + f"No {limit_type} autotune data found, using autotune limit lower bound ({min_value})." + ) + clamped_recomendation[limit_type] = min_value + + # otherwise, we can do some pretty rote clamping of resource values + elif unclamped_resource_value is not None: + if min_value and unclamped_resource_value < min_value: + log.debug( + f"{limit_type} autotune config under configured limit ({min_value}), using autotune limit lower bound." + ) + clamped_recomendation[limit_type] = min_value + if max_value and unclamped_resource_value > max_value: + log.debug( + f"{limit_type} autotune config over configured limit ({min_value}), using autotune limit upper bound." + ) + clamped_recomendation[limit_type] = max_value + else: + log.debug( + f"No {limit_type} autotune data or limits found, will continue using PaaSTA defaults." + ) + + return clamped_recomendation + + def merge_recommendations( + self, recs: Dict[Tuple[str, str], Dict[str, Any]] + ) -> Dict[Tuple[str, str], Dict[str, Any]]: + """ + :param recs: Dictionary of (service, instance_type_cluster) -> recommendations. + NOTE: instance_type_cluster is something like "kubernetes-pnw-prod". + :returns: a dictionary of the same format, with the previous recommendations merged in and autotune_limits applied. + """ + merged_recs = {} + for ( + service, + instance_type_cluster, + ), recommendations_by_instance in recs.items(): + log.info(f"Starting to process {service}/{instance_type_cluster}.yaml...") + log.debug( + f"Getting current autotune configs for {service}/{instance_type_cluster}.yaml" + ) + existing_recommendations = self.get_existing_configs( + service=service, + file_name=instance_type_cluster, + sub_dir=AUTO_SOACONFIG_SUBDIR, + ) + + log.debug( + f"Getting current configs for {service}/{instance_type_cluster}.yaml..." + ) + # i'm so sorry. + # basically, we need to make sure that for every autotuned service, we load both kubernetes- + # and eks- files for the existing configs, as there are services that at any given time will + # only exist on one of these or may have a mix (and the csv file that we get fakes the current + # cluster type) + # NOTE: if an instance appears in both files, the counterpart will always "win" - this + # should only be possible while an instance is being migrated from one instance type to + # another + instance_type, _ = instance_type_cluster.split("-", maxsplit=1) + existing_configs = { + # if we upgrate to py3.9 before getting rid of this code, this should use PEP-584-style dict merging + **self.get_existing_configs( + service=service, + file_name=instance_type_cluster, + ), + **self.get_existing_configs( + service=service, + file_name=instance_type_cluster.replace( + instance_type, INSTANCE_TYPE_COUNTERPARTS.get(instance_type, "") + ), + ), + } + + for instance_name, recommendation in recommendations_by_instance.items(): + log.debug( + f"Merging recommendations for {instance_name} in {service}/{AUTO_SOACONFIG_SUBDIR}/{instance_type_cluster}.yaml..." + ) + existing_recommendations.setdefault(instance_name, {}) + existing_recommendations[instance_name].update(recommendation) + + existing_recommendations[instance_name] = self._clamp_recommendations( + merged_recommendation=existing_recommendations[instance_name], + config=existing_configs[instance_name], + ) + merged_recs[(service, instance_type_cluster)] = existing_recommendations + log.info(f"Done processing {service}/{instance_type_cluster}.yaml.") + + return merged_recs diff --git a/paasta_tools/contrib/create_paasta_playground.py b/paasta_tools/contrib/create_paasta_playground.py index 660d3a7c68..9672e80d5c 100644 --- a/paasta_tools/contrib/create_paasta_playground.py +++ b/paasta_tools/contrib/create_paasta_playground.py @@ -46,6 +46,7 @@ def main(): src="./k8s_itests/deployments/paasta/fake_soa_config", dst="soa_config_playground", values=values_path, + overwrite=False, ) diff --git a/paasta_tools/contrib/get_running_task_allocation.py b/paasta_tools/contrib/get_running_task_allocation.py index be9854aadd..f776ae9e67 100644 --- a/paasta_tools/contrib/get_running_task_allocation.py +++ b/paasta_tools/contrib/get_running_task_allocation.py @@ -275,54 +275,55 @@ def parse_args() -> argparse.Namespace: "--scheduler", help="Scheduler to get task info from", dest="scheduler", - default="mesos", + default="kubernetes", choices=["mesos", "kubernetes"], ) + parser.add_argument( + "--additional-namespaces-exclude", + help="full names of namespaces to not fetch allocation info for those that don't match --namespace-prefix-exlude", + dest="additional_namespaces_exclude", + nargs="+", + default=[], + ) parser.add_argument( "--namespace-prefix", - help="prefix of the namespace to fetch the logs for" - "Used only when scheduler is kubernetes", + help=argparse.SUPPRESS, dest="namespace_prefix", default="paasta", ) parser.add_argument( "--additional-namespaces", - help="full names of namespaces to fetch allocation info for that don't match --namespace-prefix" - "Used only when scheduler is kubernetes", + help=argparse.SUPPRESS, dest="additional_namespaces", nargs="+", # we default this to tron since this is really the only non-paasta-prefix namespaced that is part of paasta # and we'd like to not run two cronjobs to get this information :p default=["tron"], ) - return parser.parse_args() + args = parser.parse_args() + + args.additional_namespaces_exclude = set(args.additional_namespaces_exclude) + return args -def get_matching_namespaces( - namespaces: List[str], namespace_prefix: str, additional_namespaces: List[str] + +def get_unexcluded_namespaces( + namespaces: List[str], excluded_namespaces: List[str] ) -> List[str]: - return [ - n - for n in namespaces - if n.startswith(namespace_prefix) or n in additional_namespaces - ] + return [n for n in namespaces if n not in excluded_namespaces] def main(args: argparse.Namespace) -> None: cluster = load_system_paasta_config().get_cluster() - if args.scheduler == "mesos": + kube_client = KubeClient() + all_namespaces = kubernetes_tools.get_all_namespaces(kube_client) + for matching_namespace in get_unexcluded_namespaces( + all_namespaces, + args.additional_namespaces_exclude, + ): display_task_allocation_info( - cluster, args.scheduler, args.namespace_prefix, kube_client=None + cluster, args.scheduler, matching_namespace, kube_client ) - else: - kube_client = KubeClient() - all_namespaces = kubernetes_tools.get_all_namespaces(kube_client) - for matching_namespace in get_matching_namespaces( - all_namespaces, args.namespace_prefix, args.additional_namespaces - ): - display_task_allocation_info( - cluster, args.scheduler, matching_namespace, kube_client - ) def display_task_allocation_info( diff --git a/paasta_tools/contrib/habitat_fixer.py b/paasta_tools/contrib/habitat_fixer.py new file mode 100755 index 0000000000..63fdee22c3 --- /dev/null +++ b/paasta_tools/contrib/habitat_fixer.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +import argparse +from pathlib import Path + +from kubernetes.client import V1Node + +from paasta_tools.kubernetes_tools import KUBE_CONFIG_PATH +from paasta_tools.kubernetes_tools import KUBE_CONFIG_USER_PATH +from paasta_tools.kubernetes_tools import KubeClient + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Karpenter Habitat Corruption Workaround" + ) + parser.add_argument("-c", "--cluster", required=True) + parser.add_argument( + "-k", + "--kubeconfig", + default=KUBE_CONFIG_PATH + if Path(KUBE_CONFIG_PATH).exists() + else KUBE_CONFIG_USER_PATH, + ) + parser.add_argument( + "-t", "--context", default=None # -c is taken, so lets use the last letter :p + ) + parser.add_argument( + "--for-real", + action="store_true", + ) + parsed_args = parser.parse_args() + + if not parsed_args.context: + if parsed_args.kubeconfig == KUBE_CONFIG_USER_PATH: + # in the user kubeconfig, context names are just the cluster names + parsed_args.context = parsed_args.cluster + else: + print( + f"NOTE: no context specified - will use the current context selected in {parsed_args.kubeconfig} " + "(or the KUBECONTEXT environment variable if set)." + ) + + return parsed_args + + +def is_affected_node(node: V1Node) -> bool: + try: + int(node.metadata.labels["yelp.com/habitat"]) + return True + except ValueError: + return False + + +def get_desired_habitat(node: V1Node) -> str: + zone = node.metadata.labels["topology.kubernetes.io/zone"].replace("-", "") + ecosystem = node.metadata.labels["yelp.com/ecosystem"] + + return f"{zone}{ecosystem}" + + +def main(): + args = parse_args() + client = KubeClient(config_file=args.kubeconfig, context=args.context) + for node in client.core.list_node().items: + if not is_affected_node(node): + continue + + if args.for_real: + client.core.patch_node( + name=node.metadata.name, + body={ + "metadata": { + "labels": { + "yelp.com/habitat": get_desired_habitat(node), + }, + } + }, + ) + else: + print( + f"Would have edited {node.metadata.name} in pool={node.metadata.labels['yelp.com/pool']} to have habitat={get_desired_habitat(node)} (from {node.metadata.labels['yelp.com/habitat']})", + ) + + +if __name__ == "__main__": + main() diff --git a/paasta_tools/contrib/render_template.py b/paasta_tools/contrib/render_template.py index 5c9ddf4cf6..dde7fea595 100755 --- a/paasta_tools/contrib/render_template.py +++ b/paasta_tools/contrib/render_template.py @@ -31,15 +31,17 @@ def render_file(src, dst, values): new.write(replace(old.read(), values)) -def render(src, dst, values={}, exclude={}): +def render(src, dst, values={}, exclude={}, overwrite=True): if os.path.isfile(src): - render_file(src, dst, values) + if overwrite: + render_file(src, dst, values) return for f in os.scandir(src): if f.name.startswith(".") or f.path in exclude: continue if os.path.isfile(f.path): - render_file(f.path, dst, values) + if overwrite: + render_file(f.path, dst, values) else: new_dst = replace(f"{dst}/{f.name}", values) try: @@ -47,7 +49,7 @@ def render(src, dst, values={}, exclude={}): except OSError as e: if e.errno != os.errno.EEXIST: raise - render(f.path, new_dst, values, exclude) + render(f.path, new_dst, values, exclude, overwrite) def parse_args(): @@ -82,7 +84,7 @@ def parse_args(): return args -def render_values(src: str, dst: str, values: str) -> None: +def render_values(src: str, dst: str, values: str, overwrite=True) -> None: if values is not None: values = os.path.abspath(values) # Validate src and values. Dst needs to be a directory. src can be either a valid folder of directory. values need to be valid file if provided. @@ -108,7 +110,7 @@ def render_values(src: str, dst: str, values: str) -> None: ), v, ) - render(src, dst, config_dict, {values}) + render(src, dst, config_dict, {values}, overwrite) def main(): diff --git a/paasta_tools/contrib/rightsizer_soaconfigs_update.py b/paasta_tools/contrib/rightsizer_soaconfigs_update.py index 594f800037..b30a80ce36 100644 --- a/paasta_tools/contrib/rightsizer_soaconfigs_update.py +++ b/paasta_tools/contrib/rightsizer_soaconfigs_update.py @@ -1,15 +1,27 @@ import argparse import logging from collections import defaultdict +from typing import Any +from typing import cast +from typing import Dict +from typing import List +from typing import Literal +from typing import Optional from typing import Set +from typing import TypedDict +from typing import Union from paasta_tools.config_utils import AutoConfigUpdater from paasta_tools.contrib.paasta_update_soa_memcpu import get_report_from_splunk +from paasta_tools.kubernetes_tools import SidecarResourceRequirements from paasta_tools.utils import AUTO_SOACONFIG_SUBDIR from paasta_tools.utils import DEFAULT_SOA_CONFIGS_GIT_URL from paasta_tools.utils import format_git_url from paasta_tools.utils import load_system_paasta_config + +log = logging.getLogger(__name__) + NULL = "null" SUPPORTED_CSV_KEYS = ( "cpus", @@ -136,10 +148,66 @@ def get_default_git_remote(): return default_git_remote -def get_recommendation_from_result(result, keys_to_apply): - rec = {} +SupportedInstanceType = Literal["kubernetes", "eks", "cassandracluster"] + + +class CassandraRightsizerResult(TypedDict): + current_cpus: str + suggested_cpus: str + + current_disk: str + suggested_disk: str + + current_mem: str + suggested_mem: str + + current_replicas: str + suggested_replicas: str + + +class CassandraRecommendation(TypedDict, total=False): + disk: str + mem: str + cpus: float + replicas: int + cpu_burst_percent: float + + +class KubernetesRightsizerResult(TypedDict): + current_cpus: str + suggested_cpus: str + + current_disk: str + suggested_disk: str + + current_mem: str + suggested_mem: str + + suggested_hacheck_cpus: float + + suggested_cpu_burst_add: float + + suggested_min_instances: int + + suggested_max_instances: int + + +class KubernetesRecommendation(TypedDict, total=False): + disk: float + mem: float + cpus: float + cpu_burst_add: float + max_instances: int + min_instances: int + sidecar_resource_requirements: Dict[str, SidecarResourceRequirements] + + +def get_kubernetes_recommendation_from_result( + result: KubernetesRightsizerResult, keys_to_apply: List[str] +) -> KubernetesRecommendation: + rec: KubernetesRecommendation = {} for key in keys_to_apply: - val = result.get(key) + val: Optional[str] = cast(Optional[str], result.get(key)) if not val or val == NULL: continue if key == "cpus": @@ -169,12 +237,33 @@ def get_recommendation_from_result(result, keys_to_apply): return rec +def get_cassandra_recommendation_from_result( + result: CassandraRightsizerResult, keys_to_apply: List[str] +) -> CassandraRecommendation: + rec: CassandraRecommendation = {} + for key in keys_to_apply: + val: Optional[str] = cast(Optional[str], result.get(key)) + if not val or val == NULL: + continue + if key == "cpus": + rec["cpus"] = float(val) + elif key == "cpu_burst_percent": + rec["cpu_burst_percent"] = float(val) + elif key == "mem": + rec["mem"] = val + elif key == "disk": + rec["disk"] = val + elif key == "replicas": + rec["replicas"] = int(val) + return rec + + def get_recommendations_by_service_file( results, keys_to_apply, exclude_clusters: Set[str], ): - results_by_service_file = defaultdict(dict) + results_by_service_file: Dict[tuple, Dict[str, Any]] = defaultdict(dict) for result in results.values(): # we occasionally want to disable autotune for a cluster (or set of clusters) # to do so, we can simply skip getting recommendations for any (service, cluster) @@ -189,7 +278,12 @@ def get_recommendations_by_service_file( result["service"], result["cluster"], ) # e.g. (foo, marathon-norcal-stagef) - rec = get_recommendation_from_result(result, keys_to_apply) + instance_type = result["cluster"].split("-", 1)[0] + rec: Union[KubernetesRecommendation, CassandraRecommendation] = {} + if instance_type == "cassandracluster": + rec = get_cassandra_recommendation_from_result(result, keys_to_apply) + elif instance_type == "kubernetes": + rec = get_kubernetes_recommendation_from_result(result, keys_to_apply) if not rec: continue results_by_service_file[key][result["instance"]] = rec @@ -226,17 +320,17 @@ def main(args): validation_schema_path=AUTO_SOACONFIG_SUBDIR, ) with updater: - for (service, extra_info), instance_recommendations in results.items(): - existing_recommendations = updater.get_existing_configs( - service, extra_info, AUTO_SOACONFIG_SUBDIR + for ( + service, + instance_type_cluster, + ), instance_recommendations in updater.merge_recommendations(results).items(): + log.info( + f"Writing configs for {service} to {AUTO_SOACONFIG_SUBDIR}/{instance_type_cluster}.yaml..." ) - for instance_name, recommendation in instance_recommendations.items(): - existing_recommendations.setdefault(instance_name, {}) - existing_recommendations[instance_name].update(recommendation) updater.write_configs( service, - extra_info, - existing_recommendations, + instance_type_cluster, + instance_recommendations, AUTO_SOACONFIG_SUBDIR, HEADER_COMMENT, ) diff --git a/paasta_tools/delete_kubernetes_deployments.py b/paasta_tools/delete_kubernetes_deployments.py index 944954fd70..b04574b317 100755 --- a/paasta_tools/delete_kubernetes_deployments.py +++ b/paasta_tools/delete_kubernetes_deployments.py @@ -73,7 +73,11 @@ def main(args=None) -> None: for deployment_name in deployment_names: try: log.debug(f"Deleting {deployment_name}") - delete_deployment(kube_client=kube_client, deployment_name=deployment_name) + delete_deployment( + kube_client=kube_client, + deployment_name=deployment_name, + namespace="paasta", + ) except Exception as err: log.error(f"Unable to delete {deployment_name}: {err}") sys.exit(1) diff --git a/paasta_tools/docker_wrapper.py b/paasta_tools/docker_wrapper.py index f7867e8277..72f9901bf0 100755 --- a/paasta_tools/docker_wrapper.py +++ b/paasta_tools/docker_wrapper.py @@ -340,15 +340,14 @@ def main(argv=None): # Marathon sets MESOS_TASK_ID mesos_task_id = env_args.get("MESOS_TASK_ID") - hostname = socket.getfqdn() + fqdn = socket.getfqdn() + hostname = fqdn.partition(".")[0] if mesos_task_id and can_add_hostname(argv): - argv = add_argument(argv, f"-e=PAASTA_HOST={hostname}") - hostname_task_id = generate_hostname_task_id( - hostname.partition(".")[0], mesos_task_id - ) + argv = add_argument(argv, f"-e=PAASTA_HOST={fqdn}") + hostname_task_id = generate_hostname_task_id(hostname, mesos_task_id) argv = add_argument(argv, f"--hostname={hostname_task_id }") elif can_add_hostname(argv): - argv = add_argument(argv, f"-e=PAASTA_HOST={hostname}") + argv = add_argument(argv, f"-e=PAASTA_HOST={fqdn}") argv = add_argument(argv, f"--hostname={hostname}") paasta_firewall = env_args.get("PAASTA_FIREWALL") diff --git a/paasta_tools/instance/kubernetes.py b/paasta_tools/instance/kubernetes.py index 28f37a2604..fa825a676e 100644 --- a/paasta_tools/instance/kubernetes.py +++ b/paasta_tools/instance/kubernetes.py @@ -12,6 +12,7 @@ from typing import Sequence from typing import Set from typing import Tuple +from typing import Union import a_sync import pytz @@ -24,6 +25,7 @@ from mypy_extensions import TypedDict from paasta_tools import cassandracluster_tools +from paasta_tools import eks_tools from paasta_tools import envoy_tools from paasta_tools import flink_tools from paasta_tools import kafkacluster_tools @@ -32,6 +34,7 @@ from paasta_tools import monkrelaycluster_tools from paasta_tools import nrtsearchservice_tools from paasta_tools import smartstack_tools +from paasta_tools import vitesscluster_tools from paasta_tools.cli.utils import LONG_RUNNING_INSTANCE_TYPE_HANDLERS from paasta_tools.instance.hpa_metrics_parser import HPAMetricsDict from paasta_tools.instance.hpa_metrics_parser import HPAMetricsParser @@ -46,8 +49,12 @@ from paasta_tools.utils import calculate_tail_lines -INSTANCE_TYPES_CR = {"flink", "cassandracluster", "kafkacluster"} -INSTANCE_TYPES_K8S = {"kubernetes", "cassandracluster"} +INSTANCE_TYPES_CR = {"flink", "cassandracluster", "kafkacluster", "vitesscluster"} +INSTANCE_TYPES_K8S = { + "cassandracluster", + "eks", + "kubernetes", +} INSTANCE_TYPES = INSTANCE_TYPES_K8S.union(INSTANCE_TYPES_CR) INSTANCE_TYPES_WITH_SET_STATE = {"flink"} @@ -55,6 +62,7 @@ flink=flink_tools.cr_id, cassandracluster=cassandracluster_tools.cr_id, kafkacluster=kafkacluster_tools.cr_id, + vitesscluster=vitesscluster_tools.cr_id, nrtsearchservice=nrtsearchservice_tools.cr_id, monkrelaycluster=monkrelaycluster_tools.cr_id, ) @@ -215,6 +223,8 @@ async def pod_info( } +# TODO: Cleanup +# Only used in old kubernetes_status async def job_status( kstatus: MutableMapping[str, Any], client: kubernetes_tools.KubeClient, @@ -459,18 +469,29 @@ def filter_actually_running_replicasets( def bounce_status( - service: str, - instance: str, - settings: Any, + service: str, instance: str, settings: Any, is_eks: bool = False ) -> Dict[str, Any]: status: Dict[str, Any] = {} - job_config = kubernetes_tools.load_kubernetes_service_config( - service=service, - instance=instance, - cluster=settings.cluster, - soa_dir=settings.soa_dir, - load_deployments=True, - ) + # this should be the only place where it matters that we use eks_tools. + # apart from loading config files, we should be using kubernetes_tools + # everywhere. + job_config: Union[KubernetesDeploymentConfig, eks_tools.EksDeploymentConfig] + if is_eks: + job_config = eks_tools.load_eks_service_config( + service=service, + instance=instance, + cluster=settings.cluster, + soa_dir=settings.soa_dir, + load_deployments=True, + ) + else: + job_config = kubernetes_tools.load_kubernetes_service_config( + service=service, + instance=instance, + cluster=settings.cluster, + soa_dir=settings.soa_dir, + load_deployments=True, + ) expected_instance_count = job_config.get_instances() status["expected_instance_count"] = expected_instance_count desired_state = job_config.get_desired_state() @@ -574,7 +595,6 @@ async def kubernetes_status_v2( service: str, instance: str, verbose: int, - include_smartstack: bool, include_envoy: bool, instance_type: str, settings: Any, @@ -1091,12 +1111,12 @@ async def get_version_for_controller_revision( } +# TODO: Cleanup old kubernetes status @a_sync.to_blocking async def kubernetes_status( service: str, instance: str, verbose: int, - include_smartstack: bool, include_envoy: bool, instance_type: str, settings: Any, @@ -1184,35 +1204,23 @@ async def kubernetes_status( evicted_count += 1 kstatus["evicted_count"] = evicted_count - if include_smartstack or include_envoy: + if include_envoy: service_namespace_config = kubernetes_tools.load_service_namespace_config( service=service, namespace=job_config.get_nerve_namespace(), soa_dir=settings.soa_dir, ) if "proxy_port" in service_namespace_config: - if include_smartstack: - kstatus["smartstack"] = await mesh_status( - service=service, - service_mesh=ServiceMesh.SMARTSTACK, - instance=job_config.get_nerve_namespace(), - job_config=job_config, - service_namespace_config=service_namespace_config, - pods_task=pods_task, - should_return_individual_backends=verbose > 0, - settings=settings, - ) - if include_envoy: - kstatus["envoy"] = await mesh_status( - service=service, - service_mesh=ServiceMesh.ENVOY, - instance=job_config.get_nerve_namespace(), - job_config=job_config, - service_namespace_config=service_namespace_config, - pods_task=pods_task, - should_return_individual_backends=verbose > 0, - settings=settings, - ) + kstatus["envoy"] = await mesh_status( + service=service, + service_mesh=ServiceMesh.ENVOY, + instance=job_config.get_nerve_namespace(), + job_config=job_config, + service_namespace_config=service_namespace_config, + pods_task=pods_task, + should_return_individual_backends=verbose > 0, + settings=settings, + ) return kstatus @@ -1220,7 +1228,6 @@ def instance_status( service: str, instance: str, verbose: int, - include_smartstack: bool, include_envoy: bool, use_new: bool, instance_type: str, @@ -1250,7 +1257,6 @@ def instance_status( instance=instance, instance_type=instance_type, verbose=verbose, - include_smartstack=include_smartstack, include_envoy=include_envoy, settings=settings, ) @@ -1260,7 +1266,6 @@ def instance_status( instance=instance, instance_type=instance_type, verbose=verbose, - include_smartstack=include_smartstack, include_envoy=include_envoy, settings=settings, ) @@ -1285,11 +1290,10 @@ async def kubernetes_mesh_status( instance: str, instance_type: str, settings: Any, - include_smartstack: bool = True, include_envoy: bool = True, ) -> Mapping[str, Any]: - if not include_smartstack and not include_envoy: + if not include_envoy: raise RuntimeError("No mesh types specified when requesting mesh status") if instance_type not in LONG_RUNNING_INSTANCE_TYPE_HANDLERS: raise RuntimeError( @@ -1334,11 +1338,6 @@ async def kubernetes_mesh_status( should_return_individual_backends=True, settings=settings, ) - if include_smartstack: - kmesh["smartstack"] = await mesh_status( - service_mesh=ServiceMesh.SMARTSTACK, - **mesh_status_kwargs, - ) if include_envoy: kmesh["envoy"] = await mesh_status( service_mesh=ServiceMesh.ENVOY, diff --git a/paasta_tools/kubernetes/application/controller_wrappers.py b/paasta_tools/kubernetes/application/controller_wrappers.py index 83d025931a..ac9f2fa433 100644 --- a/paasta_tools/kubernetes/application/controller_wrappers.py +++ b/paasta_tools/kubernetes/application/controller_wrappers.py @@ -1,8 +1,6 @@ import logging -import threading from abc import ABC from abc import abstractmethod -from time import sleep from typing import Optional from typing import Union @@ -13,14 +11,13 @@ from kubernetes.client.rest import ApiException from paasta_tools.autoscaling.autoscaling_service_lib import autoscaling_is_paused +from paasta_tools.eks_tools import load_eks_service_config_no_cache from paasta_tools.kubernetes_tools import create_deployment from paasta_tools.kubernetes_tools import create_pod_disruption_budget from paasta_tools.kubernetes_tools import create_stateful_set -from paasta_tools.kubernetes_tools import force_delete_pods from paasta_tools.kubernetes_tools import KubeClient from paasta_tools.kubernetes_tools import KubeDeployment from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig -from paasta_tools.kubernetes_tools import list_all_deployments from paasta_tools.kubernetes_tools import load_kubernetes_service_config_no_cache from paasta_tools.kubernetes_tools import paasta_prefixed from paasta_tools.kubernetes_tools import pod_disruption_budget_for_service_instance @@ -68,15 +65,23 @@ def __init__( self.logging = logging def load_local_config( - self, soa_dir: str, cluster: str + self, soa_dir: str, cluster: str, eks: bool = False ) -> Optional[KubernetesDeploymentConfig]: if not self.soa_config: - self.soa_config = load_kubernetes_service_config_no_cache( - service=self.kube_deployment.service, - instance=self.kube_deployment.instance, - cluster=cluster, - soa_dir=soa_dir, - ) + if eks: + self.soa_config = load_eks_service_config_no_cache( + service=self.kube_deployment.service, + instance=self.kube_deployment.instance, + cluster=cluster, + soa_dir=soa_dir, + ) + else: + self.soa_config = load_kubernetes_service_config_no_cache( + service=self.kube_deployment.service, + instance=self.kube_deployment.instance, + cluster=cluster, + soa_dir=soa_dir, + ) return self.soa_config def __str__(self): @@ -141,7 +146,7 @@ def delete_pod_disruption_budget(self, kube_client: KubeClient) -> None: ) def ensure_pod_disruption_budget( - self, kube_client: KubeClient, namespace: str = "paasta" + self, kube_client: KubeClient, namespace: str ) -> V1beta1PodDisruptionBudget: max_unavailable: Union[str, int] if "bounce_margin_factor" in self.soa_config.config_dict: @@ -239,65 +244,9 @@ def create(self, kube_client: KubeClient) -> None: self.ensure_pod_disruption_budget(kube_client, self.soa_config.get_namespace()) self.sync_horizontal_pod_autoscaler(kube_client) - def deep_delete_and_create(self, kube_client: KubeClient) -> None: - self.deep_delete(kube_client) - timer = 0 - while ( - self.kube_deployment - in set(list_all_deployments(kube_client, self.soa_config.get_namespace())) - and timer < 60 - ): - sleep(1) - timer += 1 - - if timer >= 60 and self.kube_deployment in set( - list_all_deployments(kube_client, self.soa_config.get_namespace()) - ): - # When deleting then immediately creating, we need to use Background - # deletion to ensure we can create the deployment immediately - self.deep_delete(kube_client, propagation_policy="Background") - - try: - force_delete_pods( - self.item.metadata.name, - self.kube_deployment.service, - self.kube_deployment.instance, - self.item.metadata.namespace, - kube_client, - ) - except ApiException as e: - if e.status == 404: - # Pod(s) may have been deleted by GC before we got to it - # We can consider this a success - self.logging.debug( - "pods already deleted for {} from namespace/{}. Continuing.".format( - self.kube_deployment.service, self.item.metadata.namespace - ) - ) - else: - raise - - if self.kube_deployment in set( - list_all_deployments(kube_client, self.soa_config.get_namespace()) - ): - # deployment deletion failed, we cannot continue - raise Exception(f"Could not delete deployment {self.item.metadata.name}") - else: - self.logging.info( - "deleted deploy/{} from namespace/{}".format( - self.kube_deployment.service, self.item.metadata.namespace - ) - ) - self.create(kube_client=kube_client) - def update(self, kube_client: KubeClient) -> None: # If HPA is enabled, do not update replicas. # In all other cases, replica is set to max(instances, min_instances) - if self.soa_config.config_dict.get("bounce_method", "") == "brutal": - threading.Thread( - target=self.deep_delete_and_create, args=[KubeClient()] - ).start() - return update_deployment( kube_client=kube_client, formatted_deployment=self.item, diff --git a/paasta_tools/kubernetes/bin/paasta_secrets_sync.py b/paasta_tools/kubernetes/bin/paasta_secrets_sync.py index 58b5016224..ef6b8a4939 100755 --- a/paasta_tools/kubernetes/bin/paasta_secrets_sync.py +++ b/paasta_tools/kubernetes/bin/paasta_secrets_sync.py @@ -35,8 +35,10 @@ from kubernetes.client.rest import ApiException from typing_extensions import Literal +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes_tools import create_secret from paasta_tools.kubernetes_tools import create_secret_signature +from paasta_tools.kubernetes_tools import ensure_namespace from paasta_tools.kubernetes_tools import get_paasta_secret_name from paasta_tools.kubernetes_tools import get_paasta_secret_signature_name from paasta_tools.kubernetes_tools import get_secret_signature @@ -55,11 +57,18 @@ from paasta_tools.utils import INSTANCE_TYPE_TO_K8S_NAMESPACE from paasta_tools.utils import INSTANCE_TYPES from paasta_tools.utils import load_system_paasta_config +from paasta_tools.utils import PAASTA_K8S_INSTANCE_TYPES from paasta_tools.utils import SHARED_SECRETS_K8S_NAMESPACES log = logging.getLogger(__name__) +K8S_INSTANCE_TYPE_CLASSES = ( + KubernetesDeploymentConfig, + EksDeploymentConfig, +) + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Sync paasta secrets into k8s") parser.add_argument( @@ -212,31 +221,32 @@ def get_services_to_k8s_namespaces_to_allowlist( continue config_loader = PaastaServiceConfigLoader(service, soa_dir) - for service_instance_config in config_loader.instance_configs( - cluster=cluster, instance_type_class=KubernetesDeploymentConfig - ): - secrets_used, shared_secrets_used = get_secrets_used_by_instance( - service_instance_config - ) - allowlist = services_to_k8s_namespaces_to_allowlist[service].setdefault( - service_instance_config.get_namespace(), - set(), - ) - if allowlist is not None: - allowlist.update(secrets_used) - - if "_shared" in service_list: - shared_allowlist = services_to_k8s_namespaces_to_allowlist[ - "_shared" - ].setdefault( + for instance_type_class in K8S_INSTANCE_TYPE_CLASSES: + for service_instance_config in config_loader.instance_configs( + cluster=cluster, instance_type_class=instance_type_class + ): + secrets_used, shared_secrets_used = get_secrets_used_by_instance( + service_instance_config + ) + allowlist = services_to_k8s_namespaces_to_allowlist[service].setdefault( service_instance_config.get_namespace(), set(), ) - if shared_allowlist is not None: - shared_allowlist.update(shared_secrets_used) + if allowlist is not None: + allowlist.update(secrets_used) + + if "_shared" in service_list: + shared_allowlist = services_to_k8s_namespaces_to_allowlist[ + "_shared" + ].setdefault( + service_instance_config.get_namespace(), + set(), + ) + if shared_allowlist is not None: + shared_allowlist.update(shared_secrets_used) for instance_type in INSTANCE_TYPES: - if instance_type == "kubernetes": + if instance_type in PAASTA_K8S_INSTANCE_TYPES: continue # handled above. instances = get_service_instance_list( @@ -309,6 +319,7 @@ def sync_all_secrets( else namespaces_to_allowlist.get(overwrite_namespace, set()), } for namespace, secret_allowlist in namespaces_to_allowlist.items(): + ensure_namespace(kube_client, namespace) sync_service_secrets["paasta-secret"].append( partial( sync_secrets, @@ -463,64 +474,67 @@ def sync_datastore_credentials( system_paasta_config.get_datastore_credentials_vault_overrides() ) - for instance_config in config_loader.instance_configs( - cluster=cluster, instance_type_class=KubernetesDeploymentConfig - ): - namespace = ( - overwrite_namespace - if overwrite_namespace is not None - else instance_config.get_namespace() - ) - datastore_credentials = instance_config.get_datastore_credentials() - with set_temporary_environment_variables(datastore_credentials_vault_overrides): - # expects VAULT_ADDR_OVERRIDE, VAULT_CA_OVERRIDE, and VAULT_TOKEN_OVERRIDE to be set - # in order to use a custom vault shard. overriden temporarily in this context - provider = get_secret_provider( - secret_provider_name=secret_provider_name, - soa_dir=soa_dir, - service_name=service, - cluster_names=[cluster], - # overridden by env variables but still needed here for spec validation - secret_provider_kwargs={ - "vault_cluster_config": vault_cluster_config, - "vault_auth_method": "token", - "vault_token_file": vault_token_file, - }, + for instance_type_class in K8S_INSTANCE_TYPE_CLASSES: + for instance_config in config_loader.instance_configs( + cluster=cluster, instance_type_class=instance_type_class + ): + namespace = ( + overwrite_namespace + if overwrite_namespace is not None + else instance_config.get_namespace() ) + datastore_credentials = instance_config.get_datastore_credentials() + with set_temporary_environment_variables( + datastore_credentials_vault_overrides + ): + # expects VAULT_ADDR_OVERRIDE, VAULT_CA_OVERRIDE, and VAULT_TOKEN_OVERRIDE to be set + # in order to use a custom vault shard. overriden temporarily in this context + provider = get_secret_provider( + secret_provider_name=secret_provider_name, + soa_dir=soa_dir, + service_name=service, + cluster_names=[cluster], + # overridden by env variables but still needed here for spec validation + secret_provider_kwargs={ + "vault_cluster_config": vault_cluster_config, + "vault_auth_method": "token", + "vault_token_file": vault_token_file, + }, + ) - secret_data = {} - for datastore, credentials in datastore_credentials.items(): - # mypy loses type hints on '.items' and throws false positives. unfortunately have to type: ignore - # https://github.com/python/mypy/issues/7178 - for credential in credentials: # type: ignore - vault_path = f"secrets/datastore/{datastore}/{credential}" - secrets = provider.get_data_from_vault_path(vault_path) - if not secrets: - # no secrets found at this path. skip syncing - log.debug( - f"Warning: no secrets found at requested path {vault_path}." - ) - continue + secret_data = {} + for datastore, credentials in datastore_credentials.items(): + # mypy loses type hints on '.items' and throws false positives. unfortunately have to type: ignore + # https://github.com/python/mypy/issues/7178 + for credential in credentials: # type: ignore + vault_path = f"secrets/datastore/{datastore}/{credential}" + secrets = provider.get_data_from_vault_path(vault_path) + if not secrets: + # no secrets found at this path. skip syncing + log.debug( + f"Warning: no secrets found at requested path {vault_path}." + ) + continue + + # decrypt and save in secret_data + vault_key_path = get_vault_key_secret_name(vault_path) + + # kubernetes expects data to be base64 encoded binary in utf-8 when put into secret maps + # may look like: + # {'master': {'passwd': '****', 'user': 'v-approle-mysql-serv-nVcYexH95A2'}, 'reporting': {'passwd': '****', 'user': 'v-approle-mysql-serv-GgCpRIh9Ut7'}, 'slave': {'passwd': '****', 'user': 'v-approle-mysql-serv-PzjPwqNMbqu'} + secret_data[vault_key_path] = base64.b64encode( + json.dumps(secrets).encode("utf-8") + ).decode("utf-8") - # decrypt and save in secret_data - vault_key_path = get_vault_key_secret_name(vault_path) - - # kubernetes expects data to be base64 encoded binary in utf-8 when put into secret maps - # may look like: - # {'master': {'passwd': '****', 'user': 'v-approle-mysql-serv-nVcYexH95A2'}, 'reporting': {'passwd': '****', 'user': 'v-approle-mysql-serv-GgCpRIh9Ut7'}, 'slave': {'passwd': '****', 'user': 'v-approle-mysql-serv-PzjPwqNMbqu'} - secret_data[vault_key_path] = base64.b64encode( - json.dumps(secrets).encode("utf-8") - ).decode("utf-8") - - create_or_update_k8s_secret( - service=service, - signature_name=instance_config.get_datastore_credentials_signature_name(), - secret_name=instance_config.get_datastore_credentials_secret_name(), - get_secret_data=(lambda: secret_data), - secret_signature=_get_dict_signature(secret_data), - kube_client=kube_client, - namespace=namespace, - ) + create_or_update_k8s_secret( + service=service, + signature_name=instance_config.get_datastore_credentials_signature_name(), + secret_name=instance_config.get_datastore_credentials_secret_name(), + get_secret_data=(lambda: secret_data), + secret_signature=_get_dict_signature(secret_data), + kube_client=kube_client, + namespace=namespace, + ) return True @@ -543,50 +557,51 @@ def sync_crypto_secrets( So each replica of a service instance gets the same key, thereby reducing requests to Vault API as we only talk to vault during secret syncing """ config_loader = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) - for instance_config in config_loader.instance_configs( - cluster=cluster, instance_type_class=KubernetesDeploymentConfig - ): - crypto_keys = instance_config.get_crypto_keys_from_config() - if not crypto_keys: - continue - secret_data = {} - provider = get_secret_provider( - secret_provider_name=secret_provider_name, - soa_dir=soa_dir, - service_name=service, - cluster_names=[cluster], - secret_provider_kwargs={ - "vault_cluster_config": vault_cluster_config, - "vault_auth_method": "token", - "vault_token_file": vault_token_file, - }, - ) - for key in crypto_keys: - key_versions = provider.get_key_versions(key) - if not key_versions: - log.error( - f"No key versions found for {key} on {instance_config.get_sanitised_deployment_name()}" - ) + for instance_type_class in K8S_INSTANCE_TYPE_CLASSES: + for instance_config in config_loader.instance_configs( + cluster=cluster, instance_type_class=instance_type_class + ): + crypto_keys = instance_config.get_crypto_keys_from_config() + if not crypto_keys: continue + secret_data = {} + provider = get_secret_provider( + secret_provider_name=secret_provider_name, + soa_dir=soa_dir, + service_name=service, + cluster_names=[cluster], + secret_provider_kwargs={ + "vault_cluster_config": vault_cluster_config, + "vault_auth_method": "token", + "vault_token_file": vault_token_file, + }, + ) + for key in crypto_keys: + key_versions = provider.get_key_versions(key) + if not key_versions: + log.error( + f"No key versions found for {key} on {instance_config.get_sanitised_deployment_name()}" + ) + continue - secret_data[get_vault_key_secret_name(key)] = base64.b64encode( - json.dumps(key_versions).encode("utf-8") - ).decode("utf-8") + secret_data[get_vault_key_secret_name(key)] = base64.b64encode( + json.dumps(key_versions).encode("utf-8") + ).decode("utf-8") - if not secret_data: - continue + if not secret_data: + continue - create_or_update_k8s_secret( - service=service, - signature_name=instance_config.get_crypto_secret_signature_name(), - # the secret name here must match the secret name given in the secret volume config, - # i.e. `kubernetes.client.V1SecretVolumeSource`'s `secret_name` must match below - secret_name=instance_config.get_crypto_secret_name(), - get_secret_data=(lambda: secret_data), - secret_signature=_get_dict_signature(secret_data), - kube_client=kube_client, - namespace=instance_config.get_namespace(), - ) + create_or_update_k8s_secret( + service=service, + signature_name=instance_config.get_crypto_secret_signature_name(), + # the secret name here must match the secret name given in the secret volume config, + # i.e. `kubernetes.client.V1SecretVolumeSource`'s `secret_name` must match below + secret_name=instance_config.get_crypto_secret_name(), + get_secret_data=(lambda: secret_data), + secret_signature=_get_dict_signature(secret_data), + kube_client=kube_client, + namespace=instance_config.get_namespace(), + ) return True @@ -598,40 +613,46 @@ def sync_boto_secrets( soa_dir: str, ) -> bool: config_loader = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) - for instance_config in config_loader.instance_configs( - cluster=cluster, instance_type_class=KubernetesDeploymentConfig - ): - boto_keys = instance_config.config_dict.get("boto_keys", []) - if not boto_keys: - continue - boto_keys.sort() - secret_data = {} - for key in boto_keys: - for filetype in ["sh", "yaml", "json", "cfg"]: - this_key = key + "." + filetype - sanitised_key = this_key.replace(".", "-").replace("_", "--") - try: - with open(f"/etc/boto_cfg_private/{this_key}") as f: + for instance_type_class in K8S_INSTANCE_TYPE_CLASSES: + for instance_config in config_loader.instance_configs( + cluster=cluster, instance_type_class=instance_type_class + ): + boto_keys = instance_config.config_dict.get("boto_keys", []) + if not boto_keys: + continue + boto_keys.sort() + secret_data = {} + for key in boto_keys: + for filetype in ["sh", "yaml", "json", "cfg"]: + this_key = key + "." + filetype + sanitised_key = this_key.replace(".", "-").replace("_", "--") + try: + with open(f"/etc/boto_cfg_private/{this_key}") as f: + secret_data[sanitised_key] = base64.b64encode( + f.read().encode("utf-8") + ).decode("utf-8") + except IOError: + log.warning( + f"Boto key {this_key} required for {service} could not be found." + ) secret_data[sanitised_key] = base64.b64encode( - f.read().encode("utf-8") + "This user no longer exists. Remove it from boto_keys.".encode( + "utf-8" + ) ).decode("utf-8") - except IOError: - log.warning( - f"Boto key {this_key} required for {service} could not be found." - ) - if not secret_data: - continue + if not secret_data: + continue - create_or_update_k8s_secret( - service=service, - signature_name=instance_config.get_boto_secret_signature_name(), - secret_name=instance_config.get_boto_secret_name(), - get_secret_data=(lambda: secret_data), - secret_signature=_get_dict_signature(secret_data), - kube_client=kube_client, - namespace=instance_config.get_namespace(), - ) + create_or_update_k8s_secret( + service=service, + signature_name=instance_config.get_boto_secret_signature_name(), + secret_name=instance_config.get_boto_secret_name(), + get_secret_data=(lambda: secret_data), + secret_signature=_get_dict_signature(secret_data), + kube_client=kube_client, + namespace=instance_config.get_namespace(), + ) return True @@ -654,7 +675,9 @@ def create_or_update_k8s_secret( :param get_secret_data: is a function to postpone fetching data in order to reduce service load, e.g. Vault API """ # In order to prevent slamming the k8s API, add some artificial delay here - time.sleep(0.3) + delay = load_system_paasta_config().get_secret_sync_delay_seconds() + if delay: + time.sleep(delay) kubernetes_signature = get_secret_signature( kube_client=kube_client, diff --git a/paasta_tools/kubernetes_tools.py b/paasta_tools/kubernetes_tools.py index 0be1c92101..ba34e37c68 100644 --- a/paasta_tools/kubernetes_tools.py +++ b/paasta_tools/kubernetes_tools.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import base64 +import functools import hashlib import itertools import json @@ -23,11 +24,13 @@ from inspect import currentframe from pathlib import Path from typing import Any +from typing import cast from typing import Collection from typing import Container from typing import Dict from typing import Iterable from typing import List +from typing import Literal from typing import Mapping from typing import MutableMapping from typing import NamedTuple @@ -72,6 +75,9 @@ from kubernetes.client import V1KeyToPath from kubernetes.client import V1LabelSelector from kubernetes.client import V1Lifecycle +from kubernetes.client import V1LimitRange +from kubernetes.client import V1LimitRangeItem +from kubernetes.client import V1LimitRangeSpec from kubernetes.client import V1Namespace from kubernetes.client import V1Node from kubernetes.client import V1NodeAffinity @@ -89,6 +95,7 @@ from kubernetes.client import V1PodSecurityContext from kubernetes.client import V1PodSpec from kubernetes.client import V1PodTemplateSpec +from kubernetes.client import V1PreferredSchedulingTerm from kubernetes.client import V1Probe from kubernetes.client import V1ReplicaSet from kubernetes.client import V1ResourceRequirements @@ -158,6 +165,7 @@ from paasta_tools.utils import SecretVolume from paasta_tools.utils import SystemPaastaConfig from paasta_tools.utils import time_cache +from paasta_tools.utils import TopologySpreadConstraintDict from paasta_tools.utils import VolumeWithMode @@ -173,11 +181,9 @@ "brutal": "RollingUpdate", } HACHECK_POD_NAME = "hacheck" -UWSGI_EXPORTER_POD_NAME = "uwsgi--exporter" GUNICORN_EXPORTER_POD_NAME = "gunicorn--exporter" SIDECAR_CONTAINER_NAMES = [ HACHECK_POD_NAME, - UWSGI_EXPORTER_POD_NAME, GUNICORN_EXPORTER_POD_NAME, ] KUBERNETES_NAMESPACE = "paasta" @@ -333,7 +339,6 @@ def _set_disrupted_pods(self: Any, disrupted_pods: Mapping[str, datetime]) -> No "paasta.yelp.com/image_version": str, "paasta.yelp.com/instance": str, "paasta.yelp.com/prometheus_shard": str, - "paasta.yelp.com/scrape_uwsgi_prometheus": str, "paasta.yelp.com/scrape_piscina_prometheus": str, "paasta.yelp.com/scrape_gunicorn_prometheus": str, "paasta.yelp.com/service": str, @@ -357,11 +362,45 @@ class CryptoKeyConfig(TypedDict): decrypt: List[str] +class NodeSelectorInNotIn(TypedDict): + operator: Literal["In", "NotIn"] + values: List[str] + + +class NodeSelectorExistsDoesNotExist(TypedDict): + operator: Literal["Exists", "DoesNotExist"] + + +class NodeSelectorGtLt(TypedDict): + operator: Literal["Gt", "Lt"] + value: int + + +NodeSelectorOperator = Union[ + NodeSelectorInNotIn, + NodeSelectorExistsDoesNotExist, + NodeSelectorGtLt, +] + + +NodeSelectorConfig = Union[ + str, + List[str], + List[NodeSelectorOperator], +] + + +class NodeSelectorsPreferredConfigDict(TypedDict): + weight: int + preferences: Dict[str, NodeSelectorConfig] + + class KubernetesDeploymentConfigDict(LongRunningServiceConfigDict, total=False): bounce_method: str bounce_health_params: Dict[str, Any] service_account_name: str - node_selectors: Dict[str, Union[str, Dict[str, Any]]] + node_selectors: Dict[str, NodeSelectorConfig] + node_selectors_preferred: List[NodeSelectorsPreferredConfigDict] sidecar_resource_requirements: Dict[str, SidecarResourceRequirements] lifecycle: KubeLifecycleDict anti_affinity: Union[KubeAffinityCondition, List[KubeAffinityCondition]] @@ -377,6 +416,7 @@ class KubernetesDeploymentConfigDict(LongRunningServiceConfigDict, total=False): boto_keys: List[str] crypto_keys: CryptoKeyConfig datastore_credentials: DatastoreCredentialsConfig + topology_spread_constraints: List[TopologySpreadConstraintDict] def load_kubernetes_service_config_no_cache( @@ -496,6 +536,19 @@ def __init__(self, exception: Exception, service: str, instance: str) -> None: class KubeClient: + @functools.lru_cache() # type: ignore + def __new__( + cls, + component: Optional[str] = None, + config_file: Optional[str] = None, + context: Optional[str] = None, + ) -> "KubeClient": + """By @lru_cache'ing this function, repeated instantiations of KubeClient with the same arguments will return the + exact same object. This makes it possible to effectively cache function calls that take a KubeClient as an + argument.""" + return super().__new__(cls) + + @functools.lru_cache() # type: ignore def __init__( self, component: Optional[str] = None, @@ -575,28 +628,40 @@ def allowlist_denylist_to_requirements( def raw_selectors_to_requirements( - raw_selectors: Mapping[str, Any] + raw_selectors: Mapping[str, NodeSelectorConfig] ) -> List[Tuple[str, str, List[str]]]: """Converts certain node_selectors into requirements, which can be converted to node affinities. """ - requirements = [] + requirements: List[Tuple[str, str, List[str]]] = [] for label, configs in raw_selectors.items(): + operator_configs: List[NodeSelectorOperator] = [] + if type(configs) is not list or len(configs) == 0: continue elif type(configs[0]) is str: # specifying an array/list of strings for a label is shorthand # for the "In" operator - configs = [{"operator": "In", "values": configs}] + operator_configs = [ + NodeSelectorInNotIn( + {"operator": "In", "values": cast(List[str], configs)} + ) + ] + else: + # configs should already be a List[NodeSelectorOperator] + operator_configs = cast(List[NodeSelectorOperator], configs) label = to_node_label(label) - for config in configs: + for config in operator_configs: if config["operator"] in {"In", "NotIn"}: + config = cast(NodeSelectorInNotIn, config) values = config["values"] elif config["operator"] in {"Exists", "DoesNotExist"}: + config = cast(NodeSelectorExistsDoesNotExist, config) values = [] elif config["operator"] in {"Gt", "Lt"}: + config = cast(NodeSelectorGtLt, config) # config["value"] is validated by jsonschema to be an int. but, # k8s expects singleton list of the int represented as a str # for these operators. @@ -723,7 +788,7 @@ def get_autoscaling_metric_spec( name: str, cluster: str, kube_client: KubeClient, - namespace: str = "paasta", + namespace: str, ) -> Optional[Union[V2beta2HorizontalPodAutoscaler, Dict]]: # Returns None if an HPA should not be attached based on the config, # or the config is invalid. @@ -742,7 +807,7 @@ def get_autoscaling_metric_spec( max_replicas = self.get_max_instances() if min_replicas == 0 or max_replicas == 0: log.error( - f"Invalid value for min or max_instances: {min_replicas}, {max_replicas}" + f"Invalid value for min or max_instances on {name}: {min_replicas}, {max_replicas}" ) return None @@ -796,7 +861,7 @@ def get_autoscaling_metric_spec( ), ) ) - elif metrics_provider in {"uwsgi", "piscina", "gunicorn"}: + elif metrics_provider in {"uwsgi", "piscina", "gunicorn", "active-requests"}: metrics.append( V2beta2MetricSpec( type="Object", @@ -972,9 +1037,6 @@ def get_sidecar_containers( service_namespace_config, hacheck_sidecar_volumes, ) - uwsgi_exporter_container = self.get_uwsgi_exporter_sidecar_container( - system_paasta_config - ) gunicorn_exporter_container = self.get_gunicorn_exporter_sidecar_container( system_paasta_config ) @@ -982,8 +1044,6 @@ def get_sidecar_containers( sidecars = [] if hacheck_container: sidecars.append(hacheck_container) - if uwsgi_exporter_container: - sidecars.append(uwsgi_exporter_container) if gunicorn_exporter_container: sidecars.append(gunicorn_exporter_container) return sidecars @@ -1072,57 +1132,14 @@ def get_hacheck_sidecar_container( ) return None - def get_uwsgi_exporter_sidecar_container( - self, - system_paasta_config: SystemPaastaConfig, - ) -> Optional[V1Container]: - - if self.should_run_uwsgi_exporter_sidecar(system_paasta_config): - stats_port_env = V1EnvVar( - name="STATS_PORT", - value=str(self.get_autoscaling_params().get("uwsgi_stats_port", 8889)), - ) - - return V1Container( - image=system_paasta_config.get_uwsgi_exporter_sidecar_image_url(), - resources=self.get_sidecar_resource_requirements( - "uwsgi_exporter", - system_paasta_config, - ), - name=UWSGI_EXPORTER_POD_NAME, - env=self.get_kubernetes_environment() + [stats_port_env], - ports=[V1ContainerPort(container_port=9117)], - lifecycle=V1Lifecycle( - pre_stop=V1Handler( - _exec=V1ExecAction( - command=[ - "/bin/sh", - "-c", - # we sleep for the same amount of time as we do after an hadown to ensure that we have accurate - # metrics up until our Pod dies - f"sleep {DEFAULT_HADOWN_PRESTOP_SLEEP_SECONDS}", - ] - ) - ) - ), - ) - - return None - - def should_run_uwsgi_exporter_sidecar( + def should_use_uwsgi_exporter( self, system_paasta_config: SystemPaastaConfig, ) -> bool: - if self.is_autoscaling_enabled(): - autoscaling_params = self.get_autoscaling_params() - if autoscaling_params["metrics_provider"] == "uwsgi": - if autoscaling_params.get( - "use_prometheus", - DEFAULT_USE_PROMETHEUS_UWSGI - or system_paasta_config.default_should_run_uwsgi_exporter_sidecar(), - ): - return True - return False + return ( + self.is_autoscaling_enabled() + and self.get_autoscaling_params()["metrics_provider"] == "uwsgi" + ) def get_gunicorn_exporter_sidecar_container( self, @@ -1535,6 +1552,7 @@ def get_pod_volumes( ), default_mode=mode_to_int(secret_volume.get("default_mode")), items=items, + optional=False, ), ) ) @@ -1984,7 +2002,9 @@ def get_enable_envoy_readiness_check( def get_namespace(self) -> str: """Get namespace from config, default to 'paasta'""" - return self.config_dict.get("namespace", "paasta") + return self.config_dict.get( + "namespace", f"paastasvc-{self.get_sanitised_service_name()}" + ) def get_pod_management_policy(self) -> str: """Get sts pod_management_policy from config, default to 'OrderedReady'""" @@ -2093,7 +2113,7 @@ def has_routable_ip( self.config_dict.get("routable_ip", False) or service_namespace_config.is_in_smartstack() or self.get_prometheus_port() is not None - or self.should_run_uwsgi_exporter_sidecar(system_paasta_config) + or self.should_use_uwsgi_exporter(system_paasta_config) or self.should_run_gunicorn_exporter_sidecar() ): return "true" @@ -2161,7 +2181,9 @@ def get_pod_template_spec( pod_topology_spread_constraints = create_pod_topology_spread_constraints( service=self.get_service(), instance=self.get_instance(), - topology_spread_constraints=system_paasta_config.get_topology_spread_constraints(), + topology_spread_constraints=self.get_topology_spread_constraints( + system_paasta_config.get_topology_spread_constraints() + ), ) if pod_topology_spread_constraints: constraints = pod_spec_kwargs.get("topology_spread_constraints", []) @@ -2248,16 +2270,13 @@ def get_pod_template_spec( if self.is_istio_sidecar_injection_enabled(): labels["sidecar.istio.io/inject"] = "true" - # not all services use uwsgi autoscaling, so we label those that do in order to have + # not all services use autoscaling, so we label those that do in order to have # prometheus selectively discover/scrape them - if self.should_run_uwsgi_exporter_sidecar( - system_paasta_config=system_paasta_config - ): - # this is kinda silly, but k8s labels must be strings - labels["paasta.yelp.com/scrape_uwsgi_prometheus"] = "true" - + if self.should_use_uwsgi_exporter(system_paasta_config=system_paasta_config): + # UWSGI no longer needs a label to indicate it needs to be scraped as all pods are checked for the uwsgi stats port by our centralized uwsgi-exporter + # But we do still need deploy_group for relabeling properly # this should probably eventually be made into a default label, - # but for now we're fine with it being behind this feature toggle. + # but for now we're fine with it being behind these feature toggles. # ideally, we'd also have the docker image here for ease-of-use # in Prometheus relabeling, but that information is over the # character limit for k8s labels (63 chars) @@ -2307,26 +2326,60 @@ def get_node_affinity(self) -> Optional[V1NodeAffinity]: raw_selectors=self.config_dict.get("node_selectors", {}), ) ) + + preferred_terms = [] + for node_selectors_prefered_config_dict in self.config_dict.get( + "node_selectors_preferred", [] + ): + preferred_terms.append( + V1PreferredSchedulingTerm( + weight=node_selectors_prefered_config_dict["weight"], + preference=V1NodeSelectorTerm( + match_expressions=[ + V1NodeSelectorRequirement( + key=key, + operator=op, + values=vs, + ) + for key, op, vs in raw_selectors_to_requirements( + raw_selectors=node_selectors_prefered_config_dict[ + "preferences" + ] + ) + ] + ), + ) + ) + # package everything into a node affinity - lots of layers :P - if len(requirements) == 0: + if len(requirements) == 0 and len(preferred_terms) == 0: return None - term = V1NodeSelectorTerm( - match_expressions=[ - V1NodeSelectorRequirement( - key=key, - operator=op, - values=vs, - ) - for key, op, vs in requirements - ] + + required_term = ( + V1NodeSelectorTerm( + match_expressions=[ + V1NodeSelectorRequirement( + key=key, + operator=op, + values=vs, + ) + for key, op, vs in requirements + ] + ) + if requirements + else None ) - selector = V1NodeSelector(node_selector_terms=[term]) + + if not preferred_terms: + preferred_terms = None + return V1NodeAffinity( - # this means that the selectors are only used during scheduling. - # changing it while the pod is running will not cause an eviction. - # this should be fine since if there are whitelist/blacklist config - # changes, we will bounce anyway. - required_during_scheduling_ignored_during_execution=selector, + required_during_scheduling_ignored_during_execution=V1NodeSelector( + node_selector_terms=[required_term] + ) + if required_term + else None, + preferred_during_scheduling_ignored_during_execution=preferred_terms, ) def get_pod_required_anti_affinity_terms( @@ -2460,9 +2513,17 @@ def get_prometheus_path(self) -> Optional[str]: def get_prometheus_port(self) -> Optional[int]: return self.config_dict.get("prometheus_port") + def get_topology_spread_constraints( + self, + default_pod_topology_spread_constraints: List[TopologySpreadConstraintDict], + ) -> List[TopologySpreadConstraintDict]: + return self.config_dict.get( + "topology_spread_constraints", default_pod_topology_spread_constraints + ) + def get_kubernetes_secret_hashes( - environment_variables: Mapping[str, str], service: str, namespace: str = "paasta" + environment_variables: Mapping[str, str], service: str, namespace: str ) -> Mapping[str, str]: hashes = {} to_get_hash = [] @@ -2620,6 +2681,7 @@ def force_delete_pods( paasta_service, instance, kube_client, + namespace=namespace, ) delete_options = V1DeleteOptions() for pod in pods_to_delete: @@ -2628,6 +2690,7 @@ def force_delete_pods( ) +@time_cache(ttl=60) def get_all_namespaces( kube_client: KubeClient, label_selector: Optional[str] = None ) -> List[str]: @@ -2654,6 +2717,7 @@ def get_matching_namespaces( ] +@functools.lru_cache() def ensure_namespace(kube_client: KubeClient, namespace: str) -> None: paasta_namespace = V1Namespace( metadata=V1ObjectMeta( @@ -2665,13 +2729,21 @@ def ensure_namespace(kube_client: KubeClient, namespace: str) -> None: }, ) ) - namespaces = kube_client.core.list_namespace() - namespace_names = [item.metadata.name for item in namespaces.items] + namespace_names = get_all_namespaces(kube_client) if namespace not in namespace_names: log.warning(f"Creating namespace: {namespace} as it does not exist") - kube_client.core.create_namespace(body=paasta_namespace) + try: + kube_client.core.create_namespace(body=paasta_namespace) + except ApiException as e: + if e.status == 409: + log.warning( + "Got HTTP 409 when creating namespace; it must already exist. Continuing." + ) + else: + raise ensure_paasta_api_rolebinding(kube_client, namespace) + ensure_paasta_namespace_limits(kube_client, namespace) def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> None: @@ -2679,7 +2751,7 @@ def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> No rolebinding_names = [item.metadata.name for item in rolebindings] if "paasta-api-server-per-namespace" not in rolebinding_names: log.warning( - f"Creating rolebinding paasta-api-server-per-namespace as it does not exist" + f"Creating rolebinding paasta-api-server-per-namespace on {namespace} namespace as it does not exist" ) role_binding = V1RoleBinding( metadata=V1ObjectMeta( @@ -2703,6 +2775,45 @@ def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> No ) +def ensure_paasta_namespace_limits(kube_client: KubeClient, namespace: str) -> None: + if not namespace.startswith("paastasvc-"): + log.debug( + f"Not creating LimitRange because {namespace} does not start with paastasvc-" + ) + return + + limits = get_all_limit_ranges(kube_client, namespace=namespace) + limits_names = {item.metadata.name for item in limits} + if "limit-mem-cpu-disk-per-container" not in limits_names: + log.warning( + f"Creating limit: limit-mem-cpu-disk-per-container on {namespace} namespace as it does not exist" + ) + limit = V1LimitRange( + metadata=V1ObjectMeta( + name="limit-mem-cpu-disk-per-container", + namespace=namespace, + ), + spec=V1LimitRangeSpec( + limits=[ + V1LimitRangeItem( + type="Container", + default={ + "cpu": "1", + "memory": "1024Mi", + "ephemeral-storage": "1Gi", + }, + default_request={ + "cpu": "1", + "memory": "1024Mi", + "ephemeral-storage": "1Gi", + }, + ) + ] + ), + ) + kube_client.core.create_namespaced_limit_range(namespace=namespace, body=limit) + + def list_deployments_in_all_namespaces( kube_client: KubeClient, label_selector: str ) -> List[KubeDeployment]: @@ -2733,8 +2844,9 @@ def list_deployments_in_all_namespaces( def list_deployments( kube_client: KubeClient, + *, + namespace: str, label_selector: str = "", - namespace: str = "paasta", ) -> Sequence[KubeDeployment]: deployments = kube_client.deployments.list_namespaced_deployment( @@ -3029,7 +3141,7 @@ def pod_disruption_budget_for_service_instance( service: str, instance: str, max_unavailable: Union[str, int], - namespace: str = "paasta", + namespace: str, ) -> V1beta1PodDisruptionBudget: return V1beta1PodDisruptionBudget( metadata=V1ObjectMeta( @@ -3051,7 +3163,7 @@ def pod_disruption_budget_for_service_instance( def create_pod_disruption_budget( kube_client: KubeClient, pod_disruption_budget: V1beta1PodDisruptionBudget, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.policy.create_namespaced_pod_disruption_budget( namespace=namespace, body=pod_disruption_budget @@ -3126,7 +3238,7 @@ def list_all_paasta_deployments(kube_client: KubeClient) -> Sequence[KubeDeploym def list_all_deployments( - kube_client: KubeClient, namespace: str = "paasta" + kube_client: KubeClient, namespace: str ) -> Sequence[KubeDeployment]: return list_deployments(kube_client=kube_client, namespace=namespace) @@ -3134,12 +3246,13 @@ def list_all_deployments( def list_matching_deployments( service: str, instance: str, + *, + namespace: str, kube_client: KubeClient, - namespace: str = "paasta", ) -> Sequence[KubeDeployment]: return list_deployments( kube_client, - f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}", + label_selector=f"paasta.yelp.com/service={service},paasta.yelp.com/instance={instance}", namespace=namespace, ) @@ -3157,7 +3270,7 @@ def list_matching_deployments_in_all_namespaces( @async_timeout() async def replicasets_for_service_instance( - service: str, instance: str, kube_client: KubeClient, namespace: str = "paasta" + service: str, instance: str, kube_client: KubeClient, namespace: str ) -> Sequence[V1ReplicaSet]: async_list_replica_set = a_sync.to_async( kube_client.deployments.list_namespaced_replica_set @@ -3171,7 +3284,7 @@ async def replicasets_for_service_instance( @async_timeout() async def controller_revisions_for_service_instance( - service: str, instance: str, kube_client: KubeClient, namespace: str = "paasta" + service: str, instance: str, kube_client: KubeClient, namespace: str ) -> Sequence[V1ControllerRevision]: async_list_controller_revisions = a_sync.to_async( kube_client.deployments.list_namespaced_controller_revision @@ -3185,7 +3298,7 @@ async def controller_revisions_for_service_instance( @async_timeout(15) async def pods_for_service_instance( - service: str, instance: str, kube_client: KubeClient, namespace: str = "paasta" + service: str, instance: str, kube_client: KubeClient, namespace: str ) -> Sequence[V1Pod]: async_list_pods = a_sync.to_async(kube_client.core.list_namespaced_pod) response = await async_list_pods( @@ -3201,14 +3314,12 @@ def get_pods_by_node(kube_client: KubeClient, node: V1Node) -> Sequence[V1Pod]: ).items -def get_all_pods(kube_client: KubeClient, namespace: str = "paasta") -> List[V1Pod]: +def get_all_pods(kube_client: KubeClient, namespace: str) -> List[V1Pod]: return kube_client.core.list_namespaced_pod(namespace=namespace).items @time_cache(ttl=300) -def get_all_pods_cached( - kube_client: KubeClient, namespace: str = "paasta" -) -> Sequence[V1Pod]: +def get_all_pods_cached(kube_client: KubeClient, namespace: str) -> Sequence[V1Pod]: pods: Sequence[V1Pod] = get_all_pods(kube_client, namespace) return pods @@ -3335,7 +3446,7 @@ def get_all_nodes( return kube_client.core.list_node().items -@time_cache(ttl=300) +@time_cache(ttl=60) def get_all_nodes_cached(kube_client: KubeClient) -> Sequence[V1Node]: nodes: Sequence[V1Node] = get_all_nodes(kube_client) return nodes @@ -3400,7 +3511,7 @@ def get_kubernetes_app_name(service: str, instance: str) -> str: def get_kubernetes_app_by_name( - name: str, kube_client: KubeClient, namespace: str = "paasta" + name: str, kube_client: KubeClient, namespace: str ) -> Union[V1Deployment, V1StatefulSet]: try: app = kube_client.deployments.read_namespaced_deployment_status( @@ -3420,7 +3531,7 @@ def get_kubernetes_app_by_name( def create_deployment( kube_client: KubeClient, formatted_deployment: V1Deployment, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.deployments.create_namespaced_deployment( namespace=namespace, body=formatted_deployment @@ -3430,7 +3541,7 @@ def create_deployment( def update_deployment( kube_client: KubeClient, formatted_deployment: V1Deployment, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.deployments.replace_namespaced_deployment( name=formatted_deployment.metadata.name, @@ -3442,7 +3553,7 @@ def update_deployment( def patch_deployment( kube_client: KubeClient, formatted_deployment: V1Deployment, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.deployments.patch_namespaced_deployment( name=formatted_deployment.metadata.name, @@ -3454,7 +3565,7 @@ def patch_deployment( def delete_deployment( kube_client: KubeClient, deployment_name: str, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.deployments.delete_namespaced_deployment( name=deployment_name, @@ -3465,7 +3576,7 @@ def delete_deployment( def create_stateful_set( kube_client: KubeClient, formatted_stateful_set: V1StatefulSet, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.deployments.create_namespaced_stateful_set( namespace=namespace, body=formatted_stateful_set @@ -3475,7 +3586,7 @@ def create_stateful_set( def update_stateful_set( kube_client: KubeClient, formatted_stateful_set: V1StatefulSet, - namespace: str = "paasta", + namespace: str, ) -> None: return kube_client.deployments.replace_namespaced_stateful_set( name=formatted_stateful_set.metadata.name, @@ -3653,6 +3764,7 @@ def update_secret( ) +@time_cache(ttl=300) def get_secret_signature( kube_client: KubeClient, signature_name: str, @@ -3684,7 +3796,7 @@ def update_secret_signature( service_name: str, signature_name: str, secret_signature: str, - namespace: str = "paasta", + namespace: str, ) -> None: """ :param service_name: Expect unsanitised service_name @@ -3713,7 +3825,7 @@ def create_secret_signature( service_name: str, signature_name: str, secret_signature: str, - namespace: str = "paasta", + namespace: str, ) -> None: """ :param service_name: Expect unsanitised service_name @@ -3765,7 +3877,7 @@ def load_custom_resource_definitions( def create_pod_topology_spread_constraints( service: str, instance: str, - topology_spread_constraints: List[Dict[str, Any]], + topology_spread_constraints: List[TopologySpreadConstraintDict], ) -> List[V1TopologySpreadConstraint]: """ Applies cluster-level topology spread constraints to every Pod template. @@ -3889,12 +4001,19 @@ def get_all_role_bindings( return kube_client.rbac.list_namespaced_role_binding(namespace=namespace).items +def get_all_limit_ranges( + kube_client: KubeClient, + namespace: str, +) -> Sequence[V1LimitRange]: + return kube_client.core.list_namespaced_limit_range(namespace).items + + _RE_NORMALIZE_IAM_ROLE = re.compile(r"[^0-9a-zA-Z]+") def create_or_find_service_account_name( iam_role: str, - namespace: str = "paasta", + namespace: str, k8s_role: Optional[str] = None, dry_run: bool = False, ) -> str: @@ -3909,9 +4028,9 @@ def create_or_find_service_account_name( # to support these two usecases, we'll suffix the name of a Service Account with the # Kubernetes Role name to disambiguate between the two. if k8s_role: - sa_name = f"paasta--{_RE_NORMALIZE_IAM_ROLE.sub('-', iam_role)}--{k8s_role}" + sa_name = f"paasta--{_RE_NORMALIZE_IAM_ROLE.sub('-', iam_role.lower())}--{k8s_role}" else: - sa_name = f"paasta--{_RE_NORMALIZE_IAM_ROLE.sub('-', iam_role)}" + sa_name = f"paasta--{_RE_NORMALIZE_IAM_ROLE.sub('-', iam_role.lower())}" # until Core ML migrates Spark to use Pod Identity, we need to support starting Spark drivers with a Service Account # that only has k8s access elif not iam_role and k8s_role: @@ -4144,7 +4263,8 @@ def get_secret( kube_client: KubeClient, secret_name: str, key_name: str, - namespace: str = "paasta", + *, + namespace: str, decode: bool = True, ) -> Union[str, bytes]: """ @@ -4167,7 +4287,7 @@ def get_kubernetes_secret_env_variables( kube_client: KubeClient, environment: Dict[str, str], service_name: str, - namespace: str = "paasta", + namespace: str, ) -> Dict[str, str]: decrypted_secrets = {} for k, v in environment.items(): @@ -4195,7 +4315,7 @@ def get_kubernetes_secret_volumes( kube_client: KubeClient, secret_volumes_config: Sequence[SecretVolume], service_name: str, - namespace: str = "paasta", + namespace: str, ) -> Dict[str, Union[str, bytes]]: secret_volumes = {} # The config might look one of two ways: diff --git a/paasta_tools/long_running_service_tools.py b/paasta_tools/long_running_service_tools.py index 2322ee5a58..41685cd935 100644 --- a/paasta_tools/long_running_service_tools.py +++ b/paasta_tools/long_running_service_tools.py @@ -29,6 +29,8 @@ DEFAULT_CONTAINER_PORT = 8888 DEFAULT_AUTOSCALING_SETPOINT = 0.8 +DEFAULT_DESIRED_ACTIVE_REQUESTS_PER_REPLICA = 1 +DEFAULT_ACTIVE_REQUESTS_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800 @@ -42,15 +44,16 @@ class AutoscalingParamsDict(TypedDict, total=False): metrics_provider: str decision_policy: str setpoint: float + desired_active_requests_per_replica: int forecast_policy: Optional[str] offset: Optional[float] moving_average_window_seconds: Optional[int] use_prometheus: bool use_resource_metrics: bool - uwsgi_stats_port: int scaledown_policies: Optional[dict] good_enough_window: List[float] prometheus_adapter_config: Optional[dict] + max_instances_alert_threshold: float class LongRunningServiceConfigDict(InstanceConfigDict, total=False): @@ -354,6 +357,12 @@ def get_autoscaling_params(self) -> AutoscalingParamsDict: defaults=default_params, ) + def get_autoscaling_max_instances_alert_threshold(self) -> float: + autoscaling_params = self.get_autoscaling_params() + return autoscaling_params.get( + "max_instances_alert_threshold", autoscaling_params["setpoint"] + ) + def validate( self, params: Optional[List[str]] = None, diff --git a/paasta_tools/metrics/metastatus_lib.py b/paasta_tools/metrics/metastatus_lib.py index bf9abfe450..3bb6d75faf 100755 --- a/paasta_tools/metrics/metastatus_lib.py +++ b/paasta_tools/metrics/metastatus_lib.py @@ -461,7 +461,7 @@ def assert_mesos_tasks_running( def assert_kube_pods_running( - kube_client: KubeClient, namespace: str = "paasta" + kube_client: KubeClient, namespace: str ) -> HealthCheckResult: statuses = [ get_pod_status(pod) for pod in get_all_pods_cached(kube_client, namespace) @@ -884,9 +884,10 @@ def get_resource_utilization_by_grouping( def get_resource_utilization_by_grouping_kube( grouping_func: _GenericNodeGroupingFunctionT, kube_client: KubeClient, + *, + namespace: str, filters: Sequence[_GenericNodeFilterFunctionT] = [], sort_func: _GenericNodeSortFunctionT = None, - namespace: str = "paasta", ) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]: """Given a function used to group nodes, calculate resource utilization for each value of a given attribute. @@ -1045,7 +1046,7 @@ def assert_marathon_deployments( def assert_kube_deployments( - kube_client: KubeClient, namespace: str = "paasta" + kube_client: KubeClient, namespace: str ) -> HealthCheckResult: num_deployments = len(list_all_deployments(kube_client, namespace)) return HealthCheckResult( @@ -1065,7 +1066,7 @@ def get_marathon_status( def get_kube_status( - kube_client: KubeClient, namespace: str = "paasta" + kube_client: KubeClient, namespace: str ) -> Sequence[HealthCheckResult]: """Gather information about Kubernetes. :param kube_client: the KUbernetes client diff --git a/paasta_tools/paasta_execute_docker_command.py b/paasta_tools/paasta_execute_docker_command.py index 212d9717a8..0b74abd40e 100755 --- a/paasta_tools/paasta_execute_docker_command.py +++ b/paasta_tools/paasta_execute_docker_command.py @@ -32,6 +32,7 @@ from paasta_tools.mesos_tools import get_container_id_for_mesos_id from paasta_tools.utils import get_docker_client +from paasta_tools.utils import is_using_unprivileged_containers def parse_args(): @@ -68,7 +69,11 @@ def signal_handler(signum, frame): def execute_in_container(docker_client, container_id, cmd, timeout): container_info = docker_client.inspect_container(container_id) - if container_info["ExecIDs"] and len(container_info["ExecIDs"]) > 0: + if ( + container_info["ExecIDs"] + and len(container_info["ExecIDs"]) > 0 + and not is_using_unprivileged_containers() + ): for possible_exec_id in container_info["ExecIDs"]: exec_info = docker_client.exec_inspect(possible_exec_id)["ProcessConfig"] if exec_info["entrypoint"] == "/bin/sh" and exec_info["arguments"] == [ diff --git a/paasta_tools/paasta_metastatus.py b/paasta_tools/paasta_metastatus.py index a0e79cfaa2..f05323d9ca 100755 --- a/paasta_tools/paasta_metastatus.py +++ b/paasta_tools/paasta_metastatus.py @@ -230,8 +230,9 @@ def utilization_table_by_grouping_from_kube( groupings: Sequence[str], threshold: float, kube_client: KubeClient, + *, + namespace: str, service_instance_stats: Optional[ServiceInstanceStats] = None, - namespace: str = "paasta", ) -> Tuple[Sequence[MutableSequence[str]], bool]: grouping_function = metastatus_lib.key_func_for_attribute_multi_kube(groupings) @@ -317,7 +318,7 @@ def get_service_instance_stats( def _run_kube_checks( - kube_client: KubeClient, namespace: str = "paasta" + kube_client: KubeClient, namespace: str ) -> Sequence[HealthCheckResult]: kube_status = metastatus_lib.get_kube_status(kube_client, namespace) kube_metrics_status = metastatus_lib.get_kube_resource_utilization_health( diff --git a/paasta_tools/paastaapi/api/service_api.py b/paasta_tools/paastaapi/api/service_api.py index 1bd8d94943..19d19e7684 100644 --- a/paasta_tools/paastaapi/api/service_api.py +++ b/paasta_tools/paastaapi/api/service_api.py @@ -1186,7 +1186,6 @@ def __mesh_instance( instance (str): Instance name Keyword Args: - include_smartstack (bool): Include Smartstack information. [optional] if omitted the server will use the default value of True include_envoy (bool): Include Envoy information. [optional] if omitted the server will use the default value of True _return_http_data_only (bool): response data without head status code and headers. Default is True. @@ -1251,7 +1250,6 @@ def __mesh_instance( 'all': [ 'service', 'instance', - 'include_smartstack', 'include_envoy', ], 'required': [ @@ -1275,21 +1273,17 @@ def __mesh_instance( (str,), 'instance': (str,), - 'include_smartstack': - (bool,), 'include_envoy': (bool,), }, 'attribute_map': { 'service': 'service', 'instance': 'instance', - 'include_smartstack': 'include_smartstack', 'include_envoy': 'include_envoy', }, 'location_map': { 'service': 'path', 'instance': 'path', - 'include_smartstack': 'query', 'include_envoy': 'query', }, 'collection_format_map': { @@ -1325,7 +1319,6 @@ def __status_instance( Keyword Args: verbose (int): Include verbose status information. [optional] - include_smartstack (bool): Include Smartstack information. [optional] include_envoy (bool): Include Envoy information. [optional] include_mesos (bool): Include Mesos information. [optional] new (bool): Use new version of paasta status for services. [optional] @@ -1393,7 +1386,6 @@ def __status_instance( 'service', 'instance', 'verbose', - 'include_smartstack', 'include_envoy', 'include_mesos', 'new', @@ -1421,8 +1413,6 @@ def __status_instance( (str,), 'verbose': (int,), - 'include_smartstack': - (bool,), 'include_envoy': (bool,), 'include_mesos': @@ -1434,7 +1424,6 @@ def __status_instance( 'service': 'service', 'instance': 'instance', 'verbose': 'verbose', - 'include_smartstack': 'include_smartstack', 'include_envoy': 'include_envoy', 'include_mesos': 'include_mesos', 'new': 'new', @@ -1443,7 +1432,6 @@ def __status_instance( 'service': 'path', 'instance': 'path', 'verbose': 'query', - 'include_smartstack': 'query', 'include_envoy': 'query', 'include_mesos': 'query', 'new': 'query', diff --git a/paasta_tools/secret_tools.py b/paasta_tools/secret_tools.py index bfbefd00e6..b0fffe2bd4 100644 --- a/paasta_tools/secret_tools.py +++ b/paasta_tools/secret_tools.py @@ -219,7 +219,7 @@ def decrypt_secret_volumes( # This ^ should result in 2 files (/nail/foo/bar.yaml, /nail/foo/baz.yaml) # We need to support both cases for secret_volume in secret_volumes_config: - if "items" not in secret_volume: + if not secret_volume.get("items"): secret_contents = decrypt_secret( secret_provider_name=secret_provider_name, soa_dir=soa_dir, diff --git a/paasta_tools/setup_kubernetes_job.py b/paasta_tools/setup_kubernetes_job.py index 9f7a61f036..486957a421 100755 --- a/paasta_tools/setup_kubernetes_job.py +++ b/paasta_tools/setup_kubernetes_job.py @@ -23,11 +23,15 @@ import argparse import logging import sys +import traceback from typing import List from typing import Optional from typing import Sequence from typing import Tuple +from typing import Union +from paasta_tools.eks_tools import EksDeploymentConfig +from paasta_tools.eks_tools import load_eks_service_config_no_cache from paasta_tools.kubernetes.application.controller_wrappers import Application from paasta_tools.kubernetes.application.controller_wrappers import ( get_application_wrapper, @@ -88,6 +92,13 @@ def parse_args() -> argparse.Namespace: type=int, help="Update or create up to this number of service instances. Default is 0 (no limit).", ) + parser.add_argument( + "--eks", + help="This flag deploys only k8 services that should run on EKS", + dest="eks", + action="store_true", + default=False, + ) args = parser.parse_args() return args @@ -112,11 +123,12 @@ def main() -> None: service_instances=args.service_instance_list ) - # returns a list of pairs of (No error?, KubernetesDeploymentConfig) for every service_instance + # returns a list of pairs of (No error?, KubernetesDeploymentConfig | EksDeploymentConfig) for every service_instance service_instance_configs_list = get_kubernetes_deployment_config( service_instances_with_valid_names=service_instances_with_valid_names, cluster=args.cluster or load_system_paasta_config().get_cluster(), soa_dir=soa_dir, + eks=args.eks, ) if ((False, None) in service_instance_configs_list) or ( @@ -138,6 +150,7 @@ def main() -> None: rate_limit=args.rate_limit, soa_dir=soa_dir, metrics_interface=deploy_metrics, + eks=args.eks, ) else: setup_kube_succeeded = False @@ -170,16 +183,28 @@ def get_kubernetes_deployment_config( service_instances_with_valid_names: list, cluster: str, soa_dir: str = DEFAULT_SOA_DIR, -) -> List[Tuple[bool, KubernetesDeploymentConfig]]: + eks: bool = False, +) -> List[Tuple[bool, Union[KubernetesDeploymentConfig, EksDeploymentConfig]]]: service_instance_configs_list = [] for service_instance in service_instances_with_valid_names: try: - service_instance_config = load_kubernetes_service_config_no_cache( - service=service_instance[0], - instance=service_instance[1], - cluster=cluster, - soa_dir=soa_dir, - ) + service_instance_config: Union[ + KubernetesDeploymentConfig, EksDeploymentConfig + ] + if eks: + service_instance_config = load_eks_service_config_no_cache( + service=service_instance[0], + instance=service_instance[1], + cluster=cluster, + soa_dir=soa_dir, + ) + else: + service_instance_config = load_kubernetes_service_config_no_cache( + service=service_instance[0], + instance=service_instance[1], + cluster=cluster, + soa_dir=soa_dir, + ) service_instance_configs_list.append((True, service_instance_config)) except NoDeploymentsAvailable: log.debug( @@ -200,24 +225,30 @@ def get_kubernetes_deployment_config( def setup_kube_deployments( kube_client: KubeClient, cluster: str, - service_instance_configs_list: List[Tuple[bool, KubernetesDeploymentConfig]], + service_instance_configs_list: List[ + Tuple[bool, Union[KubernetesDeploymentConfig, EksDeploymentConfig]] + ], rate_limit: int = 0, soa_dir: str = DEFAULT_SOA_DIR, metrics_interface: metrics_lib.BaseMetrics = metrics_lib.NoMetrics("paasta"), + eks: bool = False, ) -> bool: - if service_instance_configs_list: - existing_kube_deployments = set(list_all_paasta_deployments(kube_client)) - existing_apps = { - (deployment.service, deployment.instance, deployment.namespace) - for deployment in existing_kube_deployments - } + if not service_instance_configs_list: + return True + + existing_kube_deployments = set(list_all_paasta_deployments(kube_client)) + existing_apps = { + (deployment.service, deployment.instance, deployment.namespace) + for deployment in existing_kube_deployments + } applications = [ create_application_object( cluster=cluster, soa_dir=soa_dir, service_instance_config=service_instance, + eks=eks, ) if service_instance else (_, None) @@ -238,6 +269,19 @@ def setup_kube_deployments( app.kube_deployment.instance, app.kube_deployment.namespace, ) not in existing_apps: + if app.soa_config.get_bounce_method() == "downthenup": + if any( + ( + existing_app[:2] + == ( + app.kube_deployment.service, + app.kube_deployment.instance, + ) + ) + for existing_app in existing_apps + ): + # For downthenup, we don't want to create until cleanup_kubernetes_job has cleaned up the instance in the other namespace. + continue log.info(f"Creating {app} because it does not exist yet.") app.create(kube_client) app_dimensions["deploy_event"] = "create" @@ -273,16 +317,17 @@ def setup_kube_deployments( def create_application_object( cluster: str, soa_dir: str, - service_instance_config: KubernetesDeploymentConfig, + service_instance_config: Union[KubernetesDeploymentConfig, EksDeploymentConfig], + eks: bool = False, ) -> Tuple[bool, Optional[Application]]: try: formatted_application = service_instance_config.format_kubernetes_app() - except InvalidKubernetesConfig as e: - log.error(str(e)) + except InvalidKubernetesConfig: + log.error(traceback.format_exc()) return False, None app = get_application_wrapper(formatted_application) - app.load_local_config(soa_dir, cluster) + app.load_local_config(soa_dir, cluster, eks) return True, app diff --git a/paasta_tools/setup_prometheus_adapter_config.py b/paasta_tools/setup_prometheus_adapter_config.py index 4539763192..ee2ad09111 100755 --- a/paasta_tools/setup_prometheus_adapter_config.py +++ b/paasta_tools/setup_prometheus_adapter_config.py @@ -31,6 +31,7 @@ from kubernetes.client.rest import ApiException from mypy_extensions import TypedDict +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes_tools import DEFAULT_USE_PROMETHEUS_CPU from paasta_tools.kubernetes_tools import DEFAULT_USE_PROMETHEUS_UWSGI from paasta_tools.kubernetes_tools import ensure_namespace @@ -40,9 +41,15 @@ from paasta_tools.kubernetes_tools import sanitise_kubernetes_name from paasta_tools.kubernetes_tools import V1Pod from paasta_tools.long_running_service_tools import AutoscalingParamsDict +from paasta_tools.long_running_service_tools import ( + DEFAULT_ACTIVE_REQUESTS_AUTOSCALING_MOVING_AVERAGE_WINDOW, +) from paasta_tools.long_running_service_tools import ( DEFAULT_CPU_AUTOSCALING_MOVING_AVERAGE_WINDOW, ) +from paasta_tools.long_running_service_tools import ( + DEFAULT_DESIRED_ACTIVE_REQUESTS_PER_REPLICA, +) from paasta_tools.long_running_service_tools import ( DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW, ) @@ -75,6 +82,11 @@ CPU_METRICS_PROVIDER = "cpu" +K8S_INSTANCE_TYPE_CLASSES = ( + KubernetesDeploymentConfig, + EksDeploymentConfig, +) + class PrometheusAdapterResourceConfig(TypedDict, total=False): """ @@ -228,25 +240,37 @@ def should_create_piscina_scaling_rule( return False, "did not request piscina autoscaling" -def create_instance_uwsgi_scaling_rule( - service: str, - instance: str, +def should_create_active_requests_scaling_rule( autoscaling_config: AutoscalingParamsDict, +) -> Tuple[bool, Optional[str]]: + """ + Determines whether we should configure the prometheus adapter for a given service. + Returns a 2-tuple of (should_create, reason_to_skip) + """ + if autoscaling_config["metrics_provider"] == "active-requests": + return True, None + return False, "did not request active-requests autoscaling" + + +def create_instance_active_requests_scaling_rule( + service: str, + instance_config: KubernetesDeploymentConfig, paasta_cluster: str, - namespace: str = "paasta", ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ - setpoint = autoscaling_config["setpoint"] + autoscaling_config = instance_config.get_autoscaling_params() + instance = instance_config.instance + namespace = instance_config.get_namespace() + desired_active_requests_per_replica = autoscaling_config.get( + "desired_active_requests_per_replica", + DEFAULT_DESIRED_ACTIVE_REQUESTS_PER_REPLICA, + ) moving_average_window = autoscaling_config.get( - "moving_average_window_seconds", DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW + "moving_average_window_seconds", + DEFAULT_ACTIVE_REQUESTS_AUTOSCALING_MOVING_AVERAGE_WINDOW, ) - # this should always be set, but we default to 0 for safety as the worst thing that would happen - # is that we take a couple more iterations than required to hit the desired setpoint - offset = autoscaling_config.get("offset", 0) - offset_multiplier = load_system_paasta_config().get_uwsgi_offset_multiplier() - deployment_name = get_kubernetes_app_name(service=service, instance=instance) # In order for autoscaling to work safely while a service migrates from one namespace to another, the HPA needs to @@ -275,6 +299,104 @@ def create_instance_uwsgi_scaling_rule( ) ) by (kube_deployment) """ + + # Envoy tracks metrics at the smartstack namespace level. In most cases the paasta instance name matches the smartstack namespace. + # In rare cases, there are custom registration added to instance configs. + # If there is no custom registration the envoy and instance names match and no need to update the worker_filter_terms. + # If there is a single custom registration for an instance, we will process the registration value and extract the value to be used. + # The registrations usually follow the format of {service_name}.{smartstack_name}. Hence we split the string by dot and extract the last token. + # More than one custom registrations are not supported and config validation takes care of rejecting such configs. + registrations = instance_config.get_registrations() + + mesh_instance = registrations[0].split(".")[-1] if len(registrations) == 1 else None + envoy_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{mesh_instance or instance}'" + + # envoy-based metrics have no labels corresponding to the k8s resources that they + # front, but we can trivially add one in since our deployment names are of the form + # {service_name}-{instance_name} - which are both things in `worker_filter_terms` so + # it's safe to unconditionally add. + # This is necessary as otherwise the HPA/prometheus adapter does not know what these + # metrics are for. + total_load = f""" + ( + sum( + label_replace( + paasta_instance:envoy_cluster__egress_cluster_upstream_rq_active{{{envoy_filter_terms}}}, + "kube_deployment", "{deployment_name}", "", "" + ) + ) by (kube_deployment) + ) + """ + desired_instances_at_each_point_in_time = f""" + {total_load} / {desired_active_requests_per_replica} + """ + desired_instances = f""" + avg_over_time( + ( + {desired_instances_at_each_point_in_time} + )[{moving_average_window}s:] + ) + """ + + # The prometheus HPA adapter needs kube_deployment and kube_namespace labels attached to the metrics its scaling on. + # The envoy-based metrics have no labels corresponding to the k8s resources, so we can add them in. + metrics_query = f""" + label_replace( + label_replace( + {desired_instances} / {current_replicas}, + "kube_deployment", "{deployment_name}", "", "" + ), + "kube_namespace", "{namespace}", "", "" + ) + """ + series_query = f""" + k8s:deployment:pods_status_ready{{{worker_filter_terms}}} + """ + + metric_name = f"{deployment_name}-active-requests-prom" + + return { + "name": {"as": metric_name}, + "seriesQuery": _minify_promql(series_query), + "resources": {"template": "kube_<<.Resource>>"}, + "metricsQuery": _minify_promql(metrics_query), + } + + +def create_instance_uwsgi_scaling_rule( + service: str, + instance_config: KubernetesDeploymentConfig, + paasta_cluster: str, +) -> PrometheusAdapterRule: + """ + Creates a Prometheus adapter rule config for a given service instance. + """ + autoscaling_config = instance_config.get_autoscaling_params() + instance = instance_config.instance + namespace = instance_config.get_namespace() + setpoint = autoscaling_config["setpoint"] + moving_average_window = autoscaling_config.get( + "moving_average_window_seconds", DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW + ) + # this should always be set, but we default to 0 for safety as the worst thing that would happen + # is that we take a couple more iterations than required to hit the desired setpoint + offset = autoscaling_config.get("offset", 0) + offset_multiplier = load_system_paasta_config().get_uwsgi_offset_multiplier() + + deployment_name = get_kubernetes_app_name(service=service, instance=instance) + + # In order for autoscaling to work safely while a service migrates from one namespace to another, the HPA needs to + # make sure that the deployment in the new namespace is scaled up enough to handle _all_ the load. + # This is because once the new deployment is 100% healthy, cleanup_kubernetes_job will delete the deployment out of + # the old namespace all at once, suddenly putting all the load onto the deployment in the new namespace. + # To ensure this, we must: + # - DO NOT filter on namespace in worker_filter_terms (which is used when calculating desired_instances). + # - DO filter on namespace in replica_filter_terms (which is used to calculate current_replicas). + # This makes sure that desired_instances includes load from all namespaces, but that the scaling ratio calculated + # by (desired_instances / current_replicas) is meaningful for each namespace. + worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'" + replica_filter_terms = f"paasta_cluster='{paasta_cluster}',kube_deployment='{deployment_name}',namespace='{namespace}'" + # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready # over paasta service/instance/cluster. it counts the number of ready pods in a paasta # deployment. @@ -287,6 +409,17 @@ def create_instance_uwsgi_scaling_rule( ) ) by (kube_deployment)) """ + # as mentioned above: we want to get the overload by counting load across namespces - but we need + # to divide by the ready pods in the target namespace - which is done by using a namespace filter here + ready_pods_namespaced = f""" + (sum( + k8s:deployment:pods_status_ready{{{replica_filter_terms}}} >= 0 + or + max_over_time( + k8s:deployment:pods_status_ready{{{replica_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s] + ) + ) by (kube_deployment)) + """ load_per_instance = f""" avg( uwsgi_worker_busy{{{worker_filter_terms}}} @@ -317,8 +450,15 @@ def create_instance_uwsgi_scaling_rule( )[{moving_average_window}s:] ) """ + + # our Prometheus query is calculating a desired number of replicas, and then k8s wants that expressed as an average utilization + # so as long as we divide by the number that k8s ends up multiplying by, we should be able to convince k8s to run any arbitrary + # number of replicas. + # k8s happens to multiply by the # of ready pods - so we divide by that rather than by the amount of current replicas (which may + # include non-ready pods) + # ref: https://github.com/kubernetes/kubernetes/blob/7ec1a89a509906dad9fd6a4635d7bfc157b47790/pkg/controller/podautoscaler/replica_calculator.go#L278 metrics_query = f""" - {desired_instances} / {current_replicas} + {desired_instances} / {ready_pods_namespaced} """ metric_name = f"{deployment_name}-uwsgi-prom" @@ -333,14 +473,15 @@ def create_instance_uwsgi_scaling_rule( def create_instance_piscina_scaling_rule( service: str, - instance: str, - autoscaling_config: AutoscalingParamsDict, + instance_config: KubernetesDeploymentConfig, paasta_cluster: str, - namespace: str = "paasta", ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ + autoscaling_config = instance_config.get_autoscaling_params() + instance = instance_config.instance + namespace = instance_config.get_namespace() setpoint = autoscaling_config["setpoint"] moving_average_window = autoscaling_config.get( "moving_average_window_seconds", @@ -444,14 +585,15 @@ def should_create_cpu_scaling_rule( def create_instance_cpu_scaling_rule( service: str, - instance: str, - autoscaling_config: AutoscalingParamsDict, + instance_config: KubernetesDeploymentConfig, paasta_cluster: str, - namespace: str = "paasta", ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ + autoscaling_config = instance_config.get_autoscaling_params() + instance = instance_config.instance + namespace = instance_config.get_namespace() deployment_name = get_kubernetes_app_name(service=service, instance=instance) sanitized_instance_name = sanitise_kubernetes_name(instance) metric_name = f"{deployment_name}-cpu-prom" @@ -592,14 +734,15 @@ def create_instance_cpu_scaling_rule( def create_instance_gunicorn_scaling_rule( service: str, - instance: str, - autoscaling_config: AutoscalingParamsDict, + instance_config: KubernetesDeploymentConfig, paasta_cluster: str, - namespace: str = "paasta", ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ + autoscaling_config = instance_config.get_autoscaling_params() + instance = instance_config.instance + namespace = instance_config.get_namespace() setpoint = autoscaling_config["setpoint"] moving_average_window = autoscaling_config.get( "moving_average_window_seconds", @@ -704,11 +847,12 @@ def should_create_arbitrary_promql_scaling_rule( def create_instance_arbitrary_promql_scaling_rule( service: str, - instance: str, - autoscaling_config: AutoscalingParamsDict, + instance_config: KubernetesDeploymentConfig, paasta_cluster: str, - namespace: str, ) -> PrometheusAdapterRule: + autoscaling_config = instance_config.get_autoscaling_params() + instance = instance_config.instance + namespace = instance_config.get_namespace() prometheus_adapter_config = autoscaling_config["prometheus_adapter_config"] deployment_name = get_kubernetes_app_name(service=service, instance=instance) @@ -764,10 +908,8 @@ def create_instance_arbitrary_promql_scaling_rule( def get_rules_for_service_instance( service_name: str, - instance_name: str, - autoscaling_config: AutoscalingParamsDict, + instance_config: KubernetesDeploymentConfig, paasta_cluster: str, - namespace: str, ) -> List[PrometheusAdapterRule]: """ Returns a list of Prometheus Adapter rules for a given service instance. For now, this @@ -777,29 +919,31 @@ def get_rules_for_service_instance( rules: List[PrometheusAdapterRule] = [] for should_create_scaling_rule, create_instance_scaling_rule in ( + ( + should_create_active_requests_scaling_rule, + create_instance_active_requests_scaling_rule, + ), (should_create_uwsgi_scaling_rule, create_instance_uwsgi_scaling_rule), (should_create_piscina_scaling_rule, create_instance_piscina_scaling_rule), (should_create_cpu_scaling_rule, create_instance_cpu_scaling_rule), (should_create_gunicorn_scaling_rule, create_instance_gunicorn_scaling_rule), ): should_create, skip_reason = should_create_scaling_rule( - autoscaling_config=autoscaling_config, + autoscaling_config=instance_config.get_autoscaling_params(), ) if should_create: rules.append( create_instance_scaling_rule( service=service_name, - instance=instance_name, - autoscaling_config=autoscaling_config, + instance_config=instance_config, paasta_cluster=paasta_cluster, - namespace=namespace, ) ) else: log.debug( "Skipping %s.%s - %s.", service_name, - instance_name, + instance_config.instance, skip_reason, ) @@ -827,23 +971,30 @@ def create_prometheus_adapter_config( cluster=paasta_cluster, instance_type="kubernetes", soa_dir=str(soa_dir) ) } + services.update( + { + service_name + for service_name, _ in get_services_for_cluster( + cluster=paasta_cluster, instance_type="eks", soa_dir=str(soa_dir) + ) + } + ) for service_name in services: config_loader = PaastaServiceConfigLoader( service=service_name, soa_dir=str(soa_dir) ) - for instance_config in config_loader.instance_configs( - cluster=paasta_cluster, - instance_type_class=KubernetesDeploymentConfig, - ): - rules.extend( - get_rules_for_service_instance( - service_name=service_name, - instance_name=instance_config.instance, - autoscaling_config=instance_config.get_autoscaling_params(), - paasta_cluster=paasta_cluster, - namespace=instance_config.get_namespace(), + for instance_type_class in K8S_INSTANCE_TYPE_CLASSES: + for instance_config in config_loader.instance_configs( + cluster=paasta_cluster, + instance_type_class=instance_type_class, + ): + rules.extend( + get_rules_for_service_instance( + service_name=service_name, + instance_config=instance_config, + paasta_cluster=paasta_cluster, + ) ) - ) return { # we sort our rules so that we can easily compare between two different configmaps diff --git a/paasta_tools/spark_tools.py b/paasta_tools/spark_tools.py index edd339fe1b..b1f8df744f 100644 --- a/paasta_tools/spark_tools.py +++ b/paasta_tools/spark_tools.py @@ -2,6 +2,7 @@ import logging import re import socket +import sys from functools import lru_cache from typing import cast from typing import Dict @@ -169,13 +170,27 @@ def get_volumes_from_spark_mesos_configs(spark_conf: Mapping[str, str]) -> List[ def get_volumes_from_spark_k8s_configs(spark_conf: Mapping[str, str]) -> List[str]: - volume_names = [ - re.match( - r"spark.kubernetes.executor.volumes.hostPath.(\d+).mount.path", key - ).group(1) - for key in spark_conf.keys() - if "spark.kubernetes.executor.volumes.hostPath." in key and ".mount.path" in key - ] + volume_names = [] + for key in list(spark_conf.keys()): + if ( + "spark.kubernetes.executor.volumes.hostPath." in key + and ".mount.path" in key + ): + v_name = re.match( + r"spark.kubernetes.executor.volumes.hostPath.([a-z0-9]([-a-z0-9]*[a-z0-9])?).mount.path", + key, + ) + if v_name: + volume_names.append(v_name.group(1)) + else: + log.error( + f"Volume names must consist of lower case alphanumeric characters or '-', " + f"and must start and end with an alphanumeric character. Config -> '{key}' must be fixed." + ) + # Failing here because the k8s pod fails to start if the volume names + # don't follow the lowercase RFC 1123 standard. + sys.exit(1) + volumes = [] for volume_name in volume_names: read_only = ( diff --git a/paasta_tools/tron_tools.py b/paasta_tools/tron_tools.py index cdc1c44ffc..f69b823369 100644 --- a/paasta_tools/tron_tools.py +++ b/paasta_tools/tron_tools.py @@ -210,10 +210,6 @@ def parse_time_variables(command: str, parse_time: datetime.datetime = None) -> return StringFormatter(job_context).format(command) -def _use_k8s_default() -> bool: - return load_system_paasta_config().get_tron_use_k8s_default() - - def _get_tron_k8s_cluster_override(cluster: str) -> Optional[str]: """ Return the name of a compute cluster if there's a different compute cluster that should be used to run a Tronjob. @@ -665,9 +661,6 @@ def __init__( # Indicate whether this config object is created for validation self.for_validation = for_validation - def get_use_k8s(self) -> bool: - return self.config_dict.get("use_k8s", _use_k8s_default()) - def get_name(self): return self.name @@ -738,7 +731,7 @@ def get_cluster(self): def get_expected_runtime(self): return self.config_dict.get("expected_runtime") - def _get_action_config(self, action_name, action_dict): + def _get_action_config(self, action_name, action_dict) -> TronActionConfig: action_service = action_dict.setdefault("service", self.get_service()) action_deploy_group = action_dict.setdefault( "deploy_group", self.get_deploy_group() @@ -781,24 +774,17 @@ def _get_action_config(self, action_name, action_dict): action_dict["monitoring"] = self.get_monitoring() cluster_override = _get_tron_k8s_cluster_override(self.get_cluster()) - # technically, we should also be checking if k8s is enabled, but at this stage - # of our migration we're not expecting any issues and when we clean up all the - # Mesos remnants on-completion we can also rip out all the code that fallsback - # to Mesos and just do this unconditionally. - use_k8s_cluster_override = cluster_override is not None and self.get_use_k8s() return TronActionConfig( service=action_service, instance=compose_instance(self.get_name(), action_name), - cluster=cluster_override - if use_k8s_cluster_override - else self.get_cluster(), + cluster=cluster_override or self.get_cluster(), config_dict=action_dict, branch_dict=branch_dict, soa_dir=self.soa_dir, for_validation=self.for_validation, ) - def get_actions(self): + def get_actions(self) -> List[TronActionConfig]: actions = self.config_dict.get("actions") return [ self._get_action_config(name, action_dict) @@ -893,7 +879,7 @@ def format_master_config(master_config, default_volumes, dockercfg_location): return master_config -def format_tron_action_dict(action_config: TronActionConfig, use_k8s: bool = False): +def format_tron_action_dict(action_config: TronActionConfig): """Generate a dict of tronfig for an action, from the TronActionConfig. :param job_config: TronActionConfig @@ -921,12 +907,7 @@ def format_tron_action_dict(action_config: TronActionConfig, use_k8s: bool = Fal "service_account_name": action_config.get_service_account_name(), } - # while we're tranisitioning, we want to be able to cleanly fallback to Mesos - # so we'll default to Mesos unless k8s usage is enabled for both the cluster - # and job. - # there are slight differences between k8s and Mesos configs, so we'll translate - # whatever is in soaconfigs to the k8s equivalent here as well. - if executor in KUBERNETES_EXECUTOR_NAMES and use_k8s: + if executor in KUBERNETES_EXECUTOR_NAMES: # we'd like Tron to be able to distinguish between spark and normal actions # even though they both run on k8s result["executor"] = EXECUTOR_NAME_TO_TRON_EXECUTOR_TYPE.get( @@ -1046,11 +1027,9 @@ def format_tron_job_dict(job_config: TronJobConfig, k8s_enabled: bool = False): :param job_config: TronJobConfig """ - # TODO: this use_k8s flag should be removed once we've fully migrated off of mesos - use_k8s = job_config.get_use_k8s() and k8s_enabled action_dict = { action_config.get_action_name(): format_tron_action_dict( - action_config=action_config, use_k8s=use_k8s + action_config=action_config, ) for action_config in job_config.get_actions() } @@ -1069,16 +1048,11 @@ def format_tron_job_dict(job_config: TronJobConfig, k8s_enabled: bool = False): "time_zone": job_config.get_time_zone(), "expected_runtime": job_config.get_expected_runtime(), } - # TODO: this should be directly inlined, but we need to update tron everywhere first so it'll - # be slightly less tedious to just conditionally send this now until we clean things up on the - # removal of all the Mesos code - if job_config.get_use_k8s(): - result["use_k8s"] = job_config.get_use_k8s() cleanup_config = job_config.get_cleanup_action() if cleanup_config: cleanup_action = format_tron_action_dict( - action_config=cleanup_config, use_k8s=use_k8s + action_config=cleanup_config, ) result["cleanup_action"] = cleanup_action @@ -1093,23 +1067,41 @@ def load_tron_instance_config( load_deployments: bool = True, soa_dir: str = DEFAULT_SOA_DIR, ) -> TronActionConfig: - jobs = load_tron_service_config( + for action in load_tron_instance_configs( service=service, cluster=cluster, load_deployments=load_deployments, soa_dir=soa_dir, - ) - requested_job, requested_action = instance.split(".") - for job in jobs: - if job.get_name() == requested_job: - for action in job.get_actions(): - if action.get_action_name() == requested_action: - return action + ): + if action.get_instance() == instance: + return action raise NoConfigurationForServiceError( f"No tron configuration found for {service} {instance}" ) +@time_cache(ttl=5) +def load_tron_instance_configs( + service: str, + cluster: str, + load_deployments: bool = True, + soa_dir: str = DEFAULT_SOA_DIR, +) -> Tuple[TronActionConfig, ...]: + ret: List[TronActionConfig] = [] + + jobs = load_tron_service_config( + service=service, + cluster=cluster, + load_deployments=load_deployments, + soa_dir=soa_dir, + ) + + for job in jobs: + ret.extend(job.get_actions()) + + return tuple(ret) + + @time_cache(ttl=5) def load_tron_service_config( service, diff --git a/paasta_tools/utils.py b/paasta_tools/utils.py index 2e61c016e2..2afcf98ab4 100644 --- a/paasta_tools/utils.py +++ b/paasta_tools/utils.py @@ -57,6 +57,7 @@ from typing import Iterable from typing import Iterator from typing import List +from typing import Literal from typing import Mapping from typing import NamedTuple from typing import Optional @@ -98,7 +99,6 @@ "itest", "itest-and-push-to-registry", "security-check", - "performance-check", "push-to-registry", ) # Default values for _log @@ -129,14 +129,21 @@ "paasta_native", "adhoc", "kubernetes", + "eks", "tron", "flink", "cassandracluster", "kafkacluster", + "vitesscluster", "monkrelays", "nrtsearchservice", ) +PAASTA_K8S_INSTANCE_TYPES = { + "kubernetes", + "eks", +} + INSTANCE_TYPE_TO_K8S_NAMESPACE = { "marathon": "paasta", "adhoc": "paasta", @@ -144,6 +151,7 @@ "flink": "paasta-flinks", "cassandracluster": "paasta-cassandraclusters", "kafkacluster": "paasta-kafkaclusters", + "vitesscluster": "paasta-vitessclusters", "nrtsearchservice": "paasta-nrtsearchservices", } @@ -330,6 +338,7 @@ class InstanceConfigDict(TypedDict, total=False): branch: str iam_role: str iam_role_provider: str + service: str class BranchDictV1(TypedDict, total=False): @@ -536,6 +545,19 @@ def get_cap_drop(self) -> Iterable[DockerParameter]: for cap in CAPS_DROP: yield {"key": "cap-drop", "value": cap} + def get_cap_args(self) -> Iterable[DockerParameter]: + """Generate all --cap-add/--cap-drop parameters, ensuring not to have overlapping settings""" + cap_adds = list(self.get_cap_add()) + if cap_adds and is_using_unprivileged_containers(): + log.warning( + "Unprivileged containerizer detected, adding capabilities will not work properly" + ) + yield from cap_adds + added_caps = [cap["value"] for cap in cap_adds] + for cap in self.get_cap_drop(): + if cap["value"] not in added_caps: + yield cap + def format_docker_parameters( self, with_labels: bool = True, @@ -570,9 +592,8 @@ def format_docker_parameters( if extra_docker_args: for key, value in extra_docker_args.items(): parameters.extend([{"key": key, "value": value}]) - parameters.extend(self.get_cap_add()) parameters.extend(self.get_docker_init()) - parameters.extend(self.get_cap_drop()) + parameters.extend(self.get_cap_args()) return parameters def use_docker_disk_quota( @@ -918,7 +939,7 @@ def check_deploy_group(self) -> Tuple[bool, str]: if deploy_group not in pipeline_deploy_groups: return ( False, - f"{self.service}.{self.instance} uses deploy_group {deploy_group}, but it is not deploy.yaml", + f"{self.service}.{self.instance} uses deploy_group {deploy_group}, but {deploy_group} is not deployed to in deploy.yaml", ) # noqa: E501 return True, "" @@ -939,7 +960,7 @@ def get_iam_role(self) -> str: return self.config_dict.get("iam_role", "") def get_iam_role_provider(self) -> str: - return self.config_dict.get("iam_role_provider", "kiam") + return self.config_dict.get("iam_role_provider", "aws") def get_role(self) -> Optional[str]: """Which mesos role of nodes this job should run on.""" @@ -1910,12 +1931,20 @@ class KubeStateMetricsCollectorConfigDict(TypedDict, total=False): label_renames: Dict[str, str] +class TopologySpreadConstraintDict(TypedDict, total=False): + topology_key: str + when_unsatisfiable: Literal["ScheduleAnyway", "DoNotSchedule"] + max_skew: int + + class SystemPaastaConfigDict(TypedDict, total=False): allowed_pools: Dict[str, List[str]] + api_client_timeout: int api_endpoints: Dict[str, str] api_profiling_config: Dict auth_certificate_ttl: str auto_config_instance_types_enabled: Dict[str, bool] + auto_config_instance_type_aliases: Dict[str, str] auto_hostname_unique_size: int boost_regions: List[str] cluster_autoscaler_max_decrease: float @@ -1930,7 +1959,7 @@ class SystemPaastaConfigDict(TypedDict, total=False): dashboard_links: Dict[str, Dict[str, str]] datastore_credentials_vault_env_overrides: Dict[str, str] default_push_groups: List - default_should_run_uwsgi_exporter_sidecar: bool + default_should_use_uwsgi_exporter: bool deploy_blacklist: UnsafeDeployBlacklist deployd_big_bounce_deadline: float deployd_log_level: str @@ -1988,7 +2017,7 @@ class SystemPaastaConfigDict(TypedDict, total=False): pdb_max_unavailable: Union[str, int] pki_backend: str pod_defaults: Dict[str, Any] - topology_spread_constraints: List[Dict[str, Any]] + topology_spread_constraints: List[TopologySpreadConstraintDict] previous_marathon_servers: List[MarathonConfigDict] readiness_check_prefix_template: List[str] register_k8s_pods: bool @@ -2009,13 +2038,11 @@ class SystemPaastaConfigDict(TypedDict, total=False): synapse_port: int taskproc: Dict tron: Dict - uwsgi_exporter_sidecar_image_url: str gunicorn_exporter_sidecar_image_url: str vault_cluster_map: Dict vault_environment: str volumes: List[DockerVolume] zookeeper: str - tron_use_k8s: bool tron_k8s_cluster_overrides: Dict[str, str] skip_cpu_override_validation: List[str] spark_k8s_role: str @@ -2033,6 +2060,7 @@ class SystemPaastaConfigDict(TypedDict, total=False): spark_use_eks_default: bool sidecar_requirements_config: Dict[str, KubeContainerResourceRequest] eks_cluster_aliases: Dict[str, str] + secret_sync_delay_seconds: float def load_system_paasta_config( @@ -2109,6 +2137,9 @@ def __eq__(self, other: Any) -> bool: def __repr__(self) -> str: return f"SystemPaastaConfig({self.config_dict!r}, {self.directory!r})" + def get_secret_sync_delay_seconds(self) -> float: + return self.config_dict.get("secret_sync_delay_seconds", 0) + def get_spark_use_eks_default(self) -> bool: return self.config_dict.get("spark_use_eks_default", False) @@ -2213,6 +2244,20 @@ def get_auto_hostname_unique_size(self) -> int: def get_auto_config_instance_types_enabled(self) -> Dict[str, bool]: return self.config_dict.get("auto_config_instance_types_enabled", {}) + def get_auto_config_instance_type_aliases(self) -> Dict[str, str]: + """ + Allow re-using another instance type's autotuned data. This is useful when an instance can be trivially moved around + type-wise as it allows us to avoid data races/issues with the autotuned recommendations generator/updater. + """ + return self.config_dict.get("auto_config_instance_type_aliases", {}) + + def get_api_client_timeout(self) -> int: + """ + We've seen the Paasta API get hung up sometimes and the client not realizing this will sit idle forever. + This will be used to specify the default timeout + """ + return self.config_dict.get("api_client_timeout", 120) + def get_api_endpoints(self) -> Mapping[str, str]: return self.config_dict["api_endpoints"] @@ -2583,7 +2628,7 @@ def get_taskproc(self) -> Dict: def get_disabled_watchers(self) -> List: return self.config_dict.get("disabled_watchers", []) - def get_topology_spread_constraints(self) -> List[Dict[str, Any]]: + def get_topology_spread_constraints(self) -> List[TopologySpreadConstraintDict]: """List of TopologySpreadConstraints that will be applied to all Pods in the cluster""" return self.config_dict.get("topology_spread_constraints", []) @@ -2690,15 +2735,8 @@ def get_git_repo_config(self, repo_name: str) -> Dict: """ return self.get_git_config().get("repos", {}).get(repo_name, {}) - def get_uwsgi_exporter_sidecar_image_url(self) -> str: - """Get the docker image URL for the uwsgi_exporter sidecar container""" - return self.config_dict.get( - "uwsgi_exporter_sidecar_image_url", - "docker-paasta.yelpcorp.com:443/uwsgi_exporter-k8s-sidecar:v1.3.0-yelp0", - ) - - def default_should_run_uwsgi_exporter_sidecar(self) -> bool: - return self.config_dict.get("default_should_run_uwsgi_exporter_sidecar", False) + def default_should_use_uwsgi_exporter(self) -> bool: + return self.config_dict.get("default_should_use_uwsgi_exporter", False) def get_gunicorn_exporter_sidecar_image_url(self) -> str: """Get the docker image URL for the gunicorn_exporter sidecar container""" @@ -2728,9 +2766,6 @@ def get_mark_for_deployment_should_ping_for_unhealthy_pods(self) -> bool: "mark_for_deployment_should_ping_for_unhealthy_pods", True ) - def get_tron_use_k8s_default(self) -> bool: - return self.config_dict.get("tron_use_k8s", False) - def get_spark_k8s_role(self) -> str: return self.config_dict.get("spark_k8s_role", "spark") @@ -3405,8 +3440,18 @@ def load_service_instance_auto_configs( soa_dir: str = DEFAULT_SOA_DIR, ) -> Dict[str, Dict[str, Any]]: enabled_types = load_system_paasta_config().get_auto_config_instance_types_enabled() - conf_file = f"{instance_type}-{cluster}" - if enabled_types.get(instance_type): + # this looks a little funky: but what we're generally trying to do here is ensure that + # certain types of instances can be moved between instance types without having to worry + # about any sort of data races (or data weirdness) in autotune. + # instead, what we do is map certain instance types to whatever we've picked as the "canonical" + # instance type in autotune and always merge from there. + realized_type = ( + load_system_paasta_config() + .get_auto_config_instance_type_aliases() + .get(instance_type, instance_type) + ) + conf_file = f"{realized_type}-{cluster}" + if enabled_types.get(realized_type): return service_configuration_lib.read_extra_service_information( service, f"{AUTO_SOACONFIG_SUBDIR}/{conf_file}", @@ -4230,3 +4275,8 @@ def get_runtimeenv() -> str: # we could also just crash or return None, but this seems a little easier to find # should we somehow run into this at Yelp return "unknown" + + +@lru_cache(maxsize=1) +def is_using_unprivileged_containers() -> bool: + return "podman" in os.getenv("DOCKER_HOST", "") diff --git a/paasta_tools/vitess_tools.py b/paasta_tools/vitesscluster_tools.py similarity index 64% rename from paasta_tools/vitess_tools.py rename to paasta_tools/vitesscluster_tools.py index 9cd47d66af..3337f2d7cc 100644 --- a/paasta_tools/vitess_tools.py +++ b/paasta_tools/vitesscluster_tools.py @@ -5,20 +5,16 @@ import service_configuration_lib -from paasta_tools.kubernetes_tools import sanitise_kubernetes_name from paasta_tools.kubernetes_tools import sanitised_cr_name from paasta_tools.long_running_service_tools import LongRunningServiceConfig from paasta_tools.long_running_service_tools import LongRunningServiceConfigDict from paasta_tools.utils import BranchDictV2 -from paasta_tools.utils import compose_job_id -from paasta_tools.utils import decompose_job_id from paasta_tools.utils import deep_merge_dictionaries from paasta_tools.utils import DEFAULT_SOA_DIR -from paasta_tools.utils import InvalidJobNameError from paasta_tools.utils import load_service_instance_config from paasta_tools.utils import load_v2_deployments_json -KUBERNETES_NAMESPACE = "paasta-vitess" +KUBERNETES_NAMESPACE = "paasta-vitessclusters" log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) @@ -31,7 +27,7 @@ class VitessDeploymentConfigDict(LongRunningServiceConfigDict, total=False): class VitessDeploymentConfig(LongRunningServiceConfig): config_dict: VitessDeploymentConfigDict - config_filename_prefix = "vitess" + config_filename_prefix = "vitesscluster" def __init__( self, @@ -52,61 +48,9 @@ def __init__( branch_dict=branch_dict, ) - def get_service_name_smartstack(self) -> str: - """ - We register in vitess.main - """ - return "vitess_" + self.get_instance() - - def get_nerve_namespace(self) -> str: - """ - We register in vitess.main - """ - return "main" - - def get_registrations(self) -> List[str]: - """ - We register in vitess.main - """ - registrations = self.config_dict.get("registrations", []) - for registration in registrations: - try: - decompose_job_id(registration) - except InvalidJobNameError: - log.error( - "Provided registration {} for service " - "{} is invalid".format(registration, self.service) - ) - - return registrations or [ - compose_job_id(self.get_service_name_smartstack(), "main") - ] - - def get_kubernetes_namespace(self) -> str: - return KUBERNETES_NAMESPACE - - def get_namespace(self) -> str: - """Get namespace from config, default to 'paasta'""" - return self.config_dict.get("namespace", KUBERNETES_NAMESPACE) - def get_instances(self, with_limit: bool = True) -> int: return self.config_dict.get("replicas", 1) - def get_bounce_method(self) -> str: - """ - Need to map to a paasta bounce method and crossover is the closest - """ - return "crossover" - - def get_sanitised_service_name(self) -> str: - return sanitise_kubernetes_name(self.get_service()) - - def get_sanitised_instance_name(self) -> str: - return sanitise_kubernetes_name(self.get_instance()) - - def get_sanitised_deployment_name(self) -> str: - return self.get_sanitised_instance_name() - def validate( self, params: List[str] = [ @@ -139,7 +83,7 @@ def load_vitess_instance_config( service, soa_dir=soa_dir ) instance_config = load_service_instance_config( - service, instance, " vitesscluster", cluster, soa_dir=soa_dir + service, instance, "vitesscluster", cluster, soa_dir=soa_dir ) general_config = deep_merge_dictionaries( overrides=instance_config, defaults=general_config @@ -176,6 +120,6 @@ def cr_id(service: str, instance: str) -> Mapping[str, str]: group="yelp.com", version="v1alpha1", namespace=KUBERNETES_NAMESPACE, - plural="vitess", + plural="vitessclusters", name=sanitised_cr_name(service, instance), ) diff --git a/requirements-dev-minimal.txt b/requirements-dev-minimal.txt index f586bdfef8..1c46473aa1 100644 --- a/requirements-dev-minimal.txt +++ b/requirements-dev-minimal.txt @@ -1,6 +1,9 @@ astroid asynctest coverage +# VSCode debugging requirement +# See https://code.visualstudio.com/docs/python/debugging#_local-script-debugging +debugpy docutils flake8 freezegun diff --git a/requirements-dev.txt b/requirements-dev.txt index a926dc7482..dbe193a3be 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,9 @@ asynctest==0.12.0 Babel==2.9.1 cfgv==2.0.1 coverage==6.5.0 +debugpy==1.8.0 distlib==0.3.4 +docutils==0.12 exceptiongroup==1.1.2 filelock==3.0.12 flake8==3.5.0 diff --git a/requirements-docs.txt b/requirements-docs.txt index f765b4d555..d585e6aa06 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -11,6 +11,7 @@ freezegun==0.3.7 identify==1.0.6 imagesize==0.7.1 isort==4.2.5 +jinja2==3.0.3 lazy-object-proxy==1.4.3 mccabe==0.6.1 mock==2.0.0 diff --git a/requirements-minimal.txt b/requirements-minimal.txt index dfffab112c..a42e3006c9 100644 --- a/requirements-minimal.txt +++ b/requirements-minimal.txt @@ -54,7 +54,7 @@ requests-cache >= 0.4.10 retry ruamel.yaml sensu-plugin -service-configuration-lib >= 2.18.0 +service-configuration-lib >= 2.18.11 signalfx slackclient >= 1.2.1 sticht >= 1.1.0 diff --git a/requirements.txt b/requirements.txt index f6b6bfc997..aa6201a9bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,9 +8,9 @@ async-timeout==3.0.0 attrs==19.2.0 binaryornot==0.4.4 boto==2.48.0 -boto3==1.11.11 +boto3==1.34.22 boto3-type-annotations==0.3.1 -botocore==1.14.11 +botocore==1.34.22 bravado==10.4.1 bravado-core==5.12.1 cachetools==2.0.1 @@ -22,7 +22,6 @@ cookiecutter==1.4.0 croniter==1.3.4 decorator==4.1.2 docker-py==1.2.3 -docutils==0.12 dulwich==0.17.3 ephemeral-port-reserve==1.1.0 future==0.16.0 @@ -89,9 +88,9 @@ retry==0.9.2 rfc3987==1.3.7 rsa==4.7.2 ruamel.yaml==0.15.96 -s3transfer==0.3.3 +s3transfer==0.10.0 sensu-plugin==0.3.1 -service-configuration-lib==2.18.0 +service-configuration-lib==2.18.11 setuptools==39.0.1 signalfx==1.0.17 simplejson==3.10.0 @@ -111,7 +110,7 @@ translationstring==1.3 typing-extensions==4.3.0 tzlocal==1.2 url-normalize==1.4.2 -urllib3==1.24.3 +urllib3==1.26.18 utaw==0.2.0 venusian==1.1.0 webcolors==1.7 diff --git a/setup.py b/setup.py index 1942cbb3a0..20a2544732 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,9 @@ def get_install_requires(): "paasta_tools/am_i_mesos_leader.py", "paasta_tools/apply_external_resources.py", "paasta_tools/autoscale_all_services.py", - "paasta_tools/check_flink_services_health.py", + "paasta_tools/check_autoscaler_max_instances.py", "paasta_tools/check_cassandracluster_services_replication.py", + "paasta_tools/check_flink_services_health.py", "paasta_tools/check_kubernetes_api.py", "paasta_tools/check_kubernetes_services_replication.py", "paasta_tools/check_oom_events.py", @@ -52,16 +53,16 @@ def get_install_requires(): "paasta_tools/cleanup_kubernetes_cr.py", "paasta_tools/cleanup_kubernetes_crd.py", "paasta_tools/cleanup_kubernetes_jobs.py", + "paasta_tools/cli/paasta_tabcomplete.sh", "paasta_tools/delete_kubernetes_deployments.py", - "paasta_tools/paasta_deploy_tron_jobs", "paasta_tools/generate_all_deployments", "paasta_tools/generate_deployments_for_service.py", "paasta_tools/generate_services_file.py", "paasta_tools/generate_services_yaml.py", "paasta_tools/get_mesos_leader.py", - "paasta_tools/kubernetes/bin/paasta_secrets_sync.py", - "paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py", "paasta_tools/kubernetes/bin/kubernetes_remove_evicted_pods.py", + "paasta_tools/kubernetes/bin/paasta_cleanup_stale_nodes.py", + "paasta_tools/kubernetes/bin/paasta_secrets_sync.py", "paasta_tools/list_marathon_service_instances.py", "paasta_tools/log_task_lifecycle_events.py", "paasta_tools/marathon_dashboard.py", @@ -69,20 +70,20 @@ def get_install_requires(): "paasta_tools/monitoring/check_marathon_has_apps.py", "paasta_tools/monitoring/check_mesos_active_frameworks.py", "paasta_tools/monitoring/check_mesos_duplicate_frameworks.py", - "paasta_tools/monitoring/check_mesos_quorum.py", "paasta_tools/monitoring/check_mesos_outdated_tasks.py", + "paasta_tools/monitoring/check_mesos_quorum.py", "paasta_tools/monitoring/kill_orphaned_docker_containers.py", - "paasta_tools/cli/paasta_tabcomplete.sh", "paasta_tools/paasta_cluster_boost.py", + "paasta_tools/paasta_deploy_tron_jobs", "paasta_tools/paasta_execute_docker_command.py", "paasta_tools/paasta_maintenance.py", "paasta_tools/paasta_metastatus.py", "paasta_tools/paasta_remote_run.py", - "paasta_tools/setup_kubernetes_job.py", - "paasta_tools/setup_kubernetes_crd.py", + "paasta_tools/setup_istio_mesh.py", "paasta_tools/setup_kubernetes_cr.py", + "paasta_tools/setup_kubernetes_crd.py", "paasta_tools/setup_kubernetes_internal_crd.py", - "paasta_tools/setup_istio_mesh.py", + "paasta_tools/setup_kubernetes_job.py", "paasta_tools/setup_prometheus_adapter_config.py", "paasta_tools/synapse_srv_namespaces_fact.py", ] @@ -106,6 +107,7 @@ def get_install_requires(): "paasta_oom_logger=paasta_tools.oom_logger:main", "paasta_broadcast_log=paasta_tools.broadcast_log_to_services:main", "paasta_dump_locally_running_services=paasta_tools.dump_locally_running_services:main", + "paasta_habitat_fixer=paasta_tools.contrib.habitat_fixer:main", ], "paste.app_factory": ["paasta-api-config=paasta_tools.api.api:make_app"], }, diff --git a/tests/api/test_autoscaler.py b/tests/api/test_autoscaler.py index c78cc63220..058ca85dc7 100644 --- a/tests/api/test_autoscaler.py +++ b/tests/api/test_autoscaler.py @@ -12,21 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. import mock +import pytest from pyramid import testing from paasta_tools.api.views import autoscaler +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig from paasta_tools.marathon_tools import MarathonServiceConfig @mock.patch("paasta_tools.api.views.autoscaler.get_instance_config", autospec=True) -def test_get_autoscaler_count(mock_get_instance_config): +@pytest.mark.parametrize( + "instance_type_class", + ( + KubernetesDeploymentConfig, + EksDeploymentConfig, + ), +) +def test_get_autoscaler_count(mock_get_instance_config, instance_type_class): request = testing.DummyRequest() request.swagger_data = {"service": "fake_service", "instance": "fake_instance"} mock_get_instance_config.return_value = mock.MagicMock( get_instances=mock.MagicMock(return_value=123), - spec=KubernetesDeploymentConfig, + spec=instance_type_class, ) response = autoscaler.get_autoscaler_count(request) assert response.json_body["desired_instances"] == 123 @@ -54,7 +63,16 @@ def test_update_autoscaler_count_marathon(mock_get_instance_config): @mock.patch("paasta_tools.api.views.autoscaler.get_instance_config", autospec=True) -def test_update_autoscaler_count_kubernetes(mock_get_instance_config): +@pytest.mark.parametrize( + "instance_type_class", + ( + KubernetesDeploymentConfig, + EksDeploymentConfig, + ), +) +def test_update_autoscaler_count_kubernetes( + mock_get_instance_config, instance_type_class +): request = testing.DummyRequest() request.swagger_data = { "service": "fake_kubernetes_service", @@ -65,7 +83,7 @@ def test_update_autoscaler_count_kubernetes(mock_get_instance_config): mock_get_instance_config.return_value = mock.MagicMock( get_min_instances=mock.MagicMock(return_value=100), get_max_instances=mock.MagicMock(return_value=200), - spec=KubernetesDeploymentConfig, + spec=instance_type_class, ) response = autoscaler.update_autoscaler_count(request) @@ -74,7 +92,14 @@ def test_update_autoscaler_count_kubernetes(mock_get_instance_config): @mock.patch("paasta_tools.api.views.autoscaler.get_instance_config", autospec=True) -def test_update_autoscaler_count_warning(mock_get_instance_config): +@pytest.mark.parametrize( + "instance_type_class", + ( + KubernetesDeploymentConfig, + EksDeploymentConfig, + ), +) +def test_update_autoscaler_count_warning(mock_get_instance_config, instance_type_class): request = testing.DummyRequest() request.swagger_data = { "service": "fake_service", @@ -85,7 +110,7 @@ def test_update_autoscaler_count_warning(mock_get_instance_config): mock_get_instance_config.return_value = mock.MagicMock( get_min_instances=mock.MagicMock(return_value=10), get_max_instances=mock.MagicMock(return_value=100), - spec=KubernetesDeploymentConfig, + spec=instance_type_class, ) response = autoscaler.update_autoscaler_count(request) diff --git a/tests/api/test_instance.py b/tests/api/test_instance.py index 829fc10f94..a089b13c1b 100644 --- a/tests/api/test_instance.py +++ b/tests/api/test_instance.py @@ -18,11 +18,13 @@ import mock import pytest from kubernetes.client import V1Pod +from kubernetes.client.rest import ApiException from marathon.models.app import MarathonApp from marathon.models.app import MarathonTask from pyramid import testing from requests.exceptions import ReadTimeout +from paasta_tools import eks_tools from paasta_tools import kubernetes_tools from paasta_tools import marathon_tools from paasta_tools.api import settings @@ -48,7 +50,6 @@ @pytest.mark.parametrize("include_mesos", [False, True]) @pytest.mark.parametrize("include_envoy", [False, True]) -@pytest.mark.parametrize("include_smartstack", [False, True]) @mock.patch("paasta_tools.api.views.instance.marathon_mesos_status", autospec=True) @mock.patch( "paasta_tools.api.views.instance.marathon_service_mesh_status", autospec=True @@ -82,7 +83,6 @@ def test_instance_status_marathon( mock_load_service_namespace_config, mock_marathon_service_mesh_status, mock_marathon_mesos_status, - include_smartstack, include_envoy, include_mesos, ): @@ -121,7 +121,6 @@ def test_instance_status_marathon( "service": "fake_service", "instance": "fake_instance", "verbose": 2, - "include_smartstack": include_smartstack, "include_envoy": include_envoy, "include_mesos": include_mesos, } @@ -131,8 +130,6 @@ def test_instance_status_marathon( "marathon_job_status_field1": "field1_value", "marathon_job_status_field2": "field2_value", } - if include_smartstack: - expected_response["smartstack"] = mock_marathon_service_mesh_status.return_value if include_envoy: expected_response["envoy"] = mock_marathon_service_mesh_status.return_value if include_mesos: @@ -151,18 +148,6 @@ def test_instance_status_marathon( "fake_service", "fake_instance", 2 ) expected_marathon_service_mesh_status_calls = [] - if include_smartstack: - expected_marathon_service_mesh_status_calls.append( - mock.call( - "fake_service", - ServiceMesh.SMARTSTACK, - "fake_instance", - mock_service_config, - mock_load_service_namespace_config.return_value, - mock_app.tasks, - should_return_individual_backends=True, - ), - ) if include_envoy: expected_marathon_service_mesh_status_calls.append( mock.call( @@ -530,7 +515,26 @@ def test_marathon_service_mesh_status( @pytest.mark.asyncio -async def test_kubernetes_smartstack_status(): +@pytest.mark.parametrize( + "mock_job_config", + ( + kubernetes_tools.KubernetesDeploymentConfig( + service="fake_service", + cluster="fake_cluster", + instance="fake_instance", + config_dict={"bounce_method": "fake_bounce"}, + branch_dict=None, + ), + eks_tools.EksDeploymentConfig( + service="fake_service", + cluster="fake_cluster", + instance="fake_instance", + config_dict={"bounce_method": "fake_bounce"}, + branch_dict=None, + ), + ), +) +async def test_kubernetes_smartstack_status(mock_job_config): with asynctest.patch( "paasta_tools.api.views.instance.pik.match_backends_and_pods", autospec=True ) as mock_match_backends_and_pods, asynctest.patch( @@ -568,13 +572,6 @@ async def test_kubernetes_smartstack_status(): mock_pod = mock.create_autospec(V1Pod) mock_match_backends_and_pods.return_value = [(mock_backend, mock_pod)] - mock_job_config = kubernetes_tools.KubernetesDeploymentConfig( - service="fake_service", - cluster="fake_cluster", - instance="fake_instance", - config_dict={"bounce_method": "fake_bounce"}, - branch_dict=None, - ) mock_service_namespace_config = ServiceNamespaceConfig() mock_settings = mock.Mock() @@ -1100,7 +1097,6 @@ def test_kubernetes_instance_status_bounce_method(): instance=inst, instance_type="kubernetes", verbose=0, - include_smartstack=False, include_envoy=False, settings=settings, ) @@ -1135,7 +1131,6 @@ def test_kubernetes_instance_status_evicted_nodes(): instance="fake-inst", instance_type="kubernetes", verbose=0, - include_smartstack=False, include_envoy=False, settings=mock_settings, ) @@ -1196,7 +1191,6 @@ def test_instance_mesh_status( request.swagger_data = { "service": "fake_service", "instance": "fake_inst", - "include_smartstack": False, } instance_mesh = instance.instance_mesh_status(request) @@ -1212,7 +1206,6 @@ def test_instance_mesh_status( instance="fake_inst", instance_type="flink", settings=settings, - include_smartstack=False, include_envoy=None, # default of true in api specs ), ] @@ -1249,7 +1242,6 @@ def test_instance_mesh_status_error( request.swagger_data = { "service": "fake_service", "instance": "fake_inst", - "include_smartstack": False, } with pytest.raises(ApiFailure) as excinfo: @@ -1279,13 +1271,21 @@ def mock_request(self): } return request + @pytest.mark.parametrize( + "instance_type", + ( + "kubernetes", + "eks", + ), + ) def test_success( self, mock_pik_bounce_status, mock_validate_service_instance, mock_request, + instance_type, ): - mock_validate_service_instance.return_value = "kubernetes" + mock_validate_service_instance.return_value = instance_type response = instance.bounce_status(mock_request) assert response == mock_pik_bounce_status.return_value @@ -1300,6 +1300,18 @@ def test_not_found( instance.bounce_status(mock_request) assert excinfo.value.err == 404 + def test_app_not_found( + self, + mock_pik_bounce_status, + mock_validate_service_instance, + mock_request, + ): + mock_validate_service_instance.return_value = "kubernetes" + mock_pik_bounce_status.side_effect = [ApiException(status=404)] + with pytest.raises(ApiFailure) as excinfo: + instance.bounce_status(mock_request) + assert excinfo.value.err == 404 + def test_not_kubernetes( self, mock_pik_bounce_status, diff --git a/tests/cli/test_cmds_autoscale.py b/tests/cli/test_cmds_autoscale.py index 6b87fe4926..bfa706d6bb 100644 --- a/tests/cli/test_cmds_autoscale.py +++ b/tests/cli/test_cmds_autoscale.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import mock +import pytest from paasta_tools.cli.cmds.autoscale import paasta_autoscale +from paasta_tools.eks_tools import EksDeploymentConfig +from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig @mock.patch("paasta_tools.cli.cmds.autoscale.figure_out_service_name", autospec=True) @@ -21,8 +24,18 @@ "paasta_tools.cli.cmds.autoscale.client.get_paasta_oapi_client", autospec=True ) @mock.patch("paasta_tools.cli.cmds.autoscale._log_audit", autospec=True) +@pytest.mark.parametrize( + "instance_type_class", + ( + EksDeploymentConfig, + KubernetesDeploymentConfig, + ), +) def test_paasta_autoscale( - mock__log_audit, mock_get_paasta_oapi_client, mock_figure_out_service_name + mock__log_audit, + mock_get_paasta_oapi_client, + mock_figure_out_service_name, + instance_type_class, ): service = "fake_service" instance = "fake_instance" @@ -45,5 +58,42 @@ def test_paasta_autoscale( ) mock__log_audit.return_value = None - paasta_autoscale(args) + with mock.patch( + "paasta_tools.cli.cmds.autoscale.get_instance_configs_for_service", + return_value=iter([mock.Mock(__class__=instance_type_class)]), + autospec=True, + ): + paasta_autoscale(args) assert mock_api.update_autoscaler_count.call_count == 1 + + +@mock.patch("paasta_tools.cli.cmds.autoscale.figure_out_service_name", autospec=True) +@mock.patch( + "paasta_tools.cli.cmds.autoscale.client.get_paasta_oapi_client", autospec=True +) +@mock.patch("paasta_tools.cli.cmds.autoscale._log_audit", autospec=True) +def test_paasta_autoscale_no_config( + mock__log_audit, + mock_get_paasta_oapi_client, + mock_figure_out_service_name, +): + service = "fake_service" + instance = "fake_instance" + cluster = "fake_cluster" + + mock_figure_out_service_name.return_value = service + mock_api = mock.Mock() + mock_get_paasta_oapi_client.return_value = mock.Mock(autoscaler=mock_api) + + args = mock.MagicMock() + args.service = service + args.clusters = cluster + args.instances = instance + args.set = 14 + + with mock.patch( + "paasta_tools.cli.cmds.autoscale.get_instance_configs_for_service", + return_value=iter(()), + autospec=True, + ): + assert paasta_autoscale(args) == 1 diff --git a/tests/cli/test_cmds_local_run.py b/tests/cli/test_cmds_local_run.py index c5a77ed402..393af3039d 100644 --- a/tests/cli/test_cmds_local_run.py +++ b/tests/cli/test_cmds_local_run.py @@ -17,6 +17,7 @@ import docker import mock +import pytest from pytest import mark from pytest import raises @@ -137,6 +138,7 @@ def test_dry_run_json_dict( assert ret == 0 # Ensure it's a dict and check some keys + print("Output", out) expected_out = json.loads(out) assert isinstance(expected_out, dict) assert "docker_hash" in expected_out @@ -407,6 +409,7 @@ def test_configure_and_run_command_uses_cmd_from_config( cluster="fake_cluster", system_paasta_config=system_paasta_config, args=args, + assume_role_aws_account=None, ) assert return_code == 0 mock_run_docker_container.assert_called_once_with( @@ -430,6 +433,7 @@ def test_configure_and_run_command_uses_cmd_from_config( skip_secrets=False, assume_role_arn="", assume_pod_identity=False, + assume_role_aws_account=None, use_okta_role=False, ) @@ -475,6 +479,7 @@ def test_configure_and_run_uses_bash_by_default_when_interactive( cluster="fake_cluster", system_paasta_config=system_paasta_config, args=args, + assume_role_aws_account="dev", ) assert return_code == 0 mock_secret_provider_kwargs = { @@ -502,6 +507,7 @@ def test_configure_and_run_uses_bash_by_default_when_interactive( secret_provider_kwargs=mock_secret_provider_kwargs, skip_secrets=False, assume_role_arn="", + assume_role_aws_account="dev", assume_pod_identity=False, use_okta_role=False, ) @@ -555,6 +561,7 @@ def test_configure_and_run_pulls_image_when_asked( args=args, system_paasta_config=system_paasta_config, pull_image=True, + assume_role_aws_account="dev", ) assert return_code == 0 mock_docker_pull_image.assert_called_once_with("fake_registry/fake_image") @@ -584,6 +591,7 @@ def test_configure_and_run_pulls_image_when_asked( skip_secrets=False, assume_role_arn="", assume_pod_identity=False, + assume_role_aws_account="dev", use_okta_role=False, ) @@ -633,6 +641,7 @@ def test_configure_and_run_docker_container_defaults_to_interactive_instance( cluster="fake_cluster", args=args, system_paasta_config=system_paasta_config, + assume_role_aws_account="dev", ) assert return_code == 0 mock_secret_provider_kwargs = { @@ -661,6 +670,7 @@ def test_configure_and_run_docker_container_defaults_to_interactive_instance( skip_secrets=False, assume_role_arn="", assume_pod_identity=False, + assume_role_aws_account="dev", use_okta_role=False, ) @@ -718,6 +728,7 @@ def test_configure_and_run_docker_container_respects_docker_sha( cluster="fake_cluster", args=args, system_paasta_config=system_paasta_config, + assume_role_aws_account="dev", ) expected = "fake_registry/services-fake_service:paasta-abcdefg" assert mock_run_docker_container.call_args[1]["docker_url"] == expected @@ -762,6 +773,43 @@ def test_run_success( assert paasta_local_run(args) is None +@pytest.mark.parametrize( + "cluster, aws_account, expected_aws_account", + [ + ("pnw-devc", None, "dev"), + ("pnw-devc", "prod", "prod"), + ("pnw-prod", None, "prod"), + ("pnw-prod", "dev", "dev"), + ], +) +@mock.patch("paasta_tools.cli.cmds.local_run.load_system_paasta_config", autospec=True) +@mock.patch("paasta_tools.cli.cmds.local_run.figure_out_service_name", autospec=True) +@mock.patch("paasta_tools.cli.cmds.cook_image.validate_service_name", autospec=True) +@mock.patch( + "paasta_tools.cli.cmds.local_run.configure_and_run_docker_container", autospec=True +) +def test_assume_role_aws_account( + mock_run_docker_container, + mock_validate_service_name, + mock_figure_out_service_name, + mock_system_paasta_config, + cluster, + aws_account, + expected_aws_account, + system_paasta_config, +): + mock_system_paasta_config.return_value = system_paasta_config + + args = mock.MagicMock() + args.cluster = cluster + args.assume_role_aws_account = aws_account + + paasta_local_run(args) + + _, kwargs = mock_run_docker_container.call_args + assert kwargs.get("assume_role_aws_account", "") == expected_aws_account + + @mock.patch("paasta_tools.cli.cmds.local_run.figure_out_service_name", autospec=True) @mock.patch( "paasta_tools.cli.cmds.local_run.configure_and_run_docker_container", autospec=True @@ -1925,6 +1973,7 @@ def test_volumes_are_deduped(mock_exists): "/etc/paasta", ), args=mock.Mock(yelpsoa_config_root="/blurp/durp", volumes=[]), + assume_role_aws_account="dev", ) args, kwargs = mock_run_docker_container.call_args assert kwargs["volumes"] == ["/hostPath:/containerPath:ro"] @@ -1980,6 +2029,7 @@ def test_missing_volumes_skipped(mock_exists): "/etc/paasta", ), args=mock.Mock(yelpsoa_config_root="/blurp/durp", volumes=[]), + assume_role_aws_account="dev", ) args, kwargs = mock_run_docker_container.call_args assert kwargs["volumes"] == [] @@ -2113,6 +2163,13 @@ def test_run_docker_container_assume_aws_role( autospec=None, ) @mock.patch("os.makedirs", autospec=True) +@mark.parametrize( + "original_service_name, override_service_name", + ( + ("fake_service", "fake_service"), # no service override + ("fake_service", "super_fake_service"), # service override + ), +) def test_run_docker_container_secret_volumes( mock_os_makedirs, mock_open, @@ -2123,6 +2180,8 @@ def test_run_docker_container_secret_volumes( mock_execlpe, mock_get_docker_run_cmd, mock_pick_random_port, + original_service_name, + override_service_name, ): mock_docker_client = mock.MagicMock(spec_set=docker.Client) mock_docker_client.attach = mock.MagicMock(spec_set=docker.Client.attach) @@ -2130,8 +2189,12 @@ def test_run_docker_container_secret_volumes( mock_docker_client.remove_container = mock.MagicMock( spec_set=docker.Client.remove_container ) + mock_service_manifest = mock.MagicMock(spec=MarathonServiceConfig) mock_service_manifest.cluster = "fake_cluster" + mock_service_manifest.get_service = mock.MagicMock( + return_value=override_service_name + ) # Coverage for binary file vs non-binary file mock_text_io_wrapper = mock.Mock(name="text_io_wrapper", autospec=True) @@ -2149,7 +2212,7 @@ def test_run_docker_container_secret_volumes( os.environ["TMPDIR"] = "/tmp/" return_code = run_docker_container( docker_client=mock_docker_client, - service="fake_service", + service=original_service_name, instance="fake_instance", docker_url="fake_hash", volumes=[], @@ -2174,6 +2237,9 @@ def test_run_docker_container_secret_volumes( the_kwargs["volumes"][1], ), "Did not find the expected secret file volume mount" + _, decrypt_kwargs = mock_decrypt_secret_volumes.call_args_list[0] + assert decrypt_kwargs["service_name"] == override_service_name + assert 0 == return_code @@ -2503,12 +2569,18 @@ def test_assume_aws_role( assume_role, assume_pod_identity, use_okta_role, + "dev", ) assert sys_exit.value.code == 1 return else: env = assume_aws_role( - mock_config, mock_service, assume_role, assume_pod_identity, use_okta_role + mock_config, + mock_service, + assume_role, + assume_pod_identity, + use_okta_role, + "dev", ) if as_root: @@ -2526,3 +2598,33 @@ def test_assume_aws_role( assert env["AWS_ACCESS_KEY_ID"] == "AKIAFOOBAR" else: assert env["AWS_ACCESS_KEY_ID"] == "AKIAFOOBAR2" + + +@mock.patch("paasta_tools.cli.cmds.local_run.subprocess.run", autospec=True) +@mock.patch("paasta_tools.cli.cmds.local_run.boto3.Session", autospec=True) +def test_assume_aws_role_with_web_identity( + mock_boto, + mock_subprocess_run, +): + mock_config = mock.MagicMock() + mock_config.get_iam_role.return_value = None + mock_service = "mockservice" + + mock_credentials = mock.MagicMock() + mock_credentials.access_key = "AKIAFOOBAR" + mock_credentials.secret_key = "SECRETKEY" + mock_credentials.token = "SESSION_TOKEN" + mock_boto.return_value.get_credentials.return_value = mock_credentials + + os.environ["AWS_ROLE_ARN"] = "arn:aws:iam::123456789:role/mock_role" + os.environ["AWS_WEB_IDENTITY_TOKEN_FILE"] = "/tokenfile" + + env = assume_aws_role(mock_config, mock_service, False, False, False, "dev") + + os.environ.pop("AWS_ROLE_ARN") + os.environ.pop("AWS_WEB_IDENTITY_TOKEN_FILE") + + assert "AWS_ACCESS_KEY_ID" in env + assert "AWS_SECRET_ACCESS_KEY" in env + assert "AWS_SESSION_TOKEN" in env + assert env["AWS_ACCESS_KEY_ID"] == "AKIAFOOBAR" diff --git a/tests/cli/test_cmds_mark_for_deployment.py b/tests/cli/test_cmds_mark_for_deployment.py index 9d523708e3..0deb476e00 100644 --- a/tests/cli/test_cmds_mark_for_deployment.py +++ b/tests/cli/test_cmds_mark_for_deployment.py @@ -43,6 +43,10 @@ class FakeArgs: auto_abandon_delay = 1.0 auto_rollback_delay = 1.0 authors = None + warn = 17 + polling_interval = None + diagnosis_interval = None + time_before_first_diagnosis = None @fixture @@ -137,31 +141,6 @@ class FakeArgsRollback(FakeArgs): mock_is_docker_image_already_in_registry.return_value = False with raises(ValueError): mark_for_deployment.paasta_mark_for_deployment(FakeArgsRollback) - - -@patch("paasta_tools.cli.cmds.mark_for_deployment.validate_service_name", autospec=True) -@patch( - "paasta_tools.cli.cmds.mark_for_deployment.is_docker_image_already_in_registry", - autospec=True, -) -@patch( - "paasta_tools.cli.cmds.mark_for_deployment.get_currently_deployed_version", - autospec=True, -) -@patch("paasta_tools.cli.cmds.mark_for_deployment.list_deploy_groups", autospec=True) -def test_paasta_mark_for_deployment_when_verify_image_succeeds( - mock_list_deploy_groups, - mock_get_currently_deployed_version, - mock_is_docker_image_already_in_registry, - mock_validate_service_name, -): - class FakeArgsRollback(FakeArgs): - verify_image = True - - mock_list_deploy_groups.return_value = ["test_deploy_groups"] - mock_is_docker_image_already_in_registry.return_value = False - with raises(ValueError): - mark_for_deployment.paasta_mark_for_deployment(FakeArgsRollback) mock_is_docker_image_already_in_registry.assert_called_with( "test_service", "fake_soa_dir", @@ -858,3 +837,65 @@ def test_MarkForDeployProcess_happy_path_skips_complete_if_no_auto_rollback( assert mfdp.run() == 0 assert mfdp.trigger_history == ["start_deploy", "mfd_succeeded", "deploy_finished"] assert mfdp.state_history == ["start_deploy", "deploying", "deployed"] + + +@patch( + "paasta_tools.cli.cmds.mark_for_deployment.get_instance_configs_for_service_in_deploy_group_all_clusters", + autospec=True, +) +@patch( + "paasta_tools.cli.cmds.mark_for_deployment.MarkForDeploymentProcess.any_slo_failing", + autospec=True, +) +def test_MarkForDeployProcess_get_available_buttons_failing_slos_show_disable_rollback( + mock_any_slo_failing, + mock_get_instance_configs, +): + mock_any_slo_failing.return_value = True + mfdp = WrappedMarkForDeploymentProcess( + service="service", + deploy_info=MagicMock(), + deploy_group="deploy_group", + commit="commit", + old_git_sha="old_git_sha", + git_url="git_url", + auto_rollback=True, + block=True, + soa_dir="soa_dir", + timeout=3600, + warn_pct=50, + auto_certify_delay=None, + auto_abandon_delay=600, + auto_rollback_delay=30, + authors=None, + ) + + # Test only get_available_buttons + mfdp.run_timeout = 1 + mfdp.state = "deploying" + assert "disable_auto_rollbacks" in mfdp.get_available_buttons() + assert "enable_auto_rollbacks" not in mfdp.get_available_buttons() + + mock_any_slo_failing.return_value = True + mfdp = WrappedMarkForDeploymentProcess( + service="service", + deploy_info=MagicMock(), + deploy_group="deploy_group", + commit="commit", + old_git_sha="old_git_sha", + git_url="git_url", + auto_rollback=False, + block=True, + soa_dir="soa_dir", + timeout=3600, + warn_pct=50, + auto_certify_delay=None, + auto_abandon_delay=600, + auto_rollback_delay=30, + authors=None, + ) + + mfdp.run_timeout = 1 + mfdp.state = "deploying" + assert "disable_auto_rollbacks" not in mfdp.get_available_buttons() + assert "enable_auto_rollbacks" in mfdp.get_available_buttons() diff --git a/tests/cli/test_cmds_mesh_status.py b/tests/cli/test_cmds_mesh_status.py index 17820510e1..1cf905ced8 100644 --- a/tests/cli/test_cmds_mesh_status.py +++ b/tests/cli/test_cmds_mesh_status.py @@ -3,6 +3,8 @@ import paasta_tools.paastaapi.models as paastamodels from paasta_tools.cli.cmds import mesh_status +from paasta_tools.eks_tools import EksDeploymentConfig +from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig from paasta_tools.paastaapi import ApiException @@ -50,22 +52,35 @@ def mock_get_oapi_client(fake_backend_location): @mock.patch( "paasta_tools.cli.cmds.mesh_status.get_smartstack_status_human", autospec=True ) +@pytest.mark.parametrize( + "instance_type_class", + ( + EksDeploymentConfig, + KubernetesDeploymentConfig, + ), +) def test_paasta_mesh_status_on_api_endpoint( mock_smtstk_status_human, mock_envoy_status_human, mock_get_oapi_client, fake_backend_location, system_paasta_config, + instance_type_class, ): envoy_output = mock.Mock() mock_envoy_status_human.return_value = [envoy_output] - - code, output = mesh_status.paasta_mesh_status_on_api_endpoint( - cluster="fake_cluster", - service="fake_service", - instance="fake_instance", - system_paasta_config=system_paasta_config, - ) + with mock.patch( + "paasta_tools.cli.cmds.mesh_status.get_instance_configs_for_service", + return_value=iter([mock.Mock(__class__=instance_type_class)]), + autospec=True, + ): + code, output = mesh_status.paasta_mesh_status_on_api_endpoint( + cluster="fake_cluster", + service="fake_service", + instance="fake_instance", + system_paasta_config=system_paasta_config, + soa_dir="/fake/path", + ) assert code == 0 assert output == [envoy_output] @@ -79,12 +94,20 @@ def test_paasta_mesh_status_on_api_endpoint( @mock.patch( "paasta_tools.cli.cmds.mesh_status.get_smartstack_status_human", autospec=True ) +@pytest.mark.parametrize( + "instance_type_class", + ( + EksDeploymentConfig, + KubernetesDeploymentConfig, + ), +) def test_paasta_mesh_status_on_api_endpoint_error( mock_smtstk_status_human, mock_envoy_status_human, mock_get_oapi_client, fake_backend_location, system_paasta_config, + instance_type_class, ): client = mock_get_oapi_client.return_value api_error = ApiException( @@ -100,11 +123,17 @@ def test_paasta_mesh_status_on_api_endpoint_error( for exc, expected_code, expected_msg in test_cases: client.service.mesh_instance.side_effect = [exc] + with mock.patch( + "paasta_tools.cli.cmds.mesh_status.get_instance_configs_for_service", + return_value=iter([mock.Mock(__class__=instance_type_class)]), + autospec=True, + ): code, output = mesh_status.paasta_mesh_status_on_api_endpoint( cluster="fake_cluster", service="fake_service", instance="fake_instance", system_paasta_config=system_paasta_config, + soa_dir="/fake/path", ) assert expected_code == code @@ -112,3 +141,32 @@ def test_paasta_mesh_status_on_api_endpoint_error( assert mock_smtstk_status_human.call_args_list == [] assert mock_envoy_status_human.call_args_list == [] + + +@mock.patch("paasta_tools.cli.cmds.mesh_status.get_envoy_status_human", autospec=True) +@mock.patch( + "paasta_tools.cli.cmds.mesh_status.get_smartstack_status_human", autospec=True +) +def test_paasta_mesh_status_on_api_endpoint_error_no_config( + mock_smtstk_status_human, + mock_envoy_status_human, + mock_get_oapi_client, + fake_backend_location, + system_paasta_config, +): + with mock.patch( + "paasta_tools.cli.cmds.mesh_status.get_instance_configs_for_service", + return_value=iter(()), + autospec=True, + ): + with pytest.raises(SystemExit): + mesh_status.paasta_mesh_status_on_api_endpoint( + cluster="fake_cluster", + service="fake_service", + instance="fake_instance", + system_paasta_config=system_paasta_config, + soa_dir="/fake/path", + ) + + assert mock_smtstk_status_human.call_args_list == [] + assert mock_envoy_status_human.call_args_list == [] diff --git a/tests/cli/test_cmds_performance_check.py b/tests/cli/test_cmds_performance_check.py deleted file mode 100644 index 57d7385d5d..0000000000 --- a/tests/cli/test_cmds_performance_check.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2015-2016 Yelp Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import mock -from pytest import raises - -from paasta_tools.cli.cmds import performance_check - - -@mock.patch( - "paasta_tools.cli.cmds.performance_check.validate_service_name", autospec=True -) -@mock.patch("requests.post", autospec=True) -@mock.patch( - "paasta_tools.cli.cmds.performance_check.load_performance_check_config", - autospec=True, -) -def test_submit_performance_check_job_happy( - mock_load_performance_check_config, mock_requests_post, mock_validate_service_name -): - fake_endpoint = "http://foo:1234/submit" - mock_load_performance_check_config.return_value = { - "endpoint": fake_endpoint, - "fake_param": "fake_value", - } - mock_validate_service_name.return_value = True - performance_check.submit_performance_check_job("fake_service", "fake_soa_dir") - mock_requests_post.assert_called_once_with( - url=fake_endpoint, params={"fake_param": "fake_value"} - ) - - -@mock.patch( - "paasta_tools.cli.cmds.performance_check.validate_service_name", autospec=True -) -@mock.patch( - "paasta_tools.cli.cmds.performance_check.submit_performance_check_job", - autospec=True, -) -def test_main_safely_returns_when_exceptions( - mock_submit_performance_check_job, mock_validate_service_name -): - mock_validate_service_name.return_value = True - fake_args = mock.Mock() - fake_args.service = "services-fake_service" - fake_args.soa_dir = "fake_soa_dir" - mock_submit_performance_check_job.side_effect = raises(Exception) - performance_check.perform_performance_check(fake_args) - mock_submit_performance_check_job.assert_called_once_with( - service="fake_service", soa_dir="fake_soa_dir" - ) diff --git a/tests/cli/test_cmds_secret.py b/tests/cli/test_cmds_secret.py index 0f00a18b37..a32abe1a7d 100644 --- a/tests/cli/test_cmds_secret.py +++ b/tests/cli/test_cmds_secret.py @@ -149,7 +149,7 @@ def test_paasta_secret(): "paasta_tools.cli.cmds.secret.is_secrets_for_teams_enabled", autospec=True ) as mock_is_secrets_for_teams_enabled, mock.patch( "paasta_tools.cli.cmds.secret.get_secret", autospec=True - ) as mock_get_kubernetes_secret, mock.patch( + ) as mock_get_secret, mock.patch( "paasta_tools.cli.cmds.secret.KubeClient", autospec=True ) as mock_kube_client, mock.patch( "paasta_tools.cli.cmds.secret.get_namespaces_for_secret", autospec=True @@ -242,8 +242,8 @@ def test_paasta_secret(): ) kube_client = mock.Mock() mock_is_secrets_for_teams_enabled.return_value = True - mock_get_namespaces_for_secret.return_value = {"paasta"} - mock_select_k8s_secret_namespace.return_value = "paasta" + mock_get_namespaces_for_secret.return_value = {"paastasvc-middleearth"} + mock_select_k8s_secret_namespace.return_value = "paastasvc-middleearth" mock_kube_client.return_value = kube_client secret.paasta_secret(mock_args) @@ -252,20 +252,24 @@ def test_paasta_secret(): mock_kube_client.assert_called_with( config_file=KUBE_CONFIG_USER_PATH, context="mesosstage" ) - mock_get_kubernetes_secret.assert_called_with( + mock_get_secret.assert_called_with( kube_client, - get_paasta_secret_name("paasta", "middleearth", "theonering"), - "paasta", + get_paasta_secret_name( + "paastasvc-middleearth", "middleearth", "theonering" + ), + key_name="theonering", + namespace="paastasvc-middleearth", ) # empty namespace list mock_get_namespaces_for_secret.return_value = set() mock_select_k8s_secret_namespace.return_value = None secret.paasta_secret(mock_args) - mock_get_kubernetes_secret.assert_called_with( + mock_get_secret.assert_called_with( kube_client, get_paasta_secret_name("paasta", "middleearth", "theonering"), - "paasta", + key_name="theonering", + namespace="paasta", ) mock_args = mock.Mock( diff --git a/tests/cli/test_cmds_spark_run.py b/tests/cli/test_cmds_spark_run.py index 18ed0f3b4b..6723a4d296 100644 --- a/tests/cli/test_cmds_spark_run.py +++ b/tests/cli/test_cmds_spark_run.py @@ -15,15 +15,15 @@ import mock import pytest -from boto3.exceptions import Boto3Error -from mock import Mock +from service_configuration_lib import spark_config from paasta_tools.cli.cmds import spark_run from paasta_tools.cli.cmds.spark_run import _should_get_resource_requirements +from paasta_tools.cli.cmds.spark_run import build_and_push_docker_image from paasta_tools.cli.cmds.spark_run import CLUSTER_MANAGER_K8S -from paasta_tools.cli.cmds.spark_run import CLUSTER_MANAGER_MESOS from paasta_tools.cli.cmds.spark_run import configure_and_run_docker_container from paasta_tools.cli.cmds.spark_run import decide_final_eks_toggle_state +from paasta_tools.cli.cmds.spark_run import DEFAULT_DOCKER_SHM_SIZE from paasta_tools.cli.cmds.spark_run import DEFAULT_DRIVER_CORES_BY_SPARK from paasta_tools.cli.cmds.spark_run import DEFAULT_DRIVER_MEMORY_BY_SPARK from paasta_tools.cli.cmds.spark_run import get_docker_run_cmd @@ -51,6 +51,7 @@ def test_get_docker_run_cmd(mock_getegid, mock_geteuid): docker_cmd = "pyspark" nvidia = False docker_memory_limit = "2g" + docker_shm_size = "1g" docker_cpu_limit = "2" actual = get_docker_run_cmd( @@ -61,10 +62,10 @@ def test_get_docker_run_cmd(mock_getegid, mock_geteuid): docker_cmd, nvidia, docker_memory_limit, + docker_shm_size, docker_cpu_limit, ) - - assert actual[7:] == [ + assert actual[-12:] == [ "--user=1234:100", "--name=fake_name", "--env", @@ -96,9 +97,7 @@ def test_sanitize_container_name(container_name, expected): @pytest.mark.parametrize( "disable_compact_bin_packing,cluster_manager,dir_access,expected", [ - (False, CLUSTER_MANAGER_MESOS, True, False), (False, CLUSTER_MANAGER_K8S, True, True), - (True, CLUSTER_MANAGER_MESOS, True, False), (True, CLUSTER_MANAGER_K8S, True, False), (True, CLUSTER_MANAGER_K8S, False, False), ], @@ -136,23 +135,6 @@ def mock_run(): yield m -@pytest.fixture -def mock_get_docker_client(): - fake_image_info = { - "RepoDigests": [ - DUMMY_DOCKER_IMAGE_DIGEST, - ], - } - docker_client = Mock(inspect_image=Mock(return_value=fake_image_info)) - - with mock.patch( - "paasta_tools.cli.cmds.spark_run.get_docker_client", - return_value=docker_client, - autospec=True, - ) as m: - yield m - - @pytest.mark.parametrize( "args,expected_output", [ @@ -340,6 +322,7 @@ def test_get_spark_env( ), ("spark.cores.max", False, None), (None, False, {}), + (None, True, {"spark.dynamicAllocation.enabled": "true"}), ], ) def test_parse_user_spark_args(spark_args, enable_spark_dra, expected, capsys): @@ -472,6 +455,7 @@ def test_run_docker_container( dry_run=dry_run, nvidia=nvidia, docker_memory_limit=DEFAULT_DRIVER_MEMORY_BY_SPARK, + docker_shm_size=DEFAULT_DOCKER_SHM_SIZE, docker_cpu_limit=DEFAULT_DRIVER_CORES_BY_SPARK, ) mock_get_docker_run_cmd.assert_called_once_with( @@ -482,6 +466,7 @@ def test_run_docker_container( docker_cmd=docker_cmd, nvidia=nvidia, docker_memory_limit=DEFAULT_DRIVER_MEMORY_BY_SPARK, + docker_shm_size=DEFAULT_DOCKER_SHM_SIZE, docker_cpu_limit=DEFAULT_DRIVER_CORES_BY_SPARK, ) if dry_run: @@ -500,11 +485,6 @@ def test_run_docker_container( @mock.patch("paasta_tools.cli.cmds.spark_run.get_username", autospec=True) @mock.patch("paasta_tools.cli.cmds.spark_run.run_docker_container", autospec=True) -@mock.patch( - "paasta_tools.cli.cmds.spark_run.send_and_calculate_resources_cost", - autospec=True, - return_value=(10, {"cpus": 10, "mem": 1024}), -) @mock.patch("paasta_tools.cli.cmds.spark_run.get_webui_url", autospec=True) @mock.patch("paasta_tools.cli.cmds.spark_run.create_spark_config_str", autospec=True) @mock.patch("paasta_tools.cli.cmds.spark_run.get_docker_cmd", autospec=True) @@ -539,13 +519,6 @@ def mock_create_spark_config_str(self): @pytest.mark.parametrize( ["cluster_manager", "spark_args_volumes", "expected_volumes"], [ - ( - spark_run.CLUSTER_MANAGER_MESOS, - { - "spark.mesos.executor.docker.volumes": "/mesos/volume:/mesos/volume:rw" - }, - ["/mesos/volume:/mesos/volume:rw"], - ), ( spark_run.CLUSTER_MANAGER_K8S, { @@ -579,7 +552,6 @@ def test_configure_and_run_docker_container( mock_get_docker_cmd, mock_create_spark_config_str, mock_get_webui_url, - mock_send_and_calculate_resources_cost, mock_run_docker_container, mock_get_username, cluster_manager, @@ -606,7 +578,10 @@ def test_configure_and_run_docker_container( args.cluster_manager = cluster_manager args.docker_cpu_limit = False args.docker_memory_limit = False + args.docker_shm_size = False args.use_eks_override = False + args.tronfig = None + args.job_id = None with mock.patch.object( self.instance_config, "get_env_dictionary", return_value={"env1": "val1"} ): @@ -643,19 +618,13 @@ def test_configure_and_run_docker_container( dry_run=True, nvidia=False, docker_memory_limit="2g", + docker_shm_size=DEFAULT_DOCKER_SHM_SIZE, docker_cpu_limit="1", ) @pytest.mark.parametrize( ["cluster_manager", "spark_args_volumes", "expected_volumes"], [ - ( - spark_run.CLUSTER_MANAGER_MESOS, - { - "spark.mesos.executor.docker.volumes": "/mesos/volume:/mesos/volume:rw" - }, - ["/mesos/volume:/mesos/volume:rw"], - ), ( spark_run.CLUSTER_MANAGER_K8S, { @@ -689,7 +658,6 @@ def test_configure_and_run_docker_driver_resource_limits_config( mock_get_docker_cmd, mock_create_spark_config_str, mock_get_webui_url, - mock_send_and_calculate_resources_cost, mock_run_docker_container, mock_get_username, cluster_manager, @@ -718,6 +686,7 @@ def test_configure_and_run_docker_driver_resource_limits_config( args.cluster_manager = cluster_manager args.docker_cpu_limit = 3 args.docker_memory_limit = "4g" + args.docker_shm_size = "1g" args.use_eks_override = False with mock.patch.object( self.instance_config, "get_env_dictionary", return_value={"env1": "val1"} @@ -755,19 +724,13 @@ def test_configure_and_run_docker_driver_resource_limits_config( dry_run=True, nvidia=False, docker_memory_limit="4g", + docker_shm_size="1g", docker_cpu_limit=3, ) @pytest.mark.parametrize( ["cluster_manager", "spark_args_volumes", "expected_volumes"], [ - ( - spark_run.CLUSTER_MANAGER_MESOS, - { - "spark.mesos.executor.docker.volumes": "/mesos/volume:/mesos/volume:rw" - }, - ["/mesos/volume:/mesos/volume:rw"], - ), ( spark_run.CLUSTER_MANAGER_K8S, { @@ -801,7 +764,6 @@ def test_configure_and_run_docker_driver_resource_limits( mock_get_docker_cmd, mock_create_spark_config_str, mock_get_webui_url, - mock_send_and_calculate_resources_cost, mock_run_docker_container, mock_get_username, cluster_manager, @@ -830,6 +792,7 @@ def test_configure_and_run_docker_driver_resource_limits( args.cluster_manager = cluster_manager args.docker_cpu_limit = False args.docker_memory_limit = False + args.docker_shm_size = False args.use_eks_override = False with mock.patch.object( self.instance_config, "get_env_dictionary", return_value={"env1": "val1"} @@ -867,6 +830,7 @@ def test_configure_and_run_docker_driver_resource_limits( dry_run=True, nvidia=False, docker_memory_limit="2g", + docker_shm_size=DEFAULT_DOCKER_SHM_SIZE, docker_cpu_limit="2", ) @@ -877,7 +841,6 @@ def test_configure_and_run_docker_container_nvidia( mock_get_docker_cmd, mock_create_spark_config_str, mock_get_webui_url, - mock_send_and_calculate_resources_cost, mock_run_docker_container, mock_get_username, ): @@ -886,9 +849,12 @@ def test_configure_and_run_docker_container_nvidia( ): spark_conf = { "spark.cores.max": "5", + "spark.executor.cores": 1, + "spark.executor.memory": "2g", "spark.master": "mesos://spark.master", "spark.ui.port": "1234", "spark.app.name": "fake app", + "spark.executorEnv.PAASTA_CLUSTER": "test-cluster", } args = mock.MagicMock(cmd="pyspark", nvidia=True) @@ -899,13 +865,12 @@ def test_configure_and_run_docker_container_nvidia( system_paasta_config=self.system_paasta_config, aws_creds=("id", "secret", "token"), spark_conf=spark_conf, - cluster_manager=spark_run.CLUSTER_MANAGER_MESOS, + cluster_manager=spark_run.CLUSTER_MANAGER_K8S, pod_template_path="unique-run", ) args, kwargs = mock_run_docker_container.call_args assert kwargs["nvidia"] - assert mock_send_and_calculate_resources_cost.called def test_configure_and_run_docker_container_mrjob( self, @@ -914,7 +879,6 @@ def test_configure_and_run_docker_container_mrjob( mock_get_docker_cmd, mock_create_spark_config_str, mock_get_webui_url, - mock_send_and_calculate_resources_cost, mock_run_docker_container, mock_get_username, ): @@ -923,9 +887,12 @@ def test_configure_and_run_docker_container_mrjob( ): spark_conf = { "spark.cores.max": 5, + "spark.executor.cores": 1, + "spark.executor.memory": "2g", "spark.master": "mesos://spark.master", "spark.ui.port": "1234", "spark.app.name": "fake_app", + "spark.executorEnv.PAASTA_CLUSTER": "test-cluster", } args = mock.MagicMock(cmd="python mrjob_wrapper.py", mrjob=True) @@ -936,64 +903,13 @@ def test_configure_and_run_docker_container_mrjob( system_paasta_config=self.system_paasta_config, aws_creds=("id", "secret", "token"), spark_conf=spark_conf, - cluster_manager=spark_run.CLUSTER_MANAGER_MESOS, + cluster_manager=spark_run.CLUSTER_MANAGER_K8S, pod_template_path="unique-run", ) args, kwargs = mock_run_docker_container.call_args assert kwargs["docker_cmd"] == mock_get_docker_cmd.return_value - assert mock_send_and_calculate_resources_cost.called - - def test_suppress_clusterman_metrics_errors( - self, - mock_get_history_url, - mock_et_signalfx_url, - mock_get_docker_cmd, - mock_create_spark_config_str, - mock_get_webui_url, - mock_send_and_calculate_resources_cost, - mock_run_docker_container, - mock_get_username, - ): - with mock.patch( - "paasta_tools.cli.cmds.spark_run.clusterman_metrics", autospec=True - ): - mock_send_and_calculate_resources_cost.side_effect = Boto3Error - mock_create_spark_config_str.return_value = "--conf spark.cores.max=5" - spark_conf = { - "spark.cores.max": 5, - "spark.ui.port": "1234", - "spark.app.name": "fake app", - } - args = mock.MagicMock( - suppress_clusterman_metrics_errors=False, cmd="pyspark" - ) - with pytest.raises(Boto3Error): - configure_and_run_docker_container( - args=args, - docker_img="fake-registry/fake-service", - instance_config=self.instance_config, - system_paasta_config=self.system_paasta_config, - aws_creds=("id", "secret", "token"), - spark_conf=spark_conf, - cluster_manager=spark_run.CLUSTER_MANAGER_MESOS, - pod_template_path="unique-run", - ) - - # make sure we don't blow up when this setting is True - args.suppress_clusterman_metrics_errors = True - configure_and_run_docker_container( - args=args, - docker_img="fake-registry/fake-service", - instance_config=self.instance_config, - system_paasta_config=self.system_paasta_config, - aws_creds=("id", "secret", "token"), - spark_conf=spark_conf, - cluster_manager=spark_run.CLUSTER_MANAGER_MESOS, - pod_template_path="unique-run", - ) - def test_dont_emit_metrics_for_inappropriate_commands( self, mock_get_history_url, @@ -1001,7 +917,6 @@ def test_dont_emit_metrics_for_inappropriate_commands( mock_get_docker_cmd, mock_create_spark_config_str, mock_get_webui_url, - mock_send_and_calculate_resources_cost, mock_run_docker_container, mock_get_username, ): @@ -1018,10 +933,9 @@ def test_dont_emit_metrics_for_inappropriate_commands( system_paasta_config=self.system_paasta_config, aws_creds=("id", "secret", "token"), spark_conf={"spark.ui.port": "1234", "spark.app.name": "fake_app"}, - cluster_manager=spark_run.CLUSTER_MANAGER_MESOS, + cluster_manager=spark_run.CLUSTER_MANAGER_K8S, pod_template_path="unique-run", ) - assert not mock_send_and_calculate_resources_cost.called @pytest.mark.parametrize( @@ -1111,7 +1025,6 @@ def test_paasta_spark_run_bash( mock_load_system_paasta_config, mock_validate_work_dir, mock_generate_pod_template_path, - mock_get_docker_client, ): args = argparse.Namespace( work_dir="/tmp/local", @@ -1138,12 +1051,15 @@ def test_paasta_spark_run_bash( aws_role_duration=3600, use_eks_override=False, k8s_server_address=None, + tronfig=None, + job_id=None, ) mock_load_system_paasta_config.return_value.get_cluster_aliases.return_value = {} mock_load_system_paasta_config.return_value.get_cluster_pools.return_value = { "test-cluster": ["test-pool"] } mock_should_enable_compact_bin_packing.return_value = True + mock_get_docker_image.return_value = DUMMY_DOCKER_IMAGE_DIGEST spark_run.paasta_spark_run(args) mock_validate_work_dir.assert_called_once_with("/tmp/local") assert args.cmd == "/bin/bash" @@ -1195,6 +1111,7 @@ def test_paasta_spark_run_bash( aws_creds=mock_get_aws_credentials.return_value, cluster_manager=spark_run.CLUSTER_MANAGER_K8S, pod_template_path="unique-run", + extra_driver_envs=dict(), ) mock_generate_pod_template_path.assert_called_once() @@ -1225,7 +1142,6 @@ def test_paasta_spark_run( mock_load_system_paasta_config, mock_validate_work_dir, mock_generate_pod_template_path, - mock_get_docker_client, ): args = argparse.Namespace( work_dir="/tmp/local", @@ -1252,12 +1168,15 @@ def test_paasta_spark_run( aws_role_duration=3600, use_eks_override=False, k8s_server_address=None, + tronfig=None, + job_id=None, ) mock_load_system_paasta_config.return_value.get_cluster_aliases.return_value = {} mock_load_system_paasta_config.return_value.get_cluster_pools.return_value = { "test-cluster": ["test-pool"] } mock_should_enable_compact_bin_packing.return_value = True + mock_get_docker_image.return_value = DUMMY_DOCKER_IMAGE_DIGEST spark_run.paasta_spark_run(args) mock_validate_work_dir.assert_called_once_with("/tmp/local") assert args.cmd == "USER=test timeout 1m spark-submit test.py" @@ -1308,6 +1227,7 @@ def test_paasta_spark_run( aws_creds=mock_get_aws_credentials.return_value, cluster_manager=spark_run.CLUSTER_MANAGER_K8S, pod_template_path="unique-run", + extra_driver_envs=dict(), ) mock_generate_pod_template_path.assert_called_once() @@ -1338,7 +1258,6 @@ def test_paasta_spark_run_pyspark( mock_load_system_paasta_config, mock_validate_work_dir, mock_generate_pod_template_path, - mock_get_docker_client, ): args = argparse.Namespace( work_dir="/tmp/local", @@ -1365,6 +1284,8 @@ def test_paasta_spark_run_pyspark( aws_role_duration=3600, use_eks_override=False, k8s_server_address=None, + tronfig=None, + job_id=None, ) mock_load_system_paasta_config.return_value.get_spark_use_eks_default.return_value = ( False @@ -1374,6 +1295,7 @@ def test_paasta_spark_run_pyspark( "test-cluster": ["test-pool"] } + mock_get_docker_image.return_value = DUMMY_DOCKER_IMAGE_DIGEST spark_run.paasta_spark_run(args) mock_validate_work_dir.assert_called_once_with("/tmp/local") assert args.cmd == "pyspark" @@ -1430,6 +1352,7 @@ def test_paasta_spark_run_pyspark( aws_creds=mock_get_aws_credentials.return_value, cluster_manager=spark_run.CLUSTER_MANAGER_K8S, pod_template_path="unique-run", + extra_driver_envs=dict(), ) mock_generate_pod_template_path.assert_called_once() @@ -1472,3 +1395,99 @@ def test_decide_final_eks_toggle_state(override, default, expected): ) assert decide_final_eks_toggle_state(override) is expected + + +@mock.patch.object(spark_run, "makefile_responds_to", autospec=True) +@mock.patch.object(spark_run, "paasta_cook_image", autospec=True) +@mock.patch.object(spark_run, "get_username", autospec=True) +def test_build_and_push_docker_image_unprivileged_output_format( + mock_get_username, + mock_paasta_cook_image, + mock_makefile_responds_to, + mock_run, +): + args = mock.MagicMock( + docker_registry="MOCK-docker-dev.yelpcorp.com", + autospec=True, + ) + mock_makefile_responds_to.return_value = True + mock_paasta_cook_image.return_value = 0 + mock_run.side_effect = [ + (0, None), + ( + 0, + ( + "Using default tag: latest\n" + "The push refers to repository [MOCK-docker-dev.yelpcorp.com/paasta-spark-run-user:latest]\n" + "latest: digest: sha256:103ce91c65d42498ca61cdfe8d799fab8ab1c37dac58b743b49ced227bc7bc06" + ), + ), + (0, None), + ] + mock_get_username.return_value = "user" + docker_image_digest = build_and_push_docker_image(args) + assert DUMMY_DOCKER_IMAGE_DIGEST == docker_image_digest + + +@mock.patch.object(spark_run, "makefile_responds_to", autospec=True) +@mock.patch.object(spark_run, "paasta_cook_image", autospec=True) +@mock.patch.object(spark_run, "get_username", autospec=True) +def test_build_and_push_docker_image_privileged_output_format( + mock_get_username, + mock_paasta_cook_image, + mock_makefile_responds_to, + mock_run, +): + args = mock.MagicMock( + docker_registry="MOCK-docker-dev.yelpcorp.com", + autospec=True, + ) + mock_makefile_responds_to.return_value = True + mock_paasta_cook_image.return_value = 0 + mock_run.side_effect = [ + (0, None), + ( + 0, + ( + "Using default tag: latest\n" + "The push refers to repository [MOCK-docker-dev.yelpcorp.com/paasta-spark-run-user:latest]\n" + "latest: digest: sha256:103ce91c65d42498ca61cdfe8d799fab8ab1c37dac58b743b49ced227bc7bc06 size: 1337" + ), + ), + (0, None), + ] + mock_get_username.return_value = "user" + docker_image_digest = build_and_push_docker_image(args) + assert DUMMY_DOCKER_IMAGE_DIGEST == docker_image_digest + + +@mock.patch.object(spark_run, "makefile_responds_to", autospec=True) +@mock.patch.object(spark_run, "paasta_cook_image", autospec=True) +@mock.patch.object(spark_run, "get_username", autospec=True) +def test_build_and_push_docker_image_unexpected_output_format( + mock_get_username, + mock_paasta_cook_image, + mock_makefile_responds_to, + mock_run, +): + args = mock.MagicMock( + docker_registry="MOCK-docker-dev.yelpcorp.com", + autospec=True, + ) + mock_makefile_responds_to.return_value = True + mock_paasta_cook_image.return_value = 0 + mock_run.side_effect = [ + (0, None), + ( + 0, + ( + "Using default tag: latest\n" + "The push refers to repository [MOCK-docker-dev.yelpcorp.com/paasta-spark-run-user:latest]\n" + "the regex will not match this" + ), + ), + (0, None), + ] + with pytest.raises(ValueError) as e: + build_and_push_docker_image(args) + assert "Could not determine digest from output" in str(e.value) diff --git a/tests/cli/test_cmds_status.py b/tests/cli/test_cmds_status.py index 2f485d1a24..8e91cc09e2 100644 --- a/tests/cli/test_cmds_status.py +++ b/tests/cli/test_cmds_status.py @@ -920,7 +920,7 @@ def test_status_with_registration( @pytest.fixture -def mock_marathon_status(include_envoy=True, include_smartstack=True): +def mock_marathon_status(include_envoy=True): kwargs = dict( desired_state="start", desired_app_id="abc.def", @@ -937,12 +937,6 @@ def mock_marathon_status(include_envoy=True, include_smartstack=True): non_running_tasks=[], ), ) - if include_smartstack: - kwargs["smartstack"] = paastamodels.SmartstackStatus( - registration="fake_service.fake_instance", - expected_backends_per_location=1, - locations=[], - ) if include_envoy: kwargs["envoy"] = paastamodels.EnvoyStatus( registration="fake_service.fake_instance", @@ -2491,12 +2485,15 @@ def test_output( class TestPrintFlinkStatus: @patch("paasta_tools.cli.cmds.status.load_system_paasta_config", autospec=True) + @patch("paasta_tools.api.client.load_system_paasta_config", autospec=True) def test_error_no_flink( self, + mock_load_system_paasta_config_api, mock_load_system_paasta_config, mock_flink_status, system_paasta_config, ): + mock_load_system_paasta_config_api.return_value = system_paasta_config mock_load_system_paasta_config.return_value = system_paasta_config mock_flink_status["status"] = None output = [] diff --git a/tests/cli/test_cmds_validate.py b/tests/cli/test_cmds_validate.py index 854ca07a85..4350b574f9 100644 --- a/tests/cli/test_cmds_validate.py +++ b/tests/cli/test_cmds_validate.py @@ -99,15 +99,16 @@ def test_paasta_validate_calls_everything( assert mock_validate_cpu_burst.called -@patch("paasta_tools.cli.cmds.validate.get_instance_config", autospec=True) +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) @patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) -@patch("paasta_tools.cli.cmds.validate.list_all_instances_for_service", autospec=True) @patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) def test_validate_paasta_objects( mock_path_to_soa_dir_service, - mock_list_all_instances_for_service, mock_list_clusters, - mock_get_instance_config, + mock_load_all_instance_configs_for_service, capsys, ): @@ -120,8 +121,9 @@ def test_validate_paasta_objects( mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", fake_service) mock_list_clusters.return_value = [fake_cluster] - mock_list_all_instances_for_service.return_value = [fake_instance] - mock_get_instance_config.return_value = mock_paasta_instance + mock_load_all_instance_configs_for_service.return_value = [ + (fake_instance, mock_paasta_instance) + ] assert validate_paasta_objects("fake-service-path") is False, capsys captured = capsys.readouterr() @@ -159,26 +161,31 @@ def test_validate_unknown_service_service_path(): assert not paasta_validate_soa_configs(service, service_path) -@patch("paasta_tools.cli.cmds.validate.get_instance_config", autospec=True) -@patch("paasta_tools.cli.cmds.validate.list_all_instances_for_service", autospec=True) +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) @patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) @patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) def test_validate_min_max_instances_success( mock_path_to_soa_dir_service, mock_list_clusters, - mock_list_all_instances_for_service, - mock_get_instance_config, + mock_load_all_instance_configs_for_service, capsys, ): mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", "fake_service") mock_list_clusters.return_value = ["fake_cluster"] - mock_list_all_instances_for_service.return_value = {"fake_instance1"} - mock_get_instance_config.return_value = mock.Mock( - get_instance=mock.Mock(return_value="fake_instance1"), - get_instance_type=mock.Mock(return_value="fake_type"), - get_min_instances=mock.Mock(return_value=3), - get_max_instances=mock.Mock(return_value=1), - ) + mock_load_all_instance_configs_for_service.return_value = [ + ( + "fake_instance1", + mock.Mock( + get_instance=mock.Mock(return_value="fake_instance1"), + get_instance_type=mock.Mock(return_value="fake_type"), + get_min_instances=mock.Mock(return_value=3), + get_max_instances=mock.Mock(return_value=1), + ), + ) + ] assert validate_min_max_instances("fake-service-path") is False output, _ = capsys.readouterr() @@ -226,8 +233,8 @@ def is_schema(schema): assert "$schema" in schema -def test_get_schema_marathon_found(): - schema = get_schema("marathon") +def test_get_schema_eks_found(): + schema = get_schema("eks") is_schema(schema) @@ -241,8 +248,34 @@ def test_get_schema_missing(): @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_schema_list_hashes_good(mock_get_file_contents, capsys): - marathon_content = """ +def test_k8s_namespace_schema_good(mock_get_file_contents, capsys): + mock_get_file_contents.return_value = """ +main: + namespace: this-is-good +""" + for schema_type in ["kubernetes", "eks"]: + assert validate_schema("unused_service_path.yaml", schema_type) + + output, _ = capsys.readouterr() + assert SCHEMA_VALID in output + + +@patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) +def test_k8s_namespace_schema_bad(mock_get_file_contents, capsys): + mock_get_file_contents.return_value = """ +main: + namspace: bad_namespace +""" + for schema_type in ["kubernetes", "eks"]: + assert not validate_schema("unused_service_path.yaml", schema_type) + + output, _ = capsys.readouterr() + assert SCHEMA_INVALID in output + + +@patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) +def test_kubernetes_validate_schema_list_hashes_good(mock_get_file_contents, capsys): + kubernetes_content = """ --- main_worker: cpus: 0.1 @@ -259,8 +292,8 @@ def test_marathon_validate_schema_list_hashes_good(mock_get_file_contents, capsy disk: 512 registrations: ['foo.bar', 'bar.baz'] """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_VALID in output @@ -308,8 +341,8 @@ def test_validate_rollback_bounds(mock_config, expected): @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_understands_underscores(mock_get_file_contents, capsys): - marathon_content = """ +def test_kubernetes_validate_understands_underscores(mock_get_file_contents, capsys): + kubernetes_content = """ --- _template: &template foo: bar @@ -320,16 +353,16 @@ def test_marathon_validate_understands_underscores(mock_get_file_contents, capsy env: <<: *template """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_VALID in output @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_schema_healthcheck_non_cmd(mock_get_file_contents, capsys): - marathon_content = """ +def test_kubernetes_validate_schema_healthcheck_non_cmd(mock_get_file_contents, capsys): + kubernetes_content = """ --- main_worker: cpus: 0.1 @@ -339,12 +372,12 @@ def test_marathon_validate_schema_healthcheck_non_cmd(mock_get_file_contents, ca cmd: virtualenv_run/bin/python adindexer/adindex_worker.py healthcheck_mode: tcp """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_VALID in output - marathon_content = """ + kubernetes_content = """ --- main_worker: cpus: 0.1 @@ -353,16 +386,16 @@ def test_marathon_validate_schema_healthcheck_non_cmd(mock_get_file_contents, ca disk: 512 cmd: virtualenv_run/bin/python adindexer/adindex_worker.py """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_VALID in output @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_id(mock_get_file_contents, capsys): - marathon_content = """ +def test_kubernetes_validate_id(mock_get_file_contents, capsys): + kubernetes_content = """ --- valid: cpus: 0.1 @@ -371,13 +404,13 @@ def test_marathon_validate_id(mock_get_file_contents, capsys): disk: 512 cmd: virtualenv_run/bin/python adindexer/adindex_worker.py """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_VALID in output - marathon_content = """ + kubernetes_content = """ --- this_is_okay_too_1: cpus: 0.1 @@ -386,13 +419,13 @@ def test_marathon_validate_id(mock_get_file_contents, capsys): disk: 512 cmd: virtualenv_run/bin/python adindexer/adindex_worker.py """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_VALID in output - marathon_content = """ + kubernetes_content = """ --- dashes-are-okay-too: cpus: 0.1 @@ -401,14 +434,14 @@ def test_marathon_validate_id(mock_get_file_contents, capsys): disk: 512 cmd: virtualenv_run/bin/python adindexer/adindex_worker.py """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) get_config_file_dict.cache_clear() # HACK: ensure cache is cleared for future calls output, _ = capsys.readouterr() assert SCHEMA_VALID in output - marathon_content = """ + kubernetes_content = """ --- main_worker_CAPITALS_INVALID: cpus: 0.1 @@ -417,14 +450,14 @@ def test_marathon_validate_id(mock_get_file_contents, capsys): disk: 512 cmd: virtualenv_run/bin/python adindexer/adindex_worker.py """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert not validate_schema("unused_service_path.yaml", schema_type) get_config_file_dict.cache_clear() # HACK: ensure cache is cleared for future calls output, _ = capsys.readouterr() assert SCHEMA_INVALID in output - marathon_content = """ + kubernetes_content = """ --- $^&*()(&*^%&definitely_not_okay: cpus: 0.1 @@ -433,18 +466,18 @@ def test_marathon_validate_id(mock_get_file_contents, capsys): disk: 512 cmd: virtualenv_run/bin/python adindexer/adindex_worker.py """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert not validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() assert SCHEMA_INVALID in output @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_schema_healthcheck_cmd_has_cmd( +def test_kubernetes_validate_schema_healthcheck_cmd_has_cmd( mock_get_file_contents, capsys ): - marathon_content = """ + kubernetes_content = """ --- main_worker: cpus: 0.1 @@ -454,13 +487,13 @@ def test_marathon_validate_schema_healthcheck_cmd_has_cmd( cmd: virtualenv_run/bin/python adindexer/adindex_worker.py healthcheck_mode: cmd """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert not validate_schema("unused_service_path.yaml", schema_type) get_config_file_dict.cache_clear() # HACK: ensure cache is cleared for future calls output, _ = capsys.readouterr() assert SCHEMA_INVALID in output - marathon_content = """ + kubernetes_content = """ --- main_worker: cpus: 0.1 @@ -471,8 +504,8 @@ def test_marathon_validate_schema_healthcheck_cmd_has_cmd( healthcheck_mode: cmd healthcheck_cmd: '/bin/true' """ - mock_get_file_contents.return_value = marathon_content - for schema_type in ["marathon", "kubernetes"]: + mock_get_file_contents.return_value = kubernetes_content + for schema_type in ["kubernetes", "eks"]: assert validate_schema("unused_service_path.yaml", schema_type) get_config_file_dict.cache_clear() # HACK: ensure cache is cleared for future calls output, _ = capsys.readouterr() @@ -480,7 +513,7 @@ def test_marathon_validate_schema_healthcheck_cmd_has_cmd( @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_schema_keys_outside_instance_blocks_bad( +def test_kubernetes_validate_schema_keys_outside_instance_blocks_bad( mock_get_file_contents, capsys ): mock_get_file_contents.return_value = """ @@ -491,7 +524,7 @@ def test_marathon_validate_schema_keys_outside_instance_blocks_bad( "page": false } """ - for schema_type in ["marathon", "kubernetes"]: + for schema_type in ["kubernetes", "eks"]: assert not validate_schema("unused_service_path.json", schema_type) get_config_file_dict.cache_clear() # HACK: ensure cache is cleared for future calls @@ -500,28 +533,28 @@ def test_marathon_validate_schema_keys_outside_instance_blocks_bad( @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_schema_security_good(mock_get_file_contents, capsys): +def test_kubernetes_validate_schema_security_good(mock_get_file_contents, capsys): mock_get_file_contents.return_value = """ main: dependencies_reference: main security: outbound_firewall: block """ - assert validate_schema("unused_service_path.yaml", "marathon") + assert validate_schema("unused_service_path.yaml", "kubernetes") output, _ = capsys.readouterr() assert SCHEMA_VALID in output @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_schema_security_bad(mock_get_file_contents, capsys): +def test_kubernetes_validate_schema_security_bad(mock_get_file_contents, capsys): mock_get_file_contents.return_value = """ main: dependencies_reference: main security: outbound_firewall: bblock """ - for schema_type in ["marathon", "kubernetes"]: + for schema_type in ["kubernetes", "eks"]: assert not validate_schema("unused_service_path.yaml", schema_type) output, _ = capsys.readouterr() @@ -529,7 +562,7 @@ def test_marathon_validate_schema_security_bad(mock_get_file_contents, capsys): @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -def test_marathon_validate_invalid_key_bad(mock_get_file_contents, capsys): +def test_kubernetes_validate_invalid_key_bad(mock_get_file_contents, capsys): mock_get_file_contents.return_value = """ { "main": { @@ -537,13 +570,46 @@ def test_marathon_validate_invalid_key_bad(mock_get_file_contents, capsys): } } """ - for schema_type in ["marathon", "kubernetes"]: + for schema_type in ["kubernetes", "eks"]: assert not validate_schema("unused_service_path.json", schema_type) output, _ = capsys.readouterr() assert SCHEMA_INVALID in output +@pytest.mark.parametrize( + "iam_role, expected, instance_type", + [ + ("not_an_arn", False, "kubernetes"), + ("not_an_arn", False, "eks"), + ("arn:aws:iam::12345678:role/some_role", True, "kubernetes"), + ("arn:aws:iam::12345678:role/some_role", True, "eks"), + ("arn:aws:iam::12345678:role/Some_Capitalized_Role", True, "kubernetes"), + ("arn:aws:iam::12345678:role/Some_Capitalized_Role", True, "eks"), + ("arn:aws:iam::12345678::role/malformed_role", False, "kubernetes"), + ("arn:aws:iam::12345678::role/malformed_role", False, "eks"), + ], +) +def test_instance_validate_schema_iam_role( + iam_role, + expected, + instance_type, + capsys, +): + instance_content = f""" +test_instance: + iam_role: {iam_role} +""" + with patch( + "paasta_tools.cli.cmds.validate.get_file_contents", autospec=True + ) as mock_get_file_contents: + mock_get_file_contents.return_value = instance_content + assert validate_schema("unused_service_path.yaml", instance_type) == expected + expected_output = SCHEMA_VALID if expected else SCHEMA_INVALID + output, _ = capsys.readouterr() + assert expected_output in output + + @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) def test_tron_validate_schema_understands_underscores(mock_get_file_contents, capsys): tron_content = """ @@ -622,6 +688,35 @@ def test_tron_validate_schema_cleanup_action_extra_properties_bad( assert SCHEMA_INVALID in output +@pytest.mark.parametrize( + "iam_role, expected", + [ + ("not_an_arn", False), + ("arn:aws:iam::12345678:role/some_role", True), + ("arn:aws:iam::12345678:role/Some_Capitalized_Role", True), + ("arn:aws:iam::12345678::role/malformed_role", False), + ], +) +def test_tron_validate_schema_iam_role(iam_role, expected, capsys): + tron_content = f""" +test_job: + node: paasta + schedule: "daily 04:00:00" + actions: + first: + iam_role: {iam_role} + command: echo hello world +""" + with patch( + "paasta_tools.cli.cmds.validate.get_file_contents", autospec=True + ) as mock_get_file_contents: + mock_get_file_contents.return_value = tron_content + assert validate_schema("unused_service_path.yaml", "tron") == expected + output, _ = capsys.readouterr() + expected_output = SCHEMA_VALID if expected else SCHEMA_INVALID + assert expected_output in output + + @pytest.mark.parametrize( "mock_content", ( @@ -834,9 +929,11 @@ def test_validate_unique_service_name_failure( assert "instance_1" in output -@patch("paasta_tools.cli.cmds.validate.get_instance_config", autospec=True) +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) @patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) -@patch("paasta_tools.cli.cmds.validate.list_all_instances_for_service", autospec=True) @patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) @patch("paasta_tools.cli.cmds.validate.load_system_paasta_config", autospec=True) @patch("paasta_tools.cli.cmds.validate.check_secrets_for_instance", autospec=True) @@ -844,9 +941,8 @@ def test_validate_secrets( mock_check_secrets_for_instance, mock_load_system_paasta_config, mock_path_to_soa_dir_service, - mock_list_all_instances_for_service, mock_list_clusters, - mock_get_instance_config, + mock_load_all_instance_configs_for_service, capsys, ): mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", "fake_service") @@ -856,17 +952,16 @@ def test_validate_secrets( return_value={"fake_cluster": "fake_vault_env"} ) ) - mock_list_all_instances_for_service.return_value = [ - "fake_instance", - "fake_instance2", - ] mock_paasta_instance = mock.Mock( config_dict={"env": {"SUPER_SECRET1": "SECRET(secret1)"}} ) mock_paasta_instance2 = mock.Mock( config_dict={"env": {"SUPER_SECRET1": "SHARED_SECRET(secret1)"}} ) - mock_get_instance_config.side_effect = [mock_paasta_instance, mock_paasta_instance2] + mock_load_all_instance_configs_for_service.return_value = [ + ("fake_instance", mock_paasta_instance), + ("fake_instance", mock_paasta_instance2), + ] mock_check_secrets_for_instance.return_value = True assert validate_secrets("fake-service-path"), capsys captured = capsys.readouterr() @@ -878,8 +973,9 @@ def test_validate_secrets( @patch("paasta_tools.cli.cmds.validate.os.path.isfile", autospec=True) def test_check_secrets_for_instance(mock_isfile, mock_get_file_contents): instance_config_dict = {"env": {"SUPER_SECRET1": "SECRET(secret1)"}} + overriding_service = "fake-other-service-name" soa_dir = "fake_soa_dir" - service_path = "fake-service-path" + service = "fake-service-name" vault_env = "fake_vault_env" secret_content = """ { @@ -892,18 +988,28 @@ def test_check_secrets_for_instance(mock_isfile, mock_get_file_contents): """ mock_get_file_contents.return_value = secret_content mock_isfile.return_value = True - assert check_secrets_for_instance( - instance_config_dict, soa_dir, service_path, vault_env + assert check_secrets_for_instance(instance_config_dict, soa_dir, service, vault_env) + mock_get_file_contents.assert_called_with( + "fake_soa_dir/fake-service-name/secrets/secret1.json" ) - mock_get_file_contents.assert_called_with("fake-service-path/secrets/secret1.json") instance_config_dict = {"env": {"SUPER_SECRET1": "SHARED_SECRET(secret1)"}} - assert check_secrets_for_instance( - instance_config_dict, soa_dir, service_path, vault_env - ) + assert check_secrets_for_instance(instance_config_dict, soa_dir, service, vault_env) mock_get_file_contents.assert_called_with( "fake_soa_dir/_shared/secrets/secret1.json" ) + # validation should also work on instances with service: override. + instance_config_dict = { + "env": {"SUPER_SECRET1": "SECRET(secret1)"}, + "service": overriding_service, + } + mock_get_file_contents.return_value = secret_content + mock_isfile.return_value = True + assert check_secrets_for_instance(instance_config_dict, soa_dir, service, vault_env) + mock_get_file_contents.assert_called_with( + f"{soa_dir}/{overriding_service}/secrets/secret1.json" + ) + @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) @patch("paasta_tools.cli.cmds.validate.os.path.isfile", autospec=True) @@ -912,7 +1018,7 @@ def test_check_secrets_for_instance_missing_secret( ): instance_config_dict = {"env": {"SUPER_SECRET1": "SECRET(secret1)"}} soa_dir = "fake_soa_dir" - service_path = "fake-service-path" + service = "fake-service-name" vault_env = "even_more_fake_vault_env" secret_content = """ { @@ -926,51 +1032,60 @@ def test_check_secrets_for_instance_missing_secret( mock_get_file_contents.return_value = secret_content mock_isfile.return_value = True assert not check_secrets_for_instance( - instance_config_dict, soa_dir, service_path, vault_env + instance_config_dict, soa_dir, service, vault_env ), capsys captured = capsys.readouterr() assert ( - "Secret secret1 not defined for ecosystem even_more_fake_vault_env on secret file fake-service-path/secrets/secret1.json" + "Secret secret1 not defined for ecosystem even_more_fake_vault_env on secret file fake_soa_dir/fake-service-name/secrets/secret1.json" in captured.out ) @pytest.mark.parametrize( - "setpoint,offset,expected", + "setpoint,offset,expected,instance_type", [ - (0.5, 0.5, False), - (0.5, 0.6, False), - (0.8, 0.25, True), + (0.5, 0.5, False, "kubernetes"), + (0.5, 0.6, False, "kubernetes"), + (0.8, 0.25, True, "kubernetes"), + (0.5, 0.5, False, "eks"), + (0.5, 0.6, False, "eks"), + (0.8, 0.25, True, "eks"), ], ) -@patch("paasta_tools.cli.cmds.validate.get_instance_config", autospec=True) -@patch("paasta_tools.cli.cmds.validate.list_all_instances_for_service", autospec=True) +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) @patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) @patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) def test_validate_autoscaling_configs( mock_path_to_soa_dir_service, mock_list_clusters, - mock_list_all_instances_for_service, - mock_get_instance_config, + mock_load_all_instance_configs_for_service, setpoint, offset, expected, + instance_type, ): mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", "fake_service") mock_list_clusters.return_value = ["fake_cluster"] - mock_list_all_instances_for_service.return_value = {"fake_instance1"} - mock_get_instance_config.return_value = mock.Mock( - get_instance=mock.Mock(return_value="fake_instance1"), - get_instance_type=mock.Mock(return_value="kubernetes"), - is_autoscaling_enabled=mock.Mock(return_value=True), - get_autoscaling_params=mock.Mock( - return_value={ - "metrics_provider": "uwsgi", - "setpoint": setpoint, - "offset": offset, - } - ), - ) + mock_load_all_instance_configs_for_service.return_value = [ + ( + "fake_instance1", + mock.Mock( + get_instance=mock.Mock(return_value="fake_instance1"), + get_instance_type=mock.Mock(return_value=instance_type), + is_autoscaling_enabled=mock.Mock(return_value=True), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "uwsgi", + "setpoint": setpoint, + "offset": offset, + } + ), + ), + ) + ] with mock.patch( "paasta_tools.cli.cmds.validate.load_system_paasta_config", @@ -983,30 +1098,40 @@ def test_validate_autoscaling_configs( assert validate_autoscaling_configs("fake-service-path") is expected -@patch("paasta_tools.cli.cmds.validate.get_instance_config", autospec=True) -@patch("paasta_tools.cli.cmds.validate.list_all_instances_for_service", autospec=True) +@pytest.mark.parametrize( + "instance_type", + [("kubernetes"), ("eks")], +) +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) @patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) @patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) def test_validate_autoscaling_configs_no_offset_specified( mock_path_to_soa_dir_service, mock_list_clusters, - mock_list_all_instances_for_service, - mock_get_instance_config, + mock_load_all_instance_configs_for_service, + instance_type, ): mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", "fake_service") mock_list_clusters.return_value = ["fake_cluster"] - mock_list_all_instances_for_service.return_value = {"fake_instance1"} - mock_get_instance_config.return_value = mock.Mock( - get_instance=mock.Mock(return_value="fake_instance1"), - get_instance_type=mock.Mock(return_value="kubernetes"), - is_autoscaling_enabled=mock.Mock(return_value=True), - get_autoscaling_params=mock.Mock( - return_value={ - "metrics_provider": "uwsgi", - "setpoint": 0.8, - } - ), - ) + mock_load_all_instance_configs_for_service.return_value = [ + ( + "fake_instance1", + mock.Mock( + get_instance=mock.Mock(return_value="fake_instance1"), + get_instance_type=mock.Mock(return_value=instance_type), + is_autoscaling_enabled=mock.Mock(return_value=True), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "uwsgi", + "setpoint": 0.8, + } + ), + ), + ) + ] with mock.patch( "paasta_tools.cli.cmds.validate.load_system_paasta_config", @@ -1019,43 +1144,137 @@ def test_validate_autoscaling_configs_no_offset_specified( assert validate_autoscaling_configs("fake-service-path") is True +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) +@patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) +@patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) @pytest.mark.parametrize( - "filecontents,expected", + "autoscaling_config,registrations,expected", [ - ("# overridexxx-cpu-setting", False), - ("# override-cpu-setting", False), - ("", False), - ("# override-cpu-setting (PAASTA-17522)", True), + ( + { + "metrics_provider": "active-requests", + }, + [], + True, + ), + ( + { + "metrics_provider": "active-requests", + "desired_active_requests_per_replica": -5, + }, + [], + False, + ), + ( + { + "metrics_provider": "active-requests", + "desired_active_requests_per_replica": 5, + }, + [], + True, + ), + ( + { + "metrics_provider": "active-requests", + "desired_active_requests_per_replica": 5, + }, + ["fake_service.abc"], + True, + ), + ( + { + "metrics_provider": "active-requests", + "desired_active_requests_per_replica": 5, + }, + ["fake_service.abc", "fake_service.def"], + False, + ), + ], +) +def test_validate_autoscaling_configs_active_requests( + mock_path_to_soa_dir_service, + mock_list_clusters, + mock_load_all_instance_configs_for_service, + autoscaling_config, + registrations, + expected, +): + mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", "fake_service") + mock_list_clusters.return_value = ["fake_cluster"] + mock_load_all_instance_configs_for_service.return_value = [ + ( + "fake_instance1", + mock.Mock( + get_instance=mock.Mock(return_value="fake_instance1"), + get_instance_type=mock.Mock(return_value="kubernetes"), + is_autoscaling_enabled=mock.Mock(return_value=True), + get_autoscaling_params=mock.Mock(return_value=autoscaling_config), + get_registrations=mock.Mock(return_value=registrations), + ), + ) + ] + + with mock.patch( + "paasta_tools.cli.cmds.validate.load_system_paasta_config", + autospec=True, + return_value=SystemPaastaConfig( + config={"skip_cpu_override_validation": ["not-a-real-service"]}, + directory="/some/test/dir", + ), + ): + assert validate_autoscaling_configs("fake-service-path") is expected + + +@pytest.mark.parametrize( + "filecontents,expected, instance_type", + [ + ("# overridexxx-cpu-setting", False, "kubernetes"), + ("# override-cpu-setting", False, "kubernetes"), + ("", False, "kubernetes"), + ("# override-cpu-setting (PAASTA-17522)", True, "kubernetes"), + ("# overridexxx-cpu-setting", False, "eks"), + ("# override-cpu-setting", False, "eks"), + ("", False, "eks"), + ("# override-cpu-setting (PAASTA-17522)", True, "eks"), ], ) @patch("paasta_tools.cli.cmds.validate.get_file_contents", autospec=True) -@patch("paasta_tools.cli.cmds.validate.get_instance_config", autospec=True) -@patch("paasta_tools.cli.cmds.validate.list_all_instances_for_service", autospec=True) +@patch( + "paasta_tools.cli.cmds.validate.load_all_instance_configs_for_service", + autospec=True, +) @patch("paasta_tools.cli.cmds.validate.list_clusters", autospec=True) @patch("paasta_tools.cli.cmds.validate.path_to_soa_dir_service", autospec=True) def test_validate_cpu_autotune_override( mock_path_to_soa_dir_service, mock_list_clusters, - mock_list_all_instances_for_service, - mock_get_instance_config, + mock_load_all_instance_configs_for_service, mock_get_file_contents, filecontents, expected, + instance_type, ): mock_path_to_soa_dir_service.return_value = ("fake_soa_dir", "fake_service") mock_list_clusters.return_value = ["fake_cluster"] - mock_list_all_instances_for_service.return_value = {"fake_instance1"} - mock_get_instance_config.return_value = mock.Mock( - get_instance=mock.Mock(return_value="fake_instance1"), - get_instance_type=mock.Mock(return_value="kubernetes"), - is_autoscaling_enabled=mock.Mock(return_value=True), - get_autoscaling_params=mock.Mock( - return_value={ - "metrics_provider": "cpu", - "setpoint": 0.8, - } - ), - ) + mock_load_all_instance_configs_for_service.return_value = [ + ( + "fake_instance1", + mock.Mock( + get_instance=mock.Mock(return_value="fake_instance1"), + get_instance_type=mock.Mock(return_value=instance_type), + is_autoscaling_enabled=mock.Mock(return_value=True), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "cpu", + "setpoint": 0.8, + } + ), + ), + ) + ] mock_get_file_contents.return_value = f""" --- fake_instance1: @@ -1114,22 +1333,25 @@ def test_list_upcoming_runs(schedule, starting_from, num_runs, expected): @pytest.mark.parametrize( - "burst, comment, expected", + "burst, comment, expected, instance_type", [ - (3, "# overridexxx-cpu-burst", False), - (4, "# override-cpu-burst", False), - (5, "", False), - (6, "# override-cpu-burst (MAGIC-42)", True), - (7, "# override-cpu-burst (SECURE-1234#some comment)", True), - (1, "# override-cpu-burst (HWAT-789)", True), - (1, "# override-cpu-burst", True), + (3, "# overridexxx-cpu-burst", False, "kubernetes"), + (4, "# override-cpu-burst", False, "kubernetes"), + (5, "", False, "kubernetes"), + (6, "# override-cpu-burst (MAGIC-42)", True, "kubernetes"), + (7, "# override-cpu-burst (SECURE-1234#some comment)", True, "kubernetes"), + (1, "# override-cpu-burst (HWAT-789)", True, "kubernetes"), + (1, "# override-cpu-burst", True, "kubernetes"), + (3, "# overridexxx-cpu-burst", False, "eks"), + (4, "# override-cpu-burst", False, "eks"), + (5, "", False, "eks"), + (6, "# override-cpu-burst (MAGIC-42)", True, "eks"), + (7, "# override-cpu-burst (SECURE-1234#some comment)", True, "eks"), + (1, "# override-cpu-burst (HWAT-789)", True, "eks"), + (1, "# override-cpu-burst", True, "eks"), ], ) -def test_validate_cpu_burst_override( - burst, - comment, - expected, -): +def test_validate_cpu_burst_override(burst, comment, expected, instance_type): instance_config = f""" --- fake_instance1: @@ -1152,7 +1374,7 @@ def test_validate_cpu_burst_override( autospec=True, return_value=mock.Mock( get_instance=mock.Mock(return_value="fake_instance1"), - get_instance_type=mock.Mock(return_value="kubernetes"), + get_instance_type=mock.Mock(return_value=instance_type), ), ), mock.patch( "paasta_tools.cli.cmds.validate.list_all_instances_for_service", diff --git a/tests/conftest.py b/tests/conftest.py index ae3b3cd939..b62bd7d8e3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,12 @@ import asyncio +import os import sys import time import mock import pytest +from paasta_tools.kubernetes_tools import KubeClient from paasta_tools.utils import SystemPaastaConfig @@ -26,6 +28,7 @@ def system_paasta_config(): { "cluster": "fake_cluster", "api_endpoints": {"fake_cluster": "http://fake_cluster:5054"}, + "api_client_timeout": 120, "docker_registry": "fake_registry", "volumes": [ { @@ -35,11 +38,29 @@ def system_paasta_config(): } ], "service_discovery_providers": {"smartstack": {}, "envoy": {}}, + "kube_clusters": { + "pnw-prod": {"aws_account": "prod"}, + "pnw-devc": {"aws_account": "dev"}, + }, }, "/fake_dir/", ) +@pytest.fixture(scope="function", autouse=True) +def remove_pod_identity_env_vars(): + with mock.patch.dict( + os.environ, + { + k: v + for k, v in os.environ.items() + if k not in ["AWS_ROLE_ARN", "AWS_WEB_IDENTITY_TOKEN_FILE"] + }, + clear=True, + ): + yield + + @pytest.fixture(autouse=True) def mock_read_soa_metadata(): with mock.patch( @@ -60,6 +81,12 @@ def mock_ktools_read_soa_metadata(mock_read_soa_metadata): yield mock_read_soa_metadata +@pytest.fixture(autouse=True) +def cache_clear_KubeClient(): + KubeClient.__new__.cache_clear() + KubeClient.__init__.cache_clear() + + class Struct: """ convert a dictionary to an object diff --git a/tests/contrib/test_get_running_task_allocation.py b/tests/contrib/test_get_running_task_allocation.py index 25208adb52..98c0467b0f 100644 --- a/tests/contrib/test_get_running_task_allocation.py +++ b/tests/contrib/test_get_running_task_allocation.py @@ -4,7 +4,7 @@ from paasta_tools.contrib.get_running_task_allocation import ( get_kubernetes_resource_request_limit, ) -from paasta_tools.contrib.get_running_task_allocation import get_matching_namespaces +from paasta_tools.contrib.get_running_task_allocation import get_unexcluded_namespaces def test_get_kubernetes_resource_request_limit(): @@ -26,25 +26,26 @@ def test_get_kubernetes_resource_request_limit(): @pytest.mark.parametrize( - "namespaces, namespace_prefix, additional_namespaces, expected", + "namespaces, namespaces_to_exclude, expected", ( ( ["paasta", "paasta-flink", "paasta-spark", "luisp-was-here", "tron"], - "paasta", ["tron"], - ["paasta", "paasta-flink", "paasta-spark", "tron"], + ["paasta", "paasta-flink", "paasta-spark", "luisp-was-here"], ), ( ["paasta", "paasta-flink", "paasta-spark", "luisp-was-here", "tron"], - "paasta", - [""], - ["paasta", "paasta-flink", "paasta-spark"], + [], + ["paasta", "paasta-flink", "paasta-spark", "luisp-was-here", "tron"], + ), + ( + ["paasta", "paasta-flink", "paasta-spark", "luisp-was-here", "tron"], + ["tron", "paasta"], + ["paasta-flink", "paasta-spark", "luisp-was-here"], ), ), ) -def test_get_matching_namespaces( - namespaces, namespace_prefix, additional_namespaces, expected -): +def test_get_matching_namespaces(namespaces, namespaces_to_exclude, expected): assert sorted( - get_matching_namespaces(namespaces, namespace_prefix, additional_namespaces) + get_unexcluded_namespaces(namespaces, namespaces_to_exclude) ) == sorted(expected) diff --git a/tests/instance/test_kubernetes.py b/tests/instance/test_kubernetes.py index 5c192d55d0..a86284870b 100644 --- a/tests/instance/test_kubernetes.py +++ b/tests/instance/test_kubernetes.py @@ -117,7 +117,6 @@ def instance_status_kwargs(): instance="", instance_type="", verbose=0, - include_smartstack=False, include_envoy=False, settings=mock.Mock(), use_new=False, @@ -189,7 +188,6 @@ def test_kubernetes_status(): service="", instance="", verbose=0, - include_smartstack=False, include_envoy=False, instance_type="flink", settings=mock.Mock(), @@ -306,7 +304,6 @@ def test_replicaset( service="service", instance="instance", verbose=0, - include_smartstack=False, include_envoy=False, instance_type="kubernetes", settings=mock.Mock(), @@ -421,7 +418,6 @@ def test_statefulset( service="service", instance="instance", verbose=0, - include_smartstack=False, include_envoy=False, instance_type="kubernetes", settings=mock.Mock(), @@ -484,7 +480,6 @@ def test_statefulset_with_image_version( service="service", instance="instance", verbose=0, - include_smartstack=False, include_envoy=False, instance_type="kubernetes", settings=mock.Mock(), @@ -546,7 +541,6 @@ def test_event_timeout( service="service", instance="instance", verbose=0, - include_smartstack=False, include_envoy=False, instance_type="kubernetes", settings=mock.Mock(), @@ -600,7 +594,6 @@ def test_pod_timeout( service="service", instance="instance", verbose=0, - include_smartstack=False, include_envoy=False, instance_type="kubernetes", settings=mock.Mock(), @@ -628,52 +621,6 @@ def test_job_status_include_replicaset_non_verbose(mock_get_kubernetes_app_by_na assert len(kstatus["replicasets"]) == 3 -def test_kubernetes_status_include_smartstack(): - with asynctest.patch( - "paasta_tools.instance.kubernetes.job_status", - autospec=True, - ), asynctest.patch( - "paasta_tools.kubernetes_tools.load_service_namespace_config", autospec=True - ) as mock_load_service_namespace_config, asynctest.patch( - "paasta_tools.instance.kubernetes.mesh_status", - autospec=True, - ) as mock_mesh_status, asynctest.patch( - "paasta_tools.kubernetes_tools.replicasets_for_service_instance", autospec=True - ) as mock_replicasets_for_service_instance, asynctest.patch( - "paasta_tools.kubernetes_tools.pods_for_service_instance", - autospec=True, - ) as mock_pods_for_service_instance, asynctest.patch( - "paasta_tools.kubernetes_tools.get_kubernetes_app_by_name", - autospec=True, - ), asynctest.patch( - "paasta_tools.instance.kubernetes.LONG_RUNNING_INSTANCE_TYPE_HANDLERS", - autospec=True, - ) as mock_LONG_RUNNING_INSTANCE_TYPE_HANDLERS: - mock_load_service_namespace_config.return_value = {"proxy_port": 1234} - mock_LONG_RUNNING_INSTANCE_TYPE_HANDLERS["flink"] = mock.Mock() - mock_pods_for_service_instance.return_value = [] - mock_replicasets_for_service_instance.return_value = [] - mock_service = mock.Mock() - status = pik.kubernetes_status( - service=mock_service, - instance="", - verbose=0, - include_smartstack=True, - include_envoy=False, - instance_type="flink", - settings=mock.Mock(), - ) - assert ( - mock_load_service_namespace_config.mock_calls[0][2]["service"] - is mock_service - ) - assert mock_mesh_status.mock_calls[0][2]["service"] is mock_service - assert "app_count" in status - assert "evicted_count" in status - assert "bounce_method" in status - assert "desired_state" in status - - def test_cr_status_bad_instance_type(): with pytest.raises(RuntimeError) as excinfo: pik.cr_status( @@ -794,19 +741,7 @@ async def test_get_pod_status_mesh_ready(event_loop): assert not status["mesh_ready"] -@pytest.mark.parametrize( - "include_smartstack,include_envoy,expected", - [ - (True, True, ("smartstack", "envoy")), - (True, False, ("smartstack",)), - (False, True, ("envoy",)), - ], -) -def test_kubernetes_mesh_status( - include_smartstack, - include_envoy, - expected, -): +def test_kubernetes_mesh_status_include_envoy(): with asynctest.patch( "paasta_tools.kubernetes_tools.load_service_namespace_config", autospec=True ) as mock_load_service_namespace_config, asynctest.patch( @@ -830,26 +765,34 @@ def test_kubernetes_mesh_status( instance="fake_instance", instance_type="flink", settings=mock_settings, - include_smartstack=include_smartstack, - include_envoy=include_envoy, + include_envoy=True, ) - assert len(kmesh) == len(expected) - for i in range(len(expected)): - mesh_type = expected[i] - assert kmesh.get(mesh_type) == mock_mesh_status.return_value - assert mock_mesh_status.call_args_list[i] == mock.call( + assert len(kmesh) == 1 + assert kmesh.get("envoy") == mock_mesh_status.return_value + assert mock_mesh_status.call_args_list[0] == mock.call( + service="fake_service", + instance=mock_job_config.get_nerve_namespace.return_value, + job_config=mock_job_config, + service_namespace_config={"proxy_port": 1234}, + pods_task=mock.ANY, + should_return_individual_backends=True, + settings=mock_settings, + service_mesh=getattr(pik.ServiceMesh, "ENVOY"), + ) + _, kwargs = mock_mesh_status.call_args_list[0] + assert kwargs["pods_task"].result() == ["pod_1"] + + # include_envoy = False should error + with pytest.raises(RuntimeError) as excinfo: + kmesh = pik.kubernetes_mesh_status( service="fake_service", - instance=mock_job_config.get_nerve_namespace.return_value, - job_config=mock_job_config, - service_namespace_config={"proxy_port": 1234}, - pods_task=mock.ANY, - should_return_individual_backends=True, + instance="fake_instance", + instance_type="flink", settings=mock_settings, - service_mesh=getattr(pik.ServiceMesh, mesh_type.upper()), + include_envoy=False, ) - _, kwargs = mock_mesh_status.call_args_list[i] - assert kwargs["pods_task"].result() == ["pod_1"] + assert "No mesh types specified" in str(excinfo.value) @mock.patch( @@ -891,7 +834,6 @@ def test_kubernetes_mesh_status_error( instance="fake_instance", instance_type=inst_type, settings=mock_settings, - include_smartstack=include_mesh, include_envoy=include_mesh, ) diff --git a/tests/kubernetes/application/test_controller_wrapper.py b/tests/kubernetes/application/test_controller_wrapper.py index 081c3ad09e..d0c0970e8a 100644 --- a/tests/kubernetes/application/test_controller_wrapper.py +++ b/tests/kubernetes/application/test_controller_wrapper.py @@ -1,7 +1,5 @@ -import kubernetes.client import mock import pytest -from kubernetes.client import V1DeleteOptions from kubernetes.client.rest import ApiException from paasta_tools.kubernetes.application.controller_wrappers import Application @@ -27,75 +25,6 @@ def mock_load_system_paasta_config(): yield mock_load_system_paasta_config -def test_brutal_bounce(mock_load_system_paasta_config): - # mock the new client used to brutal bounce in the background using threading. - mock_cloned_client = mock.MagicMock() - - with mock.patch( - "paasta_tools.kubernetes.application.controller_wrappers.KubeClient", - return_value=mock_cloned_client, - autospec=True, - ): - with mock.patch( - "paasta_tools.kubernetes.application.controller_wrappers.threading.Thread", - autospec=True, - ) as mock_deep_delete_and_create: - mock_client = mock.MagicMock() - - app = mock.MagicMock() - app.item.metadata.name = "fake_name" - app.item.metadata.namespace = "faasta" - - # we do NOT call deep_delete_and_create - app = setup_app({}, True) - DeploymentWrapper.update(self=app, kube_client=mock_client) - - assert mock_deep_delete_and_create.call_count == 0 - - # we call deep_delete_and_create: when bounce_method is brutal - config_dict = {"instances": 1, "bounce_method": "brutal"} - - app = setup_app(config_dict, True) - app.update(kube_client=mock_client) - - mock_deep_delete_and_create.assert_called_once_with( - target=app.deep_delete_and_create, args=[mock_cloned_client] - ) - - -def test_deep_delete_and_create(mock_load_system_paasta_config): - with mock.patch( - "paasta_tools.kubernetes.application.controller_wrappers.sleep", autospec=True - ), mock.patch( - "paasta_tools.kubernetes.application.controller_wrappers.list_all_deployments", - autospec=True, - ) as mock_list_deployments, mock.patch( - "paasta_tools.kubernetes.application.controller_wrappers.force_delete_pods", - autospec=True, - ) as mock_force_delete_pods: - mock_kube_client = mock.MagicMock() - mock_kube_client.deployments = mock.Mock(spec=kubernetes.client.AppsV1Api) - config_dict = {"instances": 1, "bounce_method": "brutal"} - app = setup_app(config_dict, True) - # This mocks being unable to delete the deployment - mock_list_deployments.return_value = [app.kube_deployment] - delete_options = V1DeleteOptions(propagation_policy="Background") - - with pytest.raises(Exception): - # test deep_delete_and_create makes kubeclient calls correctly - app.deep_delete_and_create(mock_kube_client) - mock_force_delete_pods.assert_called_with( - app.item.metadata.name, - app.kube_deployment.service, - app.kube_deployment.instance, - app.item.metadata.namespace, - mock_kube_client, - ) - mock_kube_client.deployments.delete_namespaced_deployment.assert_called_with( - app.item.metadata.name, app.item.metadata.namespace, body=delete_options - ) - - @pytest.mark.parametrize("bounce_margin_factor_set", [True, False]) def test_ensure_pod_disruption_budget_create( bounce_margin_factor_set, @@ -120,7 +49,9 @@ def test_ensure_pod_disruption_budget_create( app.soa_config.get_bounce_margin_factor.return_value = 0.1 app.kube_deployment.service.return_value = "fake_service" app.kube_deployment.instance.return_value = "fake_instance" - Application.ensure_pod_disruption_budget(self=app, kube_client=mock_client) + Application.ensure_pod_disruption_budget( + self=app, kube_client=mock_client, namespace="paasta" + ) mock_client.policy.create_namespaced_pod_disruption_budget.assert_called_once_with( body=mock_req_pdr, namespace=mock_req_pdr.metadata.namespace ) @@ -145,7 +76,9 @@ def test_ensure_pod_disruption_budget_replaces_outdated( app.soa_config.get_bounce_margin_factor.return_value = 0.1 app.kube_deployment.service.return_value = "fake_service" app.kube_deployment.instance.return_value = "fake_instance" - Application.ensure_pod_disruption_budget(self=app, kube_client=mock_client) + Application.ensure_pod_disruption_budget( + self=app, kube_client=mock_client, namespace="paasta" + ) mock_client.policy.patch_namespaced_pod_disruption_budget.assert_called_once_with( name=mock_req_pdr.metadata.name, @@ -173,7 +106,9 @@ def test_ensure_pod_disruption_budget_noop_when_min_available_is_set( app.soa_config.get_bounce_margin_factor.return_value = 0.1 app.kube_deployment.service.return_value = "fake_service" app.kube_deployment.instance.return_value = "fake_instance" - Application.ensure_pod_disruption_budget(self=app, kube_client=mock_client) + Application.ensure_pod_disruption_budget( + self=app, kube_client=mock_client, namespace="paasta" + ) mock_client.policy.patch_namespaced_pod_disruption_budget.assert_not_called() diff --git a/tests/kubernetes/bin/test_paasta_secrets_sync.py b/tests/kubernetes/bin/test_paasta_secrets_sync.py index ab08becd7c..43809b94c7 100644 --- a/tests/kubernetes/bin/test_paasta_secrets_sync.py +++ b/tests/kubernetes/bin/test_paasta_secrets_sync.py @@ -64,6 +64,9 @@ def test_sync_all_secrets(): ) as mock_sync_secrets, mock.patch( "paasta_tools.kubernetes.bin.paasta_secrets_sync.PaastaServiceConfigLoader", autospec=True, + ), mock.patch( + "paasta_tools.kubernetes.bin.paasta_secrets_sync.ensure_namespace", + autospec=True, ): services_to_k8s_namespaces_to_allowlist = { "foo": {"paastasvc-foo": None}, @@ -261,7 +264,11 @@ def paasta_secrets_patches(): "paasta_tools.kubernetes.bin.paasta_secrets_sync.json.load", autospec=True ), mock.patch( "os.path.isdir", autospec=True, return_value=True + ), mock.patch( + "paasta_tools.kubernetes.bin.paasta_secrets_sync.load_system_paasta_config", + autospec=True, ): + yield ( mock_get_secret_provider, mock_scandir, @@ -620,7 +627,10 @@ def boto_keys_patches(): ) as mock_update_kubernetes_secret_signature, mock.patch( "paasta_tools.kubernetes.bin.paasta_secrets_sync.PaastaServiceConfigLoader", autospec=True, - ) as mock_config_loader: + ) as mock_config_loader, mock.patch( + "paasta_tools.kubernetes.bin.paasta_secrets_sync.load_system_paasta_config", + autospec=True, + ): yield ( mock_open, mock_open.return_value.__enter__.return_value, @@ -665,7 +675,16 @@ def test_sync_boto_secrets_create(boto_keys_patches): "scribereader-cfg": "ZmlsZTQ=", } - mock_open_handle.read.side_effect = ["file1", "file2", "file3", "file4"] + mock_open_handle.read.side_effect = [ + "file1", + "file2", + "file3", + "file4", + "eksfile1", + "eksfile2", + "eksfile3", + "eksfile4", + ] mock_get_kubernetes_secret_signature.return_value = None assert sync_boto_secrets( kube_client=mock.Mock(), @@ -711,7 +730,16 @@ def test_sync_boto_secrets_update(boto_keys_patches): ) mock_config_loader_instances.return_value = [deployment] - mock_open_handle.read.side_effect = ["file1", "file2", "file3", "file4"] + mock_open_handle.read.side_effect = [ + "file1", + "file2", + "file3", + "file4", + "eksfile1", + "eksfile2", + "eksfile3", + "eksfile4", + ] mock_get_kubernetes_secret_signature.return_value = "1235abc" assert sync_boto_secrets( kube_client=mock.Mock(), @@ -738,7 +766,16 @@ def test_sync_boto_secrets_noop(boto_keys_patches): mock_config_loader_instances, ) = boto_keys_patches - mock_open_handle.read.side_effect = ["file1", "file2", "file3", "file4"] + mock_open_handle.read.side_effect = [ + "file1", + "file2", + "file3", + "file4", + "eksfile1", + "eksfile2", + "eksfile3", + "eksfile4", + ] mock_get_kubernetes_secret_signature.return_value = ( "4c3da4da5d97294f69527dc92c2b930ce127522c" ) @@ -778,7 +815,16 @@ def test_sync_boto_secrets_exists_but_no_signature(boto_keys_patches): ) mock_config_loader_instances.return_value = [deployment] - mock_open_handle.read.side_effect = ["file1", "file2", "file3", "file4"] + mock_open_handle.read.side_effect = [ + "file1", + "file2", + "file3", + "file4", + "eksfile1", + "eksfile2", + "eksfile3", + "eksfile4", + ] mock_get_kubernetes_secret_signature.return_value = None mock_create_secret.side_effect = ApiException(409) @@ -818,7 +864,10 @@ def crypto_keys_patches(): ) as mock_update_kubernetes_secret_signature, mock.patch( "paasta_tools.kubernetes.bin.paasta_secrets_sync.PaastaServiceConfigLoader", autospec=True, - ) as mock_config_loader: + ) as mock_config_loader, mock.patch( + "paasta_tools.kubernetes.bin.paasta_secrets_sync.load_system_paasta_config", + autospec=True, + ): yield ( provider, mock_get_kubernetes_secret_signature, diff --git a/tests/metrics/test_metastatus_lib.py b/tests/metrics/test_metastatus_lib.py index d380ae6f65..65fa6d72ae 100644 --- a/tests/metrics/test_metastatus_lib.py +++ b/tests/metrics/test_metastatus_lib.py @@ -265,7 +265,7 @@ def test_assert_kube_deployments(): ) as mock_list_all_deployments: client = Mock() mock_list_all_deployments.return_value = ["KubeDeployment:1"] - output, ok = metastatus_lib.assert_kube_deployments(client) + output, ok = metastatus_lib.assert_kube_deployments(client, namespace="paasta") assert re.match("Kubernetes deployments: 1", output) assert ok @@ -283,7 +283,7 @@ def test_assert_kube_pods_running(): V1Pod(status=V1PodStatus(phase="Failed")), V1Pod(status=V1PodStatus(phase="Failed")), ] - output, ok = metastatus_lib.assert_kube_pods_running(client) + output, ok = metastatus_lib.assert_kube_pods_running(client, namespace="paasta") assert re.match("Pods: running: 1 pending: 2 failed: 3", output) assert ok diff --git a/tests/test_cleanup_kubernetes_jobs.py b/tests/test_cleanup_kubernetes_jobs.py index ac66d41504..96a0e82305 100644 --- a/tests/test_cleanup_kubernetes_jobs.py +++ b/tests/test_cleanup_kubernetes_jobs.py @@ -15,6 +15,7 @@ from copy import deepcopy import mock +import pytest from kubernetes.client import V1Deployment from kubernetes.client import V1StatefulSet from pytest import fixture @@ -23,6 +24,7 @@ from paasta_tools.cleanup_kubernetes_jobs import cleanup_unused_apps from paasta_tools.cleanup_kubernetes_jobs import DontKillEverythingError from paasta_tools.cleanup_kubernetes_jobs import main +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes.application.controller_wrappers import DeploymentWrapper from paasta_tools.kubernetes.application.controller_wrappers import StatefulSetWrapper from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig @@ -32,7 +34,7 @@ def fake_deployment(): fake_deployment = V1Deployment( metadata=mock.Mock( - namespace="paasta", + namespace="paastasvc-service", labels={ "yelp.com/paasta_service": "service", "yelp.com/paasta_instance": "instance-1", @@ -121,6 +123,39 @@ def fake_instance_config( return fake_instance_config +def fake_eks_instance_config( + cluster, service, instance, soa_dir="soa_dir", load_deployments=False +): + fake_eks_instance_config = EksDeploymentConfig( + service, + instance, + cluster, + { + "port": None, + "monitoring": {}, + "deploy": {"pipeline": [{"step": "default"}]}, + "data": {}, + "smartstack": {}, + "dependencies": {}, + "cpus": 0.1, + "mem": 100, + "min_instances": 1, + "max_instances": 10, + "deploy_group": "prod.main", + "autoscaling": {"setpoint": 0.7}, + }, + { + "docker_image": "services-compute-infra-test-service:paasta-5b861b3bd42ef9674d3ca04a1259c79eddb71694", + "git_sha": "5b861b3bd42ef9674d3ca04a1259c79eddb71694", + "image_version": None, + "desired_state": "start", + "force_bounce": None, + }, + soa_dir, + ) + return fake_eks_instance_config + + def get_fake_instances(self, with_limit: bool = True) -> int: return self.config_dict.get("max_instances", None) @@ -136,7 +171,7 @@ def test_main(fake_deployment, fake_stateful_set, invalid_app): load_config_patch.return_value.get_cluster.return_value = "fake_cluster" main(("--soa-dir", soa_dir, "--cluster", cluster)) cleanup_patch.assert_called_once_with( - soa_dir, cluster, kill_threshold=0.5, force=False + soa_dir, cluster, kill_threshold=0.5, force=False, eks=False ) @@ -166,7 +201,14 @@ def test_list_apps(fake_deployment, fake_stateful_set, invalid_app): ) -def test_cleanup_unused_apps(fake_deployment, fake_stateful_set, invalid_app): +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) +def test_cleanup_unused_apps(eks_flag, fake_deployment, fake_stateful_set, invalid_app): mock_kube_client = mock.MagicMock() with mock.patch( "paasta_tools.cleanup_kubernetes_jobs.KubeClient", @@ -180,6 +222,10 @@ def test_cleanup_unused_apps(fake_deployment, fake_stateful_set, invalid_app): "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.KubernetesDeploymentConfig.get_instances", side_effect=get_fake_instances, @@ -193,12 +239,21 @@ def test_cleanup_unused_apps(fake_deployment, fake_stateful_set, invalid_app): ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake cluster", kill_threshold=1, force=False) + cleanup_unused_apps( + "soa_dir", "fake cluster", kill_threshold=1, force=False, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 1 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_in_multiple_namespaces( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() fake_deployment2 = deepcopy(fake_deployment) @@ -222,6 +277,10 @@ def test_cleanup_unused_apps_in_multiple_namespaces( "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.get_services_for_cluster", return_value={("service", "instance-1")}, @@ -235,12 +294,21 @@ def test_cleanup_unused_apps_in_multiple_namespaces( ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake cluster", kill_threshold=2, force=False) + cleanup_unused_apps( + "soa_dir", "fake cluster", kill_threshold=2, force=False, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 1 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_does_not_delete( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() with mock.patch( @@ -251,6 +319,10 @@ def test_cleanup_unused_apps_does_not_delete( "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.list_all_applications", return_value={("service", "instance-1"): [DeploymentWrapper(fake_deployment)]}, @@ -268,12 +340,21 @@ def test_cleanup_unused_apps_does_not_delete( ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake cluster", kill_threshold=1, force=False) + cleanup_unused_apps( + "soa_dir", "fake cluster", kill_threshold=1, force=False, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 0 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_does_not_delete_bouncing_apps( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() fake_deployment2 = deepcopy(fake_deployment) @@ -297,6 +378,10 @@ def test_cleanup_unused_apps_does_not_delete_bouncing_apps( "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.get_services_for_cluster", return_value={("service", "instance-1")}, @@ -310,12 +395,21 @@ def test_cleanup_unused_apps_does_not_delete_bouncing_apps( ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake cluster", kill_threshold=2, force=False) + cleanup_unused_apps( + "soa_dir", "fake cluster", kill_threshold=2, force=False, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 0 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_does_not_delete_recently_created_apps( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() fake_deployment.status.ready_replicas = 10 @@ -336,6 +430,10 @@ def test_cleanup_unused_apps_does_not_delete_recently_created_apps( "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.get_services_for_cluster", return_value={("service", "instance-1")}, @@ -349,12 +447,21 @@ def test_cleanup_unused_apps_does_not_delete_recently_created_apps( ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake cluster", kill_threshold=2, force=False) + cleanup_unused_apps( + "soa_dir", "fake cluster", kill_threshold=2, force=False, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 0 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_dont_kill_everything( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() with mock.patch( @@ -365,6 +472,10 @@ def test_cleanup_unused_apps_dont_kill_everything( "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.list_all_applications", return_value={("service", "instance-1"): [DeploymentWrapper(fake_deployment)]}, @@ -384,13 +495,20 @@ def test_cleanup_unused_apps_dont_kill_everything( mock_alert_state_change.__exit__ = mock.Mock(return_value=None) with raises(DontKillEverythingError): cleanup_unused_apps( - "soa_dir", "fake_cluster", kill_threshold=0, force=False + "soa_dir", "fake_cluster", kill_threshold=0, force=False, eks=eks_flag ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 0 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_dont_kill_statefulsets( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() with mock.patch( @@ -401,6 +519,10 @@ def test_cleanup_unused_apps_dont_kill_statefulsets( "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.list_all_applications", return_value={ @@ -419,11 +541,22 @@ def test_cleanup_unused_apps_dont_kill_statefulsets( ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake_cluster", kill_threshold=0.5, force=False) + cleanup_unused_apps( + "soa_dir", "fake_cluster", kill_threshold=0.5, force=False, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 0 -def test_cleanup_unused_apps_force(fake_deployment, fake_stateful_set, invalid_app): +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) +def test_cleanup_unused_apps_force( + eks_flag, fake_deployment, fake_stateful_set, invalid_app +): mock_kube_client = mock.MagicMock() with mock.patch( "paasta_tools.cleanup_kubernetes_jobs.KubeClient", @@ -433,6 +566,10 @@ def test_cleanup_unused_apps_force(fake_deployment, fake_stateful_set, invalid_a "paasta_tools.kubernetes_tools.load_kubernetes_service_config_no_cache", autospec=True, side_effect=fake_instance_config, + ), mock.patch( + "paasta_tools.eks_tools.load_eks_service_config_no_cache", + autospec=True, + side_effect=fake_eks_instance_config, ), mock.patch( "paasta_tools.cleanup_kubernetes_jobs.list_all_applications", return_value={("service", "instance-1"): [DeploymentWrapper(fake_deployment)]}, @@ -450,12 +587,21 @@ def test_cleanup_unused_apps_force(fake_deployment, fake_stateful_set, invalid_a ) as mock_alert_state_change: mock_alert_state_change.__enter__ = mock.Mock(return_value=(mock.Mock(), None)) mock_alert_state_change.__exit__ = mock.Mock(return_value=None) - cleanup_unused_apps("soa_dir", "fake_cluster", kill_threshold=0, force=True) + cleanup_unused_apps( + "soa_dir", "fake_cluster", kill_threshold=0, force=True, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 1 +@pytest.mark.parametrize( + "eks_flag", + [ + (False), + (True), + ], +) def test_cleanup_unused_apps_ignore_invalid_apps( - fake_deployment, fake_stateful_set, invalid_app + eks_flag, fake_deployment, fake_stateful_set, invalid_app ): mock_kube_client = mock.MagicMock() with mock.patch( @@ -474,5 +620,7 @@ def test_cleanup_unused_apps_ignore_invalid_apps( mock_kube_client.deployments.list_namespaced_deployment.return_value = ( mock.MagicMock(items=[invalid_app]) ) - cleanup_unused_apps("soa_dir", "fake_cluster", kill_threshold=0, force=True) + cleanup_unused_apps( + "soa_dir", "fake_cluster", kill_threshold=0, force=True, eks=eks_flag + ) assert mock_kube_client.deployments.delete_namespaced_deployment.call_count == 0 diff --git a/tests/test_config_utils.py b/tests/test_config_utils.py index 0efcb79d7b..17a76f3d53 100644 --- a/tests/test_config_utils.py +++ b/tests/test_config_utils.py @@ -36,7 +36,7 @@ def test_write_auto_config_data_service_dne(tmpdir): ) as mock_open: result = config_utils.write_auto_config_data( service="something", - extra_info="marathon-norcal-devc", + extra_info="kubernetes-norcal-devc", data={"a": 1}, soa_dir=tmpdir, ) @@ -46,7 +46,7 @@ def test_write_auto_config_data_service_dne(tmpdir): def test_write_auto_config_data_new_file(tmpdir): service = "foo" - conf_file = "marathon-norcal-devc" + conf_file = "kubernetes-norcal-devc" data = {"a": 1} tmpdir.mkdir(service) @@ -65,7 +65,7 @@ def test_write_auto_config_data_new_file(tmpdir): def test_write_auto_config_data_file_exists(tmpdir): service = "foo" - conf_file = "marathon-norcal-devc" + conf_file = "kubernetes-norcal-devc" tmpdir.mkdir(service) config_utils.write_auto_config_data( @@ -91,9 +91,14 @@ def test_write_auto_config_data_file_exists(tmpdir): @mock.patch("paasta_tools.config_utils.validate_schema", autospec=True) def test_validate_auto_config_file_config_types(mock_validate, tmpdir): - for config_type in config_utils.KNOWN_CONFIG_TYPES: + for config_type in ( + "kubernetes", + "deploy", + "smartstack", + "cassandracluster", + ): filepath = f"service/{config_type}-cluster.yaml" - config_utils.validate_auto_config_file(filepath, AUTO_SOACONFIG_SUBDIR) + assert config_utils.validate_auto_config_file(filepath, AUTO_SOACONFIG_SUBDIR) mock_validate.assert_called_with(filepath, f"autotuned_defaults/{config_type}") @@ -120,7 +125,7 @@ def test_validate_auto_config_file_unknown_type(mock_validate, tmpdir): ) def test_validate_auto_config_file_e2e(data, is_valid, tmpdir): service = "foo" - conf_file = "marathon-norcal-devc" + conf_file = "kubernetes-norcal-devc" tmpdir.mkdir(service) filepath = config_utils.write_auto_config_data( @@ -135,12 +140,20 @@ def test_validate_auto_config_file_e2e(data, is_valid, tmpdir): ) -@pytest.mark.parametrize("branch", ["master", "other_test"]) -def test_auto_config_updater_context(branch, tmpdir, mock_subprocess): +@pytest.mark.parametrize( + "branch, remote_branch_exists", + [("master", True), ("other_test", True), ("other_test", False)], +) +def test_auto_config_updater_context( + branch, remote_branch_exists, tmpdir, mock_subprocess +): remote = "git_remote" updater = config_utils.AutoConfigUpdater( "test_source", remote, branch=branch, working_dir=tmpdir ) + updater._remote_branch_exists = mock.MagicMock( + autospec=True, return_value=remote_branch_exists + ) initial_wd = os.getcwd() with updater: @@ -148,9 +161,19 @@ def test_auto_config_updater_context(branch, tmpdir, mock_subprocess): assert os.path.isdir(clone_dir) expected_calls = [mock.call.check_call(["git", "clone", remote, clone_dir])] if branch != "master": - expected_calls.append( - mock.call.check_call(["git", "checkout", "-b", branch]) - ) + if remote_branch_exists: + expected_calls.extend( + [ + mock.call.check_call(["git", "fetch", "origin", branch]), + mock.call.check_call( + ["git", "checkout", "-b", branch, f"origin/{branch}"] + ), + ] + ) + else: + expected_calls.append( + mock.call.check_call(["git", "checkout", "-b", branch]) + ) assert mock_subprocess.mock_calls == expected_calls assert os.getcwd() == clone_dir @@ -159,8 +182,13 @@ def test_auto_config_updater_context(branch, tmpdir, mock_subprocess): assert os.getcwd() == initial_wd -@pytest.mark.parametrize("branch", ["master", "other_test"]) -def test_auto_config_updater_context_no_clone(branch, tmpdir, mock_subprocess): +@pytest.mark.parametrize( + "branch, remote_branch_exists", + [("master", True), ("other_test", True), ("other_test", False)], +) +def test_auto_config_updater_context_no_clone( + branch, remote_branch_exists, tmpdir, mock_subprocess +): remote = "git_remote" working_dir = tmpdir.mkdir("testing") updater = config_utils.AutoConfigUpdater( @@ -170,13 +198,26 @@ def test_auto_config_updater_context_no_clone(branch, tmpdir, mock_subprocess): working_dir=working_dir, do_clone=False, ) + updater._remote_branch_exists = mock.MagicMock( + autospec=True, return_value=remote_branch_exists + ) initial_wd = os.getcwd() with updater: if branch == "master": expected_calls = [] else: - expected_calls = [mock.call.check_call(["git", "checkout", "-b", branch])] + if remote_branch_exists: + expected_calls = [ + mock.call.check_call(["git", "fetch", "origin", branch]), + mock.call.check_call( + ["git", "checkout", "-b", branch, f"origin/{branch}"] + ), + ] + else: + expected_calls = [ + mock.call.check_call(["git", "checkout", "-b", branch]) + ] assert mock_subprocess.mock_calls == expected_calls assert os.getcwd() == working_dir @@ -190,11 +231,10 @@ def test_auto_config_updater_context_no_clone(branch, tmpdir, mock_subprocess): def test_auto_config_updater_validate(mock_validate_file, all_valid, updater): mock_validate_file.side_effect = [True, all_valid, True] - updater.write_configs("foo", "marathon-norcal-devc", {"a": 2}) updater.write_configs("foo", "kubernetes-norcal-devc", {"a": 2}) updater.write_configs("foo", "kubernetes-pnw-devc", {"a": 2}) assert updater.validate() == all_valid - assert mock_validate_file.call_count == 3 + assert mock_validate_file.call_count == 2 def test_auto_config_updater_read_write(updater): @@ -275,3 +315,71 @@ def test_auto_config_updater_commit(mock_push, mock_commit, did_commit, updater) mock_push.assert_called_once_with(updater.branch) else: assert mock_push.call_count == 0 + + +def test_auto_config_updater_merge_recommendations_limits(updater): + service = "foo" + conf_file = "notk8s-euwest-prod" + limited_instance = "some_instance" + noop_instance = "other_instance" + autotune_data = { + limited_instance: {"cpus": 0.1, "mem": 167, "disk": 256, "cpu_burst_add": 0.5} + } + user_data = { + limited_instance: { + "cmd": "ls", + "autotune_limits": { + "cpus": {"min": 1, "max": 2}, + "mem": {"min": 1024, "max": 2048}, + "disk": {"min": 512, "max": 1024}, + }, + }, + noop_instance: {"cmd": "ls"}, + } + + recs = { + (service, conf_file): { + limited_instance: { + "mem": 1, + "disk": 700, + "cpus": 3, + }, + noop_instance: { + "cpus": 100, + "mem": 10000, + "disk": 2048, + }, + } + } + + with mock.patch.object( + updater, + "get_existing_configs", + autospec=True, + side_effect=[ + # first get the autotune data + autotune_data, + # then we get both the eks- and kuberentes- data + user_data, + # there could be data in both of these, but for a + # simpler test, we just assume that we're looking + # at something that's 100% on Yelp-managed k8s + {}, + ], + ): + assert updater.merge_recommendations(recs) == { + (service, conf_file): { + limited_instance: { + "mem": 1024, # use lower bound + "disk": 700, # unchanged + "cpus": 2, # use upper bound + "cpu_burst_add": 0.5, # no updated rec or limit, leave alone + }, + # this instances recommendations should be left untouched + noop_instance: { + "cpus": 100, + "mem": 10000, + "disk": 2048, + }, + } + } diff --git a/tests/test_delete_kubernetes_deployments.py b/tests/test_delete_kubernetes_deployments.py index d3370505ca..73d3034a87 100644 --- a/tests/test_delete_kubernetes_deployments.py +++ b/tests/test_delete_kubernetes_deployments.py @@ -28,6 +28,7 @@ def test_main(): mock_delete_deployment.assert_called_with( kube_client=mock_kube_client.return_value, deployment_name="fake_pcm_deployment", + namespace="paasta", ) # Test main() failed diff --git a/tests/test_docker_wrapper.py b/tests/test_docker_wrapper.py index a5c2fe1e39..cb79eb5ba3 100644 --- a/tests/test_docker_wrapper.py +++ b/tests/test_docker_wrapper.py @@ -324,7 +324,7 @@ def test_env_not_present(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "foobar", ) @@ -364,7 +364,7 @@ def test_numa_string_value(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", '--env=PIN_TO_NUMA_NODE="true"', ) @@ -378,7 +378,7 @@ def test_numa_bogus_node(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--env=PIN_TO_NUMA_NODE=True", ) @@ -407,7 +407,7 @@ def test_numa_unsupported(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--env=PIN_TO_NUMA_NODE=1", "--env=MARATHON_APP_RESOURCE_CPUS=1.5", @@ -438,7 +438,7 @@ def test_marathon_bogus_value(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-mems=1", "--cpuset-cpus=1,3", @@ -471,7 +471,7 @@ def test_numa_OK(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-mems=1", "--cpuset-cpus=1,3", @@ -505,7 +505,7 @@ def test_cpuset_already_set(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-cpus=0,2", "--env=PIN_TO_NUMA_NODE=1", @@ -538,7 +538,7 @@ def test_numa_req_bogus_mem_value(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-mems=1", "--cpuset-cpus=1,3", @@ -573,7 +573,7 @@ def test_numa_req_more_mem_than_available(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--env=PIN_TO_NUMA_NODE=1", "--env=MARATHON_APP_RESOURCE_CPUS=2", @@ -606,7 +606,7 @@ def test_numa_req_less_mem_than_available(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-mems=1", "--cpuset-cpus=1,3", @@ -640,7 +640,7 @@ def test_numa_req_exact_amount_of_cores(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-mems=1", "--cpuset-cpus=1,3", @@ -673,7 +673,7 @@ def test_numa_too_many_cores_requested(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--env=PIN_TO_NUMA_NODE=1", "--env=MARATHON_APP_RESOURCE_CPUS=3.0", @@ -699,7 +699,7 @@ def test_numa_enabled_unknown_cpu_requirement_skips_cpusets(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--cpuset-mems=1", "--cpuset-cpus=1,3", @@ -726,7 +726,7 @@ def test_numa_wrong_cpu(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--env=PIN_TO_NUMA_NODE=2", ) @@ -746,7 +746,7 @@ def test_numa_single_cpu_doesnt_bother_with_cpusets(self, mock_execlp): "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--env=PIN_TO_NUMA_NODE=1", ) @@ -801,7 +801,7 @@ def test_mac_address( "docker", "run", "--mac-address=00:00:00:00:00:00", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", *mock_firewall_env_args, ) @@ -846,7 +846,7 @@ def test_mac_address_already_set( "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", "--mac-address=12:34:56:78:90:ab", *mock_firewall_env_args, @@ -866,7 +866,7 @@ def test_mac_address_no_lockdir( "docker", "docker", "run", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", *mock_firewall_env_args, ) @@ -901,7 +901,7 @@ def test_prepare_new_container_error( "docker", "run", "--mac-address=00:00:00:00:00:00", - f"--hostname={socket.getfqdn()}", + f"--hostname={socket.gethostname()}", f"-e=PAASTA_HOST={socket.getfqdn()}", *mock_firewall_env_args, ) diff --git a/tests/test_kubernetes_tools.py b/tests/test_kubernetes_tools.py index 100b59efab..7ff63f9009 100644 --- a/tests/test_kubernetes_tools.py +++ b/tests/test_kubernetes_tools.py @@ -2,6 +2,7 @@ from base64 import b64encode from typing import Any from typing import Dict +from typing import List from typing import Sequence import asynctest @@ -41,6 +42,7 @@ from kubernetes.client import V1PodAntiAffinity from kubernetes.client import V1PodSpec from kubernetes.client import V1PodTemplateSpec +from kubernetes.client import V1PreferredSchedulingTerm from kubernetes.client import V1Probe from kubernetes.client import V1ResourceRequirements from kubernetes.client import V1RoleBinding @@ -89,6 +91,7 @@ from paasta_tools.kubernetes_tools import create_stateful_set from paasta_tools.kubernetes_tools import ensure_namespace from paasta_tools.kubernetes_tools import ensure_paasta_api_rolebinding +from paasta_tools.kubernetes_tools import ensure_paasta_namespace_limits from paasta_tools.kubernetes_tools import filter_nodes_by_blacklist from paasta_tools.kubernetes_tools import filter_pods_by_service_instance from paasta_tools.kubernetes_tools import force_delete_pods @@ -149,6 +152,7 @@ from paasta_tools.utils import SecretVolume from paasta_tools.utils import SecretVolumeItem from paasta_tools.utils import SystemPaastaConfig +from paasta_tools.utils import TopologySpreadConstraintDict def test_force_delete_pods(): @@ -353,26 +357,54 @@ def test_get_bounce_method(self): with pytest.raises(Exception): self.deployment.get_bounce_method() - def test_get_deployment_strategy(self): + @pytest.mark.parametrize( + "bounce_method,bounce_margin_factor,expected_strategy,expected_rolling_update_deploy", + [ + ( + "crossover", + 1, + "RollingUpdate", + V1RollingUpdateDeployment(max_surge="100%", max_unavailable="0%"), + ), + ( + "crossover", + 0.3, + "RollingUpdate", + V1RollingUpdateDeployment(max_surge="100%", max_unavailable="70%"), + ), + # b_m_f does not actually contribute to settings for brutal + ( + "brutal", + 0.5, + "RollingUpdate", + V1RollingUpdateDeployment(max_surge="100%", max_unavailable="100%"), + ), + ("downthenup", 1, "Recreate", None), + ], + ) + def test_get_deployment_strategy( + self, + bounce_method, + bounce_margin_factor, + expected_strategy, + expected_rolling_update_deploy, + ): with mock.patch( "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_bounce_method", autospec=True, - return_value="crossover", - ) as mock_get_bounce_method: + return_value=bounce_method, + ), mock.patch( + "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.get_bounce_margin_factor", + autospec=True, + return_value=bounce_margin_factor, + ): assert ( self.deployment.get_deployment_strategy_config() == V1DeploymentStrategy( - type="RollingUpdate", - rolling_update=V1RollingUpdateDeployment( - max_surge="100%", max_unavailable="0%" - ), + type=expected_strategy, + rolling_update=expected_rolling_update_deploy, ) ) - mock_get_bounce_method.return_value = "downthenup" - assert ( - self.deployment.get_deployment_strategy_config() - == V1DeploymentStrategy(type="Recreate") - ) def test_get_sanitised_volume_name(self): assert ( @@ -714,11 +746,6 @@ def test_get_sidecar_resource_requirements_default_requirements(self): "memory": "512Mi", "ephemeral-storage": "256Mi", }, - "uwsgi_exporter": { - "cpu": 0.1, - "memory": "512Mi", - "ephemeral-storage": "256Mi", - }, } ) ) @@ -746,11 +773,6 @@ def test_get_sidecar_resource_requirements_limits_override_default_requirements( "memory": "1024Mi", "ephemeral-storage": "256Mi", }, - "uwsgi_exporter": { - "cpu": 0.1, - "memory": "1024Mi", - "ephemeral-storage": "256Mi", - }, } ) ) @@ -761,72 +783,7 @@ def test_get_sidecar_resource_requirements_limits_override_default_requirements( requests={"cpu": 0.1, "memory": "1024Mi", "ephemeral-storage": "256Mi"}, ) - def test_get_uwsgi_exporter_sidecar_container_should_run(self): - system_paasta_config = mock.Mock( - get_uwsgi_exporter_sidecar_image_url=mock.Mock( - return_value="uwsgi_exporter_image" - ) - ) - with mock.patch.object( - self.deployment, "should_run_uwsgi_exporter_sidecar", return_value=True - ): - ret = self.deployment.get_uwsgi_exporter_sidecar_container( - system_paasta_config - ) - assert ret is not None - assert ret.image == "uwsgi_exporter_image" - assert ret.ports[0].container_port == 9117 - - @pytest.mark.parametrize( - "uwsgi_stats_port,expected_port", - [ - (None, "8889"), - (31337, "31337"), - ], - ) - def test_get_uwsgi_exporter_sidecar_container_stats_port( - self, uwsgi_stats_port, expected_port - ): - system_paasta_config = mock.Mock( - get_uwsgi_exporter_sidecar_image_url=mock.Mock( - return_value="uwsgi_exporter_image" - ) - ) - self.deployment.config_dict.update( - { - "max_instances": 5, - "autoscaling": { - "metrics_provider": "uwsgi", - "use_prometheus": True, - }, - } - ) - if uwsgi_stats_port is not None: - self.deployment.config_dict["autoscaling"][ - "uwsgi_stats_port" - ] = uwsgi_stats_port - - ret = self.deployment.get_uwsgi_exporter_sidecar_container(system_paasta_config) - expected_env_var = V1EnvVar(name="STATS_PORT", value=expected_port) - assert expected_env_var in ret.env - - def test_get_uwsgi_exporter_sidecar_container_shouldnt_run(self): - system_paasta_config = mock.Mock( - get_uwsgi_exporter_sidecar_image_url=mock.Mock( - return_value="uwsgi_exporter_image" - ) - ) - with mock.patch.object( - self.deployment, "should_run_uwsgi_exporter_sidecar", return_value=False - ): - assert ( - self.deployment.get_uwsgi_exporter_sidecar_container( - system_paasta_config - ) - is None - ) - - def test_should_run_uwsgi_exporter_sidecar_explicit(self): + def test_should_use_uwsgi_exporter_explicit(self): self.deployment.config_dict.update( { "max_instances": 5, @@ -839,71 +796,10 @@ def test_should_run_uwsgi_exporter_sidecar_explicit(self): system_paasta_config = mock.Mock() - assert ( - self.deployment.should_run_uwsgi_exporter_sidecar(system_paasta_config) - is True - ) + assert self.deployment.should_use_uwsgi_exporter(system_paasta_config) is True - self.deployment.config_dict["autoscaling"]["use_prometheus"] = False - assert ( - self.deployment.should_run_uwsgi_exporter_sidecar(system_paasta_config) - is False - ) - - def test_should_run_uwsgi_exporter_sidecar_defaults(self): - self.deployment.config_dict.update( - { - "max_instances": 5, - "autoscaling": { - "metrics_provider": "uwsgi", - }, - } - ) - - system_paasta_config_enabled = mock.Mock( - default_should_run_uwsgi_exporter_sidecar=mock.Mock(return_value=True) - ) - system_paasta_config_disabled = mock.Mock( - default_should_run_uwsgi_exporter_sidecar=mock.Mock(return_value=False) - ) - - with mock.patch( - "paasta_tools.kubernetes_tools.DEFAULT_USE_PROMETHEUS_UWSGI", - autospec=False, - new=False, - ): - assert ( - self.deployment.should_run_uwsgi_exporter_sidecar( - system_paasta_config_enabled - ) - is True - ) - assert ( - self.deployment.should_run_uwsgi_exporter_sidecar( - system_paasta_config_disabled - ) - is False - ) - - # If the default for use_prometheus is True and config_dict doesn't specify use_prometheus, we should run - # uwsgi_exporter regardless of default_should_run_uwsgi_exporter_sidecar. - with mock.patch( - "paasta_tools.kubernetes_tools.DEFAULT_USE_PROMETHEUS_UWSGI", - autospec=False, - new=True, - ): - assert ( - self.deployment.should_run_uwsgi_exporter_sidecar( - system_paasta_config_enabled - ) - is True - ) - assert ( - self.deployment.should_run_uwsgi_exporter_sidecar( - system_paasta_config_disabled - ) - is True - ) + self.deployment.config_dict["autoscaling"]["metrics_provider"] = "cpu" + assert self.deployment.should_use_uwsgi_exporter(system_paasta_config) is False def test_get_gunicorn_exporter_sidecar_container_should_run(self): system_paasta_config = mock.Mock( @@ -1299,22 +1195,27 @@ def test_get_pod_volumes(self): ), V1Volume( name="secret--waldo", - secret=V1SecretVolumeSource(secret_name="paasta-secret-kurupt-waldo"), + secret=V1SecretVolumeSource( + secret_name="paastasvc-kurupt-secret-kurupt-waldo", optional=False + ), ), V1Volume( name="secret--waldo", secret=V1SecretVolumeSource( - secret_name="paasta-secret-kurupt-waldo", default_mode=0o765 + secret_name="paastasvc-kurupt-secret-kurupt-waldo", + default_mode=0o765, + optional=False, ), ), V1Volume( name="secret--waldo", secret=V1SecretVolumeSource( - secret_name="paasta-secret-kurupt-waldo", + secret_name="paastasvc-kurupt-secret-kurupt-waldo", items=[ V1KeyToPath(key="aaa", mode=0o567, path="bbb"), V1KeyToPath(key="ccc", path="ddd"), ], + optional=False, ), ), ] @@ -1389,14 +1290,14 @@ def test_get_volume_mounts(self): "zuora_integration", "sync_ads_settings_post_budget_edit_batch_daemon", "paasta-boto-key-zuora--integration-sync--ads--settings--po-4xbg", - "paasta-secret-zuora--integration-paasta-boto-key-zuora--integration-sync--ads--settings--po-4xbg-signature", + "paastasvc-zuora--integration-secret-zuora--integration-paasta-boto-key-zuora--integration-sync--ads--settings--po-4xbg-signature", ), ( {"boto_keys": ["few"]}, "zuora_integration", "reprocess_zuora_amend_callouts_batch_daemon", "paasta-boto-key-zuora--integration-reprocess--zuora--amend-jztw", - "paasta-secret-zuora--integration-paasta-boto-key-zuora--integration-reprocess--zuora--amend-jztw-signature", + "paastasvc-zuora--integration-secret-zuora--integration-paasta-boto-key-zuora--integration-reprocess--zuora--amend-jztw-signature", ), ( { @@ -1405,14 +1306,14 @@ def test_get_volume_mounts(self): "kafka_discovery", "main", "paasta-boto-key-kafka--discovery-main", - "paasta-secret-kafka--discovery-paasta-boto-key-kafka--discovery-main-signature", + "paastasvc-kafka--discovery-secret-kafka--discovery-paasta-boto-key-kafka--discovery-main-signature", ), ( {"boto_keys": ["pew"]}, "yelp-main", "lives_data_action_content_ingester_worker", "paasta-boto-key-yelp-main-lives--data--action--content--in-4pxl", - "paasta-secret-yelp-main-paasta-boto-key-yelp-main-lives--data--action--content--in-4pxl-signature", + "paastasvc-yelp-main-secret-yelp-main-paasta-boto-key-yelp-main-lives--data--action--content--in-4pxl-signature", ), ( { @@ -1906,9 +1807,10 @@ def test_get_pod_template_spec( if autoscaling_metric_provider: expected_labels["paasta.yelp.com/deploy_group"] = "fake_group" - expected_labels[ - f"paasta.yelp.com/scrape_{autoscaling_metric_provider}_prometheus" - ] = "true" + if autoscaling_metric_provider != "uwsgi": + expected_labels[ + f"paasta.yelp.com/scrape_{autoscaling_metric_provider}_prometheus" + ] = "true" if autoscaling_metric_provider in ("uwsgi", "gunicorn"): routable_ip = "true" @@ -1935,7 +1837,7 @@ def test_get_pod_template_spec( autospec=True, ) @mock.patch( - "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.should_run_uwsgi_exporter_sidecar", + "paasta_tools.kubernetes_tools.KubernetesDeploymentConfig.should_use_uwsgi_exporter", autospec=True, ) @mock.patch( @@ -1943,7 +1845,7 @@ def test_get_pod_template_spec( autospec=True, ) @pytest.mark.parametrize( - "ip_configured,in_smtstk,prometheus_port,should_run_uwsgi_exporter_sidecar_retval,should_run_gunicorn_exporter_sidecar_retval,expected", + "ip_configured,in_smtstk,prometheus_port,should_use_uwsgi_exporter_retval,should_run_gunicorn_exporter_sidecar_retval,expected", [ (False, True, 8888, False, False, "true"), (False, False, 8888, False, False, "true"), @@ -1956,20 +1858,18 @@ def test_get_pod_template_spec( ) def test_routable_ip( self, - mock_should_run_uwsgi_exporter_sidecar, + mock_should_use_uwsgi_exporter, mock_should_run_gunicorn_exporter_sidecar, mock_get_prometheus_port, ip_configured, in_smtstk, prometheus_port, - should_run_uwsgi_exporter_sidecar_retval, + should_use_uwsgi_exporter_retval, should_run_gunicorn_exporter_sidecar_retval, expected, ): mock_get_prometheus_port.return_value = prometheus_port - mock_should_run_uwsgi_exporter_sidecar.return_value = ( - should_run_uwsgi_exporter_sidecar_retval - ) + mock_should_use_uwsgi_exporter.return_value = should_use_uwsgi_exporter_retval mock_should_run_gunicorn_exporter_sidecar.return_value = ( should_run_gunicorn_exporter_sidecar_retval ) @@ -1985,7 +1885,7 @@ def test_routable_ip( assert ret == expected def test_create_pod_topology_spread_constraints(self): - configured_constraints = [ + configured_constraints: List[TopologySpreadConstraintDict] = [ { "topology_key": "kubernetes.io/hostname", "max_skew": 1, @@ -2049,47 +1949,102 @@ def test_get_node_selectors(self, raw_selectors, expected): assert self.deployment.get_node_selector() == expected def test_get_node_affinity_with_reqs(self): - with mock.patch( - "paasta_tools.kubernetes_tools.allowlist_denylist_to_requirements", - return_value=[("habitat", "In", ["habitat_a"])], - autospec=True, - ), mock.patch( - "paasta_tools.kubernetes_tools.raw_selectors_to_requirements", - return_value=[("instance_type", "In", ["a1.1xlarge"])], - autospec=True, - ): - assert self.deployment.get_node_affinity() == V1NodeAffinity( - required_during_scheduling_ignored_during_execution=V1NodeSelector( - node_selector_terms=[ - V1NodeSelectorTerm( - match_expressions=[ - V1NodeSelectorRequirement( - key="habitat", - operator="In", - values=["habitat_a"], - ), - V1NodeSelectorRequirement( - key="instance_type", - operator="In", - values=["a1.1xlarge"], - ), - ] - ) - ], - ), - ) + deployment = KubernetesDeploymentConfig( + service="kurupt", + instance="fm", + cluster="brentford", + config_dict={ + "deploy_whitelist": ["habitat", ["habitat_a"]], + "node_selectors": { + "instance_type": ["a1.1xlarge"], + }, + }, + branch_dict=None, + soa_dir="/nail/blah", + ) + + assert deployment.get_node_affinity() == V1NodeAffinity( + required_during_scheduling_ignored_during_execution=V1NodeSelector( + node_selector_terms=[ + V1NodeSelectorTerm( + match_expressions=[ + V1NodeSelectorRequirement( + key="yelp.com/habitat", + operator="In", + values=["habitat_a"], + ), + V1NodeSelectorRequirement( + key="node.kubernetes.io/instance-type", + operator="In", + values=["a1.1xlarge"], + ), + ] + ) + ], + ), + ) def test_get_node_affinity_no_reqs(self): - with mock.patch( - "paasta_tools.kubernetes_tools.allowlist_denylist_to_requirements", - return_value=[], - autospec=True, - ), mock.patch( - "paasta_tools.kubernetes_tools.raw_selectors_to_requirements", - return_value=[], - autospec=True, - ): - assert self.deployment.get_node_affinity() is None + deployment = KubernetesDeploymentConfig( + service="kurupt", + instance="fm", + cluster="brentford", + config_dict={}, + branch_dict=None, + soa_dir="/nail/blah", + ) + + assert deployment.get_node_affinity() is None + + def test_get_node_affinity_with_preferences(self): + deployment = KubernetesDeploymentConfig( + service="kurupt", + instance="fm", + cluster="brentford", + config_dict={ + "deploy_whitelist": ["habitat", ["habitat_a"]], + "node_selectors_preferred": [ + { + "weight": 1, + "preferences": { + "instance_type": ["a1.1xlarge"], + }, + } + ], + }, + branch_dict=None, + soa_dir="/nail/blah", + ) + + assert deployment.get_node_affinity() == V1NodeAffinity( + required_during_scheduling_ignored_during_execution=V1NodeSelector( + node_selector_terms=[ + V1NodeSelectorTerm( + match_expressions=[ + V1NodeSelectorRequirement( + key="yelp.com/habitat", + operator="In", + values=["habitat_a"], + ), + ] + ) + ], + ), + preferred_during_scheduling_ignored_during_execution=[ + V1PreferredSchedulingTerm( + weight=1, + preference=V1NodeSelectorTerm( + match_expressions=[ + V1NodeSelectorRequirement( + key="node.kubernetes.io/instance-type", + operator="In", + values=["a1.1xlarge"], + ), + ] + ), + ) + ], + ) @pytest.mark.parametrize( "anti_affinity,expected", @@ -2221,7 +2176,7 @@ def test_raw_selectors_to_requirements_error(self): "error_key": [{"operator": "BadOperator"}], # type: ignore } with pytest.raises(ValueError): - raw_selectors_to_requirements(node_selectors) + raw_selectors_to_requirements(node_selectors) # type: ignore @pytest.mark.parametrize( "is_autoscaled, autoscaled_label", @@ -2261,7 +2216,7 @@ def test_get_kubernetes_metadata(self, is_autoscaled, autoscaled_label): "paasta.yelp.com/managed": "true", }, name="kurupt-fm", - namespace="paasta", + namespace="paastasvc-kurupt", ) @pytest.mark.parametrize( @@ -2289,6 +2244,7 @@ def test_get_autoscaling_metric_spec_cpu(self, metrics_provider): "fake_name", "cluster", KubeClient(), + "paasta", ) annotations: Dict[Any, Any] = {} expected_res = V2beta2HorizontalPodAutoscaler( @@ -2352,6 +2308,7 @@ def test_get_autoscaling_metric_spec_cpu_prometheus(self, metrics_provider): "fake_name", "cluster", KubeClient(), + "paasta", ) annotations: Dict[Any, Any] = {} expected_res = V2beta2HorizontalPodAutoscaler( @@ -2438,6 +2395,7 @@ def test_get_autoscaling_metric_spec_uwsgi_prometheus( "fake_name", "cluster", KubeClient(), + "paasta", ) expected_res = V2beta2HorizontalPodAutoscaler( kind="HorizontalPodAutoscaler", @@ -2515,6 +2473,7 @@ def test_get_autoscaling_metric_spec_gunicorn_prometheus( "fake_name", "cluster", KubeClient(), + "paasta", ) expected_res = V2beta2HorizontalPodAutoscaler( kind="HorizontalPodAutoscaler", @@ -2585,6 +2544,7 @@ def test_override_scaledown_policies(self): "fake_name", "cluster", KubeClient(), + "paasta", ) assert hpa_dict["spec"]["behavior"]["scaleDown"] == { "stabilizationWindowSeconds": 123, @@ -2612,6 +2572,7 @@ def test_get_autoscaling_metric_spec_bespoke(self): "fake_name", "cluster", KubeClient(), + "paasta", ) expected_res = None assert expected_res == return_value @@ -2684,7 +2645,9 @@ def test_get_kubernetes_secret_env_vars(self): name="SOME", value_from=V1EnvVarSource( secret_key_ref=V1SecretKeySelector( - name="paasta-secret-kurupt-a--ref", key="a_ref", optional=False + name="paastasvc-kurupt-secret-kurupt-a--ref", + key="a_ref", + optional=False, ) ), ), @@ -2692,7 +2655,7 @@ def test_get_kubernetes_secret_env_vars(self): name="A", value_from=V1EnvVarSource( secret_key_ref=V1SecretKeySelector( - name="paasta-secret-underscore-shared-underscore-ref1", + name="paastasvc-kurupt-secret-underscore-shared-underscore-ref1", key="_ref1", optional=False, ) @@ -3009,6 +2972,8 @@ def test_KubeClient(): def test_ensure_namespace_doesnt_create_if_namespace_exists(): with mock.patch( "paasta_tools.kubernetes_tools.ensure_paasta_api_rolebinding", autospec=True + ), mock.patch( + "paasta_tools.kubernetes_tools.ensure_paasta_namespace_limits", autospec=True ): mock_metadata = mock.Mock() type(mock_metadata).name = "paasta" @@ -3023,6 +2988,8 @@ def test_ensure_namespace_doesnt_create_if_namespace_exists(): def test_ensure_namespace_kube_system(): with mock.patch( "paasta_tools.kubernetes_tools.ensure_paasta_api_rolebinding", autospec=True + ), mock.patch( + "paasta_tools.kubernetes_tools.ensure_paasta_namespace_limits", autospec=True ): mock_metadata = mock.Mock() type(mock_metadata).name = "kube-system" @@ -3037,6 +3004,8 @@ def test_ensure_namespace_kube_system(): def test_ensure_namespace_creates_namespace_if_doesnt_exist(): with mock.patch( "paasta_tools.kubernetes_tools.ensure_paasta_api_rolebinding", autospec=True + ), mock.patch( + "paasta_tools.kubernetes_tools.ensure_paasta_namespace_limits", autospec=True ): mock_namespaces = mock.Mock(items=[]) mock_client = mock.Mock( @@ -3072,6 +3041,28 @@ def test_ensure_paasta_api_rolebinding_doesnt_create_if_exists(): assert not mock_client.rbac.create_namespaced_role_binding.called +def test_ensure_paasta_namespace_limits_creates_if_not_exist(): + mock_limits = mock.Mock(items=[]) + mock_client = mock.Mock( + core=mock.Mock(list_namespaced_limit_range=mock.Mock(return_value=mock_limits)), + ) + + ensure_paasta_namespace_limits(mock_client, namespace="paastasvc-cool-service-name") + assert mock_client.core.create_namespaced_limit_range.called + + +def test_ensure_paasta_namespace_limits_doesnt_create_if_exists(): + mock_metadata = mock.Mock() + type(mock_metadata).name = "limit-mem-cpu-disk-per-container" + mock_limits = mock.Mock(items=[mock.Mock(metadata=mock_metadata)]) + mock_client = mock.Mock( + core=mock.Mock(list_namespaced_limit_range=mock.Mock(return_value=mock_limits)), + ) + + ensure_paasta_namespace_limits(mock_client, namespace="paastasvc-cool-service-name") + assert not mock_client.core.create_namespaced_role_binding.called + + @pytest.mark.parametrize( "addl_labels,replicas", ( @@ -3181,7 +3172,7 @@ def test_list_all_deployments(addl_labels, replicas): list_namespaced_stateful_set=mock.Mock(return_value=mock_stateful_sets), ) ) - assert list_all_deployments(kube_client=mock_client) == [] + assert list_all_deployments(kube_client=mock_client, namespace="paasta") == [] mock_items = [ mock.Mock( @@ -3230,7 +3221,7 @@ def test_list_all_deployments(addl_labels, replicas): list_namespaced_stateful_set=mock.Mock(return_value=mock_stateful_sets), ) ) - assert list_all_deployments(mock_client) == [ + assert list_all_deployments(mock_client, namespace="paasta") == [ KubeDeployment( service="kurupt", instance="fm", @@ -3702,7 +3693,10 @@ def test_get_kubernetes_app_by_name(): mock_client.deployments.read_namespaced_deployment_status.return_value = ( mock_deployment ) - assert get_kubernetes_app_by_name("someservice", mock_client) == mock_deployment + assert ( + get_kubernetes_app_by_name("someservice", mock_client, namespace="paasta") + == mock_deployment + ) assert mock_client.deployments.read_namespaced_deployment_status.called assert not mock_client.deployments.read_namespaced_stateful_set_status.called @@ -3714,7 +3708,10 @@ def test_get_kubernetes_app_by_name(): mock_client.deployments.read_namespaced_stateful_set_status.return_value = ( mock_stateful_set ) - assert get_kubernetes_app_by_name("someservice", mock_client) == mock_stateful_set + assert ( + get_kubernetes_app_by_name("someservice", mock_client, namespace="paasta") + == mock_stateful_set + ) assert mock_client.deployments.read_namespaced_deployment_status.called assert mock_client.deployments.read_namespaced_stateful_set_status.called @@ -3723,7 +3720,7 @@ def test_get_kubernetes_app_by_name(): async def test_pods_for_service_instance(): mock_client = mock.Mock() assert ( - await pods_for_service_instance("kurupt", "fm", mock_client) + await pods_for_service_instance("kurupt", "fm", mock_client, namespace="paasta") == mock_client.core.list_namespaced_pod.return_value.items ) @@ -3782,7 +3779,7 @@ def test_get_active_versions_for_service(): def test_get_all_pods(): mock_client = mock.Mock() assert ( - get_all_pods(mock_client) + get_all_pods(mock_client, namespace="paasta") == mock_client.core.list_namespaced_pod.return_value.items ) @@ -4622,6 +4619,100 @@ def test_create_or_find_service_account_name_existing_create_rb_only(): assert mock_client.rbac.create_namespaced_role_binding.called is True +def test_create_or_find_service_account_name_caps(): + iam_role = "arn:aws:iam::000000000000:role/Some_Role" + namespace = "test_namespace" + expected_sa_name = "paasta--arn-aws-iam-000000000000-role-some-role" + with mock.patch( + "paasta_tools.kubernetes_tools.kube_config.load_kube_config", autospec=True + ), mock.patch( + "paasta_tools.kubernetes_tools.KubeClient", + autospec=False, + ) as mock_kube_client: + mock_client = mock.Mock() + mock_client.core = mock.Mock(spec=kube_client.CoreV1Api) + mock_client.core.list_namespaced_service_account.return_value = mock.Mock( + spec=V1ServiceAccountList + ) + mock_client.core.list_namespaced_service_account.return_value.items = [ + V1ServiceAccount( + kind="ServiceAccount", + metadata=V1ObjectMeta( + name=expected_sa_name, + namespace=namespace, + annotations={"eks.amazonaws.com/role-arn": iam_role}, + ), + ) + ] + mock_kube_client.return_value = mock_client + + assert expected_sa_name == create_or_find_service_account_name( + iam_role, + namespace=namespace, + ) + mock_client.core.create_namespaced_service_account.assert_not_called() + + +def test_create_or_find_service_account_name_caps_with_k8s(): + iam_role = "arn:aws:iam::000000000000:role/Some_Role" + namespace = "test_namespace" + k8s_role = "mega-admin" + expected_sa_name = "paasta--arn-aws-iam-000000000000-role-some-role--mega-admin" + with mock.patch( + "paasta_tools.kubernetes_tools.kube_config.load_kube_config", autospec=True + ), mock.patch( + "paasta_tools.kubernetes_tools.KubeClient", + autospec=False, + ) as mock_kube_client: + mock_client = mock.Mock() + mock_client.core = mock.Mock(spec=kube_client.CoreV1Api) + mock_client.rbac = mock.Mock(spec=kube_client.RbacAuthorizationV1Api) + mock_client.core.list_namespaced_service_account.return_value = mock.Mock( + spec=V1ServiceAccountList + ) + mock_client.core.list_namespaced_service_account.return_value.items = [ + V1ServiceAccount( + kind="ServiceAccount", + metadata=V1ObjectMeta( + name=expected_sa_name, + namespace=namespace, + annotations={"eks.amazonaws.com/role-arn": iam_role}, + ), + ) + ] + mock_client.rbac.list_namespaced_role_binding.return_value = mock.Mock( + spec=V1RoleBinding, + ) + mock_client.rbac.list_namespaced_role_binding.return_value.items = [ + V1RoleBinding( + kind="ServiceAccount", + metadata=V1ObjectMeta( + name=expected_sa_name, + namespace=namespace, + ), + role_ref=V1RoleRef( + api_group="rbac.authorization.k8s.io", + kind="Role", + name=k8s_role, + ), + subjects=[ + V1Subject( + kind="ServiceAccount", + namespace=namespace, + name=expected_sa_name, + ) + ], + ) + ] + mock_kube_client.return_value = mock_client + + assert expected_sa_name == create_or_find_service_account_name( + iam_role, namespace=namespace, k8s_role=k8s_role + ) + mock_client.core.create_namespaced_service_account.assert_not_called() + mock_client.rbac.create_namespaced_role_binding.assert_not_called() + + @pytest.mark.parametrize("decode", [(True), (False)]) def test_get_kubernetes_secret(decode): with mock.patch( @@ -4651,8 +4742,8 @@ def test_get_kubernetes_secret(decode): mock_client, get_paasta_secret_name(mock_namespace, service_name, secret_name), secret_name, - mock_namespace, - decode, + namespace=mock_namespace, + decode=decode, ) mock_client.core.read_namespaced_secret.assert_called_with( name="paasta-secret-example--service-example--secret", namespace="paasta" @@ -4689,6 +4780,7 @@ def test_get_kubernetes_secret_env_variables(): kube_client=mock_client, environment=mock_environment, service_name="universe", + namespace="paasta", ) assert ret == { "SECRET_NAME1": "123", @@ -4755,6 +4847,7 @@ def test_get_kubernetes_secret_volumes_multiple_files(): kube_client=mock_client, secret_volumes_config=mock_secret_volumes_config, service_name="universe", + namespace="paasta", ) assert ret == { "/the/container/path/the_secret_filename1": "secret_contents1", @@ -4784,6 +4877,7 @@ def test_get_kubernetes_secret_volumes_single_file(): kube_client=mock_client, secret_volumes_config=mock_secret_volumes_config, service_name="universe", + namespace="paasta", ) assert ret == { "/the/container/path/the_secret_name": "secret_contents", diff --git a/tests/test_list_kubernetes_service_instances.py b/tests/test_list_kubernetes_service_instances.py index 43560ccd13..94cd85d0d2 100644 --- a/tests/test_list_kubernetes_service_instances.py +++ b/tests/test_list_kubernetes_service_instances.py @@ -26,6 +26,8 @@ def test_parse_args(): ("kubernetes", False, "service_1.instance1\nservice_2.instance1", False, None), ("kubernetes", True, "service--1-instance1\nservice--2-instance1", False, None), ("flink", True, "service--1-instance1\nservice--2-instance1", False, None), + ("eks", False, "service_1.instance1\nservice_2.instance1", False, None), + ("eks", True, "service--1-instance1\nservice--2-instance1", False, None), ], ) def test_main( diff --git a/tests/test_paasta_execute_docker_command.py b/tests/test_paasta_execute_docker_command.py index 3b8108511f..01338eead3 100644 --- a/tests/test_paasta_execute_docker_command.py +++ b/tests/test_paasta_execute_docker_command.py @@ -38,6 +38,11 @@ def test_execute_in_container(): ) +@mock.patch( + "paasta_tools.paasta_execute_docker_command.is_using_unprivileged_containers", + lambda: False, + autospec=None, +) def test_execute_in_container_reuses_exec(): fake_container_id = "fake_container_id" fake_execid = "fake_execid" @@ -59,6 +64,11 @@ def test_execute_in_container_reuses_exec(): mock_docker_client.exec_start.assert_called_once_with(fake_execid, stream=False) +@mock.patch( + "paasta_tools.paasta_execute_docker_command.is_using_unprivileged_containers", + lambda: False, + autospec=None, +) def test_execute_in_container_reuses_only_valid_exec(): fake_container_id = "fake_container_id" fake_execid = "fake_execid" diff --git a/tests/test_setup_kubernetes_job.py b/tests/test_setup_kubernetes_job.py index 2a02a73e3a..a6f4c4188d 100644 --- a/tests/test_setup_kubernetes_job.py +++ b/tests/test_setup_kubernetes_job.py @@ -1,13 +1,19 @@ -# from typing import Sequence +from typing import List +from typing import Tuple +from typing import Union + import mock +import pytest from kubernetes.client import V1Deployment from kubernetes.client import V1StatefulSet from pytest import raises +from paasta_tools.eks_tools import EksDeploymentConfig from paasta_tools.kubernetes.application.controller_wrappers import Application from paasta_tools.kubernetes_tools import InvalidKubernetesConfig from paasta_tools.kubernetes_tools import KubeDeployment from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig +from paasta_tools.kubernetes_tools import KubernetesDeploymentConfigDict from paasta_tools.setup_kubernetes_job import create_application_object from paasta_tools.setup_kubernetes_job import get_kubernetes_deployment_config from paasta_tools.setup_kubernetes_job import get_service_instances_with_valid_names @@ -49,7 +55,7 @@ def test_main_logging(): service="my-service", instance="my-instance", cluster="cluster", - config_dict={}, + config_dict=KubernetesDeploymentConfigDict(), branch_dict=None, ) mock_service_instance_configs_list.return_value = [ @@ -70,7 +76,32 @@ def test_main_logging(): mock_logging.getLogger.assert_called_with("kazoo") -def test_main(): +@pytest.mark.parametrize( + "mock_kube_deploy_config, eks_flag", + [ + ( + KubernetesDeploymentConfig( + service="my-service", + instance="my-instance", + cluster="cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + False, + ), + ( + EksDeploymentConfig( + service="my-service", + instance="my-instance", + cluster="cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + True, + ), + ], +) +def test_main(mock_kube_deploy_config, eks_flag): with mock.patch( "paasta_tools.setup_kubernetes_job.metrics_lib.get_metrics_interface", autospec=True, @@ -88,13 +119,7 @@ def test_main(): ) as mock_setup_kube_deployments: mock_setup_kube_deployments.return_value = True mock_metrics_interface = mock_get_metrics_interface.return_value - mock_kube_deploy_config = KubernetesDeploymentConfig( - service="my-service", - instance="my-instance", - cluster="cluster", - config_dict={}, - branch_dict=None, - ) + mock_parse_args.return_value.eks = eks_flag mock_service_instance_configs_list.return_value = [ (True, mock_kube_deploy_config) ] @@ -109,6 +134,7 @@ def test_main(): rate_limit=mock_parse_args.return_value.rate_limit, service_instance_configs_list=mock_service_instance_configs_list.return_value, metrics_interface=mock_metrics_interface, + eks=mock_parse_args.return_value.eks, ) mock_setup_kube_deployments.return_value = False with raises(SystemExit) as e: @@ -191,7 +217,7 @@ def test_get_kubernetes_deployment_config(): instance="instance", cluster="fake_cluster", soa_dir="nail/blah", - config_dict={}, + config_dict=KubernetesDeploymentConfigDict(), branch_dict=None, ) mock_load_kubernetes_service_config_no_cache.side_effect = None @@ -210,14 +236,107 @@ def test_get_kubernetes_deployment_config(): instance="instance", cluster="fake_cluster", soa_dir="nail/blah", - config_dict={}, + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + ) + ] + + +def test_get_eks_deployment_config(): + with mock.patch( + "paasta_tools.setup_kubernetes_job.load_eks_service_config_no_cache", + autospec=True, + ) as mock_load_eks_service_config_no_cache: + + mock_get_service_instances_with_valid_names = [ + ("kurupt", "instance", None, None) + ] + + # Testing NoDeploymentsAvailable exception + mock_load_eks_service_config_no_cache.side_effect = NoDeploymentsAvailable + ret = get_kubernetes_deployment_config( + service_instances_with_valid_names=mock_get_service_instances_with_valid_names, + cluster="fake_cluster", + soa_dir="nail/blah", + eks=True, + ) + assert ret == [(True, None)] + + # Testing NoConfigurationForServiceError exception + mock_load_eks_service_config_no_cache.side_effect = ( + NoConfigurationForServiceError + ) + + ret = get_kubernetes_deployment_config( + service_instances_with_valid_names=mock_get_service_instances_with_valid_names, + cluster="fake_cluster", + soa_dir="nail/blah", + eks=True, + ) + assert ret == [(False, None)] + + # Testing returning a KubernetesDeploymentConfig + mock_kube_deploy = EksDeploymentConfig( + service="kurupt", + instance="instance", + cluster="fake_cluster", + soa_dir="nail/blah", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ) + mock_load_eks_service_config_no_cache.side_effect = None + mock_load_eks_service_config_no_cache.return_value = mock_kube_deploy + ret = get_kubernetes_deployment_config( + service_instances_with_valid_names=mock_get_service_instances_with_valid_names, + cluster="fake_cluster", + soa_dir="nail/blah", + eks=True, + ) + + assert ret == [ + ( + True, + EksDeploymentConfig( + service="kurupt", + instance="instance", + cluster="fake_cluster", + soa_dir="nail/blah", + config_dict=KubernetesDeploymentConfigDict(), branch_dict=None, ), ) ] -def test_create_application_object(): +@pytest.mark.parametrize( + "eks_flag, mock_service_config", + [ + ( + "False", + KubernetesDeploymentConfig( + service="kurupt", + instance="instance", + cluster="fake_cluster", + soa_dir="nail/blah", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + ), + ( + "True", + EksDeploymentConfig( + service="kurupt", + instance="instance", + cluster="fake_cluster", + soa_dir="nail/blah", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + ), + ], +) +def test_create_application_object(eks_flag, mock_service_config): with mock.patch( "paasta_tools.setup_kubernetes_job.load_system_paasta_config", autospec=True ), mock.patch( @@ -231,7 +350,7 @@ def test_create_application_object(): autospec=True, ) as mock_stateful_set_wrapper: mock_deploy = mock.MagicMock(spec=V1Deployment) - service_config = mock.MagicMock() + service_config = mock.MagicMock(spec=mock_service_config) service_config.format_kubernetes_app.return_value = mock_deploy # Create DeploymentWrapper @@ -239,6 +358,7 @@ def test_create_application_object(): cluster="fake_cluster", soa_dir="/nail/blah", service_instance_config=service_config, + eks=eks_flag, ) mock_deployment_wrapper.assert_called_with(mock_deploy) @@ -251,6 +371,7 @@ def test_create_application_object(): cluster="fake_cluster", soa_dir="/nail/blah", service_instance_config=service_config, + eks=eks_flag, ) mock_stateful_set_wrapper.assert_called_with(mock_deploy) @@ -261,6 +382,7 @@ def test_create_application_object(): cluster="fake_cluster", soa_dir="/nail/blah", service_instance_config=service_config, + eks=eks_flag, ) mock_deployment_wrapper.reset_mock() @@ -273,6 +395,7 @@ def test_create_application_object(): cluster="fake_cluster", soa_dir="/nail/blah", service_instance_config=service_config, + eks=eks_flag, ) assert ret == (False, None) @@ -280,12 +403,41 @@ def test_create_application_object(): assert not mock_stateful_set_wrapper.called -def test_setup_kube_deployment_create_update(): +@pytest.mark.parametrize( + "mock_kube_deploy_config, eks_flag", + [ + ( + KubernetesDeploymentConfig( + service="kurupt", + instance="fm", + cluster="fake_cluster", + soa_dir="/nail/blah", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + False, + ), + ( + EksDeploymentConfig( + service="kurupt", + instance="fm", + cluster="fake_cluster", + soa_dir="/nail/blah", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + True, + ), + ], +) +def test_setup_kube_deployment_create_update(mock_kube_deploy_config, eks_flag): fake_create = mock.MagicMock() fake_update = mock.MagicMock() fake_update_related_api_objects = mock.MagicMock() - def simple_create_application_object(cluster, soa_dir, service_instance_config): + def simple_create_application_object( + cluster, soa_dir, service_instance_config, eks + ): fake_app = mock.MagicMock(spec=Application) fake_app.kube_deployment = KubeDeployment( service=service_instance_config.service, @@ -300,7 +452,14 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): fake_app.update = fake_update fake_app.update_related_api_objects = fake_update_related_api_objects fake_app.item = None - fake_app.soa_config = None + fake_app.soa_config = KubernetesDeploymentConfig( + service=service_instance_config.service, + cluster=cluster, + instance=service_instance_config.instance, + config_dict=service_instance_config.config_dict, + branch_dict=None, + soa_dir=soa_dir, + ) fake_app.__str__ = lambda app: "fake_app" return True, fake_app @@ -317,10 +476,12 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): ) as mock_no_metrics, mock.patch( "paasta_tools.setup_kubernetes_job.get_kubernetes_deployment_config", autospec=True, - ) as mock_service_instance_configs_list: + ): mock_client = mock.Mock() # No instances created - mock_service_instance_configs_list = [] + mock_service_instance_configs_list: List[ + Tuple[bool, Union[KubernetesDeploymentConfig, EksDeploymentConfig]] + ] = [] setup_kube_deployments( kube_client=mock_client, service_instance_configs_list=mock_service_instance_configs_list, @@ -335,26 +496,60 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): mock_log_obj.info.reset_mock() # Create a new instance - mock_kube_deploy_config = KubernetesDeploymentConfig( + mock_service_instance_configs_list = [(True, mock_kube_deploy_config)] + setup_kube_deployments( + kube_client=mock_client, + service_instance_configs_list=mock_service_instance_configs_list, + cluster="fake_cluster", + soa_dir="/nail/blah", + metrics_interface=mock_no_metrics, + eks=eks_flag, + ) + assert fake_create.call_count == 1 + assert fake_update.call_count == 0 + assert fake_update_related_api_objects.call_count == 1 + assert mock_no_metrics.emit_event.call_count == 1 + mock_log_obj.info.reset_mock() + mock_no_metrics.reset_mock() + + # Skipping downthenup instance cuz of existing_apps + fake_create.reset_mock() + fake_update.reset_mock() + fake_update_related_api_objects.reset_mock() + mock_list_all_paasta_deployments.return_value = [ + KubeDeployment( + service="kurupt", + instance="fm", + git_sha="2", + namespace="paastasvc-kurupt", + image_version="extrastuff-1", + config_sha="1", + replicas=1, + ) + ] + mock_downthenup_kube_deploy_config = KubernetesDeploymentConfig( service="kurupt", instance="fm", cluster="fake_cluster", soa_dir="/nail/blah", - config_dict={}, + config_dict=KubernetesDeploymentConfigDict(bounce_method="downthenup"), branch_dict=None, ) - mock_service_instance_configs_list = [(True, mock_kube_deploy_config)] + mock_service_instance_configs_list = [ + (True, mock_downthenup_kube_deploy_config) + ] setup_kube_deployments( kube_client=mock_client, service_instance_configs_list=mock_service_instance_configs_list, cluster="fake_cluster", soa_dir="/nail/blah", metrics_interface=mock_no_metrics, + eks=eks_flag, ) - assert fake_create.call_count == 1 + assert fake_create.call_count == 0 assert fake_update.call_count == 0 - assert fake_update_related_api_objects.call_count == 1 - assert mock_no_metrics.emit_event.call_count == 1 + assert fake_update_related_api_objects.call_count == 0 + assert mock_no_metrics.emit_event.call_count == 0 mock_log_obj.info.reset_mock() mock_no_metrics.reset_mock() @@ -379,6 +574,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): cluster="fake_cluster", soa_dir="/nail/blah", metrics_interface=mock_no_metrics, + eks=eks_flag, ) assert fake_update.call_count == 1 @@ -417,6 +613,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): service_instance_configs_list=mock_service_instance_configs_list, cluster="fake_cluster", soa_dir="/nail/blah", + eks=eks_flag, ) assert fake_update.call_count == 1 assert fake_create.call_count == 0 @@ -443,6 +640,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): service_instance_configs_list=mock_service_instance_configs_list, cluster="fake_cluster", soa_dir="/nail/blah", + eks=eks_flag, ) assert fake_update.call_count == 1 assert fake_create.call_count == 0 @@ -469,6 +667,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): service_instance_configs_list=mock_service_instance_configs_list, cluster="fake_cluster", soa_dir="/nail/blah", + eks=eks_flag, ) assert fake_update.call_count == 1 assert fake_create.call_count == 0 @@ -484,7 +683,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): instance="garage", cluster="fake_cluster", soa_dir="/nail/blah", - config_dict={}, + config_dict=KubernetesDeploymentConfigDict(), branch_dict=None, ) mock_service_instance_configs_list = [ @@ -507,6 +706,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): service_instance_configs_list=mock_service_instance_configs_list, cluster="fake_cluster", soa_dir="/nail/blah", + eks=eks_flag, ) assert fake_update.call_count == 1 assert fake_create.call_count == 1 @@ -534,6 +734,7 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): service_instance_configs_list=mock_service_instance_configs_list, cluster="fake_cluster", soa_dir="/nail/blah", + eks=eks_flag, ) assert fake_update.call_count == 0 assert fake_create.call_count == 0 @@ -543,7 +744,65 @@ def simple_create_application_object(cluster, soa_dir, service_instance_config): ) -def test_setup_kube_deployments_rate_limit(): +@pytest.mark.parametrize( + "mock_kube_deploy_config_fm, mock_kube_deploy_config_garage, mock_kube_deploy_config_radio, eks_flag", + [ + ( + KubernetesDeploymentConfig( + service="kurupt", + instance="fm", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + KubernetesDeploymentConfig( + service="kurupt", + instance="garage", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + KubernetesDeploymentConfig( + service="kurupt", + instance="radio", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + False, + ), + ( + EksDeploymentConfig( + service="kurupt", + instance="fm", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + EksDeploymentConfig( + service="kurupt", + instance="garage", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + EksDeploymentConfig( + service="kurupt", + instance="radio", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + True, + ), + ], +) +def test_setup_kube_deployments_rate_limit( + mock_kube_deploy_config_fm, + mock_kube_deploy_config_garage, + mock_kube_deploy_config_radio, + eks_flag, +): with mock.patch( "paasta_tools.setup_kubernetes_job.create_application_object", autospec=True, @@ -553,28 +812,9 @@ def test_setup_kube_deployments_rate_limit(): "paasta_tools.setup_kubernetes_job.log", autospec=True ) as mock_log_obj: mock_client = mock.Mock() - mock_kube_deploy_config_fm = KubernetesDeploymentConfig( - service="kurupt", - instance="fm", - cluster="fake_cluster", - config_dict={}, - branch_dict=None, - ) - mock_kube_deploy_config_garage = KubernetesDeploymentConfig( - service="kurupt", - instance="garage", - cluster="fake_cluster", - config_dict={}, - branch_dict=None, - ) - mock_kube_deploy_config_radio = KubernetesDeploymentConfig( - service="kurupt", - instance="radio", - cluster="fake_cluster", - config_dict={}, - branch_dict=None, - ) - mock_service_instance_configs_list = [ + mock_service_instance_configs_list: List[ + Tuple[bool, Union[KubernetesDeploymentConfig, EksDeploymentConfig]] + ] = [ (True, mock_kube_deploy_config_fm), (True, mock_kube_deploy_config_garage), (True, mock_kube_deploy_config_radio), @@ -589,6 +829,7 @@ def test_setup_kube_deployments_rate_limit(): cluster="fake_cluster", soa_dir="/nail/blah", rate_limit=2, + eks=eks_flag, ) assert fake_app.create.call_count == 2 mock_log_obj.info.assert_any_call( @@ -603,11 +844,53 @@ def test_setup_kube_deployments_rate_limit(): cluster="fake_cluster", soa_dir="/nail/blah", rate_limit=0, + eks=eks_flag, ) assert fake_app.create.call_count == 3 -def test_setup_kube_deployments_skip_malformed_apps(): +@pytest.mark.parametrize( + "mock_kube_deploy_config_fake, mock_kube_deploy_config_mock, eks_flag", + [ + ( + KubernetesDeploymentConfig( + service="fake", + instance="instance", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + KubernetesDeploymentConfig( + service="mock", + instance="instance", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + False, + ), + ( + EksDeploymentConfig( + service="fake", + instance="instance", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + EksDeploymentConfig( + service="mock", + instance="instance", + cluster="fake_cluster", + config_dict=KubernetesDeploymentConfigDict(), + branch_dict=None, + ), + True, + ), + ], +) +def test_setup_kube_deployments_skip_malformed_apps( + mock_kube_deploy_config_fake, mock_kube_deploy_config_mock, eks_flag +): with mock.patch( "paasta_tools.setup_kubernetes_job.create_application_object", autospec=True, @@ -617,21 +900,9 @@ def test_setup_kube_deployments_skip_malformed_apps(): "paasta_tools.setup_kubernetes_job.log", autospec=True ) as mock_log_obj: mock_client = mock.Mock() - mock_kube_deploy_config_fake = KubernetesDeploymentConfig( - service="fake", - instance="instance", - cluster="fake_cluster", - config_dict={}, - branch_dict=None, - ) - mock_kube_deploy_config_mock = KubernetesDeploymentConfig( - service="mock", - instance="instance", - cluster="fake_cluster", - config_dict={}, - branch_dict=None, - ) - mock_service_instance_configs_list = [ + mock_service_instance_configs_list: List[ + Tuple[bool, Union[KubernetesDeploymentConfig, EksDeploymentConfig]] + ] = [ (True, mock_kube_deploy_config_fake), (True, mock_kube_deploy_config_mock), ] @@ -648,6 +919,7 @@ def test_setup_kube_deployments_skip_malformed_apps(): cluster="fake_cluster", soa_dir="/nail/blah", rate_limit=0, + eks=eks_flag, ) assert fake_app.create.call_count == 2 assert len(mock_log_obj.exception.call_args_list) == 1 diff --git a/tests/test_setup_prometheus_adapter_config.py b/tests/test_setup_prometheus_adapter_config.py index e02381d9c1..211330309e 100644 --- a/tests/test_setup_prometheus_adapter_config.py +++ b/tests/test_setup_prometheus_adapter_config.py @@ -1,8 +1,12 @@ import mock import pytest +from paasta_tools.kubernetes_tools import KubernetesDeploymentConfig from paasta_tools.long_running_service_tools import AutoscalingParamsDict from paasta_tools.setup_prometheus_adapter_config import _minify_promql +from paasta_tools.setup_prometheus_adapter_config import ( + create_instance_active_requests_scaling_rule, +) from paasta_tools.setup_prometheus_adapter_config import ( create_instance_arbitrary_promql_scaling_rule, ) @@ -16,6 +20,9 @@ create_instance_uwsgi_scaling_rule, ) from paasta_tools.setup_prometheus_adapter_config import get_rules_for_service_instance +from paasta_tools.setup_prometheus_adapter_config import ( + should_create_active_requests_scaling_rule, +) from paasta_tools.setup_prometheus_adapter_config import should_create_cpu_scaling_rule from paasta_tools.setup_prometheus_adapter_config import ( should_create_gunicorn_scaling_rule, @@ -31,6 +38,113 @@ ) +@pytest.mark.parametrize( + "autoscaling_config,expected", + [ + ( + { + "metrics_provider": "cpu", + "decision_policy": "bespoke", + "moving_average_window_seconds": 123, + "setpoint": 0.653, + }, + False, + ), + ( + { + "metrics_provider": "active-requests", + "moving_average_window_seconds": 124, + "desired_active_requests_per_replica": 0.425, + }, + True, + ), + ( + { + "metrics_provider": "active-requests", + "desired_active_requests_per_replica": 0.764, + }, + True, + ), + ], +) +def test_should_create_active_requests_scaling_rule( + autoscaling_config: AutoscalingParamsDict, expected: bool +) -> None: + should_create, reason = should_create_active_requests_scaling_rule( + autoscaling_config=autoscaling_config + ) + + assert should_create == expected + if expected: + assert reason is None + else: + assert reason is not None + + +@pytest.mark.parametrize( + "registrations,expected_instance", + [ + ( + ["test_service.abc", "test_service.xyz", "test_service.123"], + "test_instance", + ), + ( + ["test_service.xyz"], + "xyz", + ), + ], +) +def test_create_instance_active_requests_scaling_rule( + registrations: list, expected_instance: str +) -> None: + service_name = "test_service" + instance_config = mock.Mock( + instance="test_instance", + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "active-requests", + "desired_active_requests_per_replica": 12, + "moving_average_window_seconds": 20120302, + } + ), + get_registrations=mock.Mock(return_value=registrations), + ) + paasta_cluster = "test_cluster" + + with mock.patch( + "paasta_tools.setup_prometheus_adapter_config.load_system_paasta_config", + autospec=True, + return_value=MOCK_SYSTEM_PAASTA_CONFIG, + ): + rule = create_instance_active_requests_scaling_rule( + service=service_name, + instance_config=instance_config, + paasta_cluster=paasta_cluster, + ) + + # we test that the format of the dictionary is as expected with mypy + # and we don't want to test the full contents of the retval since then + # we're basically just writting a change-detector test - instead, we test + # that we're actually using our inputs + assert service_name in rule["seriesQuery"] + assert instance_config.instance in rule["seriesQuery"] + assert paasta_cluster in rule["seriesQuery"] + # these two numbers are distinctive and unlikely to be used as constants + assert ( + str( + instance_config.get_autoscaling_params()[ + "desired_active_requests_per_replica" + ] + ) + in rule["metricsQuery"] + ) + assert ( + str(instance_config.get_autoscaling_params()["moving_average_window_seconds"]) + in rule["metricsQuery"] + ) + assert f"paasta_instance='{expected_instance}'" in rule["metricsQuery"] + + @pytest.mark.parametrize( "autoscaling_config,expected", [ @@ -87,14 +201,18 @@ def test_should_create_uswgi_scaling_rule( def test_create_instance_uwsgi_scaling_rule() -> None: service_name = "test_service" - instance_name = "test_instance" + instance_config = mock.Mock( + instance="test_instance", + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "uwsgi", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": True, + } + ), + ) paasta_cluster = "test_cluster" - autoscaling_config: AutoscalingParamsDict = { - "metrics_provider": "uwsgi", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": True, - } with mock.patch( "paasta_tools.setup_prometheus_adapter_config.load_system_paasta_config", @@ -103,9 +221,8 @@ def test_create_instance_uwsgi_scaling_rule() -> None: ): rule = create_instance_uwsgi_scaling_rule( service=service_name, - instance=instance_name, + instance_config=instance_config, paasta_cluster=paasta_cluster, - autoscaling_config=autoscaling_config, ) # we test that the format of the dictionary is as expected with mypy @@ -113,12 +230,16 @@ def test_create_instance_uwsgi_scaling_rule() -> None: # we're basically just writting a change-detector test - instead, we test # that we're actually using our inputs assert service_name in rule["seriesQuery"] - assert instance_name in rule["seriesQuery"] + assert instance_config.instance in rule["seriesQuery"] assert paasta_cluster in rule["seriesQuery"] # these two numbers are distinctive and unlikely to be used as constants - assert str(autoscaling_config["setpoint"]) in rule["metricsQuery"] assert ( - str(autoscaling_config["moving_average_window_seconds"]) in rule["metricsQuery"] + str(instance_config.get_autoscaling_params()["setpoint"]) + in rule["metricsQuery"] + ) + assert ( + str(instance_config.get_autoscaling_params()["moving_average_window_seconds"]) + in rule["metricsQuery"] ) @@ -177,29 +298,33 @@ def test_should_create_cpu_scaling_rule( def test_create_instance_cpu_scaling_rule() -> None: service_name = "test_service" - instance_name = "test_instance" + instance_config = mock.Mock( + instance="test_instance", + get_namespace=mock.Mock(return_value="test_namespace"), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "cpu", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": True, + } + ), + ) paasta_cluster = "test_cluster" - namespace = "test_namespace" - autoscaling_config: AutoscalingParamsDict = { - "metrics_provider": "cpu", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": True, - } rule = create_instance_cpu_scaling_rule( service=service_name, - instance=instance_name, + instance_config=instance_config, paasta_cluster=paasta_cluster, - autoscaling_config=autoscaling_config, - namespace=namespace, ) # our query doesn't include the setpoint as we'll just give the HPA the current CPU usage and # let the HPA compare that to the setpoint directly assert ( - str(autoscaling_config["moving_average_window_seconds"]) in rule["metricsQuery"] + str(instance_config.get_autoscaling_params()["moving_average_window_seconds"]) + in rule["metricsQuery"] ) + assert str(instance_config.get_namespace()) in rule["metricsQuery"] @pytest.mark.parametrize( @@ -240,14 +365,18 @@ def test_should_create_gunicorn_scaling_rule( def test_create_instance_gunicorn_scaling_rule() -> None: service_name = "test_service" - instance_name = "test_instance" + instance_config = mock.Mock( + instance="test_instance", + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "gunicorn", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": True, + } + ), + ) paasta_cluster = "test_cluster" - autoscaling_config: AutoscalingParamsDict = { - "metrics_provider": "gunicorn", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": True, - } with mock.patch( "paasta_tools.setup_prometheus_adapter_config.load_system_paasta_config", @@ -256,9 +385,8 @@ def test_create_instance_gunicorn_scaling_rule() -> None: ): rule = create_instance_gunicorn_scaling_rule( service=service_name, - instance=instance_name, + instance_config=instance_config, paasta_cluster=paasta_cluster, - autoscaling_config=autoscaling_config, ) # we test that the format of the dictionary is as expected with mypy @@ -266,58 +394,86 @@ def test_create_instance_gunicorn_scaling_rule() -> None: # we're basically just writting a change-detector test - instead, we test # that we're actually using our inputs assert service_name in rule["seriesQuery"] - assert instance_name in rule["seriesQuery"] + assert instance_config.instance in rule["seriesQuery"] assert paasta_cluster in rule["seriesQuery"] # these two numbers are distinctive and unlikely to be used as constants - assert str(autoscaling_config["setpoint"]) in rule["metricsQuery"] assert ( - str(autoscaling_config["moving_average_window_seconds"]) in rule["metricsQuery"] + str(instance_config.get_autoscaling_params()["setpoint"]) + in rule["metricsQuery"] + ) + assert ( + str(instance_config.get_autoscaling_params()["moving_average_window_seconds"]) + in rule["metricsQuery"] ) @pytest.mark.parametrize( - "autoscaling_config,expected_rules", + "instance_config,expected_rules", [ ( - { - "metrics_provider": "uwsgi", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": True, - }, + mock.Mock( + instance="instance", + get_namespace=mock.Mock(return_value="test_namespace"), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "uwsgi", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": True, + } + ), + ), 1, ), ( - { - "metrics_provider": "uwsgi", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": False, - }, + mock.Mock( + instance="instance", + get_namespace=mock.Mock(return_value="test_namespace"), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "uwsgi", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": False, + } + ), + ), 0, ), ( - { - "metrics_provider": "cpu", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": False, - }, + mock.Mock( + instance="instance", + get_namespace=mock.Mock(return_value="test_namespace"), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "cpu", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": False, + } + ), + ), 0, ), ( - { - "metrics_provider": "cpu", - "setpoint": 0.1234567890, - "moving_average_window_seconds": 20120302, - "use_prometheus": True, - }, + mock.Mock( + instance="instance", + get_namespace=mock.Mock(return_value="test_namespace"), + get_autoscaling_params=mock.Mock( + return_value={ + "metrics_provider": "cpu", + "setpoint": 0.1234567890, + "moving_average_window_seconds": 20120302, + "use_prometheus": True, + } + ), + ), 1, ), ], ) def test_get_rules_for_service_instance( - autoscaling_config: AutoscalingParamsDict, + instance_config: KubernetesDeploymentConfig, expected_rules: int, ) -> None: with mock.patch( @@ -329,10 +485,8 @@ def test_get_rules_for_service_instance( len( get_rules_for_service_instance( service_name="service", - instance_name="instance", - autoscaling_config=autoscaling_config, + instance_config=instance_config, paasta_cluster="cluster", - namespace="test_namespace", ) ) == expected_rules @@ -359,10 +513,14 @@ def test__minify_promql(query: str, expected: str) -> None: def test_create_instance_arbitrary_promql_scaling_rule_no_seriesQuery(): rule = create_instance_arbitrary_promql_scaling_rule( service="service", - instance="instance", - autoscaling_config={"prometheus_adapter_config": {"metricsQuery": "foo"}}, + instance_config=mock.Mock( + instance="instance", + get_namespace=mock.Mock(return_value="paasta"), + get_autoscaling_params=mock.Mock( + return_value={"prometheus_adapter_config": {"metricsQuery": "foo"}} + ), + ), paasta_cluster="cluster", - namespace="paasta", ) assert rule == { @@ -381,12 +539,19 @@ def test_create_instance_arbitrary_promql_scaling_rule_no_seriesQuery(): def test_create_instance_arbitrary_promql_scaling_rule_with_seriesQuery(): rule = create_instance_arbitrary_promql_scaling_rule( service="service", - instance="instance", - autoscaling_config={ - "prometheus_adapter_config": {"metricsQuery": "foo", "seriesQuery": "bar"} - }, + instance_config=mock.Mock( + instance="instance", + get_namespace=mock.Mock(return_value="test_namespace"), + get_autoscaling_params=mock.Mock( + return_value={ + "prometheus_adapter_config": { + "metricsQuery": "foo", + "seriesQuery": "bar", + } + } + ), + ), paasta_cluster="cluster", - namespace="test_namespace", ) assert rule == { diff --git a/tests/test_spark_tools.py b/tests/test_spark_tools.py index acf6cd2052..62a71a8711 100644 --- a/tests/test_spark_tools.py +++ b/tests/test_spark_tools.py @@ -1,3 +1,4 @@ +import sys from unittest import mock import pytest @@ -29,3 +30,39 @@ def test_inject_spark_conf_str(cmd, expected): assert ( spark_tools.inject_spark_conf_str(cmd, "--conf spark.max.cores=100") == expected ) + + +@pytest.mark.parametrize( + "spark_conf,expected", + [ + ( + { + "spark.kubernetes.executor.volumes.hostPath.nailsrv-123.mount.path": "/nail/srv", + "spark.kubernetes.executor.volumes.hostPath.nailsrv-123.options.path": "/nail/srv", + "spark.kubernetes.executor.volumes.hostPath.nailsrv-123.mount.readOnly": "true", + "spark.kubernetes.executor.volumes.hostPath.123.mount.path": "/nail/123", + "spark.kubernetes.executor.volumes.hostPath.123.options.path": "/nail/123", + "spark.kubernetes.executor.volumes.hostPath.123.mount.readOnly": "false", + }, + ["/nail/srv:/nail/srv:ro", "/nail/123:/nail/123:rw"], + ), + ( + { + "spark.kubernetes.executor.volumes.hostPath.NAILsrv-123.mount.path": "/one/two", + "spark.kubernetes.executor.volumes.hostPath.NAILsrv-123.options.path": "/one/two", + "spark.kubernetes.executor.volumes.hostPath.NAILsrv-123.mount.readOnly": "true", + }, + [""], + ), + ], +) +@mock.patch.object(sys, "exit") +def test_get_volumes_from_spark_k8s_configs(mock_sys, spark_conf, expected): + result = spark_tools.get_volumes_from_spark_k8s_configs(spark_conf) + if ( + "spark.kubernetes.executor.volumes.hostPath.NAILsrv-123.mount.path" + in spark_conf + ): + mock_sys.assert_called_with(1) + else: + assert result == expected diff --git a/tests/test_tron_tools.py b/tests/test_tron_tools.py index aec22adef0..8cfbb2bb34 100644 --- a/tests/test_tron_tools.py +++ b/tests/test_tron_tools.py @@ -32,7 +32,6 @@ "volumes": [], "dockercfg_location": "/mock/dockercfg", "tron_default_pool_override": "big_pool", - "tron_use_k8s": True, "tron_k8s_cluster_overrides": { "paasta-dev-test": "paasta-dev", }, @@ -231,32 +230,20 @@ def mock_list_teams(self): yield f @pytest.mark.parametrize( - "action_service,action_deploy,cluster,expected_cluster,use_k8s", + "action_service,action_deploy,cluster,expected_cluster", [ # normal case - no cluster override present and k8s enabled - (None, None, "paasta-dev", "paasta-dev", True), - (None, "special_deploy", "paasta-dev", "paasta-dev", True), - ("other_service", None, "paasta-dev", "paasta-dev", True), - (None, None, "paasta-dev", "paasta-dev", True), - (None, None, "paasta-dev", "paasta-dev", True), + (None, None, "paasta-dev", "paasta-dev"), + (None, "special_deploy", "paasta-dev", "paasta-dev"), + ("other_service", None, "paasta-dev", "paasta-dev"), + (None, None, "paasta-dev", "paasta-dev"), + (None, None, "paasta-dev", "paasta-dev"), # cluster override present and k8s enabled - (None, None, "paasta-dev-test", "paasta-dev", True), - (None, "special_deploy", "paasta-dev-test", "paasta-dev", True), - ("other_service", None, "paasta-dev-test", "paasta-dev", True), - (None, None, "paasta-dev-test", "paasta-dev", True), - (None, None, "paasta-dev-test", "paasta-dev", True), - # no cluster override present and k8s disabled - (None, None, "paasta-dev", "paasta-dev", False), - (None, "special_deploy", "paasta-dev", "paasta-dev", False), - ("other_service", None, "paasta-dev", "paasta-dev", False), - (None, None, "paasta-dev", "paasta-dev", False), - (None, None, "paasta-dev", "paasta-dev", False), - # cluster override present and k8s disabled - (None, None, "paasta-dev-test", "paasta-dev-test", False), - (None, "special_deploy", "paasta-dev-test", "paasta-dev-test", False), - ("other_service", None, "paasta-dev-test", "paasta-dev-test", False), - (None, None, "paasta-dev-test", "paasta-dev-test", False), - (None, None, "paasta-dev-test", "paasta-dev-test", False), + (None, None, "paasta-dev-test", "paasta-dev"), + (None, "special_deploy", "paasta-dev-test", "paasta-dev"), + ("other_service", None, "paasta-dev-test", "paasta-dev"), + (None, None, "paasta-dev-test", "paasta-dev"), + (None, None, "paasta-dev-test", "paasta-dev"), ], ) @mock.patch("paasta_tools.tron_tools.load_v2_deployments_json", autospec=True) @@ -267,7 +254,6 @@ def test_get_action_config( action_deploy, cluster, expected_cluster, - use_k8s, ): """Check resulting action config with various overrides from the action.""" action_dict = {"command": "echo first"} @@ -289,7 +275,6 @@ def test_get_action_config( "max_runtime": "2h", "actions": {"normal": action_dict}, "monitoring": {"team": "noop"}, - "use_k8s": use_k8s, } soa_dir = "/other_dir" @@ -429,7 +414,6 @@ def test_format_tron_job_dict(self, mock_format_action, mock_get_action_config): ) mock_format_action.assert_called_once_with( action_config=mock_get_action_config.return_value, - use_k8s=False, ) assert result == { @@ -455,7 +439,6 @@ def test_format_tron_job_dict_k8s_enabled( actions = {action_name: action_dict} job_dict = { - "use_k8s": True, "node": "batch_server", "schedule": "daily 12:10:00", "service": "my_service", @@ -484,7 +467,6 @@ def test_format_tron_job_dict_k8s_enabled( ) mock_format_action.assert_called_once_with( action_config=mock_get_action_config.return_value, - use_k8s=True, ) assert result == { @@ -496,7 +478,6 @@ def test_format_tron_job_dict_k8s_enabled( }, "expected_runtime": "1h", "monitoring": {"team": "noop"}, - "use_k8s": True, } @mock.patch( @@ -831,7 +812,7 @@ def test_format_tron_action_dict_default_executor(self): autospec=True, ): result = tron_tools.format_tron_action_dict(action_config) - assert result["executor"] == "mesos" + assert result["executor"] == "kubernetes" def test_format_tron_action_dict_paasta(self): action_dict = { @@ -898,7 +879,7 @@ def test_format_tron_action_dict_paasta(self): "retries": 2, "retries_delay": "5m", "docker_image": mock.ANY, - "executor": "mesos", + "executor": "kubernetes", "cpus": 2, "mem": 1200, "disk": 42, @@ -915,10 +896,19 @@ def test_format_tron_action_dict_paasta(self): "extra_volumes": [ {"container_path": "/nail/tmp", "host_path": "/nail/tmp", "mode": "RW"} ], - "docker_parameters": mock.ANY, - "constraints": [ - {"attribute": "pool", "operator": "LIKE", "value": "special_pool"} - ], + "field_selector_env": {"PAASTA_POD_IP": {"field_path": "status.podIP"}}, + "node_selectors": {"yelp.com/pool": "special_pool"}, + "labels": { + "paasta.yelp.com/cluster": "test-cluster", + "paasta.yelp.com/instance": "my_job.do_something", + "paasta.yelp.com/pool": "special_pool", + "paasta.yelp.com/service": "my_service", + "yelp.com/owner": "compute_infra_platform_experience", + }, + "annotations": {"paasta.yelp.com/routable_ip": "false"}, + "cap_drop": CAPS_DROP, + "cap_add": [], + "secret_env": {}, "trigger_downstreams": True, "triggered_by": ["foo.bar.{shortdate}"], "trigger_timeout": "5m", @@ -928,7 +918,6 @@ def test_format_tron_action_dict_paasta(self): ) assert result["docker_image"] == expected_docker assert result["env"]["SHELL"] == "/bin/bash" - assert isinstance(result["docker_parameters"], list) @mock.patch("paasta_tools.spark_tools.spark_config.SparkConfBuilder", autospec=True) def test_format_tron_action_dict_spark(self, mock_spark_conf_builder): @@ -1031,7 +1020,7 @@ def test_format_tron_action_dict_spark(self, mock_spark_conf_builder): "spark.sql.files.minPartitionNum": "12", "spark.default.parallelism": "12", } - result = tron_tools.format_tron_action_dict(action_config, use_k8s=True) + result = tron_tools.format_tron_action_dict(action_config) assert result == { "command": "spark-submit " @@ -1172,7 +1161,7 @@ def test_format_tron_action_dict_paasta_k8s_service_account(self): "paasta_tools.tron_tools.load_system_paasta_config", autospec=True, ): - result = tron_tools.format_tron_action_dict(action_config, use_k8s=True) + result = tron_tools.format_tron_action_dict(action_config) assert result == { "command": "echo something", @@ -1295,7 +1284,7 @@ def test_format_tron_action_dict_paasta_k8s( autospec=True, return_value=False, ): - result = tron_tools.format_tron_action_dict(action_config, use_k8s=True) + result = tron_tools.format_tron_action_dict(action_config) assert result == { "command": "echo something", @@ -1411,7 +1400,7 @@ def test_format_tron_action_dict_paasta_no_branch_dict(self): "requires": ["required_action"], "retries": 2, "docker_image": "", - "executor": "mesos", + "executor": "kubernetes", "cpus": 2, "mem": 1200, "disk": 42, @@ -1428,13 +1417,21 @@ def test_format_tron_action_dict_paasta_no_branch_dict(self): "extra_volumes": [ {"container_path": "/nail/tmp", "host_path": "/nail/tmp", "mode": "RW"} ], - "docker_parameters": mock.ANY, - "constraints": [ - {"attribute": "pool", "operator": "LIKE", "value": "special_pool"} - ], + "field_selector_env": {"PAASTA_POD_IP": {"field_path": "status.podIP"}}, + "node_selectors": {"yelp.com/pool": "special_pool"}, + "labels": { + "paasta.yelp.com/cluster": "paasta-dev", + "paasta.yelp.com/instance": "my_job.do_something", + "paasta.yelp.com/pool": "special_pool", + "paasta.yelp.com/service": "my_service", + "yelp.com/owner": "compute_infra_platform_experience", + }, + "annotations": {"paasta.yelp.com/routable_ip": "false"}, + "cap_drop": CAPS_DROP, + "cap_add": [], + "secret_env": {}, } assert result["env"]["SHELL"] == "/bin/bash" - assert isinstance(result["docker_parameters"], list) @mock.patch("paasta_tools.tron_tools.read_extra_service_information", autospec=True) def test_load_tron_service_config(self, mock_read_extra_service_information): @@ -1571,7 +1568,7 @@ def test_create_complete_config_e2e(self, tmpdir): # that are not static, this will cause continuous reconfiguration, which # will add significant load to the Tron API, which happened in DAR-1461. # but if this is intended, just change the hash. - assert hasher.hexdigest() == "35972651618a848ac6bf7947245dbaea" + assert hasher.hexdigest() == "ba2ccfd2477b2ce2233de42619aa810a" def test_override_default_pool_override(self, tmpdir): soa_dir = tmpdir.mkdir("test_create_complete_config_soa") diff --git a/tests/test_utils.py b/tests/test_utils.py index ff78f6958a..11bb80bd76 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1169,7 +1169,7 @@ def test_load_service_instance_config( autospec=True, ) @pytest.mark.parametrize("instance_type_enabled", [(True,), (False,)]) -def test_load_service_instance_auto_configs( +def test_load_service_instance_auto_configs_no_aliases( mock_load_system_paasta_config, mock_read_extra_service_information, instance_type_enabled, @@ -1177,6 +1177,9 @@ def test_load_service_instance_auto_configs( mock_load_system_paasta_config.return_value.get_auto_config_instance_types_enabled.return_value = { "marathon": instance_type_enabled, } + mock_load_system_paasta_config.return_value.get_auto_config_instance_type_aliases.return_value = ( + {} + ) result = utils.load_service_instance_auto_configs( service="fake_service", instance_type="marathon", @@ -1195,6 +1198,41 @@ def test_load_service_instance_auto_configs( assert result == {} +@pytest.mark.parametrize( + "instance_type_aliases, instance_type, expected_instance_type", + (({}, "kubernetes", "kubernetes"), ({"eks": "kubernetes"}, "eks", "kubernetes")), +) +def test_load_service_instance_auto_configs_with_autotune_aliases( + instance_type_aliases, instance_type, expected_instance_type +): + with mock.patch( + "paasta_tools.utils.service_configuration_lib.read_extra_service_information", + autospec=True, + ) as mock_read_extra_service_information, mock.patch( + "paasta_tools.utils.load_system_paasta_config", + autospec=True, + ) as mock_load_system_paasta_config: + mock_load_system_paasta_config.return_value.get_auto_config_instance_types_enabled.return_value = { + expected_instance_type: True, + } + mock_load_system_paasta_config.return_value.get_auto_config_instance_type_aliases.return_value = ( + instance_type_aliases + ) + result = utils.load_service_instance_auto_configs( + service="fake_service", + instance_type=instance_type, + cluster="fake", + soa_dir="fake_dir", + ) + mock_read_extra_service_information.assert_called_with( + "fake_service", + f"{utils.AUTO_SOACONFIG_SUBDIR}/{expected_instance_type}-fake", + soa_dir="fake_dir", + deepcopy=False, + ) + assert result == mock_read_extra_service_information.return_value + + def test_get_services_for_cluster(): cluster = "honey_bunches_of_oats" soa_dir = "completely_wholesome" @@ -1541,9 +1579,9 @@ def test_format_docker_parameters_non_default(self): {"key": "cpu-quota", "value": "600000"}, {"key": "label", "value": "paasta_service=fake_name"}, {"key": "label", "value": "paasta_instance=fake_instance"}, + {"key": "init", "value": "true"}, {"key": "cap-add", "value": "IPC_LOCK"}, {"key": "cap-add", "value": "SYS_PTRACE"}, - {"key": "init", "value": "true"}, {"key": "cap-drop", "value": "SETPCAP"}, {"key": "cap-drop", "value": "MKNOD"}, {"key": "cap-drop", "value": "AUDIT_WRITE"}, @@ -1560,6 +1598,51 @@ def test_format_docker_parameters_non_default(self): {"key": "cap-drop", "value": "SETFCAP"}, ] + def test_format_docker_parameters_overlapping_caps(self): + fake_conf = utils.InstanceConfig( + service="fake_name", + cluster="", + instance="fake_instance", + config_dict={ + "cpu_burst_add": 2, + "cfs_period_us": 200000, + "cpus": 1, + "mem": 1024, + "disk": 1234, + "cap_add": ["IPC_LOCK", "SYS_PTRACE", "DAC_OVERRIDE", "NET_RAW"], + }, + branch_dict=None, + ) + with mock.patch( + "paasta_tools.utils.InstanceConfig.use_docker_disk_quota", + autospec=True, + return_value=False, + ): + assert fake_conf.format_docker_parameters() == [ + {"key": "memory-swap", "value": "1088m"}, + {"key": "cpu-period", "value": "200000"}, + {"key": "cpu-quota", "value": "600000"}, + {"key": "label", "value": "paasta_service=fake_name"}, + {"key": "label", "value": "paasta_instance=fake_instance"}, + {"key": "init", "value": "true"}, + {"key": "cap-add", "value": "IPC_LOCK"}, + {"key": "cap-add", "value": "SYS_PTRACE"}, + {"key": "cap-add", "value": "DAC_OVERRIDE"}, + {"key": "cap-add", "value": "NET_RAW"}, + {"key": "cap-drop", "value": "SETPCAP"}, + {"key": "cap-drop", "value": "MKNOD"}, + {"key": "cap-drop", "value": "AUDIT_WRITE"}, + {"key": "cap-drop", "value": "CHOWN"}, + {"key": "cap-drop", "value": "FOWNER"}, + {"key": "cap-drop", "value": "FSETID"}, + {"key": "cap-drop", "value": "KILL"}, + {"key": "cap-drop", "value": "SETGID"}, + {"key": "cap-drop", "value": "SETUID"}, + {"key": "cap-drop", "value": "NET_BIND_SERVICE"}, + {"key": "cap-drop", "value": "SYS_CHROOT"}, + {"key": "cap-drop", "value": "SETFCAP"}, + ] + def test_format_docker_parameters_with_disk_quota_non_default(self): fake_conf = utils.InstanceConfig( service="fake_name", @@ -1587,9 +1670,9 @@ def test_format_docker_parameters_with_disk_quota_non_default(self): {"key": "storage-opt", "value": "size=1293942784"}, {"key": "label", "value": "paasta_service=fake_name"}, {"key": "label", "value": "paasta_instance=fake_instance"}, + {"key": "init", "value": "true"}, {"key": "cap-add", "value": "IPC_LOCK"}, {"key": "cap-add", "value": "SYS_PTRACE"}, - {"key": "init", "value": "true"}, {"key": "cap-drop", "value": "SETPCAP"}, {"key": "cap-drop", "value": "MKNOD"}, {"key": "cap-drop", "value": "AUDIT_WRITE"}, @@ -2275,12 +2358,14 @@ def test_validate_service_instance_invalid(): mock_paasta_native_instances = [("service1", "main2"), ("service1", "main3")] mock_adhoc_instances = [("service1", "interactive")] mock_k8s_instances = [("service1", "k8s")] + mock_eks_instances = [("service1", "eks")] mock_tron_instances = [("service1", "job.action")] mock_flink_instances = [("service1", "flink")] mock_cassandracluster_instances = [("service1", "cassandracluster")] mock_kafkacluster_instances = [("service1", "kafkacluster")] mock_nrtsearch_instances = [("service1", "nrtsearch")] mock_monkrelaycluster_instances = [("service1", "monkrelays")] + mock_vitesscluster_instances = [("service1", "vitesscluster")] my_service = "service1" my_instance = "main" fake_cluster = "fake_cluster" @@ -2293,12 +2378,14 @@ def test_validate_service_instance_invalid(): mock_paasta_native_instances, mock_adhoc_instances, mock_k8s_instances, + mock_eks_instances, mock_tron_instances, mock_flink_instances, mock_cassandracluster_instances, mock_kafkacluster_instances, mock_nrtsearch_instances, mock_monkrelaycluster_instances, + mock_vitesscluster_instances, ], ): with raises( @@ -2560,7 +2647,6 @@ def test_is_deploy_step(): assert utils.is_deploy_step("thingy") assert not utils.is_deploy_step("itest") - assert not utils.is_deploy_step("performance-check") assert not utils.is_deploy_step("command-thingy") diff --git a/tox.ini b/tox.ini index 75311abec3..472ba7f09e 100644 --- a/tox.ini +++ b/tox.ini @@ -78,13 +78,15 @@ commands = /bin/rm -rf docs/source/generated/ # The last arg to apidoc is a list of excluded paths sphinx-apidoc -f -e -o docs/source/generated/ paasta_tools - sphinx-build -b html -d docs/build/doctrees docs/source docs/build/html + sphinx-build -j auto -b html -d docs/build/doctrees docs/source docs/build/html [testenv:k8s_itests] basepython = python3.8 whitelist_externals = bash +# one day we'll use a fully pinned venv here... deps = urllib3<2.0 + cryptography<42 docker-compose=={[tox]docker_compose_version} setenv = passenv = @@ -107,7 +109,7 @@ commands = {toxinidir}/k8s_itests/scripts/setup.sh # Run paasta-tools k8s_itests in docker docker-compose down - docker-compose --verbose build --build-arg DOCKER_REGISTRY={env:DOCKER_REGISTRY:docker-dev.yelpcorp.com/} --build-arg {env:INDEX_URL_BUILD_ARG:UNUSED}=https://pypi.org/simple + docker-compose --verbose build --parallel --build-arg DOCKER_REGISTRY={env:DOCKER_REGISTRY:docker-dev.yelpcorp.com/} --build-arg {env:INDEX_URL_BUILD_ARG:UNUSED}=https://pypi.org/simple docker-compose up \ --abort-on-container-exit @@ -136,7 +138,7 @@ basepython = python3.8 setenv = PAASTA_SYSTEM_CONFIG_DIR = {toxinidir}/general_itests/fake_etc_paasta changedir=general_itests/ -passenv = DOCKER_TLS_VERIFY DOCKER_HOST DOCKER_CERT_PATH +passenv = DOCKER_TLS_VERIFY DOCKER_HOST DOCKER_CERT_PATH DOCKER_REGISTRY deps = {[testenv]deps} behave==1.2.5 diff --git a/yelp_package/Makefile b/yelp_package/Makefile index bf172eb9f9..981d29be41 100644 --- a/yelp_package/Makefile +++ b/yelp_package/Makefile @@ -13,7 +13,7 @@ # limitations under the License. # Edit this release and run "make release" -RELEASE=0.196.0 +RELEASE=0.218.6 SHELL=/bin/bash diff --git a/yelp_package/itest/ubuntu.sh b/yelp_package/itest/ubuntu.sh index c0adab7b1c..c1c6716d3d 100755 --- a/yelp_package/itest/ubuntu.sh +++ b/yelp_package/itest/ubuntu.sh @@ -51,7 +51,6 @@ itest local-run mark-for-deployment metastatus -performance-check push-to-registry security-check status