From dce0c1b767807725b08ec494fe975090eac31e52 Mon Sep 17 00:00:00 2001 From: tenequm Date: Thu, 30 Jan 2025 20:10:24 +0000 Subject: [PATCH] feat: enable metrics collection for all avs services --- .../gasp-avs/templates/service-monitor.yaml | 46 +++++++++ .../gasp-avs/templates/service.yaml | 35 +++++++ .../gasp-avs/templates/statefulset.yaml | 63 ++++++++++++ ops/helm-charts/gasp-avs/values.yaml | 96 ++++++++++++++++++- ops/helmfiles/helmfile.yaml | 46 +++++++++ 5 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 ops/helm-charts/gasp-avs/templates/service-monitor.yaml create mode 100644 ops/helm-charts/gasp-avs/templates/service.yaml diff --git a/ops/helm-charts/gasp-avs/templates/service-monitor.yaml b/ops/helm-charts/gasp-avs/templates/service-monitor.yaml new file mode 100644 index 000000000..cc3ae0446 --- /dev/null +++ b/ops/helm-charts/gasp-avs/templates/service-monitor.yaml @@ -0,0 +1,46 @@ +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "node.name" . }} + labels: {{- include "node.labels" . | nindent 4 }} +spec: + selector: + matchLabels: {{- include "node.selectorLabels" . | nindent 6 }} + service: {{ include "node.name" . }} + endpoints: + - port: {{ .Values.service.name }} + {{- if .Values.serviceMonitor.interval }} + interval: {{ .Values.serviceMonitor.interval }} + {{- end }} + {{- if .Values.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} + {{- end }} + {{- if .Values.serviceMonitor.honorLabels }} + honorLabels: {{ .Values.serviceMonitor.honorLabels }} + {{- end }} + path: {{ .Values.serviceMonitor.path | default "/metrics" }} + {{- with .Values.serviceMonitor.relabelings }} + relabelings: +{{ toYaml . | indent 6 }} + {{- end }} + {{- with .Values.serviceMonitor.metricRelabelings }} + metricRelabelings: +{{ toYaml . | indent 6 }} + {{- end }} + {{- with .Values.serviceMonitor.namespaceSelector }} + namespaceSelector: +{{ toYaml . | indent 4 }} + {{- end }} + {{- with .Values.serviceMonitor.targetLabels }} + targetLabels: +{{ toYaml . | indent 4 }} + {{- end }} + {{- if .Values.serviceMonitor.jobLabel }} + jobLabel: {{ .Values.serviceMonitor.jobLabel }} + {{- end }} + {{- if .Values.serviceMonitor.sampleLimit }} + sampleLimit: {{ .Values.serviceMonitor.sampleLimit }} + {{- end }} + +{{- end }} diff --git a/ops/helm-charts/gasp-avs/templates/service.yaml b/ops/helm-charts/gasp-avs/templates/service.yaml new file mode 100644 index 000000000..892bb7385 --- /dev/null +++ b/ops/helm-charts/gasp-avs/templates/service.yaml @@ -0,0 +1,35 @@ +{{- if .Values.service.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "node.name" . }} + labels: {{- include "node.labels" . | nindent 4 }} + service: {{ include "node.name" . }} + annotations: + {{- with .Values.service.annotations }} + {{ tpl ( toYaml .) $ | indent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type }} +{{- if default false .Values.service.headless }} + clusterIP: None +{{- end }} +{{- if eq (default true .Values.service.allocateLoadBalancerNodePorts) false }} + allocateLoadBalancerNodePorts: false +{{- end }} +{{- if .Values.service.externalTrafficPolicy }} + externalTrafficPolicy: {{ .Values.service.externalTrafficPolicy }} +{{- end }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: {{ .Values.service.name }} +{{- if .Values.service.additionalPorts }} +{{ toYaml .Values.service.additionalPorts | indent 4 }} +{{- end }} + selector: + {{- include "node.selectorLabels" . | nindent 4 }} + service: {{ include "node.name" . }} + +{{- end }} \ No newline at end of file diff --git a/ops/helm-charts/gasp-avs/templates/statefulset.yaml b/ops/helm-charts/gasp-avs/templates/statefulset.yaml index 73cef9606..71794e3fb 100644 --- a/ops/helm-charts/gasp-avs/templates/statefulset.yaml +++ b/ops/helm-charts/gasp-avs/templates/statefulset.yaml @@ -67,6 +67,69 @@ spec: key: {{ $key }} {{- end }} {{- end }} + +# If we have services enabled lets expose their ports +{{- if .Values.service.enabled }} + ports: + - name: {{ .Values.service.name }} + containerPort: {{ .Values.service.targetPort }} + protocol: TCP + # Of course a service can use additional ports if desired + {{- if .Values.additionalPorts }} +{{ toYaml .Values.additionalPorts | indent 12 }} + {{- end }} +{{ end }} + +# This is for our liveness probes, which dictate if a pod is healthy or should be replaced +{{- if .Values.livenessProbe.enabled }} + livenessProbe: + initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }} + successThreshold: {{ .Values.livenessProbe.successThreshold }} + failureThreshold: {{ .Values.livenessProbe.failureThreshold }} +{{- if eq .Values.livenessProbe.probeType "httpGet" }} + httpGet: + scheme: {{ .Values.livenessProbe.scheme }} + path: {{ .Values.livenessProbe.path }} + port: {{ .Values.livenessProbe.port }} +{{- else if eq .Values.livenessProbe.probeType "tcpSocket" }} + tcpSocket: + port: {{ .Values.livenessProbe.port }} +{{- else if eq .Values.livenessProbe.probeType "exec" }} + exec: + command: +{{- with .Values.livenessProbe.command }} +{{ toYaml . | indent 16 }} +{{- end -}} +{{- end -}} +{{- end }} + +# This is for our readiness probes, which dictate if a pod is ready to receive traffic +{{- if .Values.readinessProbe.enabled }} + readinessProbe: + initialDelaySeconds: {{ .Values.readinessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.readinessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.readinessProbe.timeoutSeconds }} + successThreshold: {{ .Values.readinessProbe.successThreshold }} + failureThreshold: {{ .Values.readinessProbe.failureThreshold }} +{{- if eq .Values.readinessProbe.probeType "httpGet" }} + httpGet: + scheme: {{ .Values.readinessProbe.scheme }} + path: {{ .Values.readinessProbe.path }} + port: {{ .Values.readinessProbe.port }} +{{- else if eq .Values.readinessProbe.probeType "tcpSocket" }} + tcpSocket: + port: {{ .Values.readinessProbe.port }} +{{- else if eq .Values.readinessProbe.probeType "exec" }} + exec: + command: +{{- with .Values.readinessProbe.command }} +{{ toYaml . | indent 16 }} +{{- end -}} +{{- end -}} +{{- end }} + resources: {{- toYaml .Values.resources | nindent 12 }} volumeMounts: diff --git a/ops/helm-charts/gasp-avs/values.yaml b/ops/helm-charts/gasp-avs/values.yaml index ca0fcc9ab..875255409 100644 --- a/ops/helm-charts/gasp-avs/values.yaml +++ b/ops/helm-charts/gasp-avs/values.yaml @@ -64,4 +64,98 @@ podAnnotations: {} extraContainers: [] # -- Additional init containers to run in the pod -extraInitContainers: [] \ No newline at end of file +extraInitContainers: [] + +# Service definitions +service: + enabled: false + annotations: {cloud.google.com/neg: '{"ingress":true}'} + type: ClusterIP + port: 80 + targetPort: 80 + # `name` sets the name of the default port + name: default-service + # allocateLoadBalancerNodePorts: false + # externalTrafficPolicy: Local + additionalPorts: [] + +# ServiceMonitor definitions to configure Prometheus metrics collection +serviceMonitor: + enabled: false + # `port` parameter should be the name of the service port this endpoint refers to + # by default it is called `default-service` + port: default-service + interval: 30s + path: /metrics + # honorLabels: true + # annotations: + # custom: annotation + # scrapeTimeout: 10s + # relabelings: [] + # metricRelabelings: [] + # namespaceSelector: + # matchNames: + # - monitoring + # targetLabels: [] + # jobLabel: "" + # sampleLimit: 0 +# Example usage: +# serviceMonitor: +# enabled: true +# port: default-service +# interval: 30s +# path: /metrics + + +# Additional container ports to open +# This can allow scraping by prometheus, or exposure to other services +additionalPorts: [] + # - containerPort: 8001 + # name: http-admin + # protocol: TCP + +# livenessProbes are used to determine when to restart a container +livenessProbe: + enabled: false + # For the liveness probe we'll wait a full 2 minutes, just incase this service takes a while to start-up + initialDelaySeconds: 120 + periodSeconds: 10 + timeoutSeconds: 9 + successThreshold: 1 + failureThreshold: 3 + + # Specify either httpGet, tcpSocket or exec + # httpGet uses scheme, path and port (below) + # tcpSocket uses port (below) + # exec uses command (below) + probeType: httpGet + + # parameters for probes + scheme: HTTP + path: /alive + port: default-service + command: + - ls -la / + + +# readinessProbes are used to determine when a container is ready to start accepting traffic +readinessProbe: + enabled: false + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 4 + successThreshold: 2 + failureThreshold: 2 + + # Specify either httpGet, tcpSocket or exec + # httpGet uses scheme, path and port (below) + # tcpSocket uses port (below) + # exec uses command (below) + probeType: httpGet + + # parameters for probes + scheme: HTTP + path: /ready + port: default-service + command: + - ls -la / \ No newline at end of file diff --git a/ops/helmfiles/helmfile.yaml b/ops/helmfiles/helmfile.yaml index d7c4c6a3e..0cbf2d4f8 100644 --- a/ops/helmfiles/helmfile.yaml +++ b/ops/helmfiles/helmfile.yaml @@ -42,6 +42,8 @@ releases: tls: - hosts: ["avs-aggregator-{{ .Values.environmentName }}.gasp.xyz"] secretName: avs-aggregator-tls + serviceMonitor: + enabled: true podReplacementPolicy: Failed resources: requests: @@ -63,6 +65,10 @@ releases: envSecrets: {{ .Values.gaspAvsEnvSecrets | expandSecretRefs | toYaml | nindent 10 }} env: {{ .Values.gaspAvsEnv | toYaml | nindent 10 }} data: {{ .Values | getOrNil "gaspAvsData" | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 1 @@ -81,6 +87,10 @@ releases: tag: {{ .Values | get "updaterImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.updaterEnv | toYaml | nindent 10 }} envSecrets: {{ .Values.updaterEnvSecrets | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -98,6 +108,10 @@ releases: tag: {{ .Values | get "updaterImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.updaterEnvBase | toYaml | nindent 10 }} envSecrets: {{ .Values.updaterEnvSecretsBase | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -308,6 +322,10 @@ releases: tag: {{ .Values | get "ferryDepositImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.ferryDepositEnvEth | toYaml | nindent 10 }} envSecrets: {{ .Values.ferryDepositEnvSecretsEth | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -325,6 +343,10 @@ releases: tag: {{ .Values | get "ferryDepositImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.ferryDepositEnvArb | toYaml | nindent 10 }} envSecrets: {{ .Values.ferryDepositEnvSecretsArb | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -342,6 +364,10 @@ releases: tag: {{ .Values | get "ferryDepositImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.ferryDepositEnvBase | toYaml | nindent 10 }} envSecrets: {{ .Values.ferryDepositEnvSecretsBase | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -360,6 +386,10 @@ releases: tag: {{ .Values | get "ferryWithdrawalImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.ferryWithdrawalEnvEth | toYaml | nindent 10 }} envSecrets: {{ .Values.ferryWithdrawalEnvSecretsEth | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -377,6 +407,10 @@ releases: tag: {{ .Values | get "ferryWithdrawalImageTag" (requiredEnv "IMAGE_TAG") | quote }} env: {{ .Values.ferryWithdrawalEnvArb | toYaml | nindent 10 }} envSecrets: {{ .Values.ferryWithdrawalEnvSecretsArb | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -414,6 +448,10 @@ releases: args: ["node", "build/src/closer.js"] env: {{ .Values.closerEnvEth | toYaml | nindent 10 }} envSecrets: {{ .Values.closerEnvSecretsEth | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -433,6 +471,10 @@ releases: args: ["node", "build/src/closer.js"] env: {{ .Values.closerEnvArb | toYaml | nindent 10 }} envSecrets: {{ .Values.closerEnvSecretsArb | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m @@ -452,6 +494,10 @@ releases: args: ["node", "build/src/closer.js"] env: {{ .Values.closerEnvBase | toYaml | nindent 10 }} envSecrets: {{ .Values.closerEnvSecretsBase | expandSecretRefs | toYaml | nindent 10 }} + service: + enabled: true + serviceMonitor: + enabled: true resources: requests: cpu: 100m