Skip to content

Commit

Permalink
fix: top-level ELB error metric and logs, downscale to min capacities (
Browse files Browse the repository at this point in the history
…#592)

* fix: upgrade grafonnet-lib

* feat: top-level metrics

* fix: disable invalid panel

* fix: downscale

* fix: central logging of server errors

* fix: lint

* chore: rename devloop

* chore: move up availability metric

* fix: use local

* fix: from grafonnet-lib upgrade

* fix: missing vars.namespace

* fix: use shared Redis panels

* chore: put Redis at the bottom

* fix: increase threshold for ELB errors
  • Loading branch information
chris13524 authored Mar 25, 2024
1 parent 9fd3a41 commit 7cf125d
Show file tree
Hide file tree
Showing 25 changed files with 460 additions and 121 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ curl -X POST "http://localhost:3000/v1?chainId=eip155:1&projectId=someid" --data
## Testing

```bash
just amigood
just devloop
```

### Docker
Expand Down
2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ docs target='all': (_check-string-in-set target "all,rust,tf")
[[ '{{ target }}' == 'all' || '{{ target }}' == 'tf' ]] && { just tf-docs; }

# Run linting and tests
amigood: lint cargo-test-all
devloop: lint cargo-test-all

################################################################################
# Linting recipes
Expand Down
27 changes: 17 additions & 10 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use {
axum::{response::IntoResponse, Json},
cerberus::registry::RegistryError,
hyper::StatusCode,
tracing::log::error,
tracing::{debug, log::error},
};

pub type RpcResult<T> = Result<T, RpcError>;
Expand Down Expand Up @@ -159,7 +159,7 @@ pub enum RpcError {

impl IntoResponse for RpcError {
fn into_response(self) -> axum::response::Response {
match self {
let response = match &self {
Self::AxumTungstenite(err) => (StatusCode::GONE, err.to_string()).into_response(),
Self::UnsupportedChain(chain_id) => (
StatusCode::BAD_REQUEST,
Expand Down Expand Up @@ -357,15 +357,22 @@ impl IntoResponse for RpcError {
.into_response(),

// Any other errors considering as 500
e => {
error!("Internal server error: {}", e);
(
StatusCode::INTERNAL_SERVER_ERROR,
"Internal server error".to_string(),
)
.into_response()
}
_ => (
StatusCode::INTERNAL_SERVER_ERROR,
"Internal server error".to_string(),
)
.into_response(),
};

if response.status().is_client_error() {
debug!("HTTP client error: {self:?}");
}

if response.status().is_server_error() {
error!("HTTP server error: {self:?}");
}

response
}
}

Expand Down
23 changes: 14 additions & 9 deletions terraform/ecs/cluster.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
locals {
image = "${var.ecr_repository_url}:${var.image_version}"

desired_count = module.this.stage == "prod" ? var.autoscaling_desired_count : 1

task_cpu = module.this.stage == "prod" ? var.task_cpu : 256
task_memory = module.this.stage == "prod" ? var.task_memory : 512

otel_port = var.port + 1
otel_cpu = 128
otel_memory = 128
otel_cpu = module.this.stage == "prod" ? 128 : 64
otel_memory = module.this.stage == "prod" ? 128 : 64

prometheus_proxy_port = var.port + 2
prometheus_proxy_cpu = 128
prometheus_proxy_memory = 128
prometheus_proxy_cpu = module.this.stage == "prod" ? 128 : 64
prometheus_proxy_memory = module.this.stage == "prod" ? 128 : 64

file_descriptor_soft_limit = pow(2, 18)
file_descriptor_hard_limit = local.file_descriptor_soft_limit * 2
Expand All @@ -16,8 +21,8 @@ locals {
module "ecs_cpu_mem" {
source = "app.terraform.io/wallet-connect/ecs_cpu_mem/aws"
version = "1.0.0"
cpu = var.task_cpu + local.otel_cpu + local.prometheus_proxy_cpu
memory = var.task_memory + local.otel_memory + local.prometheus_proxy_memory
cpu = local.task_cpu
memory = local.task_memory
}

#-------------------------------------------------------------------------------
Expand Down Expand Up @@ -65,8 +70,8 @@ resource "aws_ecs_task_definition" "app_task" {
{
name = module.this.id,
image = local.image,
cpu = var.task_cpu,
memory = var.task_memory,
cpu = local.task_cpu - local.otel_cpu - local.prometheus_proxy_cpu,
memory = local.task_memory - local.otel_memory - local.prometheus_proxy_memory,
essential = true,

environment = [
Expand Down Expand Up @@ -206,7 +211,7 @@ resource "aws_ecs_service" "app_service" {
cluster = aws_ecs_cluster.app_cluster.id
task_definition = aws_ecs_task_definition.app_task.arn
launch_type = "FARGATE"
desired_count = var.autoscaling_desired_count
desired_count = local.desired_count
propagate_tags = "TASK_DEFINITION"

# Wait for the service deployment to succeed
Expand Down
6 changes: 5 additions & 1 deletion terraform/ecs/cluster_autoscaling.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
locals {
autoscaling_min_capacity = module.this.stage == "prod" ? var.autoscaling_min_capacity : 1
}

resource "aws_appautoscaling_target" "ecs_target" {
min_capacity = var.autoscaling_min_capacity
min_capacity = local.autoscaling_min_capacity
max_capacity = var.autoscaling_max_capacity
resource_id = "service/${aws_ecs_cluster.app_cluster.name}/${aws_ecs_service.app_service.name}"
scalable_dimension = "ecs:service:DesiredCount"
Expand Down
10 changes: 10 additions & 0 deletions terraform/ecs/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,13 @@ output "load_balancer_arn_suffix" {
description = "The ARN suffix of the load balancer"
value = aws_lb.load_balancer.arn_suffix
}

output "log_group_app_name" {
description = "The name of the log group for the app"
value = aws_cloudwatch_log_group.cluster.name
}

output "log_group_app_arn" {
description = "The ARN of the log group for the app"
value = aws_cloudwatch_log_group.cluster.arn
}
34 changes: 26 additions & 8 deletions terraform/monitoring/dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ local ds = {
},
};
local vars = {
namespace: 'Notify',
environment: std.extVar('environment'),
notifications: std.parseJson(std.extVar('notifications')),

ecs_service_name: std.extVar('ecs_service_name'),
load_balancer: std.extVar('load_balancer'),
target_group: std.extVar('target_group'),
redis_cluster_id: std.extVar('redis_cluster_id'),
ecs_service_name: std.extVar('ecs_service_name'),
load_balancer: std.extVar('load_balancer'),
target_group: std.extVar('target_group'),
redis_cluster_id: std.extVar('redis_cluster_id'),
log_group_app_name: std.extVar('log_group_app_name'),
log_group_app_arn: std.extVar('log_group_app_arn'),
aws_account_id: std.extVar('aws_account_id'),
};

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -52,8 +56,14 @@ dashboard.new(
)

.addPanels(layout.generate_grid([
row.new('ECS'),
row.new('Application'),
// panels.app.http_request_rate(ds, vars) { gridPos: pos._4 },
// panels.app.http_request_latency(ds, vars) { gridPos: pos._4 },
panels.ecs.availability(ds, vars) { gridPos: pos._3 },
panels.lb.error_5xx(ds, vars) { gridPos: pos._3 },
panels.lb.error_5xx_logs(ds, vars) { gridPos: pos._3 },

row.new('ECS'),
panels.ecs.memory(ds, vars) { gridPos: pos._3 },
panels.ecs.cpu(ds, vars) { gridPos: pos._3 },

Expand Down Expand Up @@ -109,9 +119,6 @@ dashboard.new(
panels.proxy.http_codes(ds, vars) { gridPos: pos.two_thirds },
panels.proxy.healthy_hosts(ds, vars) { gridPos: pos._3 },

row.new('Database'),
panels.db.redis_cpu_memory(ds, vars) { gridPos: pos._2 },

row.new('History Metrics'),
panels.history.requests(ds, vars) { gridPos: pos_short._3 },
panels.history.latency(ds, vars) { gridPos: pos_short._3 },
Expand All @@ -123,4 +130,15 @@ dashboard.new(
panels.identity.latency(ds, vars) { gridPos: pos_short._2 },
panels.identity.cache(ds, vars) { gridPos: pos_short._2 },
panels.identity.usage(ds, vars) { gridPos: pos_short._2 },

row.new('Redis'),
panels.redis.cpu(ds, vars) { gridPos: pos._2 },
panels.redis.memory(ds, vars) { gridPos: pos._2 },

row.new('Load Balancer'),
panels.lb.active_connections(ds, vars) { gridPos: pos._2 },
panels.lb.requests(ds, vars) { gridPos: pos._2 },

panels.lb.healthy_hosts(ds, vars) { gridPos: pos._3 },
panels.lb.error_4xx(ds, vars) { gridPos: pos._3 },
]))
2 changes: 1 addition & 1 deletion terraform/monitoring/grafonnet-lib
Submodule grafonnet-lib updated 61 files
+5 −0 .editorconfig
+21 −0 LICENSE
+12 −6 alert.libsonnet
+34 −8 alert_condition.libsonnet
+3 −0 defaults.libsonnet
+110 −58 defaults/alerts.libsonnet
+16 −25 defaults/configuration.libsonnet
+31 −51 defaults/overrides.libsonnet
+35 −0 defaults/panels.libsonnet
+29 −0 defaults/panels/aws/amqp/available_messages.libsonnet
+66 −0 defaults/panels/aws/amqp/cpu.libsonnet
+31 −0 defaults/panels/aws/amqp/in_flight_messages.libsonnet
+30 −0 defaults/panels/aws/amqp/memory.libsonnet
+41 −0 defaults/panels/aws/amqp/storage.libsonnet
+90 −0 defaults/panels/aws/docdb/available_memory.libsonnet
+35 −0 defaults/panels/aws/docdb/buffer_cache_hit_ratio.libsonnet
+31 −0 defaults/panels/aws/docdb/connections.libsonnet
+114 −0 defaults/panels/aws/docdb/cpu.libsonnet
+68 −0 defaults/panels/aws/docdb/low_mem_op_throttled.libsonnet
+28 −0 defaults/panels/aws/docdb/net_throughput.libsonnet
+36 −0 defaults/panels/aws/docdb/volume.libsonnet
+40 −0 defaults/panels/aws/docdb/write_latency.libsonnet
+78 −0 defaults/panels/aws/ecs/cpu.libsonnet
+118 −0 defaults/panels/aws/ecs/cpu_memory.libsonnet
+78 −0 defaults/panels/aws/ecs/memory.libsonnet
+60 −0 defaults/panels/aws/redis/cpu.libsonnet
+47 −0 defaults/panels/aws/redis/memory.libsonnet
+80 −0 defaults/panels/aws/redis/swap_usage.libsonnet
+60 −0 defaults/values.libsonnet
+242 −89 field_config.libsonnet
+61 −37 grafana.libsonnet
+19 −0 override.libsonnet
+26 −23 panels/panel.libsonnet
+4 −0 panels/table.libsonnet
+167 −202 panels/timeseries.libsonnet
+631 −18 targets/cloudwatch.libsonnet
+27 −0 tests/defaults/alerts.jsonnet
+6 −0 tests/defaults/configuration.jsonnet
+11 −0 tests/defaults/panels/docdb/available_memory.jsonnet
+8 −0 tests/defaults/panels/docdb/buffer_cache_hit_ratio.jsonnet
+8 −0 tests/defaults/panels/docdb/connections.jsonnet
+11 −0 tests/defaults/panels/docdb/cpu.jsonnet
+11 −0 tests/defaults/panels/docdb/low_mem_op_throttled.jsonnet
+8 −0 tests/defaults/panels/docdb/net_throughput.jsonnet
+8 −0 tests/defaults/panels/docdb/volume.jsonnet
+8 −0 tests/defaults/panels/docdb/write_latency.jsonnet
+11 −0 tests/defaults/panels/ecs/cpu.jsonnet
+11 −0 tests/defaults/panels/ecs/cpu_memory.jsonnet
+11 −0 tests/defaults/panels/ecs/memory.jsonnet
+11 −0 tests/defaults/panels/redis/cpu.jsonnet
+11 −0 tests/defaults/panels/redis/memory.jsonnet
+11 −0 tests/defaults/panels/redis/swap_usage.jsonnet
+37 −0 tests/field_config.jsonnet
+43 −0 tests/panels/timeseries.jsonnet
+36 −0 tests/utils/arrays.jsonnet
+27 −0 tests/utils/strings.jsonnet
+64 −0 tests/utils/units.jsonnet
+6 −0 threshold.libsonnet
+14 −0 utils/arrays.libsonnet
+17 −4 utils/strings.libsonnet
+89 −0 utils/units.libsonnet
13 changes: 8 additions & 5 deletions terraform/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@ data "jsonnet_file" "dashboard" {
environment = module.this.stage
notifications = jsonencode(var.notification_channels)

ecs_service_name = var.ecs_service_name
ecs_task_family = var.ecs_task_family
load_balancer = var.load_balancer_arn
target_group = var.ecs_target_group_arn
redis_cluster_id = var.redis_cluster_id
ecs_service_name = var.ecs_service_name
ecs_task_family = var.ecs_task_family
load_balancer = var.load_balancer_arn
target_group = var.ecs_target_group_arn
redis_cluster_id = var.redis_cluster_id
log_group_app_name = var.log_group_app_name
log_group_app_arn = var.log_group_app_arn
aws_account_id = var.aws_account_id
}
}

Expand Down
76 changes: 0 additions & 76 deletions terraform/monitoring/panels/db/redis_cpu_memory.libsonnet

This file was deleted.

2 changes: 1 addition & 1 deletion terraform/monitoring/panels/ecs/availability.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ local error_alert(vars) = alert.new(
axisSoftMax = 100,
)
)
.setAlert(error_alert(vars))
.setAlert(vars.environment, error_alert(vars))

.addTarget(targets.prometheus(
datasource = ds.prometheus,
Expand Down
2 changes: 1 addition & 1 deletion terraform/monitoring/panels/ecs/cpu.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ local overrides = defaults.overrides;
datasource = ds.prometheus,
)
.configure(overrides.cpu(defaults.configuration.timeseries_resource))
.setAlert(alert.new(
.setAlert(vars.environment, alert.new(
namespace = 'RPC Proxy',
name = "RPC %s - High CPU usage" % vars.environment,
message = "RPC %s - High CPU usage" % vars.environment,
Expand Down
2 changes: 1 addition & 1 deletion terraform/monitoring/panels/ecs/memory.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ local overrides = defaults.overrides;
datasource = ds.prometheus,
)
.configure(defaults.overrides.memory(defaults.configuration.timeseries_resource))
.setAlert(alert.new(
.setAlert(vars.environment, alert.new(
namespace = 'RPC Proxy',
name = "RPC %s - High Memory (RAM) usage" % vars.environment,
message = "RPC %s - High Memory (RAM) usage" % vars.environment,
Expand Down
24 changes: 24 additions & 0 deletions terraform/monitoring/panels/lb/active_connections.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
local grafana = import '../../grafonnet-lib/grafana.libsonnet';
local defaults = import '../../grafonnet-lib/defaults.libsonnet';

local panels = grafana.panels;
local targets = grafana.targets;

{
new(ds, vars)::
panels.timeseries(
title = 'Active Connections',
datasource = ds.cloudwatch,
)
.configure(defaults.configuration.timeseries)

.addTarget(targets.cloudwatch(
datasource = ds.cloudwatch,
namespace = 'AWS/ApplicationELB',
metricName = 'ActiveConnectionCount',
dimensions = {
LoadBalancer: vars.load_balancer
},
statistic = 'Average',
))
}
Loading

0 comments on commit 7cf125d

Please sign in to comment.