Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: top-level ELB error metric and logs, downscale to min capacities #592

Merged
merged 14 commits into from
Mar 25, 2024
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ curl -X POST "http://localhost:3000/v1?chainId=eip155:1&projectId=someid" --data
## Testing

```bash
just amigood
just devloop

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the name change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See PR description

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

amigood makes more sense than devloop imo but it's fine

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea was "am I good" to commit but it was awkward to type and the concept of a "developer loop" is more familiar to people.

```

### Docker
Expand Down
2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ docs target='all': (_check-string-in-set target "all,rust,tf")
[[ '{{ target }}' == 'all' || '{{ target }}' == 'tf' ]] && { just tf-docs; }

# Run linting and tests
amigood: lint cargo-test-all
devloop: lint cargo-test-all

################################################################################
# Linting recipes
Expand Down
27 changes: 17 additions & 10 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use {
axum::{response::IntoResponse, Json},
cerberus::registry::RegistryError,
hyper::StatusCode,
tracing::log::error,
tracing::{debug, log::error},
};

pub type RpcResult<T> = Result<T, RpcError>;
Expand Down Expand Up @@ -159,7 +159,7 @@ pub enum RpcError {

impl IntoResponse for RpcError {
fn into_response(self) -> axum::response::Response {
match self {
let response = match &self {
Self::AxumTungstenite(err) => (StatusCode::GONE, err.to_string()).into_response(),
Self::UnsupportedChain(chain_id) => (
StatusCode::BAD_REQUEST,
Expand Down Expand Up @@ -357,15 +357,22 @@ impl IntoResponse for RpcError {
.into_response(),

// Any other errors considering as 500
e => {
error!("Internal server error: {}", e);
(
StatusCode::INTERNAL_SERVER_ERROR,
"Internal server error".to_string(),
)
.into_response()
}
_ => (
StatusCode::INTERNAL_SERVER_ERROR,
"Internal server error".to_string(),
)
.into_response(),
};

if response.status().is_client_error() {
debug!("HTTP client error: {self:?}");
}

if response.status().is_server_error() {
error!("HTTP server error: {self:?}");
}

response
}
}

Expand Down
23 changes: 14 additions & 9 deletions terraform/ecs/cluster.tf
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
locals {
image = "${var.ecr_repository_url}:${var.image_version}"

desired_count = module.this.stage == "prod" ? var.autoscaling_desired_count : 1

task_cpu = module.this.stage == "prod" ? var.task_cpu : 256
task_memory = module.this.stage == "prod" ? var.task_memory : 512

otel_port = var.port + 1
otel_cpu = 128
otel_memory = 128
otel_cpu = module.this.stage == "prod" ? 128 : 64
otel_memory = module.this.stage == "prod" ? 128 : 64

prometheus_proxy_port = var.port + 2
prometheus_proxy_cpu = 128
prometheus_proxy_memory = 128
prometheus_proxy_cpu = module.this.stage == "prod" ? 128 : 64
prometheus_proxy_memory = module.this.stage == "prod" ? 128 : 64

file_descriptor_soft_limit = pow(2, 18)
file_descriptor_hard_limit = local.file_descriptor_soft_limit * 2
Expand All @@ -16,8 +21,8 @@ locals {
module "ecs_cpu_mem" {
source = "app.terraform.io/wallet-connect/ecs_cpu_mem/aws"
version = "1.0.0"
cpu = var.task_cpu + local.otel_cpu + local.prometheus_proxy_cpu
memory = var.task_memory + local.otel_memory + local.prometheus_proxy_memory
cpu = local.task_cpu
memory = local.task_memory
}

#-------------------------------------------------------------------------------
Expand Down Expand Up @@ -65,8 +70,8 @@ resource "aws_ecs_task_definition" "app_task" {
{
name = module.this.id,
image = local.image,
cpu = var.task_cpu,
memory = var.task_memory,
cpu = local.task_cpu - local.otel_cpu - local.prometheus_proxy_cpu,
memory = local.task_memory - local.otel_memory - local.prometheus_proxy_memory,
essential = true,

environment = [
Expand Down Expand Up @@ -206,7 +211,7 @@ resource "aws_ecs_service" "app_service" {
cluster = aws_ecs_cluster.app_cluster.id
task_definition = aws_ecs_task_definition.app_task.arn
launch_type = "FARGATE"
desired_count = var.autoscaling_desired_count
desired_count = local.desired_count
propagate_tags = "TASK_DEFINITION"

# Wait for the service deployment to succeed
Expand Down
6 changes: 5 additions & 1 deletion terraform/ecs/cluster_autoscaling.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
locals {
autoscaling_min_capacity = module.this.stage == "prod" ? var.autoscaling_min_capacity : 1
}

resource "aws_appautoscaling_target" "ecs_target" {
min_capacity = var.autoscaling_min_capacity
min_capacity = local.autoscaling_min_capacity
max_capacity = var.autoscaling_max_capacity
resource_id = "service/${aws_ecs_cluster.app_cluster.name}/${aws_ecs_service.app_service.name}"
scalable_dimension = "ecs:service:DesiredCount"
Expand Down
10 changes: 10 additions & 0 deletions terraform/ecs/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,13 @@ output "load_balancer_arn_suffix" {
description = "The ARN suffix of the load balancer"
value = aws_lb.load_balancer.arn_suffix
}

output "log_group_app_name" {
description = "The name of the log group for the app"
value = aws_cloudwatch_log_group.cluster.name
}

output "log_group_app_arn" {
description = "The ARN of the log group for the app"
value = aws_cloudwatch_log_group.cluster.arn
}
34 changes: 26 additions & 8 deletions terraform/monitoring/dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@ local ds = {
},
};
local vars = {
namespace: 'Notify',
environment: std.extVar('environment'),
notifications: std.parseJson(std.extVar('notifications')),

ecs_service_name: std.extVar('ecs_service_name'),
load_balancer: std.extVar('load_balancer'),
target_group: std.extVar('target_group'),
redis_cluster_id: std.extVar('redis_cluster_id'),
ecs_service_name: std.extVar('ecs_service_name'),
load_balancer: std.extVar('load_balancer'),
target_group: std.extVar('target_group'),
redis_cluster_id: std.extVar('redis_cluster_id'),
log_group_app_name: std.extVar('log_group_app_name'),
log_group_app_arn: std.extVar('log_group_app_arn'),
aws_account_id: std.extVar('aws_account_id'),
};

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -52,8 +56,14 @@ dashboard.new(
)

.addPanels(layout.generate_grid([
row.new('ECS'),
row.new('Application'),
// panels.app.http_request_rate(ds, vars) { gridPos: pos._4 },
// panels.app.http_request_latency(ds, vars) { gridPos: pos._4 },
Comment on lines +60 to +61
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These panels were specific to Notify Server, but leaving them here as a reminder to put request rate and request latency up here at the top as those are important high-level metrics.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened issue: #593

panels.ecs.availability(ds, vars) { gridPos: pos._3 },
panels.lb.error_5xx(ds, vars) { gridPos: pos._3 },
panels.lb.error_5xx_logs(ds, vars) { gridPos: pos._3 },

row.new('ECS'),
panels.ecs.memory(ds, vars) { gridPos: pos._3 },
panels.ecs.cpu(ds, vars) { gridPos: pos._3 },

Expand Down Expand Up @@ -109,9 +119,6 @@ dashboard.new(
panels.proxy.http_codes(ds, vars) { gridPos: pos.two_thirds },
panels.proxy.healthy_hosts(ds, vars) { gridPos: pos._3 },

row.new('Database'),
panels.db.redis_cpu_memory(ds, vars) { gridPos: pos._2 },

row.new('History Metrics'),
panels.history.requests(ds, vars) { gridPos: pos_short._3 },
panels.history.latency(ds, vars) { gridPos: pos_short._3 },
Expand All @@ -123,4 +130,15 @@ dashboard.new(
panels.identity.latency(ds, vars) { gridPos: pos_short._2 },
panels.identity.cache(ds, vars) { gridPos: pos_short._2 },
panels.identity.usage(ds, vars) { gridPos: pos_short._2 },

row.new('Redis'),
panels.redis.cpu(ds, vars) { gridPos: pos._2 },
panels.redis.memory(ds, vars) { gridPos: pos._2 },

row.new('Load Balancer'),
panels.lb.active_connections(ds, vars) { gridPos: pos._2 },
panels.lb.requests(ds, vars) { gridPos: pos._2 },

panels.lb.healthy_hosts(ds, vars) { gridPos: pos._3 },
panels.lb.error_4xx(ds, vars) { gridPos: pos._3 },
]))
2 changes: 1 addition & 1 deletion terraform/monitoring/grafonnet-lib
Submodule grafonnet-lib updated 61 files
+5 −0 .editorconfig
+21 −0 LICENSE
+12 −6 alert.libsonnet
+34 −8 alert_condition.libsonnet
+3 −0 defaults.libsonnet
+110 −58 defaults/alerts.libsonnet
+16 −25 defaults/configuration.libsonnet
+31 −51 defaults/overrides.libsonnet
+35 −0 defaults/panels.libsonnet
+29 −0 defaults/panels/aws/amqp/available_messages.libsonnet
+66 −0 defaults/panels/aws/amqp/cpu.libsonnet
+31 −0 defaults/panels/aws/amqp/in_flight_messages.libsonnet
+30 −0 defaults/panels/aws/amqp/memory.libsonnet
+41 −0 defaults/panels/aws/amqp/storage.libsonnet
+90 −0 defaults/panels/aws/docdb/available_memory.libsonnet
+35 −0 defaults/panels/aws/docdb/buffer_cache_hit_ratio.libsonnet
+31 −0 defaults/panels/aws/docdb/connections.libsonnet
+114 −0 defaults/panels/aws/docdb/cpu.libsonnet
+68 −0 defaults/panels/aws/docdb/low_mem_op_throttled.libsonnet
+28 −0 defaults/panels/aws/docdb/net_throughput.libsonnet
+36 −0 defaults/panels/aws/docdb/volume.libsonnet
+40 −0 defaults/panels/aws/docdb/write_latency.libsonnet
+78 −0 defaults/panels/aws/ecs/cpu.libsonnet
+118 −0 defaults/panels/aws/ecs/cpu_memory.libsonnet
+78 −0 defaults/panels/aws/ecs/memory.libsonnet
+60 −0 defaults/panels/aws/redis/cpu.libsonnet
+47 −0 defaults/panels/aws/redis/memory.libsonnet
+80 −0 defaults/panels/aws/redis/swap_usage.libsonnet
+60 −0 defaults/values.libsonnet
+242 −89 field_config.libsonnet
+61 −37 grafana.libsonnet
+19 −0 override.libsonnet
+26 −23 panels/panel.libsonnet
+4 −0 panels/table.libsonnet
+167 −202 panels/timeseries.libsonnet
+631 −18 targets/cloudwatch.libsonnet
+27 −0 tests/defaults/alerts.jsonnet
+6 −0 tests/defaults/configuration.jsonnet
+11 −0 tests/defaults/panels/docdb/available_memory.jsonnet
+8 −0 tests/defaults/panels/docdb/buffer_cache_hit_ratio.jsonnet
+8 −0 tests/defaults/panels/docdb/connections.jsonnet
+11 −0 tests/defaults/panels/docdb/cpu.jsonnet
+11 −0 tests/defaults/panels/docdb/low_mem_op_throttled.jsonnet
+8 −0 tests/defaults/panels/docdb/net_throughput.jsonnet
+8 −0 tests/defaults/panels/docdb/volume.jsonnet
+8 −0 tests/defaults/panels/docdb/write_latency.jsonnet
+11 −0 tests/defaults/panels/ecs/cpu.jsonnet
+11 −0 tests/defaults/panels/ecs/cpu_memory.jsonnet
+11 −0 tests/defaults/panels/ecs/memory.jsonnet
+11 −0 tests/defaults/panels/redis/cpu.jsonnet
+11 −0 tests/defaults/panels/redis/memory.jsonnet
+11 −0 tests/defaults/panels/redis/swap_usage.jsonnet
+37 −0 tests/field_config.jsonnet
+43 −0 tests/panels/timeseries.jsonnet
+36 −0 tests/utils/arrays.jsonnet
+27 −0 tests/utils/strings.jsonnet
+64 −0 tests/utils/units.jsonnet
+6 −0 threshold.libsonnet
+14 −0 utils/arrays.libsonnet
+17 −4 utils/strings.libsonnet
+89 −0 utils/units.libsonnet
13 changes: 8 additions & 5 deletions terraform/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@ data "jsonnet_file" "dashboard" {
environment = module.this.stage
notifications = jsonencode(var.notification_channels)

ecs_service_name = var.ecs_service_name
ecs_task_family = var.ecs_task_family
load_balancer = var.load_balancer_arn
target_group = var.ecs_target_group_arn
redis_cluster_id = var.redis_cluster_id
ecs_service_name = var.ecs_service_name
ecs_task_family = var.ecs_task_family
load_balancer = var.load_balancer_arn
target_group = var.ecs_target_group_arn
redis_cluster_id = var.redis_cluster_id
log_group_app_name = var.log_group_app_name
log_group_app_arn = var.log_group_app_arn
aws_account_id = var.aws_account_id
}
}

Expand Down
76 changes: 0 additions & 76 deletions terraform/monitoring/panels/db/redis_cpu_memory.libsonnet

This file was deleted.

2 changes: 1 addition & 1 deletion terraform/monitoring/panels/ecs/availability.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ local error_alert(vars) = alert.new(
axisSoftMax = 100,
)
)
.setAlert(error_alert(vars))
.setAlert(vars.environment, error_alert(vars))

.addTarget(targets.prometheus(
datasource = ds.prometheus,
Expand Down
2 changes: 1 addition & 1 deletion terraform/monitoring/panels/ecs/cpu.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ local overrides = defaults.overrides;
datasource = ds.prometheus,
)
.configure(overrides.cpu(defaults.configuration.timeseries_resource))
.setAlert(alert.new(
.setAlert(vars.environment, alert.new(
namespace = 'RPC Proxy',
name = "RPC %s - High CPU usage" % vars.environment,
message = "RPC %s - High CPU usage" % vars.environment,
Expand Down
2 changes: 1 addition & 1 deletion terraform/monitoring/panels/ecs/memory.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ local overrides = defaults.overrides;
datasource = ds.prometheus,
)
.configure(defaults.overrides.memory(defaults.configuration.timeseries_resource))
.setAlert(alert.new(
.setAlert(vars.environment, alert.new(
namespace = 'RPC Proxy',
name = "RPC %s - High Memory (RAM) usage" % vars.environment,
message = "RPC %s - High Memory (RAM) usage" % vars.environment,
Expand Down
24 changes: 24 additions & 0 deletions terraform/monitoring/panels/lb/active_connections.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
local grafana = import '../../grafonnet-lib/grafana.libsonnet';
local defaults = import '../../grafonnet-lib/defaults.libsonnet';

local panels = grafana.panels;
local targets = grafana.targets;

{
new(ds, vars)::
panels.timeseries(
title = 'Active Connections',
datasource = ds.cloudwatch,
)
.configure(defaults.configuration.timeseries)

.addTarget(targets.cloudwatch(
datasource = ds.cloudwatch,
namespace = 'AWS/ApplicationELB',
metricName = 'ActiveConnectionCount',
dimensions = {
LoadBalancer: vars.load_balancer
},
statistic = 'Average',
))
}
Loading
Loading