From b77f1c0b9b2468fd41f4a2a5681d9bf409420590 Mon Sep 17 00:00:00 2001 From: John Duffell Date: Wed, 9 Oct 2024 20:09:20 +0100 Subject: [PATCH] add log links to alarm chat messages --- handlers/alarms-handler/src/alarmMappings.ts | 116 +++++++--- handlers/alarms-handler/src/cloudwatch.ts | 23 +- handlers/alarms-handler/src/index.ts | 210 ++++++++++++------ .../alarms-handler/test/alarmMappings.test.ts | 25 ++- handlers/alarms-handler/test/index.test.ts | 108 ++++++++- 5 files changed, 363 insertions(+), 119 deletions(-) diff --git a/handlers/alarms-handler/src/alarmMappings.ts b/handlers/alarms-handler/src/alarmMappings.ts index 411bd6d384..163b140e6b 100644 --- a/handlers/alarms-handler/src/alarmMappings.ts +++ b/handlers/alarms-handler/src/alarmMappings.ts @@ -1,3 +1,4 @@ +import { groupBy } from '@modules/arrayFunctions'; import { getIfDefined } from '@modules/nullAndUndefined'; type Team = 'VALUE' | 'GROWTH' | 'PORTFOLIO' | 'PLATFORM' | 'SRE'; @@ -17,7 +18,8 @@ const sharedMobilePurchasesApps = [ 'mobile-purchases-google-update-subscriptions', ]; -const teamToAppMappings: Record = { +type AppInfo = string | { app: string; logGroups: string[] }; +export const teamToAppMappings: Record = { GROWTH: [ 'acquisition-events-api', 'admin-console', @@ -71,11 +73,17 @@ const teamToAppMappings: Record = { 'zuora-creditor', // support-frontend - 'frontend', + { app: 'frontend', logGroups: ['support-frontend'] }, 'it-test-runner', 'stripe-intent', - 'workers', - 'payment-api', + { + app: 'workers', + logGroups: [ + '/aws/lambda/CreatePaymentMethod', + '/aws/lambda/CreateZuoraSubscription', //etc + ], + }, + { app: 'payment-api', logGroups: ['support-payment-api'] }, // support-service-lambdas 'digital-voucher-suspension-processor', @@ -107,33 +115,81 @@ const teamToAppMappings: Record = { ], }; -const buildAppToTeamMappings = (): Record => { - const mappings: Record = {}; - - for (const [team, apps] of Object.entries(teamToAppMappings)) { - for (const app of apps) { - const teams = mappings[app] ?? []; - teams.push(team as Team); - - mappings[app] = teams; - } +export class AlarmMappings { + constructor(mappings: Record = teamToAppMappings) { + this.appToTeamMappings = this.buildAppToTeamMappings(mappings); + this.appToLogGroupOverrides = this.buildAppToLogGroupOverrides(mappings); } - return mappings; -}; - -const appToTeamMappings: Record = buildAppToTeamMappings(); -export const getTeams = (appName?: string): Team[] => { - if (appName && appToTeamMappings[appName]) { - return appToTeamMappings[appName] as Team[]; - } + private buildAppToTeamMappings = ( + theMappings: Record, + ): Record => { + const entries: Array<[Team, AppInfo[]]> = Object.entries(theMappings) as Array<[ + Team, + AppInfo[], + ]>; // `as` - hmm? + + const teamToApp: Array<{ app: string; team: Team }> = entries.flatMap( + ([team, appInfos]) => + appInfos.map((appInfo) => { + const app = typeof appInfo === 'string' ? appInfo : appInfo.app; + return { team, app }; + }), + ); + const groups = groupBy(teamToApp, ({ app }) => app); + + const mappings: Record = Object.fromEntries( + Object.entries(groups).map(([app, info]) => [ + app, + info.map(({ team }) => team), + ]), + ); + + return mappings; + }; + + private buildAppToLogGroupOverrides = ( + theMappings: Record, + ): Record => { + return Object.fromEntries( + Object.values(theMappings) + .flatMap((appInfos) => appInfos) + .flatMap((appInfo) => + typeof appInfo !== 'string' ? [[appInfo.app, appInfo.logGroups]] : [], + ), + ); + }; + + private appToTeamMappings: Record; + private appToLogGroupOverrides: Record; + + getTeams = (appName?: string): Team[] => { + if (appName && this.appToTeamMappings[appName]) { + return this.appToTeamMappings[appName] as Team[]; + } - return ['SRE']; -}; + return ['SRE']; + }; + + getTeamWebhookUrl = (team: Team): string => { + return getIfDefined( + process.env[`${team}_WEBHOOK`], + `${team}_WEBHOOK environment variable not set`, + ); + }; + + getLogGroups = (appName: string, stage: string): string[] => { + // currently we assume the log group is /aws/lambda/-, we can add overrides to the appToTeamMappings later + const logGroup = this.appToLogGroupOverrides[appName]; + if (logGroup === undefined) { + // assume it's a lambda + console.log('logGroup', logGroup); + const lambdaName = appName + '-' + stage; + + const logGroupName = '/aws/lambda/' + lambdaName; + return [logGroupName]; + } -export const getTeamWebhookUrl = (team: Team): string => { - return getIfDefined( - process.env[`${team}_WEBHOOK`], - `${team}_WEBHOOK environment variable not set`, - ); -}; + return logGroup.map((override) => override + '-' + stage); + }; +} diff --git a/handlers/alarms-handler/src/cloudwatch.ts b/handlers/alarms-handler/src/cloudwatch.ts index 0b585eb662..fa8eba36ff 100644 --- a/handlers/alarms-handler/src/cloudwatch.ts +++ b/handlers/alarms-handler/src/cloudwatch.ts @@ -49,10 +49,15 @@ const buildCloudwatchClient = (awsAccountId: string): CloudWatchClient => { return new CloudWatchClient({ region: 'eu-west-1' }); }; -const getTags = async ( +export type Tags = { + App?: string; + Stage?: string; +}; + +export const getTags = async ( alarmArn: string, awsAccountId: string, -): Promise => { +): Promise => { const client = buildCloudwatchClient(awsAccountId); const request = new ListTagsForResourceCommand({ @@ -60,13 +65,9 @@ const getTags = async ( }); const response = await client.send(request); - return response.Tags ?? []; -}; - -export const getAppNameTag = async ( - alarmArn: string, - awsAccountId: string, -): Promise => { - const tags = await getTags(alarmArn, awsAccountId); - return tags.find((tag: Tag) => tag.Key === 'App')?.Value; + const tags = response.Tags ?? []; + const entries = tags.flatMap((tag: Tag) => + tag.Key && tag.Value ? [[tag.Key, tag.Value]] : [], + ); + return Object.fromEntries(entries) as Tags; }; diff --git a/handlers/alarms-handler/src/index.ts b/handlers/alarms-handler/src/index.ts index f2eeaa12d1..88e4f6c85e 100644 --- a/handlers/alarms-handler/src/index.ts +++ b/handlers/alarms-handler/src/index.ts @@ -1,7 +1,7 @@ -import type { SNSEventRecord, SQSEvent } from 'aws-lambda'; +import type { SNSEventRecord, SQSEvent, SQSRecord } from 'aws-lambda'; import { z } from 'zod'; -import { getTeams, getTeamWebhookUrl } from './alarmMappings'; -import { getAppNameTag } from './cloudwatch'; +import { AlarmMappings } from './alarmMappings'; +import { getTags } from './cloudwatch'; const cloudWatchAlarmMessageSchema = z.object({ AlarmArn: z.string(), @@ -10,30 +10,33 @@ const cloudWatchAlarmMessageSchema = z.object({ NewStateReason: z.string(), NewStateValue: z.string(), AWSAccountId: z.string(), + StateChangeTime: z.coerce.date(), + Trigger: z + .object({ + Period: z.number(), + EvaluationPeriods: z.number(), + }) + .optional(), }); type CloudWatchAlarmMessage = z.infer; export const handler = async (event: SQSEvent): Promise => { try { + const alarmMappings = new AlarmMappings(); for (const record of event.Records) { - console.log(record); - - const { Message, MessageAttributes } = JSON.parse( - record.body, - ) as SNSEventRecord['Sns']; - - const parsedMessage = attemptToParseMessageString({ - messageString: Message, - }); - - if (parsedMessage) { - await handleCloudWatchAlarmMessage({ message: parsedMessage }); - } else { - await handleSnsPublishMessage({ - message: Message, - messageAttributes: MessageAttributes, - }); + const maybeChatMessages = await getChatMessages(record, alarmMappings); + + if (maybeChatMessages) { + await Promise.all( + maybeChatMessages.webhookUrls.map((webhookUrl) => { + return fetch(webhookUrl, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: maybeChatMessages.text }), + }); + }), + ); } } } catch (error) { @@ -42,11 +45,38 @@ export const handler = async (event: SQSEvent): Promise => { } }; -const attemptToParseMessageString = ({ - messageString, -}: { - messageString: string; -}): CloudWatchAlarmMessage | null => { +export async function getChatMessages( + record: SQSRecord, + alarmMappings: AlarmMappings, +) { + console.log('sqsRecord', record); + + const snsEvent = JSON.parse(record.body) as SNSEventRecord['Sns']; + + console.log('snsEvent', snsEvent); + + const parsedMessage = attemptToParseMessageString(snsEvent.Message); + + console.log('parsedMessage', parsedMessage); + + const message = parsedMessage + ? await getCloudWatchAlarmMessage(parsedMessage, alarmMappings) + : getSnsPublishMessage({ + message: snsEvent.Message, + messageAttributes: snsEvent.MessageAttributes, + }); + + if (message) { + const teams = alarmMappings.getTeams(message.app); + console.log('sending message to teams', teams); + const webhookUrls = teams.map(alarmMappings.getTeamWebhookUrl); + return { webhookUrls, text: message.text }; + } else return undefined; +} + +const attemptToParseMessageString = ( + messageString: string, +): CloudWatchAlarmMessage | null => { try { return cloudWatchAlarmMessageSchema.parse(JSON.parse(messageString)); } catch (error) { @@ -54,47 +84,96 @@ const attemptToParseMessageString = ({ } }; -const handleCloudWatchAlarmMessage = async ({ - message, -}: { - message: CloudWatchAlarmMessage; -}) => { - const { +async function getCloudWatchAlarmMessage( + { AlarmArn, AlarmName, NewStateReason, NewStateValue, AlarmDescription, AWSAccountId, - } = message; - - const app = await getAppNameTag(AlarmArn, AWSAccountId); - const teams = getTeams(app); - - await Promise.all( - teams.map((team) => { - const webhookUrl = getTeamWebhookUrl(team); - - const title = - NewStateValue === 'OK' - ? `✅ *ALARM OK:* ${AlarmName} has recovered!` - : `🚨 *ALARM:* ${AlarmName} has triggered!`; - const text = `${title}\n\n*Description:* ${ - AlarmDescription ?? '' - }\n\n*Reason:* ${NewStateReason}`; - - console.log(`CloudWatch alarm from ${app} owned by ${team}`); + StateChangeTime, + Trigger, + }: CloudWatchAlarmMessage, + alarmMappings: AlarmMappings, +) { + const tags = await getTags(AlarmArn, AWSAccountId); + console.log('tags', tags); + const { App, Stage } = tags; + + const logGroupNames = + App && Stage ? alarmMappings.getLogGroups(App, Stage) : []; + + const links = logGroupNames.map((logGroupName) => + getCloudwatchLogsLink(logGroupName, Trigger, StateChangeTime), + ); - return fetch(webhookUrl, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ text }), - }); - }), + const text = getText( + NewStateValue, + AlarmName, + AlarmDescription, + NewStateReason, + links, ); -}; -const handleSnsPublishMessage = async ({ + console.log(`CloudWatch alarm from ${App}, content ${text}`); + + return text ? { app: App, text } : undefined; +} + +function getCloudwatchLogsLink( + logGroupName: string, + Trigger: + | { + Period: number; + EvaluationPeriods: number; + } + | undefined, + StateChangeTime: Date, +) { + const assumedTimeForCompositeAlarms = 300; + const alarmCoveredTimeSeconds = Trigger + ? Trigger.EvaluationPeriods * Trigger.Period + : assumedTimeForCompositeAlarms; + const alarmEndTimeMillis = StateChangeTime.getTime(); + const alarmStartTimeMillis = + alarmEndTimeMillis - 1000 * alarmCoveredTimeSeconds; + + const cloudwatchLogsBaseUrl = + 'https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logsV2:log-groups/log-group/'; + const logLink = + cloudwatchLogsBaseUrl + + logGroupName.replaceAll('/', '$252F') + + '/log-events$3Fstart$3D' + + alarmStartTimeMillis + + '$26filterPattern$3D$26end$3D' + + alarmEndTimeMillis; + + return logLink; +} + +function getText( + NewStateValue: string, + AlarmName: string, + AlarmDescription: string | null | undefined, + NewStateReason: string, + links: string[], +) { + const title = + NewStateValue === 'OK' + ? `✅ *ALARM OK:* ${AlarmName} has recovered!` + : `🚨 *ALARM:* ${AlarmName} has triggered!`; + const text = [ + title, + `*Description:* ${AlarmDescription ?? ''}`, + `*Reason:* ${NewStateReason}`, + ] + .concat(links.map((link) => `*LogLink*: ${link}`)) + .join('\n\n'); + return text; +} + +const getSnsPublishMessage = ({ message, messageAttributes, }: { @@ -106,21 +185,10 @@ const handleSnsPublishMessage = async ({ if (stage && stage !== 'PROD') return; const app = messageAttributes.app?.Value; - const teams = getTeams(app); - await Promise.all( - teams.map((team) => { - const webhookUrl = getTeamWebhookUrl(team); + const text = message; - const text = message; + console.log(`SNS publish message from ${app}, content ${text}`); - console.log(`SNS publish message from ${app} owned by ${team}`); - - return fetch(webhookUrl, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ text }), - }); - }), - ); + return { app, text }; }; diff --git a/handlers/alarms-handler/test/alarmMappings.test.ts b/handlers/alarms-handler/test/alarmMappings.test.ts index 222bda04a7..b5516f0d41 100644 --- a/handlers/alarms-handler/test/alarmMappings.test.ts +++ b/handlers/alarms-handler/test/alarmMappings.test.ts @@ -1,4 +1,6 @@ -import { getTeams } from '../src/alarmMappings'; +import { AlarmMappings } from '../src/alarmMappings'; + +const { getTeams, getLogGroups } = new AlarmMappings(); describe('getTeam', () => { it('returns the correct team for a given app', () => { @@ -33,3 +35,24 @@ describe('getTeam', () => { expect(team).toEqual(['GROWTH', 'VALUE']); }); }); + +describe('getLogGroups', () => { + it('generates a default log group for a lambda', () => { + const app = 'discount-api'; + + const logGroups = getLogGroups(app, 'CODE'); + + expect(logGroups).toEqual(['/aws/lambda/discount-api-CODE']); + }); + + it('handles multi log groups', () => { + const app = 'workers'; + + const logGroups = getLogGroups(app, 'CODE'); + + expect(logGroups).toEqual([ + '/aws/lambda/CreatePaymentMethod-CODE', + '/aws/lambda/CreateZuoraSubscription-CODE', + ]); + }); +}); diff --git a/handlers/alarms-handler/test/index.test.ts b/handlers/alarms-handler/test/index.test.ts index ebda8ac1c5..d1305362a7 100644 --- a/handlers/alarms-handler/test/index.test.ts +++ b/handlers/alarms-handler/test/index.test.ts @@ -1,6 +1,7 @@ -import { type SQSEvent } from 'aws-lambda'; -import { handler } from '../src'; -import { getAppNameTag } from '../src/cloudwatch'; +import { type SQSEvent, SQSRecord } from 'aws-lambda'; +import { getChatMessages, handler } from '../src'; +import { getTags } from '../src/cloudwatch'; +import { AlarmMappings } from '../src/alarmMappings'; jest.mock('../src/cloudwatch'); @@ -23,6 +24,7 @@ describe('Handler', () => { NewStateValue: 'ALARM', AlarmDescription: 'description', AWSAccountId: '111111', + StateChangeTime: '2024-10-09T07:23:16.236+0000', }), }), }, @@ -51,7 +53,10 @@ describe('Handler', () => { }); it('should handle CloudWatch alarm message', async () => { - (getAppNameTag as jest.Mock).mockResolvedValueOnce('mock-app'); + (getTags as jest.Mock).mockResolvedValueOnce({ + App: 'mock-app', + Stage: 'CODE', + }); jest .spyOn(global, 'fetch') @@ -59,11 +64,36 @@ describe('Handler', () => { await handler(mockCloudWatchAlarmEvent); - expect(getAppNameTag).toHaveBeenCalledWith('mock-arn', '111111'); + expect(getTags).toHaveBeenCalledWith('mock-arn', '111111'); expect(fetch).toHaveBeenCalledWith(mockEnv.SRE_WEBHOOK, expect.any(Object)); }); + it('should handle captured CloudWatch alarm message', async () => { + (getTags as jest.Mock).mockResolvedValueOnce({ + App: 'mock-app', + Stage: 'CODE', + }); + + const result = await getChatMessages( + fullCloudWatchAlarmEvent, + new AlarmMappings({ SRE: ['mock-app'] }), + ); + + expect(getTags).toHaveBeenCalledWith( + 'arn:aws:cloudwatch:eu-west-1:1234:alarm:DISCOUNT-API-CODE Discount-api 5XX response', + '1234', + ); + const expectedText = + '🚨 *ALARM:* DISCOUNT-API-CODE Discount-api 5XX response has triggered!\n\n' + + '*Description:* Impact - Discount api returned a 5XX response check the logs for more information: https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logsV2:log-groups/log-group/$252Faws$252Flambda$252Fdiscount-api-CODE. Follow the process in https://docs.google.com/document/d/sdkjfhskjdfhksjdhf/edit\n\n' + + '*Reason:* Threshold Crossed: 1 datapoint [2.0 (09/10/24 07:18:00)] was greater than or equal to the threshold (1.0).\n\n' + + '*LogLink*: https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logsV2:log-groups/log-group/$252Faws$252Flambda$252Fmock-app-CODE/log-events$3Fstart$3D1728458296236$26filterPattern$3D$26end$3D1728458596236'; + expect(result?.webhookUrls).toEqual([mockEnv.SRE_WEBHOOK]); + expect(result?.text).toEqual(expectedText); + }); + it('should handle SNS publish message', async () => { + (getTags as jest.Mock).mockResolvedValueOnce({}); jest .spyOn(global, 'fetch') .mockResolvedValue(Promise.resolve(new Response(JSON.stringify({})))); @@ -74,6 +104,7 @@ describe('Handler', () => { }); it('should throw error if the fetch HTTP call fails', async () => { + (getTags as jest.Mock).mockResolvedValueOnce({}); jest .spyOn(global, 'fetch') .mockResolvedValue(Promise.reject(new Error('Fetch error'))); @@ -84,7 +115,10 @@ describe('Handler', () => { }); it('calls the webhook with the correct data for an OK action', async () => { - (getAppNameTag as jest.Mock).mockResolvedValueOnce('mock-app'); + (getTags as jest.Mock).mockResolvedValueOnce({ + App: 'mock-app', + Stage: 'CODE', + }); jest .spyOn(global, 'fetch') .mockResolvedValue(Promise.resolve(new Response(JSON.stringify({})))); @@ -99,6 +133,7 @@ describe('Handler', () => { NewStateValue: 'OK', AlarmDescription: 'description', AWSAccountId: '111111', + StateChangeTime: '2024-10-09T07:23:16.236+0000', }), }), }, @@ -115,3 +150,64 @@ describe('Handler', () => { ); }); }); + +const fullCloudWatchAlarmEvent = { + messageId: 'askjdhaskjhdjkashdakjsdjkashd', + receiptHandle: 'skdfhksjdfhksjdhfkjsdhfjkhsdfksd==', + body: JSON.stringify({ + Type: 'Notification', + MessageId: 'sdkfjhslkdfhjksjdhfkjsdhf', + TopicArn: 'arn:aws:sns:eu-west-1:123456:alarms-handler-topic-CODE', + Subject: + 'ALARM: "DISCOUNT-API-CODE Discount-api 5XX response" in EU (Ireland)', + Message: JSON.stringify({ + AlarmName: 'DISCOUNT-API-CODE Discount-api 5XX response', + AlarmDescription: + 'Impact - Discount api returned a 5XX response check the logs for more information: https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logsV2:log-groups/log-group/$252Faws$252Flambda$252Fdiscount-api-CODE. Follow the process in https://docs.google.com/document/d/sdkjfhskjdfhksjdhf/edit', + AWSAccountId: '1234', + AlarmConfigurationUpdatedTimestamp: '2024-09-23T09:21:15.363+0000', + NewStateValue: 'ALARM', + NewStateReason: + 'Threshold Crossed: 1 datapoint [2.0 (09/10/24 07:18:00)] was greater than or equal to the threshold (1.0).', + StateChangeTime: '2024-10-09T07:23:16.236+0000', + Region: 'EU (Ireland)', + AlarmArn: + 'arn:aws:cloudwatch:eu-west-1:1234:alarm:DISCOUNT-API-CODE Discount-api 5XX response', + OldStateValue: 'OK', + OKActions: [], + AlarmActions: ['arn:aws:sns:eu-west-1:1234:alarms-handler-topic-CODE'], + InsufficientDataActions: [], + Trigger: { + MetricName: '5XXError', + Namespace: 'AWS/ApiGateway', + StatisticType: 'Statistic', + Statistic: 'SUM', + Unit: null, + Dimensions: [[Object]], + Period: 300, + EvaluationPeriods: 1, + ComparisonOperator: 'GreaterThanOrEqualToThreshold', + Threshold: 1, + TreatMissingData: '', + EvaluateLowSampleCountPercentile: '', + }, + }), + Timestamp: '2024-10-09T07:23:16.318Z', + SignatureVersion: '1', + Signature: 'skjefhksjdhfkjsdhfkjsdhfkjsdf==', + SigningCertURL: 'https://sns.eu-west-1.amazonaws.com/smhdfsmdfhgsdjf.pem', + UnsubscribeURL: + 'https://sns.eu-west-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:eu-west-1:1234:alarms-handler-topic-CODE:sdkjfhsdkjfhskdjf', + }), + attributes: { + ApproximateReceiveCount: '1', + SentTimestamp: '1728458596353', + SenderId: 'askjdhaskjdhaksdj', + ApproximateFirstReceiveTimestamp: '1728458596364', + }, + messageAttributes: {}, + md5OfBody: 'askdjalksdjlasdjlaksjd', + eventSource: 'aws:sqs', + eventSourceARN: 'arn:aws:sqs:eu-west-1:1234:alarms-handler-queue-CODE', + awsRegion: 'eu-west-1', +} as SQSRecord;