From b8c3648231912b43bcf8a814b92cc24796a297e7 Mon Sep 17 00:00:00 2001 From: NeuralFlux <40491005+NeuralFlux@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:14:49 -0400 Subject: [PATCH 1/5] fix: make otel tracing leaner and synchronous --- package.json | 2 +- src/controllers/opentelemetry.ts | 14 +++++++++----- src/controllers/threading/taskHandler.ts | 3 +-- src/controllers/threading/threadHandler.ts | 7 ++++++- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/package.json b/package.json index d24b3de..2a7c03e 100644 --- a/package.json +++ b/package.json @@ -81,10 +81,10 @@ "@bull-board/api": "^5.9.1", "@bull-board/express": "^5.9.1", "@opentelemetry/api": "^1.7.0", - "@opentelemetry/auto-instrumentations-node": "^0.44.0", "@opentelemetry/exporter-jaeger": "^1.19.0", "@opentelemetry/exporter-metrics-otlp-proto": "^0.50.0", "@opentelemetry/exporter-trace-otlp-proto": "^0.50.0", + "@opentelemetry/instrumentation-http": "^0.53.0", "@opentelemetry/resources": "^1.18.1", "@opentelemetry/sdk-metrics": "^1.18.1", "@opentelemetry/sdk-node": "^0.50.0", diff --git a/src/controllers/opentelemetry.ts b/src/controllers/opentelemetry.ts index e05793a..4e5dfa1 100644 --- a/src/controllers/opentelemetry.ts +++ b/src/controllers/opentelemetry.ts @@ -1,5 +1,6 @@ import { NodeSDK } from "@opentelemetry/sdk-node"; -import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node"; +import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-node"; +import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; import { Resource } from "@opentelemetry/resources"; import Debug from "debug"; import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-proto'; @@ -9,17 +10,20 @@ import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; const jaegerHost = process.env.JAEGER_HOST ?? 'jaeger-otel-collector.sri'; const jaegerPort = process.env.JAEGER_PORT ?? 4318; const jaegerResName = process.env.JAEGER_RES_NAME ?? '/v1/traces'; +const traceExporter = new OTLPTraceExporter({ + url: `http://${jaegerHost}:${jaegerPort}${jaegerResName}` +}); debug("Initializing Opentelemetry instrumentation..."); const sdk = new NodeSDK({ // metrics, if needed, shall be exported on a different endpoint - traceExporter: new OTLPTraceExporter({ - url: `http://${jaegerHost}:${jaegerPort}${jaegerResName}` - }), - instrumentations: [getNodeAutoInstrumentations()], + // trace a subset of instrumentations to avoid performance overhead + instrumentations: [new HttpInstrumentation()], resource: new Resource({ [ATTR_SERVICE_NAME]: "biothings-explorer", }), + // use simple span processor to avoid losing data when the forked process exits (taskHandler) + spanProcessors: [new SimpleSpanProcessor(traceExporter)], }); debug(`OTel URL http://${jaegerHost}:${jaegerPort}${jaegerResName}`); sdk.start(); diff --git a/src/controllers/threading/taskHandler.ts b/src/controllers/threading/taskHandler.ts index 78b2629..4e9705a 100644 --- a/src/controllers/threading/taskHandler.ts +++ b/src/controllers/threading/taskHandler.ts @@ -91,14 +91,13 @@ async function runTask({ }); let activeContext: Context = propagation.extract(context.active(), { traceparent, tracestate }); - debug(`OTel task ${traceparent} and ${tracestate}`); + debug(`OTel task context: ${traceparent} and ${tracestate}`); let tracer = trace.getTracer("biothings-explorer-thread") span = tracer.startSpan( routeNames[route], {kind: 1}, // specifies internal span activeContext, ); - debug(`OTel span created ${span}`); span.setAttribute("bte.requestData", JSON.stringify(req.data.queryGraph)); Telemetry.setOtelSpan(span); diff --git a/src/controllers/threading/threadHandler.ts b/src/controllers/threading/threadHandler.ts index e9a595c..894caa8 100644 --- a/src/controllers/threading/threadHandler.ts +++ b/src/controllers/threading/threadHandler.ts @@ -1,6 +1,6 @@ import { MessageChannel, threadId } from "worker_threads"; import Debug from "debug"; -import { context, propagation } from "@opentelemetry/api"; +import { context, propagation, trace } from "@opentelemetry/api"; const debug = Debug("bte:biothings-explorer-trapi:threading"); import path from "path"; import { redisClient } from "@biothings-explorer/utils"; @@ -221,6 +221,11 @@ export async function runTask(req: Request, res: Response, route: string, useBul propagation.inject(context.active(), otelData); const { traceparent, tracestate } = otelData; + // add req dest to root span name as HTTP instrumentation doesn't do it automatically + const activeContext = context.active(); + const rootSpan = trace.getSpan(activeContext); + rootSpan.updateName(`${req.method} ${req.originalUrl}`); + const taskInfo: TaskInfo = { data: { route, From 677886ccd47a3b455484c448a5dfb362ea442307 Mon Sep 17 00:00:00 2001 From: NeuralFlux <40491005+NeuralFlux@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:44:44 -0400 Subject: [PATCH 2/5] chore: ignore ts lint error --- src/controllers/opentelemetry.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/controllers/opentelemetry.ts b/src/controllers/opentelemetry.ts index 4e5dfa1..252b286 100644 --- a/src/controllers/opentelemetry.ts +++ b/src/controllers/opentelemetry.ts @@ -23,6 +23,7 @@ const sdk = new NodeSDK({ [ATTR_SERVICE_NAME]: "biothings-explorer", }), // use simple span processor to avoid losing data when the forked process exits (taskHandler) + // @ts-ignore - fix from MetinSeylan/Nestjs-OpenTelemetry#63 spanProcessors: [new SimpleSpanProcessor(traceExporter)], }); debug(`OTel URL http://${jaegerHost}:${jaegerPort}${jaegerResName}`); From daac52bc50358bba542810f0538ae5779ce12ccd Mon Sep 17 00:00:00 2001 From: NeuralFlux <40491005+NeuralFlux@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:57:56 -0400 Subject: [PATCH 3/5] fix: update root span only if it exists --- src/controllers/threading/threadHandler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/controllers/threading/threadHandler.ts b/src/controllers/threading/threadHandler.ts index 894caa8..5a8a771 100644 --- a/src/controllers/threading/threadHandler.ts +++ b/src/controllers/threading/threadHandler.ts @@ -224,7 +224,7 @@ export async function runTask(req: Request, res: Response, route: string, useBul // add req dest to root span name as HTTP instrumentation doesn't do it automatically const activeContext = context.active(); const rootSpan = trace.getSpan(activeContext); - rootSpan.updateName(`${req.method} ${req.originalUrl}`); + if (rootSpan != undefined) rootSpan.updateName(`${req.method} ${req.originalUrl}`); const taskInfo: TaskInfo = { data: { From 08aefaabf08440bf1b34dd7d7959cd258a0f2edc Mon Sep 17 00:00:00 2001 From: NeuralFlux <40491005+NeuralFlux@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:01:10 -0400 Subject: [PATCH 4/5] chore: add type hints to otel vars --- src/controllers/threading/taskHandler.ts | 4 ++-- src/controllers/threading/threadHandler.ts | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/controllers/threading/taskHandler.ts b/src/controllers/threading/taskHandler.ts index 4e9705a..1a42dbd 100644 --- a/src/controllers/threading/taskHandler.ts +++ b/src/controllers/threading/taskHandler.ts @@ -14,7 +14,7 @@ import { tasks } from "../../routes/index"; import { getQueryQueue } from "../async/asyncquery_queue"; import * as Sentry from "@sentry/node"; import { ProfilingIntegration } from "@sentry/profiling-node"; -import { Span, trace, context, propagation, Context } from "@opentelemetry/api"; +import { Span, trace, context, propagation, Context, Tracer } from "@opentelemetry/api"; import { Telemetry } from "@biothings-explorer/utils"; import { InnerTaskData } from "@biothings-explorer/types"; @@ -92,7 +92,7 @@ async function runTask({ let activeContext: Context = propagation.extract(context.active(), { traceparent, tracestate }); debug(`OTel task context: ${traceparent} and ${tracestate}`); - let tracer = trace.getTracer("biothings-explorer-thread") + let tracer: Tracer = trace.getTracer("biothings-explorer-thread") span = tracer.startSpan( routeNames[route], {kind: 1}, // specifies internal span diff --git a/src/controllers/threading/threadHandler.ts b/src/controllers/threading/threadHandler.ts index 5a8a771..c23e69c 100644 --- a/src/controllers/threading/threadHandler.ts +++ b/src/controllers/threading/threadHandler.ts @@ -1,6 +1,6 @@ import { MessageChannel, threadId } from "worker_threads"; import Debug from "debug"; -import { context, propagation, trace } from "@opentelemetry/api"; +import { context, propagation, trace, Context, Span } from "@opentelemetry/api"; const debug = Debug("bte:biothings-explorer-trapi:threading"); import path from "path"; import { redisClient } from "@biothings-explorer/utils"; @@ -222,8 +222,8 @@ export async function runTask(req: Request, res: Response, route: string, useBul const { traceparent, tracestate } = otelData; // add req dest to root span name as HTTP instrumentation doesn't do it automatically - const activeContext = context.active(); - const rootSpan = trace.getSpan(activeContext); + const activeContext: Context = context.active(); + const rootSpan: Span = trace.getSpan(activeContext); if (rootSpan != undefined) rootSpan.updateName(`${req.method} ${req.originalUrl}`); const taskInfo: TaskInfo = { From 0d30d9b56d6188223874edf93e48d3401c6e0d5f Mon Sep 17 00:00:00 2001 From: NeuralFlux <40491005+NeuralFlux@users.noreply.github.com> Date: Thu, 10 Oct 2024 17:50:59 -0400 Subject: [PATCH 5/5] fix: switch to BatchSpanProcessor and flush spans when thread exits --- src/controllers/opentelemetry.ts | 15 +++++++++++---- src/controllers/threading/taskHandler.ts | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/controllers/opentelemetry.ts b/src/controllers/opentelemetry.ts index 252b286..77b81e3 100644 --- a/src/controllers/opentelemetry.ts +++ b/src/controllers/opentelemetry.ts @@ -1,5 +1,5 @@ import { NodeSDK } from "@opentelemetry/sdk-node"; -import { SimpleSpanProcessor } from "@opentelemetry/sdk-trace-node"; +import { BatchSpanProcessor } from "@opentelemetry/sdk-trace-node"; import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; import { Resource } from "@opentelemetry/resources"; import Debug from "debug"; @@ -10,9 +10,11 @@ import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; const jaegerHost = process.env.JAEGER_HOST ?? 'jaeger-otel-collector.sri'; const jaegerPort = process.env.JAEGER_PORT ?? 4318; const jaegerResName = process.env.JAEGER_RES_NAME ?? '/v1/traces'; + const traceExporter = new OTLPTraceExporter({ url: `http://${jaegerHost}:${jaegerPort}${jaegerResName}` }); +const spanProcessor = new BatchSpanProcessor(traceExporter); debug("Initializing Opentelemetry instrumentation..."); const sdk = new NodeSDK({ @@ -22,10 +24,15 @@ const sdk = new NodeSDK({ resource: new Resource({ [ATTR_SERVICE_NAME]: "biothings-explorer", }), - // use simple span processor to avoid losing data when the forked process exits (taskHandler) - // @ts-ignore - fix from MetinSeylan/Nestjs-OpenTelemetry#63 - spanProcessors: [new SimpleSpanProcessor(traceExporter)], + // @ts-ignore because MetinSeylan/Nestjs-OpenTelemetry#63 + spanProcessors: [spanProcessor], }); debug(`OTel URL http://${jaegerHost}:${jaegerPort}${jaegerResName}`); sdk.start(); debug("Opentelemetry instrumentation initialized."); + +export async function flushRemainingSpans(): Promise { + // avoid losing any spans in the buffer when taskHandler exits + debug("Flushing remaining spans..."); + await spanProcessor.forceFlush(); +} diff --git a/src/controllers/threading/taskHandler.ts b/src/controllers/threading/taskHandler.ts index 1a42dbd..b22ae41 100644 --- a/src/controllers/threading/taskHandler.ts +++ b/src/controllers/threading/taskHandler.ts @@ -17,6 +17,7 @@ import { ProfilingIntegration } from "@sentry/profiling-node"; import { Span, trace, context, propagation, Context, Tracer } from "@opentelemetry/api"; import { Telemetry } from "@biothings-explorer/utils"; import { InnerTaskData } from "@biothings-explorer/types"; +import { flushRemainingSpans } from "../opentelemetry"; // use SENTRY_DSN environment variable try { @@ -113,6 +114,7 @@ async function runTask({ transaction.finish(); span.end(); Telemetry.removeOtelSpan(); + await flushRemainingSpans(); } catch (error) { debug("Sentry/OpenTelemetry transaction finish error. This does not affect execution."); debug(error);