Skip to content

Commit

Permalink
feat/fix: allow disabling high cardinality metrics, and add an option…
Browse files Browse the repository at this point in the history
… to periodically reset them (#51)

This fixes #50 by periodically
resetting the prometheus counters to keep the cardinality in check.

Even though that mitigates the issue greatly, it requires tweaking
`Metrics.resetPeriodMinutes` (depending on your traffic).

So I still decided to disable high cardinality metrics by default in
order to keep the rest of the metrics working (and protect people's
privacy) by default.
  • Loading branch information
cottand authored Jan 9, 2024
1 parent 0f54972 commit 9cb692d
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 8 deletions.
9 changes: 7 additions & 2 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ type Upstream struct {
}

type Metrics struct {
Enabled bool
Path string
Enabled bool
Path string
HighCardinalityEnabled bool
ResetPeriodMinutes int64
}

type DnsOverHttpServer struct {
Expand Down Expand Up @@ -167,6 +169,9 @@ followCnameDepth = 12
[Metrics]
enabled = false
path = "/metrics"
# see https://cottand.github.io/leng/Prometheus-Metrics.html
highCardinalityEnabled = false
resetPeriodMinutes = 60
[DnsOverHttpServer]
enabled = false
Expand Down
5 changes: 4 additions & 1 deletion doc/src/Configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,13 @@ customdnsrecords = [
# cache capacity, 0 for infinite
maxcount = 0

# Prometheus metrics - enable
# Prometheus metrics
[Metrics]
enabled = false
path = "/metrics"
# see https://cottand.github.io/leng/Prometheus-Metrics.html
highCardinalityEnabled = false
resetPeriodMinutes = 60

[DnsOverHttpServer]
enabled = false
Expand Down
35 changes: 34 additions & 1 deletion doc/src/Prometheus-Metrics.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,41 @@
# Prometheus metrics

The HTTP API has a `/metrics` endpoint that exposes Go runtime metrics as well as things including:

- downstream DNS requests, broken down by type
- upstream DNS requests
- upstream DNS-over-HTTPS success rate
- downstream DNS-over-HTTPS success rate

No grafana dashboards exist for leng yet. If you make one, please make a PR!

## High cardinality metrics

Tags can be added to some metrics (`upstream_request`, `request_total`) so that
they include information such as the name of the DNS request (ie, `example.com.`)
or the IP of host making the request.

If leng is left to run for a few hours (and you have enough traffic),
the cardinality of these metrics will grow, to the point
the
size of the `/metrics`
response will grow to be so big the metrics stop being updated.
While resetting the counters periodically can help
(and you can tweak that with the config `Metrics.resetPeriodMinutes`)
but you might still see issues depending on your traffic.
You can
read [this SO post](https://stackoverflow.com/questions/46373442/how-dangerous-are-high-cardinality-labels-in-prometheus)
to learn more.

High cardinality metrics **can also compromise your privacy** by exposing in the metrics endpoint
what domains clients are querying as well as their IPs.

For these reasons, **high cardinality metrics are disabled by default**. You can enable them
with the following config:

No grafana dashboards exist for leng yet. If you make one, please make a PR!
```toml
[Metrics]
enabled = true
path = "/metrics"
highCardinalityEnabled = true
```
56 changes: 56 additions & 0 deletions grimd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"fmt"
"github.com/cottand/leng/internal/metric"
"github.com/pelletier/go-toml/v2"
"io"
"net/http"
Expand Down Expand Up @@ -36,6 +37,8 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client,

changeConfig(&config)

cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled)
defer cancelMetrics()
quitActivation := make(chan bool)
actChannel := make(chan *ActivationHandler)

Expand All @@ -59,6 +62,9 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client,
// QuestionCache contains all queries to the dns server
questionCache := makeQuestionCache(config.QuestionCacheCap)

reloadChan := make(chan bool)
_, _ = StartAPIServer(&config, reloadChan, blockCache, questionCache)
defer close(reloadChan)
server.Run(&config, blockCache, questionCache)

time.Sleep(200 * time.Millisecond)
Expand All @@ -67,6 +73,56 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client,
test(c, testDnsHost)
}

func TestHighCardinalityMetricsOff(t *testing.T) {
var config *Config
integrationTest(
func(c *Config) {
c.CustomDNSRecords = []string{
"example.com. IN A 10.10.0.1 ",
"example.org. IN A 10.10.0.2 ",
}
c.Metrics.Enabled = true
c.Metrics.HighCardinalityEnabled = false
config = c
},
func(client *dns.Client, target string) {
c := new(dns.Client)
m := new(dns.Msg)

// make 2 request
m.SetQuestion(dns.Fqdn("example.com"), dns.TypeA)

_, _, err := c.Exchange(m, target)
if err != nil {
t.Fatal(err)
}
m.SetQuestion(dns.Fqdn("example.org"), dns.TypeA)

_, _, err = c.Exchange(m, target)
if err != nil {
t.Fatal(err)
}

metrics, err := http.Get(fmt.Sprintf("http://%s%s", config.API, config.Metrics.Path))
if err != nil {
t.Fatal(err)
}
defer func() {
_ = metrics.Body.Close()
}()
body, _ := io.ReadAll(metrics.Body)
bodyStr := string(body)

if !strings.Contains(bodyStr, "q_name=\"\"") {
t.Fatalf("Expected an empty `q_name` label, but it was not in the metrics response:\n%s", bodyStr)
}
if !strings.Contains(bodyStr, "remote_ip=\"\"") {
t.Fatalf("Expected an empty `remote_ip` label, but it was not in the metrics response:\n%s", bodyStr)
}
},
)
}

func TestMultipleARecords(t *testing.T) {
integrationTest(
func(c *Config) {
Expand Down
66 changes: 62 additions & 4 deletions internal/metric/metric.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
package metric

import (
"context"
"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"net"
"strconv"
"time"
)

const (
Expand Down Expand Up @@ -60,33 +62,89 @@ var (
Name: "doh_response_count",
Help: "Successful DoH responses",
}, []string{"status"})

allVecMetrics = []*prometheus.CounterVec{
responseCounter,
RequestUpstreamResolveCounter,
RequestUpstreamDohRequest,
DohResponseCount,
}

configHighCardinality = false
)

func init() {
prometheus.MustRegister(
responseCounter,
RequestUpstreamResolveCounter,
RequestUpstreamDohRequest,
CustomDNSConfigReload,
DohResponseCount,
)
}

func Start(resetPeriodMinutes int64, highCardinality bool) (closeChan context.CancelFunc) {
configHighCardinality = highCardinality
ctx, cancel := context.WithCancel(context.Background())
mark := time.Now()

go func(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
default:
time.Sleep(time.Duration(resetPeriodMinutes) * time.Minute)
if time.Now().Sub(mark) > time.Duration(resetPeriodMinutes)*time.Minute {
for _, m := range allVecMetrics {
m.Reset()
}
mark = time.Now()
}
}
time.Sleep(400 * time.Millisecond)
}

}(ctx)

return cancel
}

func ReportDNSResponse(w dns.ResponseWriter, message *dns.Msg, blocked bool) {
question := message.Question[0]
remoteHost, _, _ := net.SplitHostPort(w.RemoteAddr().String())
var remoteHost string
var qName string
if !configHighCardinality {
remoteHost = ""
qName = ""
} else {
remoteHost, _, _ = net.SplitHostPort(w.RemoteAddr().String())
qName = question.Name
}
responseCounter.With(prometheus.Labels{
"remote_ip": remoteHost,
"q_type": dns.Type(question.Qtype).String(),
"q_name": question.Name,
"q_name": qName,
"rcode": dns.RcodeToString[message.Rcode],
"blocked": strconv.FormatBool(blocked),
}).Inc()
}

func ReportDNSRespond(remote net.IP, message *dns.Msg, blocked bool) {
question := message.Question[0]
var remoteHost string
var qName string
if !configHighCardinality {
remoteHost = ""
qName = ""
} else {
remoteHost = remote.String()
qName = question.Name
}
responseCounter.With(prometheus.Labels{
"remote_ip": remote.String(),
"remote_ip": remoteHost,
"q_type": dns.Type(question.Qtype).String(),
"q_name": question.Name,
"q_name": qName,
"rcode": dns.RcodeToString[message.Rcode],
"blocked": strconv.FormatBool(blocked),
}).Inc()
Expand Down
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"flag"
"github.com/cottand/leng/internal/metric"
"golang.org/x/sys/unix"
"net/http"
"os"
Expand Down Expand Up @@ -59,6 +60,8 @@ func main() {
loggingState.cleanUp()
}()

cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled)

lengActive = true
quitActivation := make(chan bool)
actChannel := make(chan *ActivationHandler)
Expand Down Expand Up @@ -103,6 +106,7 @@ forever:
case os.Interrupt:
logger.Error("SIGINT received, stopping\n")
quitActivation <- true
cancelMetrics()
break forever
case unix.SIGHUP:
logger.Error("SIGHUP received: rotating logs\n")
Expand Down

0 comments on commit 9cb692d

Please sign in to comment.