diff --git a/config.go b/config.go index a79308a..1decced 100644 --- a/config.go +++ b/config.go @@ -58,8 +58,10 @@ type Upstream struct { } type Metrics struct { - Enabled bool - Path string + Enabled bool + Path string + HighCardinalityEnabled bool + ResetPeriodMinutes int64 } type DnsOverHttpServer struct { @@ -167,6 +169,9 @@ followCnameDepth = 12 [Metrics] enabled = false path = "/metrics" + # see https://cottand.github.io/leng/Prometheus-Metrics.html + highCardinalityEnabled = false + resetPeriodMinutes = 60 [DnsOverHttpServer] enabled = false diff --git a/doc/src/Configuration.md b/doc/src/Configuration.md index 8c368a2..d7d33e1 100644 --- a/doc/src/Configuration.md +++ b/doc/src/Configuration.md @@ -103,10 +103,13 @@ customdnsrecords = [ # cache capacity, 0 for infinite maxcount = 0 -# Prometheus metrics - enable +# Prometheus metrics [Metrics] enabled = false path = "/metrics" + # see https://cottand.github.io/leng/Prometheus-Metrics.html + highCardinalityEnabled = false + resetPeriodMinutes = 60 [DnsOverHttpServer] enabled = false diff --git a/doc/src/Prometheus-Metrics.md b/doc/src/Prometheus-Metrics.md index 6bbdcfc..be9429b 100644 --- a/doc/src/Prometheus-Metrics.md +++ b/doc/src/Prometheus-Metrics.md @@ -1,8 +1,41 @@ +# Prometheus metrics + The HTTP API has a `/metrics` endpoint that exposes Go runtime metrics as well as things including: + - downstream DNS requests, broken down by type - upstream DNS requests - upstream DNS-over-HTTPS success rate - downstream DNS-over-HTTPS success rate +No grafana dashboards exist for leng yet. If you make one, please make a PR! + +## High cardinality metrics + +Tags can be added to some metrics (`upstream_request`, `request_total`) so that +they include information such as the name of the DNS request (ie, `example.com.`) +or the IP of host making the request. + +If leng is left to run for a few hours (and you have enough traffic), +the cardinality of these metrics will grow, to the point +the +size of the `/metrics` +response will grow to be so big the metrics stop being updated. +While resetting the counters periodically can help +(and you can tweak that with the config `Metrics.resetPeriodMinutes`) +but you might still see issues depending on your traffic. +You can +read [this SO post](https://stackoverflow.com/questions/46373442/how-dangerous-are-high-cardinality-labels-in-prometheus) +to learn more. + +High cardinality metrics **can also compromise your privacy** by exposing in the metrics endpoint +what domains clients are querying as well as their IPs. + +For these reasons, **high cardinality metrics are disabled by default**. You can enable them +with the following config: -No grafana dashboards exist for leng yet. If you make one, please make a PR! \ No newline at end of file +```toml +[Metrics] +enabled = true +path = "/metrics" +highCardinalityEnabled = true +``` \ No newline at end of file diff --git a/grimd_test.go b/grimd_test.go index 5f1a97e..44ca177 100644 --- a/grimd_test.go +++ b/grimd_test.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "github.com/cottand/leng/internal/metric" "github.com/pelletier/go-toml/v2" "io" "net/http" @@ -36,6 +37,8 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client, changeConfig(&config) + cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled) + defer cancelMetrics() quitActivation := make(chan bool) actChannel := make(chan *ActivationHandler) @@ -59,6 +62,9 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client, // QuestionCache contains all queries to the dns server questionCache := makeQuestionCache(config.QuestionCacheCap) + reloadChan := make(chan bool) + _, _ = StartAPIServer(&config, reloadChan, blockCache, questionCache) + defer close(reloadChan) server.Run(&config, blockCache, questionCache) time.Sleep(200 * time.Millisecond) @@ -67,6 +73,56 @@ func integrationTest(changeConfig func(c *Config), test func(client *dns.Client, test(c, testDnsHost) } +func TestHighCardinalityMetricsOff(t *testing.T) { + var config *Config + integrationTest( + func(c *Config) { + c.CustomDNSRecords = []string{ + "example.com. IN A 10.10.0.1 ", + "example.org. IN A 10.10.0.2 ", + } + c.Metrics.Enabled = true + c.Metrics.HighCardinalityEnabled = false + config = c + }, + func(client *dns.Client, target string) { + c := new(dns.Client) + m := new(dns.Msg) + + // make 2 request + m.SetQuestion(dns.Fqdn("example.com"), dns.TypeA) + + _, _, err := c.Exchange(m, target) + if err != nil { + t.Fatal(err) + } + m.SetQuestion(dns.Fqdn("example.org"), dns.TypeA) + + _, _, err = c.Exchange(m, target) + if err != nil { + t.Fatal(err) + } + + metrics, err := http.Get(fmt.Sprintf("http://%s%s", config.API, config.Metrics.Path)) + if err != nil { + t.Fatal(err) + } + defer func() { + _ = metrics.Body.Close() + }() + body, _ := io.ReadAll(metrics.Body) + bodyStr := string(body) + + if !strings.Contains(bodyStr, "q_name=\"\"") { + t.Fatalf("Expected an empty `q_name` label, but it was not in the metrics response:\n%s", bodyStr) + } + if !strings.Contains(bodyStr, "remote_ip=\"\"") { + t.Fatalf("Expected an empty `remote_ip` label, but it was not in the metrics response:\n%s", bodyStr) + } + }, + ) +} + func TestMultipleARecords(t *testing.T) { integrationTest( func(c *Config) { diff --git a/internal/metric/metric.go b/internal/metric/metric.go index 30493eb..01089f8 100644 --- a/internal/metric/metric.go +++ b/internal/metric/metric.go @@ -1,11 +1,13 @@ package metric import ( + "context" "github.com/miekg/dns" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "net" "strconv" + "time" ) const ( @@ -60,6 +62,15 @@ var ( Name: "doh_response_count", Help: "Successful DoH responses", }, []string{"status"}) + + allVecMetrics = []*prometheus.CounterVec{ + responseCounter, + RequestUpstreamResolveCounter, + RequestUpstreamDohRequest, + DohResponseCount, + } + + configHighCardinality = false ) func init() { @@ -67,26 +78,73 @@ func init() { responseCounter, RequestUpstreamResolveCounter, RequestUpstreamDohRequest, + CustomDNSConfigReload, + DohResponseCount, ) } +func Start(resetPeriodMinutes int64, highCardinality bool) (closeChan context.CancelFunc) { + configHighCardinality = highCardinality + ctx, cancel := context.WithCancel(context.Background()) + mark := time.Now() + + go func(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + default: + time.Sleep(time.Duration(resetPeriodMinutes) * time.Minute) + if time.Now().Sub(mark) > time.Duration(resetPeriodMinutes)*time.Minute { + for _, m := range allVecMetrics { + m.Reset() + } + mark = time.Now() + } + } + time.Sleep(400 * time.Millisecond) + } + + }(ctx) + + return cancel +} + func ReportDNSResponse(w dns.ResponseWriter, message *dns.Msg, blocked bool) { question := message.Question[0] - remoteHost, _, _ := net.SplitHostPort(w.RemoteAddr().String()) + var remoteHost string + var qName string + if !configHighCardinality { + remoteHost = "" + qName = "" + } else { + remoteHost, _, _ = net.SplitHostPort(w.RemoteAddr().String()) + qName = question.Name + } responseCounter.With(prometheus.Labels{ "remote_ip": remoteHost, "q_type": dns.Type(question.Qtype).String(), - "q_name": question.Name, + "q_name": qName, "rcode": dns.RcodeToString[message.Rcode], "blocked": strconv.FormatBool(blocked), }).Inc() } + func ReportDNSRespond(remote net.IP, message *dns.Msg, blocked bool) { question := message.Question[0] + var remoteHost string + var qName string + if !configHighCardinality { + remoteHost = "" + qName = "" + } else { + remoteHost = remote.String() + qName = question.Name + } responseCounter.With(prometheus.Labels{ - "remote_ip": remote.String(), + "remote_ip": remoteHost, "q_type": dns.Type(question.Qtype).String(), - "q_name": question.Name, + "q_name": qName, "rcode": dns.RcodeToString[message.Rcode], "blocked": strconv.FormatBool(blocked), }).Inc() diff --git a/main.go b/main.go index 8b0b855..104ddd9 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "context" "flag" + "github.com/cottand/leng/internal/metric" "golang.org/x/sys/unix" "net/http" "os" @@ -59,6 +60,8 @@ func main() { loggingState.cleanUp() }() + cancelMetrics := metric.Start(config.Metrics.ResetPeriodMinutes, config.Metrics.HighCardinalityEnabled) + lengActive = true quitActivation := make(chan bool) actChannel := make(chan *ActivationHandler) @@ -103,6 +106,7 @@ forever: case os.Interrupt: logger.Error("SIGINT received, stopping\n") quitActivation <- true + cancelMetrics() break forever case unix.SIGHUP: logger.Error("SIGHUP received: rotating logs\n")