From 53ecaf36bef16130069600989223c306956f5e32 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 12:49:08 -0500 Subject: [PATCH 1/7] ref: switch to slog for structured logging Resolves https://github.com/aarnaud/vault-pki-exporter/issues/7 switches to slog with standard logging levels of info,debug,warn,error. will make logging easier to include and more customizable. using standard slog package reduces third party dependencies too --- README.md | 3 +- cmd/main.go | 57 +++++++-- compose.yaml | 1 + pkg/logger/logger.go | 234 ------------------------------------ pkg/vault-mon/pki.go | 42 +++---- pkg/vault-mon/prometheus.go | 25 +++- pkg/vault/client.go | 36 +++--- 7 files changed, 109 insertions(+), 289 deletions(-) delete mode 100644 pkg/logger/logger.go diff --git a/README.md b/README.md index 36ec2ca..f6ee7c1 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,8 @@ Flags: --prometheus Enable prometheus exporter, default if nothing else --refresh-interval duration How many sec between metrics update (default 1m0s) --batch-size-percent How large of a batch of certificates to get data for at once, supports floats (e.g 0.0 - 100.0) (default 1) - -v, --verbose Enable verbose + --log-level Set log level (options: info, warn, error, debug) + -v, --verbose (deprecated) Enable verbose logging. Defaults to debug level logging Use " [command] --help" for more information about a command. ``` diff --git a/cmd/main.go b/cmd/main.go index 52e6165..90a65c7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -2,9 +2,11 @@ package main import ( "fmt" + "log" + "log/slog" + "os" "time" - log "github.com/aarnaud/vault-pki-exporter/pkg/logger" "github.com/aarnaud/vault-pki-exporter/pkg/vault" vaultMon "github.com/aarnaud/vault-pki-exporter/pkg/vault-mon" "github.com/spf13/cobra" @@ -37,41 +39,56 @@ func init() { log.Fatal(err) } + flags.String("log-level", "info", "Set log level (options: info, warn, error, debug)") + if err := viper.BindPFlag("log-level", flags.Lookup("log-level")); err != nil { + log.Fatal("Could not set log level:", err) + } + flags.BoolP("prometheus", "", false, "Enable prometheus exporter, default if nothing else") if err := viper.BindPFlag("prometheus", flags.Lookup("prometheus")); err != nil { - log.Fatal(err) + log.Fatal("Could not bind prometheus flag:", err) } flags.BoolP("influx", "", false, "Enable InfluxDB Line Protocol") if err := viper.BindPFlag("influx", flags.Lookup("influx")); err != nil { - log.Fatal(err) + log.Fatal("Could not bind influx flag:", err) } flags.Int("port", 9333, "Prometheus exporter HTTP port") if err := viper.BindPFlag("port", flags.Lookup("port")); err != nil { - log.Fatal(err) + log.Fatal("Could not bind port flag:", err) } flags.Duration("fetch-interval", time.Minute, "How many sec between fetch certs on vault") if err := viper.BindPFlag("fetch_interval", flags.Lookup("fetch-interval")); err != nil { - log.Fatal(err) + log.Fatal("Could not bind fetch-interval flag:", err) } flags.Duration("refresh-interval", time.Minute, "How many sec between metrics update") if err := viper.BindPFlag("refresh_interval", flags.Lookup("refresh-interval")); err != nil { - log.Fatal(err) + log.Fatal("Could not bind refresh-interval flag:", err) } flags.Float64("batch-size-percent", 1, "loadCerts batch size percentage, supports floats (e.g 0.0 - 100.0)") if err := viper.BindPFlag("batch_size_percent", flags.Lookup("batch-size-percent")); err != nil { - log.Fatal(err) + log.Fatal("Could not bind batch-size-percent flag:", err) } } func main() { + cli.ParseFlags(os.Args[1:]) + + // preserve deprecated verbose flag + if viper.GetBool("verbose") { + setLogLevel("debug") + } else { + setLogLevel(viper.GetString("log-level")) + slog.Info("Log level initialized", "log-level", viper.GetString("log-level")) + } + err := cli.Execute() if err != nil { - log.Fatal(err) + log.Fatal("CLI execution failed:", err) } } @@ -83,13 +100,13 @@ func entrypoint() { pkiMon := vaultMon.PKIMon{} err := pkiMon.Init(vaultcli.Client) if err != nil { - log.Errorln(err.Error()) + slog.Error("PKIMon initialization failed", "error", err) } pkiMon.Watch(viper.GetDuration("fetch_interval")) if viper.GetBool("prometheus") || !viper.GetBool("influx") { - log.Infoln("start prometheus exporter") + slog.Info("start prometheus exporter") vaultMon.PromWatchCerts(&pkiMon, viper.GetDuration("refresh_interval")) vaultMon.PromStartExporter(viper.GetInt("port")) } @@ -98,3 +115,23 @@ func entrypoint() { vaultMon.InfluxWatchCerts(&pkiMon, viper.GetDuration("refresh_interval"), viper.GetBool("prometheus")) } } + +// https://pkg.go.dev/log/slog#example-SetLogLoggerLevel-Log +func setLogLevel(level string) { + var slogLevel slog.Level + switch level { + case "debug": + slogLevel = slog.LevelDebug + case "info": + slogLevel = slog.LevelInfo + case "warn": + slogLevel = slog.LevelWarn + case "error": + slogLevel = slog.LevelError + default: + slogLevel = slog.LevelInfo + } + + handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slogLevel}) + slog.SetDefault(slog.New(handler)) +} diff --git a/compose.yaml b/compose.yaml index 9a51a63..5929f7b 100644 --- a/compose.yaml +++ b/compose.yaml @@ -28,6 +28,7 @@ services: - ./vault-pki-exporter - --fetch-interval=5s - --refresh-interval=5s + - --log-level=debug networks: - vault-pki-exporter ports: diff --git a/pkg/logger/logger.go b/pkg/logger/logger.go deleted file mode 100644 index 13f44ca..0000000 --- a/pkg/logger/logger.go +++ /dev/null @@ -1,234 +0,0 @@ -package logger - -import ( - "github.com/sirupsen/logrus" - logrusSyslog "github.com/sirupsen/logrus/hooks/syslog" - "golang.org/x/crypto/ssh/terminal" - "io" - "log/syslog" - "os" - "path" -) - -var ( - // std is the name of the standard logger in stdlib `log` - std *logrus.Logger = New() -) - -func New() *logrus.Logger { - log := &logrus.Logger{ - Out: os.Stderr, - Formatter: new(logrus.TextFormatter), - Hooks: make(logrus.LevelHooks), - Level: logrus.InfoLevel, - } - - if !checkIfTerminal(log.Out) { - hook, err := logrusSyslog.NewSyslogHook("", "", syslog.LOG_INFO, path.Base(os.Args[0])) - if err != nil { - log.Error("Unable to connect to local syslog daemon") - } else { - log.Hooks.Add(hook) - } - } - - return log -} - -func StandardLogger() *logrus.Logger { - return std -} - -// SetOutput sets the standard logger output. -func SetOutput(out io.Writer) { - std.Out = out -} - -// SetFormatter sets the standard logger formatter. -func SetFormatter(formatter logrus.Formatter) { - std.Formatter = formatter -} - -// SetLevel sets the standard logger level. -func SetLevel(level logrus.Level) { - std.SetLevel(level) -} - -// GetLevel returns the standard logger level. -func GetLevel() logrus.Level { - return std.Level -} - -// AddHook adds a hook to the standard logger hooks. -func AddHook(hook logrus.Hook) { - std.Hooks.Add(hook) -} - -// WithError creates an entry from the standard logger and adds an error to it, using the value defined in ErrorKey as key. -func WithError(err error) *logrus.Entry { - return std.WithField(logrus.ErrorKey, err) -} - -// WithField creates an entry from the standard logger and adds a field to -// it. If you want multiple fields, use `WithFields`. -// -// Note that it doesn't log until you call Debug, Print, Info, Warn, Fatal -// or Panic on the Entry it returns. -func WithField(key string, value interface{}) *logrus.Entry { - return std.WithField(key, value) -} - -// WithFields creates an entry from the standard logger and adds multiple -// fields to it. This is simply a helper for `WithField`, invoking it -// once for each field. -// -// Note that it doesn't log until you call Debug, Print, Info, Warn, Fatal -// or Panic on the Entry it returns. -func WithFields(fields logrus.Fields) *logrus.Entry { - return std.WithFields(fields) -} - -// Debug logs a message at level Debug on the standard logger. -func Debug(args ...interface{}) { - std.Debug(args...) -} - -// Print logs a message at level Info on the standard logger. -func Print(args ...interface{}) { - std.Print(args...) -} - -// Info logs a message at level Info on the standard logger. -func Info(args ...interface{}) { - std.Info(args...) -} - -// Warn logs a message at level Warn on the standard logger. -func Warn(args ...interface{}) { - std.Warn(args...) -} - -// Warning logs a message at level Warn on the standard logger. -func Warning(args ...interface{}) { - std.Warning(args...) -} - -// Error logs a message at level Error on the standard logger. -func Error(args ...interface{}) { - std.Error(args...) -} - -// Panic logs a message at level Panic on the standard logger. -func Panic(args ...interface{}) { - std.Panic(args...) -} - -// Fatal logs a message at level Fatal on the standard logger. -func Fatal(args ...interface{}) { - std.Fatal(args...) -} - -// Debugf logs a message at level Debug on the standard logger. -func Debugf(format string, args ...interface{}) { - std.Debugf(format, args...) -} - -// Printf logs a message at level Info on the standard logger. -func Printf(format string, args ...interface{}) { - std.Printf(format, args...) -} - -// Infof logs a message at level Info on the standard logger. -func Infof(format string, args ...interface{}) { - std.Infof(format, args...) -} - -// Warnf logs a message at level Warn on the standard logger. -func Warnf(format string, args ...interface{}) { - std.Warnf(format, args...) -} - -// Warningf logs a message at level Warn on the standard logger. -func Warningf(format string, args ...interface{}) { - std.Warningf(format, args...) -} - -// Errorf logs a message at level Error on the standard logger. -func Errorf(format string, args ...interface{}) { - std.Errorf(format, args...) -} - -// Panicf logs a message at level Panic on the standard logger. -func Panicf(format string, args ...interface{}) { - std.Panicf(format, args...) -} - -// Fatalf logs a message at level Fatal on the standard logger. -func Fatalf(format string, args ...interface{}) { - std.Fatalf(format, args...) -} - -// Debugln logs a message at level Debug on the standard logger. -func Debugln(args ...interface{}) { - std.Debugln(args...) -} - -// Println logs a message at level Info on the standard logger. -func Println(args ...interface{}) { - std.Println(args...) -} - -// Infoln logs a message at level Info on the standard logger. -func Infoln(args ...interface{}) { - std.Infoln(args...) -} - -// Warnln logs a message at level Warn on the standard logger. -func Warnln(args ...interface{}) { - std.Warnln(args...) -} - -// Warningln logs a message at level Warn on the standard logger. -func Warningln(args ...interface{}) { - std.Warningln(args...) -} - -// Errorln logs a message at level Error on the standard logger. -func Errorln(args ...interface{}) { - std.Errorln(args...) -} - -// Panicln logs a message at level Panic on the standard logger. -func Panicln(args ...interface{}) { - std.Panicln(args...) -} - -// Fatalln logs a message at level Fatal on the standard logger. -func Fatalln(args ...interface{}) { - std.Fatalln(args...) -} - -func checkIfTerminal(w io.Writer) bool { - switch v := w.(type) { - case *os.File: - return terminal.IsTerminal(int(v.Fd())) - default: - return false - } -} - -func GetFormater() logrus.Formatter { - if checkIfTerminal(std.Out) { - return &logrus.TextFormatter{ - FullTimestamp: true, - } - } - return &FlatFormatter{} -} - -type FlatFormatter struct { -} - -func (f *FlatFormatter) Format(entry *logrus.Entry) ([]byte, error) { - return []byte(entry.Message), nil -} diff --git a/pkg/vault-mon/pki.go b/pkg/vault-mon/pki.go index 5aaf377..52caeab 100644 --- a/pkg/vault-mon/pki.go +++ b/pkg/vault-mon/pki.go @@ -4,10 +4,10 @@ import ( "crypto/x509" "encoding/pem" "fmt" + "log/slog" "sync" "time" - log "github.com/aarnaud/vault-pki-exporter/pkg/logger" "github.com/aarnaud/vault-pki-exporter/pkg/vault" vaultapi "github.com/hashicorp/vault/api" "github.com/mitchellh/mapstructure" @@ -64,7 +64,7 @@ func (mon *PKIMon) loadPKI() error { if _, ok := mon.pkis[name]; !ok { pki := PKI{path: name, vault: mon.vault} mon.pkis[name] = &pki - log.Infof("%s loaded", pki.path) + slog.Info("PKI loaded", "pki", pki.path) } } } @@ -72,27 +72,27 @@ func (mon *PKIMon) loadPKI() error { } func (mon *PKIMon) Watch(interval time.Duration) { - log.Infoln("Start watching pki certs") + slog.Info("Start watching PKI certs") go func() { for { - log.Infoln("Refresh PKI list") + slog.Info("Refresh PKI list") err := mon.loadPKI() if err != nil { - log.Errorln(err) + slog.Error("Error loading PKI", "error", err) } for _, pki := range mon.pkis { - log.Infof("Refresh PKI certificate for %s", pki.path) + slog.Info("Refresh PKI certificate", "pki", pki.path) pki.clearCerts() err := pki.loadCerts() if err != nil { - log.Errorln(err) + slog.Error("Error loading certs", "pki", pki.path, "error", err) } err = pki.loadCrl() if err != nil { - log.Errorln(err) + slog.Error("Error loading CRL", "pki", pki.path, "error", err) } } mon.Loaded = true @@ -119,15 +119,15 @@ func (pki *PKI) loadCrl() error { if pki.crls == nil { pki.crls = make(map[string]*x509.RevocationList) - log.Warningln("init an empty certs list") + slog.Warn("Initialized an empty certs list", "pki", pki.path) } for _, issuerRef := range issuers { crl, err := pki.loadCrlForIssuer(issuerRef) if err != nil { - log.Errorf("loadCrl() failed to load CRL for issuer %s, error: %v", issuerRef, err) + slog.Error("Failed to load CRL", "pki", pki.path, "issuer", issuerRef, "error", err) } else if crl == nil { - log.Errorf("CRL cannot be loaded for issuer %s", issuerRef) + slog.Error("CRL cannot be loaded", "pki", pki.path, "issuer", issuerRef) } else { pki.crls[issuerRef] = crl } @@ -178,7 +178,7 @@ func (pki *PKI) loadCrlForIssuer(issuerRef string) (*x509.RevocationList, error) crlData, ok := secret.Data["crl"].(string) if !ok || crlData == "" { - return nil, fmt.Errorf("crl data missing or invalid for issuer %s", issuerRef) + return nil, fmt.Errorf("CRL data missing or invalid for issuer %s", issuerRef) } block, _ := pem.Decode([]byte(crlData)) @@ -193,7 +193,7 @@ func (pki *PKI) loadCrlForIssuer(issuerRef string) (*x509.RevocationList, error) return nil, fmt.Errorf("error parsing CRL for issuer %s: %w", issuerRef, err) } - log.Debugf("Successfully loaded CRL for issuer %s", issuerRef) + slog.Debug("Successfully loaded CRL", "pki", pki.path, "issuer", issuerRef) return crl, nil } @@ -206,7 +206,7 @@ func (pki *PKI) loadCerts() error { if pki.certs == nil { pki.certs = make(map[string]*x509.Certificate) - log.Warningln("init an empty certs list") + slog.Warn("Initialized an empty certs list", "pki", pki.path) } secret, err := pki.vault.Logical().List(fmt.Sprintf("%scerts", pki.path)) @@ -246,9 +246,7 @@ func (pki *PKI) loadCerts() error { batchKeys := serialsList.Keys[i:end] var wg sync.WaitGroup - if viper.GetBool("verbose") { - log.WithField("batchsize", len(batchKeys)).Infof("processing batch of certs in loadCerts") - } + slog.Info("Processing batch of certs", "pki", pki.path, "batchsize", len(batchKeys)) // add a mutex for protecting concurrent access to the certs map var certsMux sync.Mutex @@ -259,26 +257,26 @@ func (pki *PKI) loadCerts() error { secret, err := pki.vault.Logical().Read(fmt.Sprintf("%scert/%s", pki.path, serial)) if err != nil || secret == nil || secret.Data == nil { - log.Errorf("failed to get certificate for %s%s, got error: %v", pki.path, serial, err) + slog.Error("Failed to get certificate", "pki", pki.path, "serial", serial, "error", err) return } secretCert := vault.SecretCertificate{} err = mapstructure.Decode(secret.Data, &secretCert) if err != nil { - log.Errorf("failed to decode secret for %s/%s, error: %v", pki.path, serial, err) + slog.Error("Failed to decode secret", "pki", pki.path, "serial", serial, "error", err) return } block, _ := pem.Decode([]byte([]byte(secretCert.Certificate))) if block == nil { - log.Errorf("failed to decode PEM block for %s/%s", pki.path, serial) + slog.Error("Failed to decode PEM block", "pki", pki.path, "serial", serial) return } cert, err := x509.ParseCertificate(block.Bytes) if err != nil { - log.Errorf("failed to load certificate for %s/%s, error: %v", pki.path, serial, err) + slog.Error("Failed to load certificate", "pki", pki.path, "serial", serial, "error", err) return } @@ -295,7 +293,7 @@ func (pki *PKI) loadCerts() error { if _, ok := pki.certs[cert.Subject.CommonName]; !ok && cert.NotAfter.Unix() > time.Now().Unix() { pki.certs[cert.Subject.CommonName] = cert if err != nil { - log.Errorln(err) + slog.Error("Error adding certificate", "pki", pki.path, "error", err) } } certsMux.Unlock() diff --git a/pkg/vault-mon/prometheus.go b/pkg/vault-mon/prometheus.go index 5c441b4..e3a7232 100644 --- a/pkg/vault-mon/prometheus.go +++ b/pkg/vault-mon/prometheus.go @@ -3,12 +3,12 @@ package vault_mon import ( "crypto/x509" "fmt" + "log" + "log/slog" "net/http" "strings" "time" - log "github.com/aarnaud/vault-pki-exporter/pkg/logger" - "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -72,7 +72,11 @@ func PromWatchCerts(pkimon *PKIMon, interval time.Duration) { now := time.Now() revokedCerts := make(map[string]struct{}) + slog.Debug("Starting new PromWatchCerts loop", "interval_seconds", interval.Seconds(), "pkis_count", len(pkis)) + for pkiname, pki := range pkis { + slog.Info("Processing PKI", "pki", pkiname) + for _, crl := range pki.GetCRLs() { if crl != nil { issuer := crl.Issuer.CommonName @@ -81,12 +85,16 @@ func PromWatchCerts(pkimon *PKIMon, interval time.Duration) { crl_nextupdate.WithLabelValues(pkiname, issuer).Set(float64(crl.NextUpdate.Unix())) crl_length.WithLabelValues(pkiname, issuer).Set(float64(len(crl.RevokedCertificates))) crl_byte_size.WithLabelValues(pkiname, issuer).Set(float64(pki.crlRawSize)) + + slog.Debug("Updated CRL metrics", "pki", pkiname, "issuer", issuer, "next_update", crl.NextUpdate) + // gather revoked certs from the CRL so we can exclude their metrics later for _, revokedCert := range crl.RevokedCertificates { revokedCerts[revokedCert.SerialNumber.String()] = struct{}{} } } } + for _, cert := range pki.GetCerts() { certlabels := getLabelValues(pkiname, cert) if _, isRevoked := revokedCerts[cert.SerialNumber.String()]; isRevoked { @@ -96,6 +104,7 @@ func PromWatchCerts(pkimon *PKIMon, interval time.Duration) { age.DeleteLabelValues(certlabels...) startdate.DeleteLabelValues(certlabels...) enddate.DeleteLabelValues(certlabels...) + slog.Debug("Cleared metrics for revoked certificate", "pki", pkiname, "serial", cert.SerialNumber, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) continue } @@ -104,15 +113,20 @@ func PromWatchCerts(pkimon *PKIMon, interval time.Duration) { startdate.WithLabelValues(certlabels...).Set(float64(cert.NotBefore.Unix())) enddate.WithLabelValues(certlabels...).Set(float64(cert.NotAfter.Unix())) + slog.Debug("Updated certificate metrics", "pki", pkiname, "serial", cert.SerialNumber, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) } + certcount.WithLabelValues(pkiname).Set(float64(len(pki.certs))) expired_cert_count.WithLabelValues(pkiname).Set(float64(pki.expiredCertsCounter)) + slog.Info("PKI metrics updated", "pki", pkiname, "total_certs", len(pki.certs), "expired_certs", pki.expiredCertsCounter) } - promWatchCertsDuration.Observe(time.Since(startTime).Seconds()) + + duration := time.Since(startTime).Seconds() + promWatchCertsDuration.Observe(duration) + slog.Info("PromWatchCerts loop completed", "duration_seconds", duration, "pkis_processed", len(pkis)) time.Sleep(interval) } }() - } func getLabelValues(pkiname string, cert *x509.Certificate) []string { @@ -129,9 +143,10 @@ func getLabelValues(pkiname string, cert *x509.Certificate) []string { } func PromStartExporter(port int) { + slog.Info("Starting Prometheus exporter", "port", port) http.Handle("/metrics", promhttp.Handler()) err := http.ListenAndServe(fmt.Sprintf(":%d", port), nil) if err != nil { - log.Fatal(err) + log.Fatal("Failed to start Prometheus exporter ", err) } } diff --git a/pkg/vault/client.go b/pkg/vault/client.go index b1136cf..ef30814 100644 --- a/pkg/vault/client.go +++ b/pkg/vault/client.go @@ -3,13 +3,15 @@ package vault import ( "context" "fmt" - log "github.com/aarnaud/vault-pki-exporter/pkg/logger" + "log" + "log/slog" + "os" + "time" + jwtauth "github.com/hashicorp/vault-plugin-auth-jwt" vaultapi "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/api/auth/kubernetes" "github.com/mitchellh/mapstructure" - "os" - "time" ) type secretCallback func(secret *vaultapi.Secret) @@ -38,7 +40,7 @@ func (vault *ClientWrapper) Init() { vaultconf := &vaultapi.Config{} err = vaultconf.ReadEnvironment() if err != nil { - log.Fatalln(err.Error()) + log.Fatal("Failed to read Vault environment ", err) } vault.Client, err = vaultapi.NewClient(vaultconf) @@ -61,7 +63,7 @@ func (vault *ClientWrapper) Init() { } ttl, _ := token_secret.TokenTTL() - log.Infof("Token TTL: %d LeaseDuration: %d", int32(ttl/time.Second), token_secret.LeaseDuration) + slog.Info("Token TTL and LeaseDuration", "ttl", int32(ttl/time.Second), "lease_duration", token_secret.LeaseDuration) isRenewable, _ := token_secret.TokenIsRenewable() if isRenewable { @@ -81,7 +83,7 @@ func (vault *ClientWrapper) Init() { watch_renewer_vault(token_renewer) } else { ttl, _ := token_secret.TokenTTL() - log.Infof("[vault] token is not renewable, ttl: %d", int32(ttl/time.Second)) + slog.Info("[vault] token is not renewable", "ttl", int32(ttl/time.Second)) } } @@ -108,7 +110,7 @@ func (vault *ClientWrapper) GetSecret(path string, fn secretCallback) error { watch_renewer_vault(renewer) } else { - log.Infof("[vault] secret is not renewable, use TTL to refresh secret : %s", path) + slog.Info("[vault] secret is not renewable, use TTL to refresh secret", "path", path) // Refresh secret at the end of Lease if secret.LeaseDuration > 0 { go func() { @@ -116,15 +118,15 @@ func (vault *ClientWrapper) GetSecret(path string, fn secretCallback) error { time.Sleep(time.Duration(secret.LeaseDuration) * time.Second) secret, err = vault.Client.Logical().Read(path) if err != nil { - log.Error("[vault]", err) + slog.Error("[vault]", "error", err) continue } if secret == nil { - log.Errorf("[vault] secret not found : %s", path) + slog.Error("[vault] secret not found", "path", path) continue } fn(secret) - log.Infof("[vault] successfully refreshed : %s", path) + slog.Info("[vault] successfully refreshed", "path", path) } }() } @@ -137,7 +139,7 @@ func (vault *ClientWrapper) GetSecretKV2(path string, fn secretKV2Callback) erro return vault.GetSecret(path, func(secret *vaultapi.Secret) { err := mapstructure.WeakDecode(secret.Data, secret_kv2) if err != nil { - log.Errorln("Can decode secret as KV version 2 ", err) + slog.Error("Can decode secret as KV version 2", "error", err) } fn(secret_kv2) }) @@ -159,11 +161,11 @@ func watch_renewer_vault(renewer *vaultapi.Renewer) { if flag == "" { flag = "token" } - log.Infof("[vault] successfully renewed: %s", flag) + slog.Info("[vault] successfully renewed", "lease_id", flag) // LeaseDuration=0 => infinte time but if LeaseDuration < 10s secret wasn't renewed // Strange because Renewable is true if renewal.Secret.LeaseDuration < 10 && renewal.Secret.LeaseDuration != 0 { - log.Fatalf("[vault] not renewable anymore: %s", flag) + log.Fatal("[vault] not renewable anymore ", flag) renewer.Stop() break } @@ -191,10 +193,10 @@ func (vault *ClientWrapper) authOIDC() { data["mount"] = mount secret, err := jwthandler.Auth(vault.Client, data) if err != nil { - log.Fatalln(err.Error()) + log.Fatal(err.Error()) } if secret == nil || secret.Auth == nil { - log.Fatalln("Failed to auth with OIDC") + log.Fatal("Failed to auth with OIDC") } vault.Client.SetToken(secret.Auth.ClientToken) } @@ -207,9 +209,9 @@ func (vault *ClientWrapper) authK8S() { am, err := kubernetes.NewKubernetesAuth(os.Getenv("VAULT_K8S_ROLE"), kubernetes.WithMountPath(mount)) if err != nil { - log.Fatalln("Failed to create k8s auth method: ", err) + log.Fatal("Failed to create k8s auth method: ", err) } if _, err := vault.Client.Auth().Login(context.Background(), am); err != nil { - log.Fatalln("Failed to auth with k8s: ", err) + log.Fatal("Failed to auth with k8s: ", err) } } From d5c94f3030141360004e46b807207afb4bd08d31 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 15:03:49 -0500 Subject: [PATCH 2/7] fix: gen metrics for certs with same CN diff OUs Resolves https://github.com/aarnaud/vault-pki-exporter/issues/28 Implements logic to consider both CN and OU as unique values for a certificate in the certificate map. This way we get metrics for certs that have the same CN but different Subject OUs - before, only CN was considered. This should still handle renewals correctly for the same cert, updating metrics in-place. That is why serial number or certificate fingerprint was not used as certs renewed in-place should retain their time series. I debated also adding Issuer information to uniquely identify these certs but did not think it was needed --- pkg/vault-mon/influx.go | 12 ++++---- pkg/vault-mon/pki.go | 43 ++++++++++++++++----------- pkg/vault-mon/prometheus.go | 58 +++++++++++++++++++++---------------- vault-setup.sh | 25 ++++++++++++---- 4 files changed, 85 insertions(+), 53 deletions(-) diff --git a/pkg/vault-mon/influx.go b/pkg/vault-mon/influx.go index ba82511..17ba25b 100644 --- a/pkg/vault-mon/influx.go +++ b/pkg/vault-mon/influx.go @@ -41,13 +41,15 @@ func influxProcessData(pkimon *PKIMon) { printCrlInfluxPoint(pkiname, crl) } } - for _, cert := range pki.GetCerts() { - printCertificateInfluxPoint(pkiname, cert) + for commonName, orgUnits := range pki.GetCerts() { + for orgUnit, cert := range orgUnits { + printCertificateInfluxPoint(pkiname, commonName, orgUnit, cert) + } } } } -func printCertificateInfluxPoint(pkiname string, cert *x509.Certificate) { +func printCertificateInfluxPoint(pkiname, commonName, orgUnit string, cert *x509.Certificate) { now := time.Now() point := influx.Point{ Measurement: "x509_cert", @@ -55,9 +57,9 @@ func printCertificateInfluxPoint(pkiname string, cert *x509.Certificate) { "host": hostname, "source": pkiname, "serial": strings.ReplaceAll(fmt.Sprintf("% x", cert.SerialNumber.Bytes()), " ", "-"), - "common_name": cert.Subject.CommonName, + "common_name": commonName, "organization": getEmptyStringIfEmpty(cert.Subject.Organization), - "organizational_unit": getEmptyStringIfEmpty(cert.Subject.OrganizationalUnit), + "organizational_unit": orgUnit, "country": getEmptyStringIfEmpty(cert.Subject.Country), "province": getEmptyStringIfEmpty(cert.Subject.Province), "locality": getEmptyStringIfEmpty(cert.Subject.Locality), diff --git a/pkg/vault-mon/pki.go b/pkg/vault-mon/pki.go index 52caeab..042b590 100644 --- a/pkg/vault-mon/pki.go +++ b/pkg/vault-mon/pki.go @@ -18,7 +18,7 @@ import ( type PKI struct { path string - certs map[string]*x509.Certificate + certs map[string]map[string]*x509.Certificate crls map[string]*x509.RevocationList crlRawSize int expiredCertsCounter int @@ -62,7 +62,7 @@ func (mon *PKIMon) loadPKI() error { for name, mount := range mounts { if mount.Type == "pki" { if _, ok := mon.pkis[name]; !ok { - pki := PKI{path: name, vault: mon.vault} + pki := PKI{path: name, vault: mon.vault, certs: make(map[string]map[string]*x509.Certificate)} mon.pkis[name] = &pki slog.Info("PKI loaded", "pki", pki.path) } @@ -205,7 +205,7 @@ func (pki *PKI) loadCerts() error { defer pki.certsmux.Unlock() if pki.certs == nil { - pki.certs = make(map[string]*x509.Certificate) + pki.certs = make(map[string]map[string]*x509.Certificate) slog.Warn("Initialized an empty certs list", "pki", pki.path) } @@ -268,7 +268,7 @@ func (pki *PKI) loadCerts() error { return } - block, _ := pem.Decode([]byte([]byte(secretCert.Certificate))) + block, _ := pem.Decode([]byte(secretCert.Certificate)) if block == nil { slog.Error("Failed to decode PEM block", "pki", pki.path, "serial", serial) return @@ -280,35 +280,44 @@ func (pki *PKI) loadCerts() error { return } + commonName := cert.Subject.CommonName + orgUnit := "" + // define certs by their commonName and *Subject* (not issuer) OU + if len(cert.Subject.OrganizationalUnit) > 0 { + orgUnit = cert.Subject.OrganizationalUnit[0] + } + certsMux.Lock() - // if already in map check the expiration - if certInMap, ok := pki.certs[cert.Subject.CommonName]; ok && certInMap.NotAfter.Unix() < cert.NotAfter.Unix() { - pki.certs[cert.Subject.CommonName] = cert + if _, exists := pki.certs[commonName]; !exists { + pki.certs[commonName] = make(map[string]*x509.Certificate) } - if cert.NotAfter.Unix() < time.Now().Unix() { - pki.expiredCertsCounter++ + // if cert is in map already or the new cert has a *later* expiration date, update map + // handles renewal of existing cert smoothly + if existingCert, ok := pki.certs[commonName][orgUnit]; !ok || existingCert.NotAfter.Before(cert.NotAfter) { + pki.certs[commonName][orgUnit] = cert + + slog.Debug("Updated certificate in map", "pki", pki.path, "serial", serial, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) } - if _, ok := pki.certs[cert.Subject.CommonName]; !ok && cert.NotAfter.Unix() > time.Now().Unix() { - pki.certs[cert.Subject.CommonName] = cert - if err != nil { - slog.Error("Error adding certificate", "pki", pki.path, "error", err) - } + if cert.NotAfter.Before(time.Now()) { + pki.expiredCertsCounter++ + + slog.Debug("Cert rejected as it is expired", "pki", pki.path, "serial", serial, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) } + certsMux.Unlock() }(serial) } wg.Wait() } - loadCertsDuration.Observe(time.Since(startTime).Seconds()) return nil } func (pki *PKI) clearCerts() { pki.certsmux.Lock() - pki.certs = make(map[string]*x509.Certificate) + pki.certs = make(map[string]map[string]*x509.Certificate) pki.certsmux.Unlock() } @@ -318,7 +327,7 @@ func (pki *PKI) GetCRLs() map[string]*x509.RevocationList { return pki.crls } -func (pki *PKI) GetCerts() map[string]*x509.Certificate { +func (pki *PKI) GetCerts() map[string]map[string]*x509.Certificate { pki.certsmux.Lock() defer pki.certsmux.Unlock() return pki.certs diff --git a/pkg/vault-mon/prometheus.go b/pkg/vault-mon/prometheus.go index e3a7232..d7727b8 100644 --- a/pkg/vault-mon/prometheus.go +++ b/pkg/vault-mon/prometheus.go @@ -95,36 +95,44 @@ func PromWatchCerts(pkimon *PKIMon, interval time.Duration) { } } - for _, cert := range pki.GetCerts() { - certlabels := getLabelValues(pkiname, cert) - if _, isRevoked := revokedCerts[cert.SerialNumber.String()]; isRevoked { - // in case we have prior existing metrics, clear them for revoked certs - // seems fine to run in case the metrics don't exist or are already deleted too - expiry.DeleteLabelValues(certlabels...) - age.DeleteLabelValues(certlabels...) - startdate.DeleteLabelValues(certlabels...) - enddate.DeleteLabelValues(certlabels...) - slog.Debug("Cleared metrics for revoked certificate", "pki", pkiname, "serial", cert.SerialNumber, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) - continue - } + for commonName, orgUnits := range pki.GetCerts() { + for orgUnit, cert := range orgUnits { + + certlabels := getLabelValues(pkiname, cert) + + slog.Debug("cert found") + + if _, isRevoked := revokedCerts[cert.SerialNumber.String()]; isRevoked { + // in case we have prior existing metrics, clear them for revoked certs + // seems fine to run in case the metrics don't exist or are already deleted too + expiry.DeleteLabelValues(certlabels...) + age.DeleteLabelValues(certlabels...) + startdate.DeleteLabelValues(certlabels...) + enddate.DeleteLabelValues(certlabels...) + + slog.Debug("Cleared metrics for revoked certificate", "pki", pkiname, "serial", cert.SerialNumber, "common_name", commonName, "organizational_unit", orgUnit) - expiry.WithLabelValues(certlabels...).Set(float64(cert.NotAfter.Sub(now).Seconds())) - age.WithLabelValues(certlabels...).Set(float64(now.Sub(cert.NotBefore).Seconds())) - startdate.WithLabelValues(certlabels...).Set(float64(cert.NotBefore.Unix())) - enddate.WithLabelValues(certlabels...).Set(float64(cert.NotAfter.Unix())) + continue + } + + expiry.WithLabelValues(certlabels...).Set(float64(cert.NotAfter.Sub(now).Seconds())) + age.WithLabelValues(certlabels...).Set(float64(now.Sub(cert.NotBefore).Seconds())) + startdate.WithLabelValues(certlabels...).Set(float64(cert.NotBefore.Unix())) + enddate.WithLabelValues(certlabels...).Set(float64(cert.NotAfter.Unix())) - slog.Debug("Updated certificate metrics", "pki", pkiname, "serial", cert.SerialNumber, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) + slog.Debug("Updated certificate metrics", "pki", pkiname, "serial", cert.SerialNumber, "common_name", cert.Subject.CommonName, "organizational_unit", cert.Subject.OrganizationalUnit) + } + certcount.WithLabelValues(pkiname).Set(float64(len(pki.certs))) + expired_cert_count.WithLabelValues(pkiname).Set(float64(pki.expiredCertsCounter)) + + slog.Info("PKI metrics updated", "pki", pkiname, "total_certs", len(pki.certs), "expired_certs", pki.expiredCertsCounter) } - certcount.WithLabelValues(pkiname).Set(float64(len(pki.certs))) - expired_cert_count.WithLabelValues(pkiname).Set(float64(pki.expiredCertsCounter)) - slog.Info("PKI metrics updated", "pki", pkiname, "total_certs", len(pki.certs), "expired_certs", pki.expiredCertsCounter) + duration := time.Since(startTime).Seconds() + promWatchCertsDuration.Observe(duration) + slog.Info("PromWatchCerts loop completed", "duration_seconds", duration, "pkis_processed", len(pkis)) + time.Sleep(interval) } - - duration := time.Since(startTime).Seconds() - promWatchCertsDuration.Observe(duration) - slog.Info("PromWatchCerts loop completed", "duration_seconds", duration, "pkis_processed", len(pkis)) - time.Sleep(interval) } }() } diff --git a/vault-setup.sh b/vault-setup.sh index bc748e0..4735137 100755 --- a/vault-setup.sh +++ b/vault-setup.sh @@ -29,19 +29,32 @@ vault write pki/config/urls \ vault write pki/config/crl expiry="400h" -vault write pki/roles/example-dot-com \ +# make two roles with different OUs to ensure we get metrics for the same CN with different OUs +vault write pki/roles/foo-role \ allowed_domains=my-website.com \ allow_subdomains=true \ - max_ttl=72h + max_ttl=72h \ + ou="Foo" + +vault write pki/roles/bar-role \ + allowed_domains=my-website.com \ + allow_subdomains=true \ + max_ttl=72h \ + ou="Bar" apk add jq -# test revoking a certificate for CRL metrics -CERT_OUTPUT=$(vault write -format=json pki/issue/example-dot-com common_name=www.revokme.my-website.com) + +# Test revoking a certificate for CRL metrics +CERT_OUTPUT=$(vault write -format=json pki/issue/foo-role common_name=www.revokme.my-website.com) CERT_SERIAL=$(echo $CERT_OUTPUT | jq -r '.data.serial_number') vault write pki/revoke serial_number="$CERT_SERIAL" -vault write pki/issue/example-dot-com \ - common_name=www.my-website.com +# issue 2 certs with same CNs but different OUs - want metrics for both +vault write pki/issue/foo-role \ + common_name=www.duplicate-ou-cert.my-website.com + +vault write pki/issue/bar-role \ + common_name=www.duplicate-ou-cert.my-website.com vault read pki/crl/rotate From c48f62c96739c9c2c1c47be50765aba47695b4a8 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 16:22:28 -0500 Subject: [PATCH 3/7] test: use regex for duplicate OU test --- .github/workflows/tests.yml | 2 +- README.md | 14 +++++++++++++- tests.yml | 12 ++++++++---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 41827e5..cfcf39b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,7 +17,7 @@ jobs: run: docker compose build - name: Download venom - run: curl https://github.com/ovh/venom/releases/download/v1.1.0/venom.linux-amd64 -L -o /usr/local/bin/venom && chmod +x /usr/local/bin/venom + run: curl https://github.com/ovh/venom/releases/download/v1.2.0/venom.linux-amd64 -L -o /usr/local/bin/venom && chmod +x /usr/local/bin/venom - name: Run test run: venom run tests.yml \ No newline at end of file diff --git a/README.md b/README.md index f6ee7c1..f3f41a6 100644 --- a/README.md +++ b/README.md @@ -91,11 +91,23 @@ level=error msg="failed to get certificate for pki/26:97:08:32:44:40:30:de:11:5z Your batch size is probably too high. +## Certificate Selection + +Any certificate with a unique subject common name and organizational unit is considered for metrics. If a certificate is renewed in place with the same CN and OU, it will still retain the same time series to avoid false alarms. + +Revoked certificates are not considered for metrics and their time series will be deleted when an "active" certificate is deleted. + +Expired certificates still retain their time series too. + +## PKI Engine Selection + +Right now the exporter will find any Vault PKI secrets engines and attempt to get certs for all of them. PKI secrets engines are currently not selectable by the exporter. + ## Contributing ### Testing -Venom is used for tests, run `sudo venom run tests.yml` to perform integration tests. +Venom is used for tests, run `sudo venom run tests.yml` to perform integration tests. Make sure you have at least venom version 1.2.0. Unit tests would also most likely be welcome for contribution with go native tests. diff --git a/tests.yml b/tests.yml index c5bf0af..11dc83e 100644 --- a/tests.yml +++ b/tests.yml @@ -9,7 +9,7 @@ testcases: - name: docker compose up steps: - type: exec - script: docker compose up --build -d --wait + script: docker compose up --build -d --wait --force-recreate - name: vaultStatus steps: - name: vaultHealth @@ -303,9 +303,9 @@ testcases: - name: testMetrics steps: - - name: wait 10s + - name: wait 20s type: exec - script: sleep 10 + script: sleep 20 - name: validateMetrics type: http method: GET @@ -325,7 +325,11 @@ testcases: - result.body ShouldNotContainSubstring common_name="revokeme.first-ca.example.com" # CRLs for each issuer - result.body ShouldContainSubstring x509_crl_length{issuer="my-website.com",source="pki/"} 1 - - result.body ShouldContainSubstring x509_crl_length{issuer="mysecondwebsite.com",source="pki/"} 0 + - result.body ShouldContainSubstring x509_crl_length{issuer="Second CA",source="second-ca/"} 0 + # Certificate metrics for certs with same CN but multiple OUs + # https://github.com/ovh/venom/issues/700#issuecomment-1635502233 + - result.body ShouldMatchRegex '(?m).*x509_cert_age{common_name="www.duplicate-ou-cert.my-website.com",country="",locality="",organization="",organizational_unit="Foo",province="",serial=".*",source="pki\/"}.*' + - result.body ShouldMatchRegex '(?m).*x509_cert_age{common_name="www.duplicate-ou-cert.my-website.com",country="",locality="",organization="",organizational_unit="Bar",province="",serial=".*",source="pki\/"}.*' - name: docker compose down steps: From 9afc8fd7687e80fc4df8c7bc496ea47b2ea0f944 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 16:28:03 -0500 Subject: [PATCH 4/7] fix: reset crl test assertions --- tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests.yml b/tests.yml index 11dc83e..930cd61 100644 --- a/tests.yml +++ b/tests.yml @@ -325,7 +325,7 @@ testcases: - result.body ShouldNotContainSubstring common_name="revokeme.first-ca.example.com" # CRLs for each issuer - result.body ShouldContainSubstring x509_crl_length{issuer="my-website.com",source="pki/"} 1 - - result.body ShouldContainSubstring x509_crl_length{issuer="Second CA",source="second-ca/"} 0 + - result.body ShouldContainSubstring x509_crl_length{issuer="mysecondwebsite.com",source="pki/"} 0 # Certificate metrics for certs with same CN but multiple OUs # https://github.com/ovh/venom/issues/700#issuecomment-1635502233 - result.body ShouldMatchRegex '(?m).*x509_cert_age{common_name="www.duplicate-ou-cert.my-website.com",country="",locality="",organization="",organizational_unit="Foo",province="",serial=".*",source="pki\/"}.*' From af020969e7e0abf19264a03c17cfa9e5d58ec9bf Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 16:28:55 -0500 Subject: [PATCH 5/7] chore: add tool-versions standard for mise/asdf --- .tool-versions | 1 + 1 file changed, 1 insertion(+) create mode 100644 .tool-versions diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..bf24d2f --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +venom 1.2.0 From 26dfa3bb1483250dd227812e3eab2d2d2f5c101e Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 16:41:20 -0500 Subject: [PATCH 6/7] chore: remove lone debug log --- pkg/vault-mon/prometheus.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/vault-mon/prometheus.go b/pkg/vault-mon/prometheus.go index d7727b8..ac47d05 100644 --- a/pkg/vault-mon/prometheus.go +++ b/pkg/vault-mon/prometheus.go @@ -100,8 +100,6 @@ func PromWatchCerts(pkimon *PKIMon, interval time.Duration) { certlabels := getLabelValues(pkiname, cert) - slog.Debug("cert found") - if _, isRevoked := revokedCerts[cert.SerialNumber.String()]; isRevoked { // in case we have prior existing metrics, clear them for revoked certs // seems fine to run in case the metrics don't exist or are already deleted too From 004da29940a12c54d936b09bbce0fa7629559a93 Mon Sep 17 00:00:00 2001 From: Will Bollock Date: Fri, 15 Nov 2024 16:59:15 -0500 Subject: [PATCH 7/7] chore: add log for initial cli flag values --- cmd/main.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/main.go b/cmd/main.go index 90a65c7..a8975d0 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -86,6 +86,9 @@ func main() { slog.Info("Log level initialized", "log-level", viper.GetString("log-level")) } + // note mix of underscores and dashes + slog.Info("CLI flag values", "fetch-interval", viper.GetDuration("fetch_interval"), "refresh-interval", viper.GetDuration("refresh_interval"), "batch-size-percent", viper.GetFloat64("batch_size_percent") ) + err := cli.Execute() if err != nil { log.Fatal("CLI execution failed:", err)