Skip to content

Commit

Permalink
Add FSM loop timings
Browse files Browse the repository at this point in the history
As the issue #2854 demonstrates, Main FSM loop performance is crucial
for overall GoBGP performance. While we're far from hitting those limits yet,
we want to be sure that we have enough capacity in FSM loop.

To achieve that, we add FSM loop timing histograms, which are showing:
    - timing_sec - amount of time handling each message took
    - wait_sec - amount of time each message spent in a channel before
      it was received by the main FSM loop.
The latter (wait times) allows to detect situation when FSM loop performance
is inadequate, while the former (timings) allows to understand which
component causes such delays.

For the sake of simplicity, further classification of message types, such
as RouterID of the peer or name of gRPC operation is not made.
  • Loading branch information
Sergey Klyaus committed Jan 15, 2025
1 parent f04adfb commit 792b985
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 3 deletions.
8 changes: 7 additions & 1 deletion cmd/gobgpd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,14 @@ func main() {

logger.Info("gobgpd started")
bgpLogger := &builtinLogger{logger: logger, cfgStrict: opts.ConfigStrict}
bgpServer := server.NewBgpServer(server.GrpcListenAddress(opts.GrpcHosts), server.GrpcOption(grpcOpts), server.LoggerOption(bgpLogger))
fsmTimingCollector := metrics.NewFSMTimingsCollector()
bgpServer := server.NewBgpServer(
server.GrpcListenAddress(opts.GrpcHosts),
server.GrpcOption(grpcOpts),
server.LoggerOption(bgpLogger),
server.TimingHookOption(fsmTimingCollector))
prometheus.MustRegister(metrics.NewBgpCollector(bgpServer))
prometheus.MustRegister(fsmTimingCollector)
go bgpServer.Serve()

if opts.UseSdNotify {
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ require (
github.com/k-sone/critbitgo v1.4.0
github.com/kr/pretty v0.3.1
github.com/prometheus/client_golang v1.16.0
github.com/prometheus/client_model v0.3.0
github.com/sirupsen/logrus v1.9.3
github.com/spf13/cobra v1.7.0
github.com/spf13/viper v1.16.0
Expand Down Expand Up @@ -41,7 +42,6 @@ require (
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect
github.com/rogpeppe/go-internal v1.9.0 // indirect
Expand Down
73 changes: 73 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package metrics

import (
"context"
"fmt"
"time"

"github.com/prometheus/client_golang/prometheus"

Expand All @@ -10,6 +12,77 @@ import (
"github.com/osrg/gobgp/v3/pkg/server"
)

type fsmTimingsCollector struct {
timingHistograms, waitHistograms []prometheus.Histogram
}

type FSMTimingsCollector interface {
server.FSMTimingHook
prometheus.Collector
}

var (
fsmOperationInfixes = [server.FSMOperationTypeCount]string{
"mgmt_op",
"accept",
"event",
"message",
}

fsmOperationDescriptions = [server.FSMOperationTypeCount]string{
"management operation",
"TCP accept",
"event",
"BGP message",
}
)

func NewFSMTimingsCollector() FSMTimingsCollector {
const namespace = "fsm_loop"
c := &fsmTimingsCollector{
timingHistograms: make([]prometheus.Histogram, server.FSMOperationTypeCount),
waitHistograms: make([]prometheus.Histogram, server.FSMOperationTypeCount),
}

fsmHistograms := make([]prometheus.Histogram, server.FSMOperationTypeCount)
for i := range fsmHistograms {
c.timingHistograms[i] = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: prometheus.BuildFQName(namespace, fsmOperationInfixes[i], "timing_sec"),
Help: fmt.Sprintf("Histogram of %s timings", fsmOperationDescriptions[i]),
Buckets: prometheus.DefBuckets,
})
c.waitHistograms[i] = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: prometheus.BuildFQName(namespace, fsmOperationInfixes[i], "wait_sec"),
Help: fmt.Sprintf("Histogram of %s channel delays", fsmOperationDescriptions[i]),
Buckets: prometheus.DefBuckets,
})
}
return c
}

func (f *fsmTimingsCollector) Observe(op server.FSMOperation, tOp, tWait time.Duration) {
f.timingHistograms[op].Observe(tOp.Seconds())
if tWait != 0 {
f.waitHistograms[op].Observe(tWait.Seconds())
}
}

func (f *fsmTimingsCollector) Describe(descs chan<- *prometheus.Desc) {
for _, histogramList := range [][]prometheus.Histogram{f.timingHistograms, f.waitHistograms} {
for _, h := range histogramList {
h.Describe(descs)
}
}
}

func (f *fsmTimingsCollector) Collect(metrics chan<- prometheus.Metric) {
for _, histogramList := range [][]prometheus.Histogram{f.timingHistograms, f.waitHistograms} {
for _, h := range histogramList {
h.Collect(metrics)
}
}
}

type bgpCollector struct {
server *server.BgpServer
}
Expand Down
47 changes: 47 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import (
"time"

"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
apb "google.golang.org/protobuf/types/known/anypb"

api "github.com/osrg/gobgp/v3/api"
Expand Down Expand Up @@ -141,3 +143,48 @@ func TestMetrics(test *testing.T) {
cancel()
<-ch
}

func TestFSMLoopMetrics(t *testing.T) {
assert, require := assert.New(t), require.New(t)

fsmCollector := NewFSMTimingsCollector()
registry := prometheus.NewRegistry()
err := registry.Register(fsmCollector)
assert.NoError(err)

s := server.NewBgpServer(server.TimingHookOption(fsmCollector))
go s.Serve()

const metricName = "fsm_loop_mgmt_op_timing_sec"
metrics, err := registry.Gather()
require.NoError(err)
hist := getMetric(metrics, metricName)
require.NotNil(hist)
assert.Equal(uint64(0), *hist.Metric[0].Histogram.SampleCount)

err = s.StartBgp(context.Background(), &api.StartBgpRequest{
Global: &api.Global{
Asn: 2,
RouterId: "2.2.2.2",
ListenPort: -1,
},
})
require.NoError(err)
defer s.StopBgp(context.Background(), &api.StopBgpRequest{})

// StartBgp counts as single management operation
metrics, err = registry.Gather()
require.NoError(err)
hist = getMetric(metrics, metricName)
require.NotNil(hist)
assert.Equal(uint64(1), *hist.Metric[0].Histogram.SampleCount)
}

func getMetric(metrics []*dto.MetricFamily, metricName string) *dto.MetricFamily {
for _, m := range metrics {
if m.GetName() == metricName {
return m
}
}
return nil
}
5 changes: 5 additions & 0 deletions pkg/server/rpki.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ type roaEvent struct {
Src string
Data []byte
conn *net.TCPConn
timestamp time.Time
}

type roaManager struct {
Expand Down Expand Up @@ -149,6 +150,7 @@ func (c *roaClient) lifetimeout() {
c.eventCh <- &roaEvent{
EventType: roaLifetimeout,
Src: c.host,
timestamp: time.Now(),
}
}

Expand Down Expand Up @@ -407,6 +409,7 @@ func (c *roaClient) tryConnect() {
EventType: roaConnected,
Src: c.host,
conn: conn.(*net.TCPConn),
timestamp: time.Now(),
}
return
}
Expand All @@ -419,6 +422,7 @@ func (c *roaClient) established() (err error) {
c.eventCh <- &roaEvent{
EventType: roaDisconnected,
Src: c.host,
timestamp: time.Now(),
}
}()

Expand All @@ -445,6 +449,7 @@ func (c *roaClient) established() (err error) {
EventType: roaRTR,
Src: c.host,
Data: append(header, body...),
timestamp: time.Now(),
}
}
}
45 changes: 44 additions & 1 deletion pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,25 @@ import (
"github.com/osrg/gobgp/v3/pkg/zebra"
)

type FSMOperation uint

const (
FSMMgmtOp FSMOperation = iota
FSMAccept
FSMROAEvent
FSMMessage

FSMOperationTypeCount
)

type FSMTimingHook interface {
Observe(op FSMOperation, tOp, tWait time.Duration)
}

type nopTimingHook struct{}

func (n nopTimingHook) Observe(op FSMOperation, tOp, tWait time.Duration) {}

type tcpListener struct {
l *net.TCPListener
ch chan struct{}
Expand Down Expand Up @@ -141,6 +160,7 @@ type options struct {
grpcAddress string
grpcOption []grpc.ServerOption
logger log.Logger
timingHook FSMTimingHook
}

type ServerOption func(*options)
Expand All @@ -163,6 +183,12 @@ func LoggerOption(logger log.Logger) ServerOption {
}
}

func TimingHookOption(hook FSMTimingHook) ServerOption {
return func(o *options) {
o.timingHook = hook
}
}

type BgpServer struct {
apiServer *server
bgpConfig oc.Bgp
Expand All @@ -184,10 +210,13 @@ type BgpServer struct {
roaTable *table.ROATable
uuidMap map[string]uuid.UUID
logger log.Logger
timingHook FSMTimingHook
}

func NewBgpServer(opt ...ServerOption) *BgpServer {
opts := options{}
opts := options{
timingHook: nopTimingHook{},
}
for _, o := range opt {
o(&opts)
}
Expand All @@ -207,6 +236,7 @@ func NewBgpServer(opt ...ServerOption) *BgpServer {
roaManager: newROAManager(roaTable, logger),
roaTable: roaTable,
logger: logger,
timingHook: opts.timingHook,
}
s.bmpManager = newBmpClientManager(s)
s.mrtManager = newMrtManager(s)
Expand Down Expand Up @@ -269,6 +299,7 @@ type mgmtOp struct {
f func() error
errCh chan error
checkActive bool // check BGP global setting is configured before calling f()
timestamp time.Time
}

func (s *BgpServer) handleMGMTOp(op *mgmtOp) {
Expand All @@ -288,6 +319,7 @@ func (s *BgpServer) mgmtOperation(f func() error, checkActive bool) (err error)
f: f,
errCh: ch,
checkActive: checkActive,
timestamp: time.Now(),
}
return
}
Expand Down Expand Up @@ -485,22 +517,33 @@ func (s *BgpServer) Serve() {
}

chosen, value, ok := reflect.Select(cases)
tStart := time.Now()
switch chosen {
case 0:
op := value.Interface().(*mgmtOp)
tWait := tStart.Sub(op.timestamp)
s.handleMGMTOp(op)
s.timingHook.Observe(FSMMgmtOp, time.Since(tStart), tWait)
case 1:
// NOTE: it would be useful to use kernel metrics such as SO_TIMESTAMPING to record time we got
// first SYN packet in TCP connection. For now we skip tWait for accept events, message/mgmt op
// delays should be enough to analyze FSM loop.
conn := value.Interface().(*net.TCPConn)
s.passConnToPeer(conn)
s.timingHook.Observe(FSMAccept, time.Since(tStart), 0)
case 2:
ev := value.Interface().(*roaEvent)
tWait := tStart.Sub(ev.timestamp)
s.roaManager.HandleROAEvent(ev)
s.timingHook.Observe(FSMROAEvent, time.Since(tStart), tWait)
default:
// in the case of dynamic peer, handleFSMMessage closed incoming channel so
// nil fsmMsg can happen here.
if ok {
e := value.Interface().(*fsmMsg)
tWait := tStart.Sub(e.timestamp)
handlefsmMsg(e)
s.timingHook.Observe(FSMMessage, time.Since(tStart), tWait)
}
}
}
Expand Down

0 comments on commit 792b985

Please sign in to comment.