From fb3df2ceef551aa6da9593591e32f56318096b5c Mon Sep 17 00:00:00 2001 From: Dan Gottlieb Date: Thu, 19 Dec 2024 16:15:51 -0500 Subject: [PATCH] RSDK-9566: Monitor CPU/Memory usage of viam-server and modular processes with FTDC. (#4642) --- ftdc/sys/sys.go | 100 +++++++++++++++++++++++++++ go.mod | 4 +- go.sum | 7 +- module/modmanager/manager.go | 48 ++++++++++++- module/modmanager/options/options.go | 4 ++ robot/impl/local_robot.go | 4 ++ robot/impl/resource_manager.go | 1 + robot/impl/resource_manager_test.go | 4 ++ 8 files changed, 165 insertions(+), 7 deletions(-) create mode 100644 ftdc/sys/sys.go diff --git a/ftdc/sys/sys.go b/ftdc/sys/sys.go new file mode 100644 index 00000000000..ae92d72323f --- /dev/null +++ b/ftdc/sys/sys.go @@ -0,0 +1,100 @@ +// Package sys provides functionality for gathering system metrics in an FTDC compliant API. +package sys + +import ( + "os" + "time" + + "github.com/prometheus/procfs" +) + +// On linux, getting the page size is a system call. Cache the page size for the entirety of the +// program lifetime. As opposed to calling it each time we wish to compute the resident memory a +// program is using. +var ( + osPageSize int + machineBootTimeSecsSinceEpoch float64 +) + +func init() { + osPageSize = os.Getpagesize() + + machine, err := procfs.NewDefaultFS() + if err != nil { + return + } + + machineStats, err := machine.Stat() + if err != nil { + return + } + + machineBootTimeSecsSinceEpoch = float64(machineStats.BootTime) +} + +// UsageStatser can be used to get system metrics for a process. +type UsageStatser struct { + proc procfs.Proc +} + +// NewSelfSysUsageStatser will return a `SysUsageStatser` for the current process. +func NewSelfSysUsageStatser() (*UsageStatser, error) { + process, err := procfs.Self() + if err != nil { + return nil, err + } + + return &UsageStatser{process}, nil +} + +// NewPidSysUsageStatser will return a `SysUsageStatser` for the given process id. +func NewPidSysUsageStatser(pid int) (*UsageStatser, error) { + process, err := procfs.NewProc(pid) + if err != nil { + return nil, err + } + + return &UsageStatser{process}, nil +} + +type stats struct { + UserCPUSecs float64 + SystemCPUSecs float64 + ElapsedTimeSecs float64 + VssMB float64 + RssMB float64 +} + +// Stats returns Stats. +func (sys *UsageStatser) Stats() any { + // Stats files refer to time in "clock ticks". The right way to learn of the tick time (on + // linux) is via a system call to `sysconf(_SC_CLK_TCK)`. That system call, however, requires + // cgo. And it's almost universally true that 100hz is the configured value for "modern" + // systems. + // + // We should feel empowered to revisit this decision if the above assumption is not true. It's + // important to have the right value such that we can compute the amount of time a program has + // been running accurately. Without that, computing metrics like CPU percentage are incorrect. + const userHz = 100 + + stat, err := sys.proc.Stat() + if err != nil { + return stats{} + } + + // relativeStartTimeSecs is the time the program started in seconds since the machine was + // booted. + relativeStartTimeSecs := float64(stat.Starttime) / float64(userHz) + + // absoluteStartTimeSecs is the time the program started in seconds since the epoch. + absoluteStartTimeSecs := machineBootTimeSecsSinceEpoch + relativeStartTimeSecs + + const nanosPerSecond = float64(1_000_000_000) + return stats{ + UserCPUSecs: float64(stat.UTime) / float64(userHz), + SystemCPUSecs: float64(stat.STime) / float64(userHz), + ElapsedTimeSecs: float64(time.Now().UnixNano())/nanosPerSecond - absoluteStartTimeSecs, + VssMB: float64(stat.VSize) / 1_000_000.0, + RssMB: float64(stat.RSS*osPageSize) / 1_000_000.0, + } +} diff --git a/go.mod b/go.mod index bf133276621..ae0d6f3459a 100644 --- a/go.mod +++ b/go.mod @@ -60,6 +60,7 @@ require ( github.com/pion/logging v0.2.2 github.com/pion/mediadevices v0.6.4 github.com/pion/rtp v1.8.7 + github.com/prometheus/procfs v0.15.1 github.com/rhysd/actionlint v1.6.24 github.com/rs/cors v1.11.1 github.com/sergi/go-diff v1.3.1 @@ -76,7 +77,7 @@ require ( go.uber.org/zap v1.27.0 go.viam.com/api v0.1.372 go.viam.com/test v1.2.4 - go.viam.com/utils v0.1.116 + go.viam.com/utils v0.1.118 goji.io v2.0.2+incompatible golang.org/x/image v0.19.0 golang.org/x/mobile v0.0.0-20240112133503-c713f31d574b @@ -334,7 +335,6 @@ require ( github.com/prometheus/client_golang v1.12.2 // indirect github.com/prometheus/client_model v0.6.0 // indirect github.com/prometheus/common v0.37.0 // indirect - github.com/prometheus/procfs v0.7.3 // indirect github.com/quasilyte/go-ruleguard v0.4.3-0.20240823090925-0fe6f58b47b1 // indirect github.com/quasilyte/go-ruleguard/dsl v0.3.22 // indirect github.com/quasilyte/gogrep v0.5.0 // indirect diff --git a/go.sum b/go.sum index c28213c2d46..ff9673c7a94 100644 --- a/go.sum +++ b/go.sum @@ -1194,8 +1194,9 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.3.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/prometheus/procfs v0.7.3 h1:4jVXhlkAyzOScmCkXBTOLRLTz8EeU+eyjrwB/EPq0VU= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= +github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= +github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/quasilyte/go-consistent v0.0.0-20190521200055-c6f3937de18c/go.mod h1:5STLWrekHfjyYwxBRVRXNOSewLJ3PWfDJd1VyTS21fI= github.com/quasilyte/go-ruleguard v0.3.0/go.mod h1:p2miAhLp6fERzFNbcuQ4bevXs8rgK//uCHsUDkumITg= @@ -1516,8 +1517,8 @@ go.viam.com/api v0.1.372 h1:Al9P7yojBDdNVAF7nrr5BAbzCvb+vrSp8N7BitbV0mQ= go.viam.com/api v0.1.372/go.mod h1:g5eipXHNm0rQmW7DWya6avKcmzoypLmxnMlAaIsE5Ls= go.viam.com/test v1.2.4 h1:JYgZhsuGAQ8sL9jWkziAXN9VJJiKbjoi9BsO33TW3ug= go.viam.com/test v1.2.4/go.mod h1:zI2xzosHdqXAJ/kFqcN+OIF78kQuTV2nIhGZ8EzvaJI= -go.viam.com/utils v0.1.116 h1:hoCj3SsV8LZAOEP75TjMeX57axhravS8rNUYmhpTWtM= -go.viam.com/utils v0.1.116/go.mod h1:Z/Njnhd3+CKRAIwAqdogEDxlKYbJVhazg/4ygP2evpc= +go.viam.com/utils v0.1.118 h1:Kp6ebrCBiYReeSC1XnWPTjtBJoTUsQ6YWAomQkQF/mE= +go.viam.com/utils v0.1.118/go.mod h1:g1CaEkf7aJCrSI/Sfkx+6cBS1+Y3fPT2pvMQbp7TTBI= go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2 h1:WJhcL4p+YeDxmZWg141nRm7XC8IDmhz7lk5GpadO1Sg= go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E= gocv.io/x/gocv v0.25.0/go.mod h1:Rar2PS6DV+T4FL+PM535EImD/h13hGVaHhnCu1xarBs= diff --git a/module/modmanager/manager.go b/module/modmanager/manager.go index b2c8ad098cd..849fd63f60d 100644 --- a/module/modmanager/manager.go +++ b/module/modmanager/manager.go @@ -29,6 +29,8 @@ import ( "google.golang.org/protobuf/types/known/structpb" "go.viam.com/rdk/config" + "go.viam.com/rdk/ftdc" + "go.viam.com/rdk/ftdc/sys" rdkgrpc "go.viam.com/rdk/grpc" "go.viam.com/rdk/logging" modlib "go.viam.com/rdk/module" @@ -67,6 +69,7 @@ func NewManager( restartCtx: restartCtx, restartCtxCancel: restartCtxCancel, packagesDir: options.PackagesDir, + ftdc: options.FTDC, } } @@ -99,6 +102,8 @@ type module struct { inStartup atomic.Bool inRecoveryLock sync.Mutex logger logging.Logger + + ftdc *ftdc.FTDC } type addedResource struct { @@ -178,6 +183,8 @@ type Manager struct { removeOrphanedResources func(ctx context.Context, rNames []resource.Name) restartCtx context.Context restartCtxCancel context.CancelFunc + + ftdc *ftdc.FTDC } // Close terminates module connections and processes. @@ -318,6 +325,7 @@ func (mgr *Manager) add(ctx context.Context, conf config.Module, moduleLogger lo dataDir: moduleDataDir, resources: map[resource.Name]*addedResource{}, logger: moduleLogger, + ftdc: mgr.ftdc, } if err := mgr.startModule(ctx, mod); err != nil { @@ -1143,6 +1151,10 @@ func (m *module) startProcess( return errors.WithMessage(err, "module startup failed") } + // Turn on process cpu/memory diagnostics for the module process. If there's an error, we + // continue normally, just without FTDC. + m.registerProcessWithFTDC() + checkTicker := time.NewTicker(100 * time.Millisecond) defer checkTicker.Stop() @@ -1180,9 +1192,21 @@ func (m *module) stopProcess() error { if m.process == nil { return nil } + + m.logger.Infof("Stopping module: %s process", m.cfg.Name) + // Attempt to remove module's .sock file if module did not remove it // already. - defer rutils.RemoveFileNoError(m.addr) + defer func() { + rutils.RemoveFileNoError(m.addr) + + // The system metrics "statser" is resilient to the process dying under the hood. An empty set + // of metrics will be reported. Therefore it is safe to continue monitoring the module process + // while it's in shutdown. + if m.ftdc != nil { + m.ftdc.Remove(m.process.ID()) + } + }() // TODO(RSDK-2551): stop ignoring exit status 143 once Python modules handle // SIGTERM correctly. @@ -1193,7 +1217,7 @@ func (m *module) stopProcess() error { } return err } - m.logger.Infof("Stopping module: %s process", m.cfg.Name) + return nil } @@ -1302,6 +1326,26 @@ func (m *module) getFullEnvironment(viamHomeDir string) map[string]string { return getFullEnvironment(m.cfg, m.dataDir, viamHomeDir) } +func (m *module) registerProcessWithFTDC() { + if m.ftdc == nil { + return + } + + pid, err := m.process.UnixPid() + if err != nil { + m.logger.Warnw("Module process has no pid. Cannot start ftdc.", "err", err) + return + } + + statser, err := sys.NewPidSysUsageStatser(pid) + if err != nil { + m.logger.Warnw("Cannot find /proc files", "err", err) + return + } + + m.ftdc.Add(fmt.Sprintf("modules.%s", m.process.ID()), statser) +} + func getFullEnvironment( cfg config.Module, dataDir string, diff --git a/module/modmanager/options/options.go b/module/modmanager/options/options.go index c848b61b3cb..87a55138605 100644 --- a/module/modmanager/options/options.go +++ b/module/modmanager/options/options.go @@ -4,6 +4,7 @@ package modmanageroptions import ( "context" + "go.viam.com/rdk/ftdc" "go.viam.com/rdk/resource" ) @@ -19,4 +20,7 @@ type Options struct { RemoveOrphanedResources func(ctx context.Context, rNames []resource.Name) // PackagesDir is from Config.PackagesPath. It's used for resolving local tarball module paths. PackagesDir string + // Passing in an FTDC object will let the mod manager add and remove pieces to track diagnostics + // of. Such module process CPU/memory information as they are started and stopped. + FTDC *ftdc.FTDC } diff --git a/robot/impl/local_robot.go b/robot/impl/local_robot.go index 802fb3bed97..10ae93c3b32 100644 --- a/robot/impl/local_robot.go +++ b/robot/impl/local_robot.go @@ -26,6 +26,7 @@ import ( "go.viam.com/rdk/components/sensor" "go.viam.com/rdk/config" "go.viam.com/rdk/ftdc" + "go.viam.com/rdk/ftdc/sys" icloud "go.viam.com/rdk/internal/cloud" "go.viam.com/rdk/logging" "go.viam.com/rdk/operation" @@ -323,6 +324,9 @@ func newWithResources( // constructed to get a valid copy of its stats object (for the schema's sake). Even if // the web service has not been "started". ftdcWorker = ftdc.New(ftdc.DefaultDirectory(config.ViamDotDir, partID), logger.Sublogger("ftdc")) + if statser, err := sys.NewSelfSysUsageStatser(); err == nil { + ftdcWorker.Add("viam-server", statser) + } } closeCtx, cancel := context.WithCancel(ctx) diff --git a/robot/impl/resource_manager.go b/robot/impl/resource_manager.go index 4c11d894321..f767f8e1492 100644 --- a/robot/impl/resource_manager.go +++ b/robot/impl/resource_manager.go @@ -126,6 +126,7 @@ func (manager *resourceManager) startModuleManager( ViamHomeDir: viamHomeDir, RobotCloudID: robotCloudID, PackagesDir: packagesDir, + FTDC: manager.opts.ftdc, } manager.moduleManager = modmanager.NewManager(ctx, parentAddr, logger, mmOpts) } diff --git a/robot/impl/resource_manager_test.go b/robot/impl/resource_manager_test.go index 9ef3bf7961d..06991cdac3a 100644 --- a/robot/impl/resource_manager_test.go +++ b/robot/impl/resource_manager_test.go @@ -1330,6 +1330,10 @@ func (fp *fakeProcess) Status() error { return nil } +func (fp *fakeProcess) UnixPid() (int, error) { + return 0, errors.New("unimplemented") +} + func TestManagerResourceRPCAPIs(t *testing.T) { logger := logging.NewTestLogger(t) injectRobot := &inject.Robot{}