Skip to content

Commit

Permalink
RSDK-9566: Monitor CPU/Memory usage of viam-server and modular proces…
Browse files Browse the repository at this point in the history
…ses with FTDC. (viamrobotics#4642)
  • Loading branch information
dgottlieb authored Dec 19, 2024
1 parent f66b8e3 commit fb3df2c
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 7 deletions.
100 changes: 100 additions & 0 deletions ftdc/sys/sys.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Package sys provides functionality for gathering system metrics in an FTDC compliant API.
package sys

import (
"os"
"time"

"github.com/prometheus/procfs"
)

// On linux, getting the page size is a system call. Cache the page size for the entirety of the
// program lifetime. As opposed to calling it each time we wish to compute the resident memory a
// program is using.
var (
osPageSize int
machineBootTimeSecsSinceEpoch float64
)

func init() {
osPageSize = os.Getpagesize()

machine, err := procfs.NewDefaultFS()
if err != nil {
return
}

machineStats, err := machine.Stat()
if err != nil {
return
}

machineBootTimeSecsSinceEpoch = float64(machineStats.BootTime)
}

// UsageStatser can be used to get system metrics for a process.
type UsageStatser struct {
proc procfs.Proc
}

// NewSelfSysUsageStatser will return a `SysUsageStatser` for the current process.
func NewSelfSysUsageStatser() (*UsageStatser, error) {
process, err := procfs.Self()
if err != nil {
return nil, err
}

return &UsageStatser{process}, nil
}

// NewPidSysUsageStatser will return a `SysUsageStatser` for the given process id.
func NewPidSysUsageStatser(pid int) (*UsageStatser, error) {
process, err := procfs.NewProc(pid)
if err != nil {
return nil, err
}

return &UsageStatser{process}, nil
}

type stats struct {
UserCPUSecs float64
SystemCPUSecs float64
ElapsedTimeSecs float64
VssMB float64
RssMB float64
}

// Stats returns Stats.
func (sys *UsageStatser) Stats() any {
// Stats files refer to time in "clock ticks". The right way to learn of the tick time (on
// linux) is via a system call to `sysconf(_SC_CLK_TCK)`. That system call, however, requires
// cgo. And it's almost universally true that 100hz is the configured value for "modern"
// systems.
//
// We should feel empowered to revisit this decision if the above assumption is not true. It's
// important to have the right value such that we can compute the amount of time a program has
// been running accurately. Without that, computing metrics like CPU percentage are incorrect.
const userHz = 100

stat, err := sys.proc.Stat()
if err != nil {
return stats{}
}

// relativeStartTimeSecs is the time the program started in seconds since the machine was
// booted.
relativeStartTimeSecs := float64(stat.Starttime) / float64(userHz)

// absoluteStartTimeSecs is the time the program started in seconds since the epoch.
absoluteStartTimeSecs := machineBootTimeSecsSinceEpoch + relativeStartTimeSecs

const nanosPerSecond = float64(1_000_000_000)
return stats{
UserCPUSecs: float64(stat.UTime) / float64(userHz),
SystemCPUSecs: float64(stat.STime) / float64(userHz),
ElapsedTimeSecs: float64(time.Now().UnixNano())/nanosPerSecond - absoluteStartTimeSecs,
VssMB: float64(stat.VSize) / 1_000_000.0,
RssMB: float64(stat.RSS*osPageSize) / 1_000_000.0,
}
}
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ require (
github.com/pion/logging v0.2.2
github.com/pion/mediadevices v0.6.4
github.com/pion/rtp v1.8.7
github.com/prometheus/procfs v0.15.1
github.com/rhysd/actionlint v1.6.24
github.com/rs/cors v1.11.1
github.com/sergi/go-diff v1.3.1
Expand All @@ -76,7 +77,7 @@ require (
go.uber.org/zap v1.27.0
go.viam.com/api v0.1.372
go.viam.com/test v1.2.4
go.viam.com/utils v0.1.116
go.viam.com/utils v0.1.118
goji.io v2.0.2+incompatible
golang.org/x/image v0.19.0
golang.org/x/mobile v0.0.0-20240112133503-c713f31d574b
Expand Down Expand Up @@ -334,7 +335,6 @@ require (
github.com/prometheus/client_golang v1.12.2 // indirect
github.com/prometheus/client_model v0.6.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.7.3 // indirect
github.com/quasilyte/go-ruleguard v0.4.3-0.20240823090925-0fe6f58b47b1 // indirect
github.com/quasilyte/go-ruleguard/dsl v0.3.22 // indirect
github.com/quasilyte/gogrep v0.5.0 // indirect
Expand Down
7 changes: 4 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1194,8 +1194,9 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx
github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.3.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.7.3 h1:4jVXhlkAyzOScmCkXBTOLRLTz8EeU+eyjrwB/EPq0VU=
github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
github.com/quasilyte/go-consistent v0.0.0-20190521200055-c6f3937de18c/go.mod h1:5STLWrekHfjyYwxBRVRXNOSewLJ3PWfDJd1VyTS21fI=
github.com/quasilyte/go-ruleguard v0.3.0/go.mod h1:p2miAhLp6fERzFNbcuQ4bevXs8rgK//uCHsUDkumITg=
Expand Down Expand Up @@ -1516,8 +1517,8 @@ go.viam.com/api v0.1.372 h1:Al9P7yojBDdNVAF7nrr5BAbzCvb+vrSp8N7BitbV0mQ=
go.viam.com/api v0.1.372/go.mod h1:g5eipXHNm0rQmW7DWya6avKcmzoypLmxnMlAaIsE5Ls=
go.viam.com/test v1.2.4 h1:JYgZhsuGAQ8sL9jWkziAXN9VJJiKbjoi9BsO33TW3ug=
go.viam.com/test v1.2.4/go.mod h1:zI2xzosHdqXAJ/kFqcN+OIF78kQuTV2nIhGZ8EzvaJI=
go.viam.com/utils v0.1.116 h1:hoCj3SsV8LZAOEP75TjMeX57axhravS8rNUYmhpTWtM=
go.viam.com/utils v0.1.116/go.mod h1:Z/Njnhd3+CKRAIwAqdogEDxlKYbJVhazg/4ygP2evpc=
go.viam.com/utils v0.1.118 h1:Kp6ebrCBiYReeSC1XnWPTjtBJoTUsQ6YWAomQkQF/mE=
go.viam.com/utils v0.1.118/go.mod h1:g1CaEkf7aJCrSI/Sfkx+6cBS1+Y3fPT2pvMQbp7TTBI=
go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2 h1:WJhcL4p+YeDxmZWg141nRm7XC8IDmhz7lk5GpadO1Sg=
go4.org/unsafe/assume-no-moving-gc v0.0.0-20230525183740-e7c30c78aeb2/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E=
gocv.io/x/gocv v0.25.0/go.mod h1:Rar2PS6DV+T4FL+PM535EImD/h13hGVaHhnCu1xarBs=
Expand Down
48 changes: 46 additions & 2 deletions module/modmanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
"google.golang.org/protobuf/types/known/structpb"

"go.viam.com/rdk/config"
"go.viam.com/rdk/ftdc"
"go.viam.com/rdk/ftdc/sys"
rdkgrpc "go.viam.com/rdk/grpc"
"go.viam.com/rdk/logging"
modlib "go.viam.com/rdk/module"
Expand Down Expand Up @@ -67,6 +69,7 @@ func NewManager(
restartCtx: restartCtx,
restartCtxCancel: restartCtxCancel,
packagesDir: options.PackagesDir,
ftdc: options.FTDC,
}
}

Expand Down Expand Up @@ -99,6 +102,8 @@ type module struct {
inStartup atomic.Bool
inRecoveryLock sync.Mutex
logger logging.Logger

ftdc *ftdc.FTDC
}

type addedResource struct {
Expand Down Expand Up @@ -178,6 +183,8 @@ type Manager struct {
removeOrphanedResources func(ctx context.Context, rNames []resource.Name)
restartCtx context.Context
restartCtxCancel context.CancelFunc

ftdc *ftdc.FTDC
}

// Close terminates module connections and processes.
Expand Down Expand Up @@ -318,6 +325,7 @@ func (mgr *Manager) add(ctx context.Context, conf config.Module, moduleLogger lo
dataDir: moduleDataDir,
resources: map[resource.Name]*addedResource{},
logger: moduleLogger,
ftdc: mgr.ftdc,
}

if err := mgr.startModule(ctx, mod); err != nil {
Expand Down Expand Up @@ -1143,6 +1151,10 @@ func (m *module) startProcess(
return errors.WithMessage(err, "module startup failed")
}

// Turn on process cpu/memory diagnostics for the module process. If there's an error, we
// continue normally, just without FTDC.
m.registerProcessWithFTDC()

checkTicker := time.NewTicker(100 * time.Millisecond)
defer checkTicker.Stop()

Expand Down Expand Up @@ -1180,9 +1192,21 @@ func (m *module) stopProcess() error {
if m.process == nil {
return nil
}

m.logger.Infof("Stopping module: %s process", m.cfg.Name)

// Attempt to remove module's .sock file if module did not remove it
// already.
defer rutils.RemoveFileNoError(m.addr)
defer func() {
rutils.RemoveFileNoError(m.addr)

// The system metrics "statser" is resilient to the process dying under the hood. An empty set
// of metrics will be reported. Therefore it is safe to continue monitoring the module process
// while it's in shutdown.
if m.ftdc != nil {
m.ftdc.Remove(m.process.ID())
}
}()

// TODO(RSDK-2551): stop ignoring exit status 143 once Python modules handle
// SIGTERM correctly.
Expand All @@ -1193,7 +1217,7 @@ func (m *module) stopProcess() error {
}
return err
}
m.logger.Infof("Stopping module: %s process", m.cfg.Name)

return nil
}

Expand Down Expand Up @@ -1302,6 +1326,26 @@ func (m *module) getFullEnvironment(viamHomeDir string) map[string]string {
return getFullEnvironment(m.cfg, m.dataDir, viamHomeDir)
}

func (m *module) registerProcessWithFTDC() {
if m.ftdc == nil {
return
}

pid, err := m.process.UnixPid()
if err != nil {
m.logger.Warnw("Module process has no pid. Cannot start ftdc.", "err", err)
return
}

statser, err := sys.NewPidSysUsageStatser(pid)
if err != nil {
m.logger.Warnw("Cannot find /proc files", "err", err)
return
}

m.ftdc.Add(fmt.Sprintf("modules.%s", m.process.ID()), statser)
}

func getFullEnvironment(
cfg config.Module,
dataDir string,
Expand Down
4 changes: 4 additions & 0 deletions module/modmanager/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package modmanageroptions
import (
"context"

"go.viam.com/rdk/ftdc"
"go.viam.com/rdk/resource"
)

Expand All @@ -19,4 +20,7 @@ type Options struct {
RemoveOrphanedResources func(ctx context.Context, rNames []resource.Name)
// PackagesDir is from Config.PackagesPath. It's used for resolving local tarball module paths.
PackagesDir string
// Passing in an FTDC object will let the mod manager add and remove pieces to track diagnostics
// of. Such module process CPU/memory information as they are started and stopped.
FTDC *ftdc.FTDC
}
4 changes: 4 additions & 0 deletions robot/impl/local_robot.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"go.viam.com/rdk/components/sensor"
"go.viam.com/rdk/config"
"go.viam.com/rdk/ftdc"
"go.viam.com/rdk/ftdc/sys"
icloud "go.viam.com/rdk/internal/cloud"
"go.viam.com/rdk/logging"
"go.viam.com/rdk/operation"
Expand Down Expand Up @@ -323,6 +324,9 @@ func newWithResources(
// constructed to get a valid copy of its stats object (for the schema's sake). Even if
// the web service has not been "started".
ftdcWorker = ftdc.New(ftdc.DefaultDirectory(config.ViamDotDir, partID), logger.Sublogger("ftdc"))
if statser, err := sys.NewSelfSysUsageStatser(); err == nil {
ftdcWorker.Add("viam-server", statser)
}
}

closeCtx, cancel := context.WithCancel(ctx)
Expand Down
1 change: 1 addition & 0 deletions robot/impl/resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ func (manager *resourceManager) startModuleManager(
ViamHomeDir: viamHomeDir,
RobotCloudID: robotCloudID,
PackagesDir: packagesDir,
FTDC: manager.opts.ftdc,
}
manager.moduleManager = modmanager.NewManager(ctx, parentAddr, logger, mmOpts)
}
Expand Down
4 changes: 4 additions & 0 deletions robot/impl/resource_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,10 @@ func (fp *fakeProcess) Status() error {
return nil
}

func (fp *fakeProcess) UnixPid() (int, error) {
return 0, errors.New("unimplemented")
}

func TestManagerResourceRPCAPIs(t *testing.T) {
logger := logging.NewTestLogger(t)
injectRobot := &inject.Robot{}
Expand Down

0 comments on commit fb3df2c

Please sign in to comment.