Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect GPUs #121

Merged
merged 28 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ jobs:
mv metal-hammer-initrd.img.lz4* images/metal-hammer/
- name: Upload image tarballs to GCS
run: gsutil -m cp -r -p images/metal-hammer gs://$GCS_BUCKET
- uses: release-drafter/release-drafter@v5
- uses: release-drafter/release-drafter@v6
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ RUN curl -fLsS https://sourceforge.net/projects/e1000/files/ice%20stable/${ICE_V

# ipmitool from bookworm is broken and returns with error on most commands
FROM golang:1.22-bullseye as initrd-builder
ENV UROOT_GIT_SHA_OR_TAG=v0.13.0
ENV UROOT_GIT_SHA_OR_TAG=v0.14.0
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
Expand Down
4 changes: 2 additions & 2 deletions REINSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ If only the `imageID` is given it tries to guess the primary disk of the old OS.
After wiping the primary disk the reinstall procedure continues with the usual installation process up from the `installImage` method that eventually ends with the `finalizeAllocation` call, which now includes the previous mentioned `BootInfo` parameters.

**metal-core** passes-through the request to **metal-api**, sets the boot order to HD and power cycles the machine again, which in turn boots the new OS.
**metal-api** removes the `allocation.Reinstall` mark and stores the `BootInfo` details together with the newly installed `imageID` in the `allcation.MachineSetup` struct.

**metal-api** removes the `allocation.Reinstall` mark and stores the `BootInfo` details together with the newly installed `imageID` in the `allocation.MachineSetup` struct.

This was the happy-path. But of course, things can go wrong. If for any reason the reinstallation process fails, we are potentially in one of the following two states: Either the primary disk has been wiped already (and therewith the existing OS) or not. In both cases **metal-hammer** calls **metal-core** via the `/machine/abort-reinstall/<id>` endpoint delivering the bool value `primaryDiskWiped` that indicates the actual state.
If **metal-core** fails to respond or the OS has already been wiped the machine reboots. Otherwise it gets the `BootInfo` of the previous installed OS stored in the DS and reboots with these details into the existing OS, just as nothing had happened at all.
Expand Down
2 changes: 1 addition & 1 deletion cmd/firmware/firmware.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (f *Firmware) Update() {
}
}

// Run execute a comand with arguments, returns output and error
// Run execute a command with arguments, returns output and error
func run(log *slog.Logger, command string, args ...string) (string, error) {
path, err := exec.LookPath(command)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion cmd/firmware/intel.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func (r intel) update() error {
if err != nil {
return fmt.Errorf("unable to update intel firmware %w", err)
}
r.log.Info("intel", "updated firware output", output)
r.log.Info("intel", "updated firmware output", output)
return nil
}

Expand Down
9 changes: 6 additions & 3 deletions cmd/metal-client.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

v1 "github.com/metal-stack/metal-api/pkg/api/v1"
metalgo "github.com/metal-stack/metal-go"
"github.com/metal-stack/metal-go/api/client/machine"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"google.golang.org/grpc/keepalive"
Expand All @@ -18,7 +19,7 @@ import (
type MetalAPIClient struct {
log *slog.Logger
conn grpc.ClientConnInterface
Driver metalgo.Client
driver metalgo.Client
}

// NewMetalAPIClient fetches the address,hmac and certificates from pixie needed to communicate with metal-api,
Expand Down Expand Up @@ -71,10 +72,12 @@ func NewMetalAPIClient(log *slog.Logger, spec *Specification) (*MetalAPIClient,
return &MetalAPIClient{
log: log,
conn: conn,
Driver: driver,
driver: driver,
}, nil
}

func (c *MetalAPIClient) Machine() machine.ClientService {
return c.driver.Machine()
}
func (c *MetalAPIClient) Event() v1.EventServiceClient {
return v1.NewEventServiceClient(c.conn)
}
Expand Down
4 changes: 2 additions & 2 deletions cmd/network/ethtool.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
"github.com/metal-stack/metal-hammer/pkg/os/command"
)

// EthtoolCommand to gather ethernet informations
// EthtoolCommand to gather ethernet information
const ethtoolCommand = command.Ethtool

// Ethtool to query/set ethernet interfaces
Expand Down Expand Up @@ -89,7 +89,7 @@ func (e *Ethtool) disableFirmwareLLDP(ifi string) {

var buggyIntelNicDriverNames = []string{"i40e"}

// stopFirmwareLLDP stop Firmeware LLDP not persistent over reboots, only during runtime.
// stopFirmwareLLDP stop Firmware LLDP not persistent over reboots, only during runtime.
// mount -t debugfs none /sys/kernel/debug
// echo lldp stop > /sys/kernel/debug/i40e/0000:01:00.2/command
// where <0000:01:00.2> is the pci address of the ethernet nic, this can be inspected by lspci,
Expand Down
2 changes: 1 addition & 1 deletion cmd/network/ntpdate.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func getTime(log *slog.Logger, servers []string) (t time.Time, err error) {
return
}

// NtpDate set the system time to the time comming from a ntp source
// NtpDate set the system time to the time coming from a ntp source
func NtpDate(log *slog.Logger) {
t, err := getTime(log, ntpServers)
if err != nil {
Expand Down
61 changes: 60 additions & 1 deletion cmd/register/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/metal-stack/metal-hammer/cmd/network"
"github.com/metal-stack/metal-hammer/cmd/storage"
"github.com/metal-stack/v"
"github.com/u-root/u-root/pkg/pci"
"github.com/vishvananda/netlink"
)

Expand Down Expand Up @@ -81,6 +82,32 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
if err != nil {
return nil, fmt.Errorf("unable to get system cpu(s) %w", err)
}
r.log.Info("cpu", "processors", cpu.String())
var metalCPUs []*v1.MachineCPU
for _, cpu := range cpu.Processors {
metalCPUs = append(metalCPUs, &v1.MachineCPU{
Vendor: cpu.Vendor,
Model: cpu.Model,
Cores: cpu.NumCores,
Threads: cpu.NumThreads,
})
}

// 0000:bd:00.0: DisplayVGA: NVIDIA Corporation AD102GL [RTX 6000 Ada Generation]

gpus, err := r.detectGPUs()
if err != nil {
return nil, fmt.Errorf("unable to get system gpu(s) %w", err)
}

var metalGPUs []*v1.MachineGPU
for _, g := range gpus {
r.log.Info("found gpu", "gpu", g.String())
metalGPUs = append(metalGPUs, &v1.MachineGPU{
Vendor: g.VendorName,
Model: g.DeviceName,
})
}

// Nics
nics := []*v1.MachineNic{}
Expand Down Expand Up @@ -166,6 +193,8 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
CpuCores: uint32(cpu.TotalCores),
Nics: nics,
Disks: disks,
Cpus: metalCPUs,
Gpus: metalGPUs,
}

// IPMI
Expand All @@ -178,7 +207,7 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
board := r.inband.Board()
b := board.BIOS
if b == nil {
return nil, fmt.Errorf("unable to read bios informations from bmc")
return nil, fmt.Errorf("unable to read bios information from bmc")
}
bios := &v1.MachineBIOS{
Version: b.Version,
Expand All @@ -198,6 +227,36 @@ func (r *Register) readHardwareDetails() (*v1.BootServiceRegisterRequest, error)
return request, nil
}

func (r *Register) detectGPUs() (pci.Devices, error) {
pciReader, err := pci.NewBusReader("*")
if err != nil {
return nil, err
}

var devices pci.Devices
if devices, err = pciReader.Read(); err != nil {
return nil, err
}

devices.SetVendorDeviceName()

var result pci.Devices
for _, device := range devices {
// "vendor":"NVIDIA Corporation","device":"AD102GL [RTX 6000 Ada Generation]"}
if !strings.Contains(strings.ToLower(device.VendorName), "nvidia") {
continue
}

// TODO if new models must be supported, this code must be refactored
if strings.Contains(strings.ToLower(device.DeviceName), "rtx") {
r.log.Info("add gpu", "vendor", device.VendorName, "device", device.DeviceName)
result = append(result, device)
}
}

return result, nil
}

// save the content of kernel ring buffer to /var/log/syslog
// by calling the appropriate syscall.
// Only required if Memory is gathered by ghw.Memory()
Expand Down
4 changes: 2 additions & 2 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func Run(log *slog.Logger, spec *Specification, hal hal.InBand) (*event.EventEmi
return eventEmitter, fmt.Errorf("register %w", err)
}

resp, err := metalAPIClient.Driver.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
resp, err := metalAPIClient.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
if err != nil {
return eventEmitter, fmt.Errorf("fetch %w", err)
}
Expand Down Expand Up @@ -143,7 +143,7 @@ func Run(log *slog.Logger, spec *Specification, hal hal.InBand) (*event.EventEmi
if err != nil {
return eventEmitter, fmt.Errorf("wait for installation %w", err)
}
resp, err = metalAPIClient.Driver.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
resp, err = metalAPIClient.Machine().FindMachine(machine.NewFindMachineParams().WithID(spec.MachineUUID), nil)
if err != nil {
return eventEmitter, fmt.Errorf("wait for installation %w", err)
}
Expand Down
10 changes: 8 additions & 2 deletions cmd/supwd.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@ func (h *Hammer) createBmcSuperuser() error {
return fmt.Errorf("failed to fetch SuperUser password %w", err)
}

if resp.FeatureDisabled {
if resp.SuperUserPassword == "" {
h.log.Warn("creation of superuser disabled because password is empty")
return nil
}

bmcConn := h.Hal.BMCConnection()

err = bmcConn.CreateUser(bmcConn.SuperUser(), api.AdministratorPrivilege, resp.SuperUserPassword)
if err != nil {
return fmt.Errorf("failed to create bmc superuser: %s %w", bmcConn.SuperUser().Name, err)
// FIXME: this happens always after the first creation on X12 and newer boards
// return fmt.Errorf("failed to create bmc superuser: %s %w", bmcConn.SuperUser().Name, err)
h.log.Error("failed to create bmc superuser", "user", bmcConn.SuperUser().Name, "error", err)
return nil
}

h.log.Info("created superuser", "user", bmcConn.SuperUser().Name)
return nil
}
Loading
Loading