Skip to content

Commit c7bc4aa

Browse files
committed
feat(bmc): add redfish support
Signed-off-by: Sunil Thaha <sthaha@redhat.com>
1 parent f67206a commit c7bc4aa

31 files changed

+5222
-20
lines changed

cmd/kepler/main.go

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@ import (
1111
"syscall"
1212

1313
"github.com/alecthomas/kingpin/v2"
14+
"k8s.io/utils/ptr"
15+
1416
"github.com/sustainable-computing-io/kepler/config"
1517
"github.com/sustainable-computing-io/kepler/internal/device"
1618
"github.com/sustainable-computing-io/kepler/internal/exporter/prometheus"
1719
"github.com/sustainable-computing-io/kepler/internal/exporter/stdout"
1820
"github.com/sustainable-computing-io/kepler/internal/k8s/pod"
1921
"github.com/sustainable-computing-io/kepler/internal/logger"
2022
"github.com/sustainable-computing-io/kepler/internal/monitor"
23+
"github.com/sustainable-computing-io/kepler/internal/platform/redfish"
2124
"github.com/sustainable-computing-io/kepler/internal/resource"
2225
"github.com/sustainable-computing-io/kepler/internal/server"
2326
"github.com/sustainable-computing-io/kepler/internal/service"
@@ -157,6 +160,8 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
157160
monitor.WithMinTerminatedEnergyThreshold(monitor.Energy(cfg.Monitor.MinTerminatedEnergyThreshold)*monitor.Joule),
158161
)
159162

163+
// Create Redfish service if enabled (experimental feature)
164+
160165
apiServer := server.NewAPIServer(
161166
server.WithLogger(logger),
162167
server.WithListenAddress(cfg.Web.ListenAddresses),
@@ -170,9 +175,20 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
170175
pm,
171176
)
172177

178+
// Add Redfish service if enabled
179+
var redfishService *redfish.Service
180+
if ptr.Deref(cfg.Experimental.Platform.Redfish.Enabled, false) {
181+
rs, err := createRedfishService(logger, cfg)
182+
if err != nil {
183+
return nil, fmt.Errorf("failed to create Redfish service: %w", err)
184+
}
185+
services = append(services, rs)
186+
redfishService = rs
187+
}
188+
173189
// Add Prometheus exporter if enabled
174190
if *cfg.Exporter.Prometheus.Enabled {
175-
promExporter, err := createPrometheusExporter(logger, cfg, apiServer, pm)
191+
promExporter, err := createPrometheusExporter(logger, cfg, apiServer, pm, redfishService)
176192
if err != nil {
177193
return nil, fmt.Errorf("failed to create Prometheus exporter: %w", err)
178194
}
@@ -194,19 +210,51 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
194210
return services, nil
195211
}
196212

197-
func createPrometheusExporter(logger *slog.Logger, cfg *config.Config, apiServer *server.APIServer, pm *monitor.PowerMonitor) (*prometheus.Exporter, error) {
213+
func createRedfishService(logger *slog.Logger, cfg *config.Config) (*redfish.Service, error) {
214+
// Resolve node ID using new priority logic
215+
redfishCfg := cfg.Experimental.Platform.Redfish
216+
217+
nodeID, err := redfish.ResolveNodeID(redfishCfg.NodeID, cfg.Kube.Node)
218+
if err != nil {
219+
return nil, fmt.Errorf("failed to resolve node ID for Redfish service: %w", err)
220+
}
221+
222+
rs, err := redfish.NewService(
223+
cfg.Experimental.Platform.Redfish.ConfigFile,
224+
nodeID,
225+
logger,
226+
)
227+
if err != nil {
228+
return nil, fmt.Errorf("failed to create Redfish service: %w", err)
229+
}
230+
231+
return rs, nil
232+
}
233+
234+
func createPrometheusExporter(
235+
logger *slog.Logger, cfg *config.Config,
236+
apiServer *server.APIServer, pm *monitor.PowerMonitor,
237+
rs *redfish.Service,
238+
) (*prometheus.Exporter, error) {
198239
logger.Debug("Creating Prometheus exporter")
199240

200241
// Use metrics level from configuration (already parsed)
201242
metricsLevel := cfg.Exporter.Prometheus.MetricsLevel
202243

203-
collectors, err := prometheus.CreateCollectors(
204-
pm,
244+
var collectorOpts []prometheus.OptionFn
245+
collectorOpts = append(collectorOpts,
205246
prometheus.WithLogger(logger),
206247
prometheus.WithProcFSPath(cfg.Host.ProcFS),
207248
prometheus.WithNodeName(cfg.Kube.Node),
208249
prometheus.WithMetricsLevel(metricsLevel),
209250
)
251+
252+
// Add platform data provider if Redfish service is available
253+
if rs != nil {
254+
collectorOpts = append(collectorOpts, prometheus.WithPlatformDataProvider(rs))
255+
}
256+
257+
collectors, err := prometheus.CreateCollectors(pm, collectorOpts...)
210258
if err != nil {
211259
return nil, fmt.Errorf("failed to create Prometheus collectors: %w", err)
212260
}

compose/dev/kepler-dev/etc/kepler/config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,11 @@ dev:
7070
fake-cpu-meter:
7171
enabled: false
7272
zones: [] # zones to be enabled, empty enables all default zones
73+
74+
# EXPERIMENTAL FEATURES - These features are experimental and may be unstable
75+
experimental:
76+
platform:
77+
redfish:
78+
enabled: false # Enable experimental Redfish BMC power monitoring
79+
configFile: "/etc/kepler/redfish.yaml" # Path to Redfish BMC configuration file
80+
nodeID: "" # Node identifier to use (overrides Kubernetes node name and hostname fallback)

compose/dev/kepler-latest/etc/kepler/config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,11 @@ dev:
7070
fake-cpu-meter:
7171
enabled: false
7272
zones: [] # zones to be enabled, empty enables all default zones
73+
74+
# EXPERIMENTAL FEATURES - These features are experimental and may be unstable
75+
experimental:
76+
platform:
77+
redfish:
78+
enabled: false # Enable experimental Redfish BMC power monitoring
79+
configFile: "/etc/kepler/redfish.yaml" # Path to Redfish BMC configuration file
80+
nodeID: "" # Node identifier to use (overrides Kubernetes node name and hostname fallback)

config/config.go

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,34 @@ type (
9393
Node string `yaml:"nodeName"`
9494
}
9595

96-
Config struct {
97-
Log Log `yaml:"log"`
98-
Host Host `yaml:"host"`
99-
Monitor Monitor `yaml:"monitor"`
100-
Rapl Rapl `yaml:"rapl"`
101-
Exporter Exporter `yaml:"exporter"`
102-
Web Web `yaml:"web"`
103-
Debug Debug `yaml:"debug"`
104-
Dev Dev `yaml:"dev"` // WARN: do not expose dev settings as flags
96+
// Platform contains settings for platform power monitoring
97+
Platform struct {
98+
Redfish Redfish `yaml:"redfish"`
99+
}
100+
101+
// Redfish contains settings for Redfish BMC power monitoring
102+
Redfish struct {
103+
Enabled *bool `yaml:"enabled"`
104+
NodeID string `yaml:"nodeID"`
105+
ConfigFile string `yaml:"configFile"`
106+
}
105107

106-
Kube Kube `yaml:"kube"`
108+
// Experimental contains experimental features (no stability guarantees)
109+
Experimental struct {
110+
Platform Platform `yaml:"platform"`
111+
}
112+
113+
Config struct {
114+
Log Log `yaml:"log"`
115+
Host Host `yaml:"host"`
116+
Monitor Monitor `yaml:"monitor"`
117+
Rapl Rapl `yaml:"rapl"`
118+
Exporter Exporter `yaml:"exporter"`
119+
Web Web `yaml:"web"`
120+
Debug Debug `yaml:"debug"`
121+
Dev Dev `yaml:"dev"` // WARN: do not expose dev settings as flags
122+
Experimental Experimental `yaml:"experimental"`
123+
Kube Kube `yaml:"kube"`
107124
}
108125
)
109126

@@ -186,6 +203,11 @@ const (
186203
KubeConfigFlag = "kube.config"
187204
KubeNodeNameFlag = "kube.node-name"
188205

206+
// Experimental Platform flags
207+
ExperimentalPlatformRedfishEnabledFlag = "experimental.platform.redfish.enabled"
208+
ExperimentalPlatformRedfishNodeIDFlag = "experimental.platform.redfish.node-id"
209+
ExperimentalPlatformRedfishConfigFlag = "experimental.platform.redfish.config"
210+
189211
// WARN: dev settings shouldn't be exposed as flags as flags are intended for end users
190212
)
191213

@@ -228,6 +250,15 @@ func DefaultConfig() *Config {
228250
Web: Web{
229251
ListenAddresses: []string{":28282"},
230252
},
253+
Experimental: Experimental{
254+
Platform: Platform{
255+
Redfish: Redfish{
256+
Enabled: ptr.To(false),
257+
NodeID: "",
258+
ConfigFile: "",
259+
},
260+
},
261+
},
231262
Kube: Kube{
232263
Enabled: ptr.To(false),
233264
},
@@ -327,6 +358,11 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
327358
kubeconfig := app.Flag(KubeConfigFlag, "Path to a kubeconfig. Only required if out-of-cluster.").ExistingFile()
328359
nodeName := app.Flag(KubeNodeNameFlag, "Name of kubernetes node on which kepler is running.").String()
329360

361+
// experimental platform
362+
experimentalPlatformRedfishEnabled := app.Flag(ExperimentalPlatformRedfishEnabledFlag, "Enable experimental Redfish BMC power monitoring").Default("false").Bool()
363+
experimentalPlatformRedfishNodeID := app.Flag(ExperimentalPlatformRedfishNodeIDFlag, "Node identifier for experimental Redfish platform power monitoring").String()
364+
experimentalPlatformRedfishConfig := app.Flag(ExperimentalPlatformRedfishConfigFlag, "Path to experimental Redfish BMC configuration file").String()
365+
330366
return func(cfg *Config) error {
331367
// Logging settings
332368
if flagsSet[LogLevelFlag] {
@@ -389,6 +425,19 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
389425
cfg.Kube.Node = *nodeName
390426
}
391427

428+
// experimental platform settings
429+
if flagsSet[ExperimentalPlatformRedfishEnabledFlag] {
430+
cfg.Experimental.Platform.Redfish.Enabled = experimentalPlatformRedfishEnabled
431+
}
432+
433+
if flagsSet[ExperimentalPlatformRedfishNodeIDFlag] {
434+
cfg.Experimental.Platform.Redfish.NodeID = *experimentalPlatformRedfishNodeID
435+
}
436+
437+
if flagsSet[ExperimentalPlatformRedfishConfigFlag] {
438+
cfg.Experimental.Platform.Redfish.ConfigFile = *experimentalPlatformRedfishConfig
439+
}
440+
392441
cfg.sanitize()
393442
return cfg.Validate()
394443
}
@@ -412,6 +461,8 @@ func (c *Config) sanitize() {
412461
c.Exporter.Prometheus.DebugCollectors[i] = strings.TrimSpace(c.Exporter.Prometheus.DebugCollectors[i])
413462
}
414463
c.Kube.Config = strings.TrimSpace(c.Kube.Config)
464+
c.Experimental.Platform.Redfish.NodeID = strings.TrimSpace(c.Experimental.Platform.Redfish.NodeID)
465+
c.Experimental.Platform.Redfish.ConfigFile = strings.TrimSpace(c.Experimental.Platform.Redfish.ConfigFile)
415466
}
416467

417468
// Validate checks for configuration errors
@@ -500,6 +551,17 @@ func (c *Config) Validate(skips ...SkipValidation) error {
500551
}
501552
}
502553
}
554+
{ // Experimental Platform
555+
if ptr.Deref(c.Experimental.Platform.Redfish.Enabled, false) {
556+
if c.Experimental.Platform.Redfish.ConfigFile == "" {
557+
errs = append(errs, fmt.Sprintf("%s not supplied but %s set to true", ExperimentalPlatformRedfishConfigFlag, ExperimentalPlatformRedfishEnabledFlag))
558+
} else {
559+
if err := canReadFile(c.Experimental.Platform.Redfish.ConfigFile); err != nil {
560+
errs = append(errs, fmt.Sprintf("unreadable Redfish config file: %s: %s", c.Experimental.Platform.Redfish.ConfigFile, err.Error()))
561+
}
562+
}
563+
}
564+
}
503565

504566
if len(errs) > 0 {
505567
return fmt.Errorf("invalid configuration: %s", strings.Join(errs, ", "))

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ require (
1414
github.com/prometheus/client_model v0.6.1
1515
github.com/prometheus/exporter-toolkit v0.14.0
1616
github.com/prometheus/procfs v0.15.1
17+
github.com/stmcginnis/gofish v0.15.0
1718
github.com/stretchr/testify v1.10.0
1819
go.uber.org/zap v1.26.0
1920
golang.org/x/sync v0.12.0

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU
135135
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
136136
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
137137
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
138+
github.com/stmcginnis/gofish v0.15.0 h1:8TG41+lvJk/0Nf8CIIYErxbMlQUy80W0JFRZP3Ld82A=
139+
github.com/stmcginnis/gofish v0.15.0/go.mod h1:BLDSFTp8pDlf/xDbLZa+F7f7eW0E/CHCboggsu8CznI=
138140
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
139141
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
140142
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=

hack/config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,11 @@ dev:
7070
fake-cpu-meter:
7171
enabled: false
7272
zones: [] # zones to be enabled, empty enables all default zones
73+
74+
# EXPERIMENTAL FEATURES - These features are experimental and may be unstable
75+
experimental:
76+
platform:
77+
redfish:
78+
enabled: false # Enable experimental Redfish BMC power monitoring
79+
configFile: "/etc/kepler/redfish.yaml" # Path to Redfish BMC configuration file
80+
nodeID: "" # Node identifier to use (overrides Kubernetes node name and hostname fallback)

0 commit comments

Comments
 (0)