Skip to content

Commit bfb16b2

Browse files
authored
[shim] Fix macOS build (#2958)
1 parent 0fea032 commit bfb16b2

File tree

7 files changed

+117
-97
lines changed

7 files changed

+117
-97
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ jobs:
126126
runs-on: ${{ matrix.os }}
127127
strategy:
128128
matrix:
129-
os: [ubuntu-latest]
129+
os: [ubuntu-latest, macos-latest]
130130
steps:
131131
- uses: actions/checkout@v4
132132
- name: Set up Go

runner/cmd/shim/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error)
204204
}
205205

206206
var dcgmExporter *dcgm.DCGMExporter
207-
var dcgmWrapper *dcgm.DCGMWrapper
207+
var dcgmWrapper dcgm.DCGMWrapperInterface
208208

209209
if common.GetGpuVendor() == common.GpuVendorNvidia {
210210
dcgmExporterPath, err := dcgm.GetDCGMExporterExecPath(ctx)

runner/internal/shim/api/server.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,14 @@ type ShimServer struct {
2929
runner TaskRunner
3030

3131
dcgmExporter *dcgm.DCGMExporter
32-
dcgmWrapper *dcgm.DCGMWrapper
32+
dcgmWrapper dcgm.DCGMWrapperInterface
3333

3434
version string
3535
}
3636

3737
func NewShimServer(
3838
ctx context.Context, address string, version string,
39-
runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, dcgmWrapper *dcgm.DCGMWrapper,
39+
runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, dcgmWrapper dcgm.DCGMWrapperInterface,
4040
) *ShimServer {
4141
r := api.NewRouter()
4242
s := &ShimServer{

runner/internal/shim/dcgm/wrapper.go

Lines changed: 4 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,5 @@
11
package dcgm
22

3-
import (
4-
"errors"
5-
"fmt"
6-
"sync"
7-
8-
godcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm"
9-
)
10-
113
type HealthStatus string
124

135
const (
@@ -30,88 +22,8 @@ type Health struct {
3022
Incidents []HealthIncident `json:"incidents"`
3123
}
3224

33-
// DCGMWrapper is a wrapper around go-dcgm (which, in turn, is a wrapper around libdcgm.so)
34-
type DCGMWrapper struct {
35-
group godcgm.GroupHandle
36-
healthCheckEnabled bool
37-
38-
mu *sync.Mutex
39-
}
40-
41-
// NewDCGMWrapper initializes and starts DCGM in the specific mode:
42-
// - If address is empty, then libdcgm starts embedded hostengine within the current process.
43-
// This is the main mode.
44-
// - If address is not empty, then libdcgm connects to already running nv-hostengine service via TCP.
45-
// This mode is useful for debugging, e.g., one can start nv-hostengine via systemd and inject
46-
// errors via dcgmi:
47-
// - systemctl start nvidia-dcgm.service
48-
// - dcgmi test --inject --gpuid 0 -f 202 -v 99999
49-
//
50-
// Note: embedded hostengine is started in AUTO operation mode, which means that
51-
// the library handles periodic tasks by itself executing them in additional threads.
52-
func NewDCGMWrapper(address string) (*DCGMWrapper, error) {
53-
var err error
54-
if address == "" {
55-
_, err = godcgm.Init(godcgm.Embedded)
56-
} else {
57-
// "address is a unix socket filename (1) or a TCP/IP address (0)"
58-
_, err = godcgm.Init(godcgm.Standalone, address, "0")
59-
}
60-
if err != nil {
61-
return nil, fmt.Errorf("failed to initialize or start DCGM: %w", err)
62-
}
63-
return &DCGMWrapper{
64-
group: godcgm.GroupAllGPUs(),
65-
mu: new(sync.Mutex),
66-
}, nil
67-
}
68-
69-
func (w *DCGMWrapper) Shutdown() error {
70-
if err := godcgm.Shutdown(); err != nil {
71-
return fmt.Errorf("failed to shut down DCGM: %w", err)
72-
}
73-
return nil
74-
}
75-
76-
func (w *DCGMWrapper) EnableHealthChecks() error {
77-
w.mu.Lock()
78-
defer w.mu.Unlock()
79-
if w.healthCheckEnabled {
80-
return errors.New("health check system already enabled")
81-
}
82-
if err := godcgm.HealthSet(w.group, godcgm.DCGM_HEALTH_WATCH_ALL); err != nil {
83-
return fmt.Errorf("failed to configure health watches: %w", err)
84-
}
85-
// "On the first call, stateful information about all of the enabled watches within a group
86-
// is created but no error results are provided. On subsequent calls, any error information
87-
// will be returned."
88-
if _, err := godcgm.HealthCheck(w.group); err != nil {
89-
return fmt.Errorf("failed to initialize health watches state: %w", err)
90-
}
91-
w.healthCheckEnabled = true
92-
return nil
93-
}
94-
95-
func (w *DCGMWrapper) GetHealth() (Health, error) {
96-
health := Health{}
97-
if !w.healthCheckEnabled {
98-
return health, errors.New("health check system is not enabled")
99-
}
100-
response, err := godcgm.HealthCheck(w.group)
101-
if err != nil {
102-
return health, fmt.Errorf("failed to fetch health status: %w", err)
103-
}
104-
health.OverallHealth = int(response.OverallHealth)
105-
health.Incidents = make([]HealthIncident, 0, len(response.Incidents))
106-
for _, incident := range response.Incidents {
107-
health.Incidents = append(health.Incidents, HealthIncident{
108-
System: int(incident.System),
109-
Health: int(incident.Health),
110-
ErrorMessage: incident.Error.Message,
111-
ErrorCode: int(incident.Error.Code),
112-
EntityGroupID: int(incident.EntityInfo.EntityGroupId),
113-
EntityID: int(incident.EntityInfo.EntityId),
114-
})
115-
}
116-
return health, nil
25+
type DCGMWrapperInterface interface {
26+
Shutdown() error
27+
EnableHealthChecks() error
28+
GetHealth() (Health, error)
11729
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//go:build darwin
2+
3+
package dcgm
4+
5+
import "errors"
6+
7+
func NewDCGMWrapper(address string) (DCGMWrapperInterface, error) {
8+
return nil, errors.New("macOS is not supported")
9+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
//go:build linux
2+
3+
package dcgm
4+
5+
import (
6+
"errors"
7+
"fmt"
8+
"sync"
9+
10+
godcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm"
11+
)
12+
13+
// DCGMWrapper is a wrapper around go-dcgm (which, in turn, is a wrapper around libdcgm.so)
14+
type DCGMWrapper struct {
15+
group godcgm.GroupHandle
16+
healthCheckEnabled bool
17+
18+
mu *sync.Mutex
19+
}
20+
21+
// NewDCGMWrapper initializes and starts DCGM in the specific mode:
22+
// - If address is empty, then libdcgm starts embedded hostengine within the current process.
23+
// This is the main mode.
24+
// - If address is not empty, then libdcgm connects to already running nv-hostengine service via TCP.
25+
// This mode is useful for debugging, e.g., one can start nv-hostengine via systemd and inject
26+
// errors via dcgmi:
27+
// - systemctl start nvidia-dcgm.service
28+
// - dcgmi test --inject --gpuid 0 -f 202 -v 99999
29+
//
30+
// Note: embedded hostengine is started in AUTO operation mode, which means that
31+
// the library handles periodic tasks by itself executing them in additional threads.
32+
func NewDCGMWrapper(address string) (*DCGMWrapper, error) {
33+
var err error
34+
if address == "" {
35+
_, err = godcgm.Init(godcgm.Embedded)
36+
} else {
37+
// "address is a unix socket filename (1) or a TCP/IP address (0)"
38+
_, err = godcgm.Init(godcgm.Standalone, address, "0")
39+
}
40+
if err != nil {
41+
return nil, fmt.Errorf("failed to initialize or start DCGM: %w", err)
42+
}
43+
return &DCGMWrapper{
44+
group: godcgm.GroupAllGPUs(),
45+
mu: new(sync.Mutex),
46+
}, nil
47+
}
48+
49+
func (w *DCGMWrapper) Shutdown() error {
50+
if err := godcgm.Shutdown(); err != nil {
51+
return fmt.Errorf("failed to shut down DCGM: %w", err)
52+
}
53+
return nil
54+
}
55+
56+
func (w *DCGMWrapper) EnableHealthChecks() error {
57+
w.mu.Lock()
58+
defer w.mu.Unlock()
59+
if w.healthCheckEnabled {
60+
return errors.New("health check system already enabled")
61+
}
62+
if err := godcgm.HealthSet(w.group, godcgm.DCGM_HEALTH_WATCH_ALL); err != nil {
63+
return fmt.Errorf("failed to configure health watches: %w", err)
64+
}
65+
// "On the first call, stateful information about all of the enabled watches within a group
66+
// is created but no error results are provided. On subsequent calls, any error information
67+
// will be returned."
68+
if _, err := godcgm.HealthCheck(w.group); err != nil {
69+
return fmt.Errorf("failed to initialize health watches state: %w", err)
70+
}
71+
w.healthCheckEnabled = true
72+
return nil
73+
}
74+
75+
func (w *DCGMWrapper) GetHealth() (Health, error) {
76+
health := Health{}
77+
if !w.healthCheckEnabled {
78+
return health, errors.New("health check system is not enabled")
79+
}
80+
response, err := godcgm.HealthCheck(w.group)
81+
if err != nil {
82+
return health, fmt.Errorf("failed to fetch health status: %w", err)
83+
}
84+
health.OverallHealth = int(response.OverallHealth)
85+
health.Incidents = make([]HealthIncident, 0, len(response.Incidents))
86+
for _, incident := range response.Incidents {
87+
health.Incidents = append(health.Incidents, HealthIncident{
88+
System: int(incident.System),
89+
Health: int(incident.Health),
90+
ErrorMessage: incident.Error.Message,
91+
ErrorCode: int(incident.Error.Code),
92+
EntityGroupID: int(incident.EntityInfo.EntityGroupId),
93+
EntityID: int(incident.EntityInfo.EntityId),
94+
})
95+
}
96+
return health, nil
97+
}

runner/internal/shim/dcgm/wrapper_test.go renamed to runner/internal/shim/dcgm/wrapper_linux_test.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
//go:build linux
2+
13
package dcgm
24

35
import (
@@ -73,7 +75,7 @@ func getGpuID(t *testing.T) uint {
7375
return gpuIDs[0]
7476
}
7577

76-
func injectError(t *testing.T, gpuID uint, fieldID godcgm.Short, fieldType uint, value interface{}) {
78+
func injectError(t *testing.T, gpuID uint, fieldID godcgm.Short, fieldType uint, value any) {
7779
t.Helper()
7880
err := godcgm.InjectFieldValue(gpuID, fieldID, fieldType, 0, time.Now().UnixMicro(), value)
7981
require.NoError(t, err)

0 commit comments

Comments
 (0)