Skip to content

Commit 0511272

Browse files
authored
[XPU] avoid using xpu-smi to query device info (#72136)
* [XPU] avoid using xpu-smi to query device info * refine code * return MB when querying memory * fix bugs * add unittest
1 parent bbcdc76 commit 0511272

File tree

8 files changed

+131
-24
lines changed

8 files changed

+131
-24
lines changed

paddle/fluid/pybind/pybind.cc

+4
Original file line numberDiff line numberDiff line change
@@ -2940,6 +2940,10 @@ All parameter, weight, gradient are variables in Paddle.
29402940
#ifdef PADDLE_WITH_XPU
29412941
m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
29422942
m.def("xpu_empty_cache", platform::EmptyCache);
2943+
m.def("get_xpu_device_utilization_rate",
2944+
platform::GetXPUDeviceUtilizationRate);
2945+
m.def("get_xpu_device_total_memory", platform::GetXPUDeviceTotalMemory);
2946+
m.def("get_xpu_device_used_memory", platform::GetXPUDeviceUsedMemory);
29432947
#endif
29442948

29452949
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())

paddle/phi/backends/xpu/enforce_xpu.h

+11
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License. */
1717
#ifdef PADDLE_WITH_XPU
1818
#include <cuda.h>
1919
#include <cuda_runtime.h>
20+
#include <xpu/xpuml.h>
2021
#endif
2122

2223
#include "paddle/phi/backends/xpu/xpu_header.h"
@@ -128,6 +129,16 @@ DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess);
128129
} \
129130
} while (0)
130131

132+
// TODO(lijin23): support fine-grained error msg.
133+
#define PADDLE_ENFORCE_XPUML_SUCCESS(COND) \
134+
do { \
135+
auto __cond__ = (COND); \
136+
PADDLE_ENFORCE_EQ( \
137+
__cond__, \
138+
XPUML_SUCCESS, \
139+
common::errors::Fatal("XPUML Error, error_code=%d", __cond__)); \
140+
} while (0)
141+
131142
} // namespace xpu
132143
} // namespace backends
133144
} // namespace phi

paddle/phi/backends/xpu/xpu_info.cc

+46
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class XPUContext;
5050
namespace backends {
5151
namespace xpu {
5252

53+
static std::once_flag xpuml_init_flag;
54+
5355
/**************************** Version Management **************************/
5456

5557
//! Get the version of XPU Driver
@@ -221,6 +223,50 @@ void MemcpySyncD2D(void* dst,
221223

222224
/**************************** Others **************************/
223225

226+
int GetXPUDeviceUtilizationRate(int dev_id) {
227+
std::call_once(xpuml_init_flag, xpumlInit);
228+
if (dev_id == -1) {
229+
dev_id = GetXPUCurrentDeviceId();
230+
}
231+
xpumlDevice_t dev_handle;
232+
PADDLE_ENFORCE_XPUML_SUCCESS(
233+
xpumlDeviceGetHandleByIndex(dev_id, &dev_handle));
234+
xpumlUtilization_t dev_util;
235+
PADDLE_ENFORCE_XPUML_SUCCESS(
236+
xpumlDeviceGetUtilizationRates(dev_handle, &dev_util));
237+
return dev_util.xpu;
238+
}
239+
240+
int GetXPUDeviceTotalMemory(int dev_id) {
241+
std::call_once(xpuml_init_flag, xpumlInit);
242+
if (dev_id == -1) {
243+
dev_id = GetXPUCurrentDeviceId();
244+
}
245+
246+
xpumlDevice_t dev_handle;
247+
PADDLE_ENFORCE_XPUML_SUCCESS(
248+
xpumlDeviceGetHandleByIndex(dev_id, &dev_handle));
249+
xpumlMemory_t dev_mem_info;
250+
PADDLE_ENFORCE_XPUML_SUCCESS(
251+
xpumlDeviceGetMemoryInfo(dev_handle, &dev_mem_info));
252+
return dev_mem_info.totalGlobalMemory / 1024 / 1024; // MB
253+
}
254+
255+
int GetXPUDeviceUsedMemory(int dev_id) {
256+
std::call_once(xpuml_init_flag, xpumlInit);
257+
if (dev_id == -1) {
258+
dev_id = GetXPUCurrentDeviceId();
259+
}
260+
261+
xpumlDevice_t dev_handle;
262+
PADDLE_ENFORCE_XPUML_SUCCESS(
263+
xpumlDeviceGetHandleByIndex(dev_id, &dev_handle));
264+
xpumlMemory_t dev_mem_info;
265+
PADDLE_ENFORCE_XPUML_SUCCESS(
266+
xpumlDeviceGetMemoryInfo(dev_handle, &dev_mem_info));
267+
return dev_mem_info.usedGlobalMemory / 1024 / 1024; // MB
268+
}
269+
224270
XPUVersion get_xpu_version(int dev_id) {
225271
if (dev_id == -1) {
226272
dev_id = GetXPUCurrentDeviceId();

paddle/phi/backends/xpu/xpu_info.h

+3
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ XPUVersion get_xpu_version(int dev_id);
114114
void set_xpu_debug_level(int level);
115115

116116
int get_xpu_max_ptr_size(int dev_id);
117+
int GetXPUDeviceUtilizationRate(int dev_id);
118+
int GetXPUDeviceTotalMemory(int dev_id);
119+
int GetXPUDeviceUsedMemory(int dev_id);
117120

118121
} // namespace xpu
119122
} // namespace backends

paddle/phi/core/platform/device/xpu/xpu_info.cc

+12
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,18 @@ void EmptyCache() {
119119
}
120120
}
121121

122+
int GetXPUDeviceUtilizationRate(int dev_id) {
123+
return phi::backends::xpu::GetXPUDeviceUtilizationRate(dev_id);
124+
}
125+
126+
int GetXPUDeviceTotalMemory(int dev_id) {
127+
return phi::backends::xpu::GetXPUDeviceTotalMemory(dev_id);
128+
}
129+
130+
int GetXPUDeviceUsedMemory(int dev_id) {
131+
return phi::backends::xpu::GetXPUDeviceUsedMemory(dev_id);
132+
}
133+
122134
class RecordedXPUMallocHelper {
123135
private:
124136
explicit RecordedXPUMallocHelper(int dev_id, uint64_t limit_size = 0)

paddle/phi/core/platform/device/xpu/xpu_info.h

+4
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ bool IsXPUMallocRecorded(int dev_id);
9494

9595
void EmptyCache(void);
9696

97+
int GetXPUDeviceUtilizationRate(int dev_id);
98+
int GetXPUDeviceTotalMemory(int dev_id);
99+
int GetXPUDeviceUsedMemory(int dev_id);
100+
97101
} // namespace platform
98102
} // namespace paddle
99103
#endif

python/paddle/distributed/launch/utils/nvsmi.py

+8-24
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import time
2121

2222
import paddle
23+
from paddle.base import core
2324

2425

2526
class Info:
@@ -150,40 +151,23 @@ def query_npu_smi(query=None, index=None, dtype=None):
150151

151152

152153
def query_xpu_smi(query=None, index=None, dtype=None):
153-
if not has_xpu_smi():
154-
return []
155-
156-
cmd = ["xpu-smi"]
157-
154+
ret = []
158155
if not isinstance(dtype, list) or len(dtype) != len(query):
159156
dtype = [str] * len(query)
160157

161-
output = subprocess.check_output(cmd, timeout=3)
162-
lines = output.decode("utf-8").split(os.linesep)
163-
ret = []
164-
i = 0
165-
166-
for line in lines:
167-
if not line:
168-
continue
169-
result = re.split(r',|/|\s+|\|', line)
170-
length = len(result)
171-
if length not in [23] or "XPU" in result:
172-
continue
173-
result = [item for item in result if item]
174-
info = Info()
175-
utilization_xpu = float(re.findall(r'\d+\.\d+|\d+', result[9])[0])
176-
mem_total = float(re.findall(r'\d+\.\d+|\d+', result[8])[0])
177-
mem_used = float(re.findall(r'\d+\.\d+|\d+', result[7])[0])
158+
for dev_id in range(core.get_xpu_device_count()):
159+
utilization_xpu = core.get_xpu_device_utilization_rate(dev_id)
160+
mem_total = core.get_xpu_device_total_memory(dev_id)
161+
mem_used = core.get_xpu_device_used_memory(dev_id)
178162
result = [
179-
i,
163+
dev_id,
180164
utilization_xpu,
181165
mem_total,
182166
mem_used,
183167
(mem_total - mem_used),
184168
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
185169
]
186-
i += 1
170+
info = Info()
187171
for k, v, d in zip(query, result, dtype):
188172
setattr(info, k.replace(".", "_"), d(v))
189173
ret.append(info)
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import unittest
16+
17+
import paddle
18+
from paddle.device import core
19+
20+
21+
class TestQueryXPUDeviceInfo(unittest.TestCase):
22+
def test_dygraph(self):
23+
if core.is_compiled_with_xpu():
24+
paddle.disable_static()
25+
dev_num = core.get_xpu_device_count()
26+
self.assertGreater(
27+
dev_num,
28+
0,
29+
"The environment you run this test does not have any xpu device.",
30+
)
31+
for dev_id in range(dev_num):
32+
self.assertGreaterEqual(
33+
core.get_xpu_device_utilization_rate(dev_id), 0
34+
)
35+
self.assertGreater(core.get_xpu_device_total_memory(dev_id), 0)
36+
self.assertGreaterEqual(
37+
core.get_xpu_device_used_memory(dev_id), 0
38+
)
39+
paddle.enable_static()
40+
41+
42+
if __name__ == "__main__":
43+
unittest.main()

0 commit comments

Comments
 (0)