From 4b5e53ea91f5759c4c0bce66d5ef36a37ece6841 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 Aug 2025 10:33:40 +0000 Subject: [PATCH 1/5] Initial plan From 2d3a7e54875e346a92f1413712415a04ff025743 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 Aug 2025 10:47:00 +0000 Subject: [PATCH 2/5] Implement device ID validation for GPU operators Co-authored-by: ggorman <5394691+ggorman@users.noreply.github.com> --- devito/passes/iet/langbase.py | 43 ++++++++++++++++++++++++++-- tests/test_gpu_openacc.py | 25 +++++++++++++++++ tests/test_gpu_openmp.py | 53 ++++++++++++++++++++++++++++------- 3 files changed, 108 insertions(+), 13 deletions(-) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index a99fcdcb41..7ed8acf90d 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -11,7 +11,7 @@ from devito.mpi.distributed import MPICommObject from devito.passes import is_on_device from devito.passes.iet.engine import iet_pass -from devito.symbolics import Byref, CondNe, SizeOf +from devito.symbolics import Byref, CondNe, CondGe, SizeOf from devito.tools import as_list, is_integer, prod from devito.types import Symbol, QueueID, Wildcard @@ -426,9 +426,30 @@ def _make_setdevice_seq(iet, nodes=()): devicetype = as_list(self.langbb[self.platform]) deviceid = self.deviceid + # Add device validation check + ngpus, call_ngpus = self.langbb._get_num_devices(self.platform) + + # Create validation: if deviceid >= num_devices, print error and exit + validation_check = Conditional( + CondGe(deviceid, ngpus), + List(body=[ + Call('printf', ['"%s: Error - Requested device ID %d does not exist. ' + 'Only %d device(s) available. Check CUDA_VISIBLE_DEVICES ' + 'and container GPU configuration.\\n"', + self.langbb['name'], deviceid, ngpus]), + Call('exit', [1]) + ]) + ) + + device_setup = List(body=[ + call_ngpus, + validation_check, + self.langbb['set-device']([deviceid] + devicetype) + ]) + return list(nodes) + [Conditional( CondNe(deviceid, -1), - self.langbb['set-device']([deviceid] + devicetype) + device_setup )] def _make_setdevice_mpi(iet, objcomm, nodes=()): @@ -441,7 +462,23 @@ def _make_setdevice_mpi(iet, objcomm, nodes=()): ngpus, call_ngpus = self.langbb._get_num_devices(self.platform) - osdd_then = self.langbb['set-device']([deviceid] + devicetype) + # Add device validation check for explicit device ID + validation_check = Conditional( + CondGe(deviceid, ngpus), + List(body=[ + Call('printf', ['"%s: Error - Requested device ID %d does not exist. ' + 'Only %d device(s) available. Check CUDA_VISIBLE_DEVICES ' + 'and container GPU configuration.\\n"', + self.langbb['name'], deviceid, ngpus]), + Call('exit', [1]) + ]) + ) + + osdd_then = List(body=[ + call_ngpus, + validation_check, + self.langbb['set-device']([deviceid] + devicetype) + ]) osdd_else = self.langbb['set-device']([rank % ngpus] + devicetype) return list(nodes) + [Conditional( diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index 8c4813db0b..f85cb61dfa 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -200,6 +200,31 @@ def test_op_apply(self): assert np.all(np.array(u.data[0, :, :, :]) == time_steps) + def test_device_validation_error_message(self): + """Test that OpenACC device validation includes helpful error messages.""" + grid = Grid(shape=(3, 3, 3)) + + u = TimeFunction(name='u', grid=grid, dtype=np.int32) + + op = Operator(Eq(u.forward, u + 1), platform='nvidiaX', language='openacc') + + # Check that the generated code contains device validation with informative error + code = str(op) + + # Should contain device count check + assert 'acc_get_num_devices' in code, "Missing OpenACC device count check" + + # Should contain validation condition + assert 'deviceid >= ngpus' in code, "Missing OpenACC device ID validation condition" + + # Should contain helpful error message components + assert 'does not exist' in code, "Missing 'does not exist' error message" + assert 'CUDA_VISIBLE_DEVICES' in code, "Missing CUDA_VISIBLE_DEVICES guidance" + assert 'container GPU configuration' in code, "Missing container guidance" + + # Should contain exit call to prevent undefined behavior + assert 'exit(1)' in code, "Missing exit call on validation failure" + def iso_acoustic(self, opt): shape = (101, 101) extent = (1000, 1000) diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 7150d66eb2..4ea53fe2fd 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -20,8 +20,13 @@ def test_init_omp_env(self): op = Operator(Eq(u.forward, u.dx+1), language='openmp') - assert str(op.body.init[0].body[0]) ==\ - 'if (deviceid != -1)\n{\n omp_set_default_device(deviceid);\n}' + # With device validation, the generated code now includes validation logic + init_code = str(op.body.init[0].body[0]) + assert 'if (deviceid != -1)' in init_code + assert 'int ngpus = omp_get_num_devices()' in init_code + assert 'if (deviceid >= ngpus)' in init_code + assert 'does not exist' in init_code + assert 'omp_set_default_device(deviceid)' in init_code @pytest.mark.parallel(mode=1) def test_init_omp_env_w_mpi(self, mode): @@ -31,14 +36,42 @@ def test_init_omp_env_w_mpi(self, mode): op = Operator(Eq(u.forward, u.dx+1), language='openmp') - assert str(op.body.init[0].body[0]) ==\ - ('if (deviceid != -1)\n' - '{\n omp_set_default_device(deviceid);\n}\n' - 'else\n' - '{\n int rank = 0;\n' - ' MPI_Comm_rank(comm,&rank);\n' - ' int ngpus = omp_get_num_devices();\n' - ' omp_set_default_device((rank)%(ngpus));\n}') + # With device validation, the MPI case also includes validation for explicit deviceid + init_code = str(op.body.init[0].body[0]) + assert 'if (deviceid != -1)' in init_code + assert 'int ngpus = omp_get_num_devices()' in init_code + # For MPI case with explicit deviceid, should have validation + assert 'if (deviceid >= ngpus)' in init_code + assert 'does not exist' in init_code + # Should still have MPI rank-based assignment in else clause + assert 'int rank = 0' in init_code + assert 'MPI_Comm_rank(comm,&rank)' in init_code + assert '(rank)%(ngpus)' in init_code + + def test_device_validation_error_message(self): + """Test that device validation includes helpful error messages.""" + grid = Grid(shape=(3, 3, 3)) + + u = TimeFunction(name='u', grid=grid) + + op = Operator(Eq(u.forward, u.dx+1), language='openmp') + + # Check that the generated code contains device validation with informative error + code = str(op) + + # Should contain device count check + assert 'omp_get_num_devices()' in code, "Missing device count check" + + # Should contain validation condition + assert 'deviceid >= ngpus' in code, "Missing device ID validation condition" + + # Should contain helpful error message components + assert 'does not exist' in code, "Missing 'does not exist' error message" + assert 'CUDA_VISIBLE_DEVICES' in code, "Missing CUDA_VISIBLE_DEVICES guidance" + assert 'container GPU configuration' in code, "Missing container guidance" + + # Should contain exit call to prevent undefined behavior + assert 'exit(1)' in code, "Missing exit call on validation failure" def test_basic(self): grid = Grid(shape=(3, 3, 3)) From b03b63dd8d18570e21217181fd230bc6ce7c6792 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 Aug 2025 11:16:05 +0000 Subject: [PATCH 3/5] Fix CondGe import error - use Ge from relational module instead Co-authored-by: ggorman <5394691+ggorman@users.noreply.github.com> --- devito/passes/iet/langbase.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 7ed8acf90d..36ed2da6a6 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -11,7 +11,8 @@ from devito.mpi.distributed import MPICommObject from devito.passes import is_on_device from devito.passes.iet.engine import iet_pass -from devito.symbolics import Byref, CondNe, CondGe, SizeOf +from devito.symbolics import Byref, CondNe, SizeOf +from devito.types.relational import Ge from devito.tools import as_list, is_integer, prod from devito.types import Symbol, QueueID, Wildcard @@ -431,7 +432,7 @@ def _make_setdevice_seq(iet, nodes=()): # Create validation: if deviceid >= num_devices, print error and exit validation_check = Conditional( - CondGe(deviceid, ngpus), + Ge(deviceid, ngpus), List(body=[ Call('printf', ['"%s: Error - Requested device ID %d does not exist. ' 'Only %d device(s) available. Check CUDA_VISIBLE_DEVICES ' @@ -464,7 +465,7 @@ def _make_setdevice_mpi(iet, objcomm, nodes=()): # Add device validation check for explicit device ID validation_check = Conditional( - CondGe(deviceid, ngpus), + Ge(deviceid, ngpus), List(body=[ Call('printf', ['"%s: Error - Requested device ID %d does not exist. ' 'Only %d device(s) available. Check CUDA_VISIBLE_DEVICES ' From 66277bddaf5d2ad13d91ae00799f3c165e18e972 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 Aug 2025 11:20:31 +0000 Subject: [PATCH 4/5] Implement _get_num_devices method in LangBB base class Co-authored-by: ggorman <5394691+ggorman@users.noreply.github.com> --- devito/passes/iet/langbase.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index 36ed2da6a6..a6ac9dc2bc 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -3,6 +3,7 @@ from abc import ABC import cgen as c +import numpy as np from devito.data import FULL from devito.ir import (DummyExpr, Call, Conditional, Expression, List, Prodder, @@ -57,11 +58,15 @@ class LangBB(metaclass=LangMeta): """ @classmethod - def _get_num_devices(cls): + def _get_num_devices(cls, platform): """ Get the number of accessible devices. + Returns a tuple of (ngpus_symbol, call_to_get_num_devices). """ - raise NotImplementedError + ngpus = Symbol(name='ngpus', dtype=np.int32) + devicetype = as_list(cls[platform]) + call_ngpus = cls['num-devices'](devicetype, retobj=ngpus) + return ngpus, call_ngpus @classmethod def _map_to(cls, f, imask=None, qid=None): From afbd708600759179402872d7b30394049daf2afc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:49:04 +0000 Subject: [PATCH 5/5] Simplify device validation implementation following Devito patterns Co-authored-by: ggorman <5394691+ggorman@users.noreply.github.com> --- devito/passes/iet/langbase.py | 31 +++++++++++++------------------ tests/test_gpu_openacc.py | 19 +++++++++---------- tests/test_gpu_openmp.py | 23 +++++++++++------------ 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/devito/passes/iet/langbase.py b/devito/passes/iet/langbase.py index a6ac9dc2bc..56672552a4 100644 --- a/devito/passes/iet/langbase.py +++ b/devito/passes/iet/langbase.py @@ -3,7 +3,6 @@ from abc import ABC import cgen as c -import numpy as np from devito.data import FULL from devito.ir import (DummyExpr, Call, Conditional, Expression, List, Prodder, @@ -13,7 +12,7 @@ from devito.passes import is_on_device from devito.passes.iet.engine import iet_pass from devito.symbolics import Byref, CondNe, SizeOf -from devito.types.relational import Ge +from sympy import Ge from devito.tools import as_list, is_integer, prod from devito.types import Symbol, QueueID, Wildcard @@ -63,7 +62,8 @@ def _get_num_devices(cls, platform): Get the number of accessible devices. Returns a tuple of (ngpus_symbol, call_to_get_num_devices). """ - ngpus = Symbol(name='ngpus', dtype=np.int32) + from devito.types import Symbol + ngpus = Symbol(name='ngpus', dtype='int32') devicetype = as_list(cls[platform]) call_ngpus = cls['num-devices'](devicetype, retobj=ngpus) return ngpus, call_ngpus @@ -434,22 +434,19 @@ def _make_setdevice_seq(iet, nodes=()): # Add device validation check ngpus, call_ngpus = self.langbb._get_num_devices(self.platform) - - # Create validation: if deviceid >= num_devices, print error and exit - validation_check = Conditional( + + validation = Conditional( Ge(deviceid, ngpus), List(body=[ - Call('printf', ['"%s: Error - Requested device ID %d does not exist. ' - 'Only %d device(s) available. Check CUDA_VISIBLE_DEVICES ' - 'and container GPU configuration.\\n"', - self.langbb['name'], deviceid, ngpus]), + Call('printf', ['"%s: Error - device %d >= %d devices\\n"', + self.langbb['name'], deviceid, ngpus]), Call('exit', [1]) ]) ) device_setup = List(body=[ call_ngpus, - validation_check, + validation, self.langbb['set-device']([deviceid] + devicetype) ]) @@ -468,21 +465,19 @@ def _make_setdevice_mpi(iet, objcomm, nodes=()): ngpus, call_ngpus = self.langbb._get_num_devices(self.platform) - # Add device validation check for explicit device ID - validation_check = Conditional( + # Add device validation for explicit device ID + validation = Conditional( Ge(deviceid, ngpus), List(body=[ - Call('printf', ['"%s: Error - Requested device ID %d does not exist. ' - 'Only %d device(s) available. Check CUDA_VISIBLE_DEVICES ' - 'and container GPU configuration.\\n"', - self.langbb['name'], deviceid, ngpus]), + Call('printf', ['"%s: Error - device %d >= %d devices\\n"', + self.langbb['name'], deviceid, ngpus]), Call('exit', [1]) ]) ) osdd_then = List(body=[ call_ngpus, - validation_check, + validation, self.langbb['set-device']([deviceid] + devicetype) ]) osdd_else = self.langbb['set-device']([rank % ngpus] + devicetype) diff --git a/tests/test_gpu_openacc.py b/tests/test_gpu_openacc.py index f85cb61dfa..2b5fc7d6fa 100644 --- a/tests/test_gpu_openacc.py +++ b/tests/test_gpu_openacc.py @@ -208,20 +208,19 @@ def test_device_validation_error_message(self): op = Operator(Eq(u.forward, u + 1), platform='nvidiaX', language='openacc') - # Check that the generated code contains device validation with informative error + # Check that the generated code contains device validation code = str(op) - + # Should contain device count check assert 'acc_get_num_devices' in code, "Missing OpenACC device count check" - + # Should contain validation condition - assert 'deviceid >= ngpus' in code, "Missing OpenACC device ID validation condition" - - # Should contain helpful error message components - assert 'does not exist' in code, "Missing 'does not exist' error message" - assert 'CUDA_VISIBLE_DEVICES' in code, "Missing CUDA_VISIBLE_DEVICES guidance" - assert 'container GPU configuration' in code, "Missing container guidance" - + assert 'deviceid >= ngpus' in code, "Missing OpenACC device ID " + \ + "validation condition" + + # Should contain error message + assert 'Error - device' in code, "Missing error message" + # Should contain exit call to prevent undefined behavior assert 'exit(1)' in code, "Missing exit call on validation failure" diff --git a/tests/test_gpu_openmp.py b/tests/test_gpu_openmp.py index 4ea53fe2fd..3ed106859d 100644 --- a/tests/test_gpu_openmp.py +++ b/tests/test_gpu_openmp.py @@ -25,7 +25,7 @@ def test_init_omp_env(self): assert 'if (deviceid != -1)' in init_code assert 'int ngpus = omp_get_num_devices()' in init_code assert 'if (deviceid >= ngpus)' in init_code - assert 'does not exist' in init_code + assert 'Error - device' in init_code assert 'omp_set_default_device(deviceid)' in init_code @pytest.mark.parallel(mode=1) @@ -36,13 +36,14 @@ def test_init_omp_env_w_mpi(self, mode): op = Operator(Eq(u.forward, u.dx+1), language='openmp') - # With device validation, the MPI case also includes validation for explicit deviceid + # With device validation, the MPI case also includes validation for explicit + # deviceid init_code = str(op.body.init[0].body[0]) assert 'if (deviceid != -1)' in init_code assert 'int ngpus = omp_get_num_devices()' in init_code # For MPI case with explicit deviceid, should have validation assert 'if (deviceid >= ngpus)' in init_code - assert 'does not exist' in init_code + assert 'Error - device' in init_code # Should still have MPI rank-based assignment in else clause assert 'int rank = 0' in init_code assert 'MPI_Comm_rank(comm,&rank)' in init_code @@ -56,20 +57,18 @@ def test_device_validation_error_message(self): op = Operator(Eq(u.forward, u.dx+1), language='openmp') - # Check that the generated code contains device validation with informative error + # Check that the generated code contains device validation code = str(op) - + # Should contain device count check assert 'omp_get_num_devices()' in code, "Missing device count check" - + # Should contain validation condition assert 'deviceid >= ngpus' in code, "Missing device ID validation condition" - - # Should contain helpful error message components - assert 'does not exist' in code, "Missing 'does not exist' error message" - assert 'CUDA_VISIBLE_DEVICES' in code, "Missing CUDA_VISIBLE_DEVICES guidance" - assert 'container GPU configuration' in code, "Missing container guidance" - + + # Should contain error message + assert 'Error - device' in code, "Missing error message" + # Should contain exit call to prevent undefined behavior assert 'exit(1)' in code, "Missing exit call on validation failure"