Skip to content

Commit ca86c8e

Browse files
authored
Merge pull request #3512 from vkarak/bugfix/properly-cleanup-buildjobs
[bugfix] Properly cleanup build jobs on abort
2 parents f2abf19 + b06f86b commit ca86c8e

File tree

5 files changed

+28
-26
lines changed

5 files changed

+28
-26
lines changed

reframe/core/pipeline.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1762,12 +1762,14 @@ def _create_job(self, job_type, force_local=False, **job_opts):
17621762

17631763
if job_type == 'build':
17641764
script_name = 'rfm_build.sh'
1765+
job_name = f'rfm_build_{self.short_name}'
17651766
elif job_type == 'run':
17661767
script_name = 'rfm_job.sh'
1768+
job_name = f'rfm_{self.short_name}'
17671769

17681770
return Job.create(scheduler,
17691771
launcher,
1770-
name=f'rfm_{self.short_name}',
1772+
name=job_name,
17711773
script_filename=script_name,
17721774
workdir=self._stagedir,
17731775
sched_access=self._current_partition.access,

reframe/frontend/executors/__init__.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -391,8 +391,8 @@ def __exit__(this, exc_type, exc_value, traceback):
391391
# This practically ignores skipping during the cleanup phase
392392
self.skip()
393393
raise TaskExit from e
394-
except ABORT_REASONS:
395-
self.fail()
394+
except ABORT_REASONS as e:
395+
self.abort(e)
396396
raise
397397
except BaseException as e:
398398
self.fail()
@@ -523,6 +523,11 @@ def skip(self, exc_info=None):
523523
self._notify_listeners('on_task_skip')
524524

525525
def abort(self, cause=None):
526+
def _cancel_job(job):
527+
if job:
528+
with contextlib.suppress(JobNotStartedError):
529+
job.cancel()
530+
526531
if self.failed or self._aborted:
527532
return
528533

@@ -531,11 +536,8 @@ def abort(self, cause=None):
531536
exc.__cause__ = cause
532537
self._aborted = True
533538
try:
534-
if self.check.job:
535-
self.check.job.cancel()
536-
self.check.job.wait()
537-
except JobNotStartedError:
538-
self.fail((type(exc), exc, None), 'on_task_abort')
539+
_cancel_job(self.check.build_job)
540+
_cancel_job(self.check.job)
539541
except BaseException:
540542
self.fail()
541543
else:

unittests/test_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ def test_check_kbd_interrupt(run_reframe):
303303
)
304304
assert 'Traceback' not in stdout
305305
assert 'Traceback' not in stderr
306-
assert 'FAILED' in stdout
306+
assert 'ABORTED' in stdout
307307
assert returncode != 0
308308

309309

unittests/test_policies.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ def test_kbd_interrupt_within_test(make_runner, make_cases, common_exec_ctx):
253253
runner.runall(make_cases([make_kbd_check()]))
254254

255255
stats = runner.stats
256-
assert 1 == len(stats.failed())
256+
assert 1 == len(stats.aborted())
257257
assert_all_dead(runner)
258258

259259

@@ -318,7 +318,7 @@ def test_sigterm_handling(make_runner, make_cases, common_exec_ctx):
318318

319319
assert_all_dead(runner)
320320
assert runner.stats.num_cases() == 1
321-
assert len(runner.stats.failed()) == 1
321+
assert len(runner.stats.aborted()) == 1
322322

323323

324324
def test_reruns(make_runner, make_cases, common_exec_ctx):
@@ -576,16 +576,13 @@ def test_concurrency_none(make_async_runner, make_cases,
576576
def assert_interrupted_run(runner):
577577
assert 4 == runner.stats.num_cases()
578578
assert_runall(runner)
579-
assert 1 == len(runner.stats.failed())
580-
assert 3 == len(runner.stats.aborted())
579+
assert 4 == len(runner.stats.aborted())
581580
assert_all_dead(runner)
582581

583582
# Verify that failure reasons for the different tasks are correct
584583
for t in runner.stats.tasks():
585-
if isinstance(t.check, KeyboardInterruptCheck):
586-
assert t.exc_info[0] == KeyboardInterrupt
587-
else:
588-
assert t.exc_info[0] == AbortTaskError
584+
assert t.exc_info[0] == AbortTaskError
585+
assert isinstance(t.exc_info[1].__cause__, KeyboardInterrupt)
589586

590587

591588
def test_kbd_interrupt_in_wait_with_concurrency(

unittests/test_schedulers.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -777,36 +777,39 @@ def _read_pid(job, attempts=3):
777777
f'{attempts} attempts')
778778

779779

780-
@pytest.mark.flaky(reruns=3)
781780
def test_cancel_with_grace(minimal_job, scheduler, local_only):
782781
# This test emulates a spawned process that ignores the SIGTERM signal
783782
# and also spawns another process:
784783
#
785784
# reframe --- local job script --- sleep 5
786785
# (TERM IGN)
787786
#
788-
# We expect the job not to be cancelled immediately, since it ignores
789-
# the gracious signal we are sending it. However, we expect it to be
790-
# killed immediately after the grace period of 2 seconds expires.
787+
# We expect the job not to be cancelled immediately, since it ignores the
788+
# gracious signal we are sending it. However, we expect it to be killed
789+
# immediately after the grace period of 2 seconds expires. There is a
790+
# little tweak though. Since we do not know if the shell will be killed
791+
# first or the `sleep 5` process, we add an additional `sleep 1` at the
792+
# end to stall the script and make sure that it also get the `TERM`
793+
# signal. Otherwise,if the `sleep 5` is killed first, the script will
794+
# continue and may be fast enough to not get the signal as well.
791795
#
792796
# We also check that the additional spawned process is also killed.
793797
minimal_job.time_limit = '1m'
794798
minimal_job.scheduler.CANCEL_GRACE_PERIOD = 2
795799
prepare_job(minimal_job,
796800
command='sleep 5 &',
797801
pre_run=['trap -- "" TERM'],
798-
post_run=['echo $!', 'wait'],
802+
post_run=['echo $!', 'wait', 'sleep 1'],
799803
prepare_cmds=[''])
800804
submit_job(minimal_job)
801805

802806
# Stall a bit here to let the the spawned process start and install its
803807
# signal handler for SIGTERM
804-
time.sleep(1)
808+
time.sleep(.2)
805809

806810
sleep_pid = _read_pid(minimal_job)
807811
t_grace = time.time()
808812
minimal_job.cancel()
809-
time.sleep(0.1)
810813
minimal_job.wait()
811814
t_grace = time.time() - t_grace
812815

@@ -824,7 +827,6 @@ def test_cancel_with_grace(minimal_job, scheduler, local_only):
824827
assert_process_died(sleep_pid)
825828

826829

827-
# @pytest.mark.flaky(reruns=3)
828830
def test_cancel_term_ignore(minimal_job, scheduler, local_only):
829831
# This test emulates a descendant process of the spawned job that
830832
# ignores the SIGTERM signal:
@@ -855,7 +857,6 @@ def test_cancel_term_ignore(minimal_job, scheduler, local_only):
855857
minimal_job.wait()
856858
t_grace = time.time() - t_grace
857859

858-
# assert t_grace >= 2 and t_grace < 5
859860
assert minimal_job.state == 'FAILURE'
860861
assert minimal_job.signal == signal.SIGTERM
861862
assert_process_died(sleep_pid)

0 commit comments

Comments
 (0)