Skip to content

write files when submitted function fails #734

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0f05d9c
write files when submitted function fails
jan-janssen Jul 15, 2025
54211e1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2025
174c1f6
get information from input_dict
jan-janssen Jul 15, 2025
1da66eb
Merge remote-tracking branch 'origin/write_error_file' into write_err…
jan-janssen Jul 15, 2025
4b4e7e5
Format black
pyiron-runner Jul 15, 2025
5b49d66
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2025
2b7fb92
write either error.output or error
jan-janssen Jul 15, 2025
ee2e6dd
Merge remote-tracking branch 'origin/write_error_file' into write_err…
jan-janssen Jul 15, 2025
5bd926c
Format black
pyiron-runner Jul 15, 2025
4f51280
write exception as string
jan-janssen Jul 15, 2025
0f1e8d3
shared function
jan-janssen Jul 16, 2025
a5f05a5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 16, 2025
87000cf
move function
jan-janssen Jul 16, 2025
f2a3abf
Format black
pyiron-runner Jul 16, 2025
cf1fef7
bug fix
jan-janssen Jul 16, 2025
8c15eb5
Merge remote-tracking branch 'origin/write_error_file' into write_err…
jan-janssen Jul 16, 2025
e6f7605
check error file is written
jan-janssen Jul 16, 2025
481ba85
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 16, 2025
f6e90d9
clean up
jan-janssen Jul 16, 2025
d67055a
Add more details in log
jan-janssen Jul 16, 2025
ac9ceea
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 16, 2025
db57726
fix formatting
jan-janssen Jul 16, 2025
f37eea1
fix info tests
jan-janssen Jul 16, 2025
e9ed692
set write_error_file in resource_dict
jan-janssen Jul 16, 2025
0696ef3
Merge remote-tracking branch 'origin/main' into write_error_file
jan-janssen Jul 16, 2025
da0f1d8
fix test
jan-janssen Jul 16, 2025
985851d
Merge remote-tracking branch 'origin/main' into write_error_file
jan-janssen Jul 16, 2025
db24c95
fix file location
jan-janssen Jul 16, 2025
c8500a3
Drop Python 3.9
jan-janssen Jul 16, 2025
eb85ed1
extend tests
jan-janssen Jul 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ jobs:
run: echo -e "channels:\n - conda-forge\n" > .condarc
- uses: conda-incubator/setup-miniconda@v3
with:
python-version: '3.9'
python-version: '3.10'
miniforge-version: latest
condarc-file: .condarc
environment-file: .ci_support/environment-old.yml
Expand Down
5 changes: 5 additions & 0 deletions executorlib/backend/cache_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import cloudpickle

from executorlib.standalone.error import backend_write_error_file
from executorlib.task_scheduler.file.backend import (
backend_load_file,
backend_write_file,
Expand Down Expand Up @@ -53,6 +54,10 @@ def main() -> None:
output={"error": error},
runtime=time.time() - time_start,
)
backend_write_error_file(
error=error,
apply_dict=apply_dict,
)
else:
if mpi_rank_zero:
backend_write_file(
Expand Down
5 changes: 5 additions & 0 deletions executorlib/backend/interactive_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import cloudpickle
import zmq

from executorlib.standalone.error import backend_write_error_file
from executorlib.standalone.interactive.backend import call_funct, parse_arguments
from executorlib.standalone.interactive.communication import (
interface_connect,
Expand Down Expand Up @@ -82,6 +83,10 @@ def main() -> None:
socket=socket,
result_dict={"error": error},
)
backend_write_error_file(
error=error,
apply_dict=input_dict,
)
else:
# Send output
if mpi_rank_zero:
Expand Down
6 changes: 6 additions & 0 deletions executorlib/backend/interactive_serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from os.path import abspath
from typing import Optional

from executorlib.standalone.error import backend_write_error_file
from executorlib.standalone.interactive.backend import call_funct, parse_arguments
from executorlib.standalone.interactive.communication import (
interface_connect,
Expand All @@ -17,6 +18,7 @@ def main(argument_lst: Optional[list[str]] = None):

Args:
argument_lst (Optional[List[str]]): List of command line arguments. If None, sys.argv will be used.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Returns:
None
Expand Down Expand Up @@ -58,6 +60,10 @@ def main(argument_lst: Optional[list[str]] = None):
socket=socket,
result_dict={"error": error},
)
backend_write_error_file(
error=error,
apply_dict=input_dict,
)
else:
# Send output
interface_send(socket=socket, result_dict={"result": output})
Expand Down
8 changes: 8 additions & 0 deletions executorlib/executor/flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class FluxJobExecutor(BaseExecutor):
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Examples:
```
Expand Down Expand Up @@ -103,6 +104,7 @@ def __init__(
plot_dependency_graph: bool = False,
plot_dependency_graph_filename: Optional[str] = None,
log_obj_size: bool = False,
write_error_file: bool = False,
):
"""
The executorlib.FluxJobExecutor leverages either the message passing interface (MPI), the SLURM workload manager
Expand Down Expand Up @@ -148,6 +150,7 @@ def __init__(
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

"""
default_resource_dict: dict = {
Expand All @@ -157,6 +160,7 @@ def __init__(
"cwd": None,
"openmpi_oversubscribe": False,
"slurm_cmd_args": [],
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down Expand Up @@ -248,6 +252,7 @@ class FluxClusterExecutor(BaseExecutor):
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Examples:
```
Expand Down Expand Up @@ -285,6 +290,7 @@ def __init__(
plot_dependency_graph: bool = False,
plot_dependency_graph_filename: Optional[str] = None,
log_obj_size: bool = False,
write_error_file: bool = False,
):
"""
The executorlib.FluxClusterExecutor leverages either the message passing interface (MPI), the SLURM workload
Expand Down Expand Up @@ -327,6 +333,7 @@ def __init__(
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

"""
default_resource_dict: dict = {
Expand All @@ -336,6 +343,7 @@ def __init__(
"cwd": None,
"openmpi_oversubscribe": False,
"slurm_cmd_args": [],
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down
8 changes: 8 additions & 0 deletions executorlib/executor/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class SingleNodeExecutor(BaseExecutor):
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Examples:
```
Expand Down Expand Up @@ -93,6 +94,7 @@ def __init__(
plot_dependency_graph: bool = False,
plot_dependency_graph_filename: Optional[str] = None,
log_obj_size: bool = False,
write_error_file: bool = False,
):
"""
The executorlib.SingleNodeExecutor leverages either the message passing interface (MPI), the SLURM workload
Expand Down Expand Up @@ -134,6 +136,7 @@ def __init__(
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

"""
default_resource_dict: dict = {
Expand All @@ -143,6 +146,7 @@ def __init__(
"cwd": None,
"openmpi_oversubscribe": False,
"slurm_cmd_args": [],
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down Expand Up @@ -220,6 +224,7 @@ class TestClusterExecutor(BaseExecutor):
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Examples:
```
Expand Down Expand Up @@ -256,6 +261,7 @@ def __init__(
plot_dependency_graph: bool = False,
plot_dependency_graph_filename: Optional[str] = None,
log_obj_size: bool = False,
write_error_file: bool = False,
):
"""
The executorlib.api.TestClusterExecutor is designed to test the file based communication used in the
Expand Down Expand Up @@ -291,6 +297,7 @@ def __init__(
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

"""
default_resource_dict: dict = {
Expand All @@ -299,6 +306,7 @@ def __init__(
"gpus_per_core": 0,
"cwd": None,
"openmpi_oversubscribe": False,
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down
8 changes: 8 additions & 0 deletions executorlib/executor/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class SlurmClusterExecutor(BaseExecutor):
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Examples:
```
Expand Down Expand Up @@ -97,6 +98,7 @@ def __init__(
plot_dependency_graph: bool = False,
plot_dependency_graph_filename: Optional[str] = None,
log_obj_size: bool = False,
write_error_file: bool = False,
):
"""
The executorlib.SlurmClusterExecutor leverages either the message passing interface (MPI), the SLURM workload
Expand Down Expand Up @@ -139,6 +141,7 @@ def __init__(
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

"""
default_resource_dict: dict = {
Expand All @@ -148,6 +151,7 @@ def __init__(
"cwd": None,
"openmpi_oversubscribe": False,
"slurm_cmd_args": [],
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down Expand Up @@ -244,6 +248,7 @@ class SlurmJobExecutor(BaseExecutor):
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

Examples:
```
Expand Down Expand Up @@ -280,6 +285,7 @@ def __init__(
plot_dependency_graph: bool = False,
plot_dependency_graph_filename: Optional[str] = None,
log_obj_size: bool = False,
write_error_file: bool = False,
):
"""
The executorlib.SlurmJobExecutor leverages either the message passing interface (MPI), the SLURM workload
Expand Down Expand Up @@ -325,6 +331,7 @@ def __init__(
debugging purposes and to get an overview of the specified dependencies.
plot_dependency_graph_filename (str): Name of the file to store the plotted graph in.
log_obj_size (bool): Enable debug mode which reports the size of the communicated objects.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails

"""
default_resource_dict: dict = {
Expand All @@ -334,6 +341,7 @@ def __init__(
"cwd": None,
"openmpi_oversubscribe": False,
"slurm_cmd_args": [],
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down
1 change: 1 addition & 0 deletions executorlib/standalone/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"error": "error",
"runtime": "runtime",
"queue_id": "queue_id",
"write_error_file": "write_error_file",
}


Expand Down
20 changes: 20 additions & 0 deletions executorlib/standalone/error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import traceback


def backend_write_error_file(error: Exception, apply_dict: dict) -> None:
"""
Write an error to a file if specified in the apply_dict.

Args:
error (Exception): The error to be written.
apply_dict (dict): Dictionary containing additional parameters.

Returns:
None
"""
if apply_dict.get("write_error_file", False):
with open(apply_dict.get("error_file_name", "error.out"), "a") as f:
f.write("function: " + str(apply_dict["fn"]) + "\n")
f.write("args: " + str(apply_dict["args"]) + "\n")
f.write("kwargs: " + str(apply_dict["kwargs"]) + "\n")
traceback.print_exception(error, file=f)
5 changes: 5 additions & 0 deletions executorlib/task_scheduler/file/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import time
from typing import Any

from executorlib.standalone.error import backend_write_error_file
from executorlib.task_scheduler.file.hdf import dump, load
from executorlib.task_scheduler.file.shared import FutureItem

Expand Down Expand Up @@ -77,6 +78,10 @@ def backend_execute_task_in_file(file_name: str) -> None:
}
except Exception as error:
result = {"error": error}
backend_write_error_file(
error=error,
apply_dict=apply_dict,
)

backend_write_file(
file_name=file_name,
Expand Down
4 changes: 4 additions & 0 deletions executorlib/task_scheduler/file/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ def load(file_name: str) -> dict:
data_dict["kwargs"] = cloudpickle.loads(np.void(hdf["/input_kwargs"]))
else:
data_dict["kwargs"] = {}
if "write_error_file" in hdf:
data_dict["write_error_file"] = cloudpickle.loads(
np.void(hdf["/write_error_file"])
)
return data_dict


Expand Down
2 changes: 2 additions & 0 deletions executorlib/task_scheduler/file/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,15 @@ def execute_tasks_h5(
)
cache_key = task_resource_dict.pop("cache_key", None)
cache_directory = os.path.abspath(task_resource_dict.pop("cache_directory"))
write_error_file = task_resource_dict.pop("write_error_file", False)
task_key, data_dict = serialize_funct_h5(
fn=task_dict["fn"],
fn_args=task_args,
fn_kwargs=task_kwargs,
resource_dict=task_resource_dict,
cache_key=cache_key,
)
data_dict["write_error_file"] = write_error_file
if task_key not in memory_dict:
if os.path.join(
cache_directory, task_key + "_o.h5"
Expand Down
5 changes: 5 additions & 0 deletions executorlib/task_scheduler/file/task_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
pysqa_config_directory: Optional[str] = None,
backend: Optional[str] = None,
disable_dependencies: bool = False,
write_error_file: bool = False,
):
"""
Initialize the FileExecutor.
Expand All @@ -50,12 +51,14 @@ def __init__(
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
backend (str, optional): name of the backend used to spawn tasks.
disable_dependencies (boolean): Disable resolving future objects during the submission.
write_error_file (boolean): Enable writing error.out files when the computation of a Python function fails
"""
super().__init__(max_cores=None)
default_resource_dict = {
"cores": 1,
"cwd": None,
"cache_directory": "executorlib_cache",
"write_error_file": write_error_file,
}
if resource_dict is None:
resource_dict = {}
Expand Down Expand Up @@ -95,6 +98,7 @@ def create_file_executor(
init_function: Optional[Callable] = None,
disable_dependencies: bool = False,
execute_function: Callable = execute_with_pysqa,
write_error_file: bool = False,
):
if block_allocation:
raise ValueError(
Expand Down Expand Up @@ -123,4 +127,5 @@ def create_file_executor(
disable_dependencies=disable_dependencies,
execute_function=execute_function,
terminate_function=terminate_function,
write_error_file=write_error_file,
)
Loading
Loading