diff --git a/.gitignore b/.gitignore
index 1ca9eaef6..48bb9a4be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,7 +13,7 @@ docsrc/source/contributing.md
 examples/checkpoints/
 build
 docs/
-
+_compatibility_data/
 
 # mypy
 .mypy_cache
diff --git a/noxfile.py b/noxfile.py
new file mode 100644
index 000000000..0a883baeb
--- /dev/null
+++ b/noxfile.py
@@ -0,0 +1,88 @@
+import nox
+import argparse
+from pathlib import Path
+import os
+import tempfile
+import shutil
+
+
+def git_rev_parse(session, commit):
+    print(f"Converting provided commit '{commit}' to Git revision...")
+    rev = session.run("git", "rev-parse", commit, external=True, silent=True).strip()
+    return rev
+
+
+@nox.session
+def save_and_load(session: nox.Session):
+    """Save models and outputs to disk and compare outputs between versions.
+
+    This session installs the bayesflow version specified by the `commit` argument, and runs the test suite either in
+    "save" or in "load" mode. In save mode, results are stored to disk and a within-version load test is performed.
+    In load mode, the stored models and outputs are loaded from disk, and old and new outputs are compared.
+    This helps to detect breaking serialization between versions.
+
+    Important: The test code from the current checkout, not from `commit`, is used.
+    """
+    # parse the arguments
+    parser = argparse.ArgumentParser()
+    # add subparsers for the two different commands
+    subparsers = parser.add_subparsers(help="subcommand help", dest="mode")
+    # save command
+    parser_save = subparsers.add_parser("save")
+    parser_save.add_argument("commit", type=str)
+    # load command, additional "from" argument
+    parser_load = subparsers.add_parser("load")
+    parser_load.add_argument("--from", type=str, required=True, dest="from_commit")
+    parser_load.add_argument("commit", type=str)
+
+    # keep unknown arguments, they will be forwarded to pytest below
+    args, unknownargs = parser.parse_known_args(session.posargs)
+
+    if args.mode == "load":
+        if args.from_commit == ".":
+            from_commit = "local"
+        else:
+            from_commit = git_rev_parse(session, args.from_commit)
+
+        from_path = Path("_compatibility_data").absolute() / from_commit
+        if not from_path.exists():
+            raise FileNotFoundError(
+                f"The directory {from_path} does not exist, cannot load data.\n"
+                f"Please run 'nox -- save {args.from_commit}' to create it, and then rerun this command."
+            )
+
+        print(f"Data will be loaded from path {from_path}.")
+
+    # install dependencies, currently the jax backend is used, but we could add a configuration option for this
+    repo_path = Path(os.curdir).absolute()
+    if args.commit == ".":
+        print("'.' provided, installing local state...")
+        if args.mode == "save":
+            print("Output will be saved to the alias 'local'")
+        commit = "local"
+        session.install(".[test]")
+    else:
+        commit = git_rev_parse(session, args.commit)
+        print("Installing specified revision...")
+        session.install(f"bayesflow[test] @ git+file://{str(repo_path)}@{commit}")
+    session.install("jax")
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # launch in temporary directory, as the local bayesflow would overshadow the installed one
+        tmpdirname = Path(tmpdirname)
+        # pass mode and data path to pytest, required for correct save and load behavior
+        if args.mode == "load":
+            data_path = from_path
+        else:
+            data_path = Path("_compatibility_data").absolute() / commit
+            if data_path.exists():
+                print(f"Removing existing data directory {data_path}...")
+                shutil.rmtree(data_path)
+
+        cmd = ["pytest", "tests/test_compatibility", f"--mode={args.mode}", f"--data-path={data_path}"]
+        cmd += unknownargs
+
+        print(f"Copying tests from working directory to temporary directory: {tmpdirname}")
+        shutil.copytree("tests", tmpdirname / "tests")
+        with session.chdir(tmpdirname):
+            session.run(*cmd, env={"KERAS_BACKEND": "jax"})
diff --git a/pyproject.toml b/pyproject.toml
index f29938bba..564420ae0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ all = [
     "sphinxcontrib-bibtex ~= 2.6",
     "snowballstemmer ~= 2.2.0",
     # test
+    "nox",
     "pytest",
     "pytest-cov",
     "pytest-rerunfailures",
@@ -82,6 +83,7 @@ test = [
     "nbconvert",
     "ipython",
     "ipykernel",
+    "nox",
     "pytest",
     "pytest-cov",
     "pytest-rerunfailures",
diff --git a/tests/conftest.py b/tests/conftest.py
index 560b7c59b..a32d71c7a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,11 @@
 BACKENDS = ["jax", "numpy", "tensorflow", "torch"]
 
 
+def pytest_addoption(parser):
+    parser.addoption("--mode", choices=["save", "load"])
+    parser.addoption("--data-path", type=str)
+
+
 def pytest_runtest_setup(item):
     """Skips backends by test markers. Unmarked tests are treated as backend-agnostic"""
     backend = keras.backend.backend()
@@ -41,42 +46,42 @@ def pytest_make_parametrize_id(config, val, argname):
     return f"{argname}={repr(val)}"
 
 
-@pytest.fixture(params=[2], scope="session")
+@pytest.fixture(params=[2])
 def batch_size(request):
     return request.param
 
 
-@pytest.fixture(params=[None, 2, 3], scope="session")
+@pytest.fixture(params=[None, 2, 3])
 def conditions_size(request):
     return request.param
 
 
-@pytest.fixture(params=[1, 4], scope="session")
+@pytest.fixture(params=[1, 4])
 def summary_dim(request):
     return request.param
 
 
-@pytest.fixture(params=["two_moons"], scope="session")
+@pytest.fixture(params=["two_moons"])
 def dataset(request):
     return request.getfixturevalue(request.param)
 
 
-@pytest.fixture(params=[2, 3], scope="session")
+@pytest.fixture(params=[2, 3])
 def feature_size(request):
     return request.param
 
 
-@pytest.fixture(scope="session")
-def random_conditions(batch_size, conditions_size):
+@pytest.fixture()
+def random_conditions(random_seed, batch_size, conditions_size):
     if conditions_size is None:
         return None
 
-    return keras.random.normal((batch_size, conditions_size))
+    return keras.random.normal((batch_size, conditions_size), seed=10)
 
 
-@pytest.fixture(scope="session")
-def random_samples(batch_size, feature_size):
-    return keras.random.normal((batch_size, feature_size))
+@pytest.fixture()
+def random_samples(random_seed, batch_size, feature_size):
+    return keras.random.normal((batch_size, feature_size), seed=20)
 
 
 @pytest.fixture(scope="function", autouse=True)
@@ -86,11 +91,11 @@ def random_seed():
     return seed
 
 
-@pytest.fixture(scope="session")
-def random_set(batch_size, set_size, feature_size):
-    return keras.random.normal((batch_size, set_size, feature_size))
+@pytest.fixture()
+def random_set(random_seed, batch_size, set_size, feature_size):
+    return keras.random.normal((batch_size, set_size, feature_size), seed=30)
 
 
-@pytest.fixture(params=[2, 3], scope="session")
+@pytest.fixture(params=[2, 3])
 def set_size(request):
     return request.param
diff --git a/tests/test_compatibility/conftest.py b/tests/test_compatibility/conftest.py
new file mode 100644
index 000000000..ad347cf55
--- /dev/null
+++ b/tests/test_compatibility/conftest.py
@@ -0,0 +1,200 @@
+import pytest
+from pathlib import Path
+
+
+@pytest.fixture(autouse=True, scope="session")
+def mode(request):
+    mode = request.config.getoption("--mode")
+    if not mode:
+        return "save"
+    return mode
+
+
+@pytest.fixture(autouse=True, scope="session")
+def data_dir(request, tmp_path_factory):
+    # read config option to detect "unset" scenario
+    mode = request.config.getoption("--mode")
+    path = request.config.getoption("--data-path")
+    if not mode:
+        # if mode is unset, save and load from a temporary directory
+        return Path(tmp_path_factory.mktemp("_compatibility_data"))
+    elif not path:
+        pytest.exit(reason="Please provide the --data-path argument for model saving/loading.")
+    elif mode == "load":
+        path = Path(path)
+        if not path.exists():
+            pytest.exit(reason=f"Load path '{path}' does not exist. Please specify a valid load path", returncode=1)
+    return path
+
+
+# reduce number of test configurations
+@pytest.fixture(params=[None, 3])
+def conditions_size(request):
+    return request.param
+
+
+@pytest.fixture(params=[1, 2])
+def summary_dim(request):
+    return request.param
+
+
+@pytest.fixture(params=[4])
+def feature_size(request):
+    return request.param
+
+
+# Generic fixtures for use as input to the tested classes.
+# The classes to test are constructed in the respective subdirectories, to allow for more thorough configuation.
+@pytest.fixture(params=[None, "all"])
+def standardize(request):
+    return request.param
+
+
+@pytest.fixture()
+def adapter(request):
+    import bayesflow as bf
+
+    match request.param:
+        case "summary":
+            return bf.Adapter.create_default("parameters").rename("observables", "summary_variables")
+        case "direct":
+            return bf.Adapter.create_default("parameters").rename("observables", "inference_conditions")
+        case "default":
+            return bf.Adapter.create_default("parameters")
+        case "empty":
+            return bf.Adapter()
+        case None:
+            return None
+        case _:
+            raise ValueError(f"Invalid request parameter for adapter: {request.param}")
+
+
+@pytest.fixture(params=["coupling_flow", "flow_matching"])
+def inference_network(request):
+    match request.param:
+        case "coupling_flow":
+            from bayesflow.networks import CouplingFlow
+
+            return CouplingFlow(depth=2)
+
+        case "flow_matching":
+            from bayesflow.networks import FlowMatching
+
+            return FlowMatching(subnet_kwargs=dict(widths=(32, 32)), use_optimal_transport=False)
+
+        case None:
+            return None
+
+        case _:
+            raise ValueError(f"Invalid request parameter for inference_network: {request.param}")
+
+
+@pytest.fixture(params=["time_series_transformer", "fusion_transformer", "time_series_network", "custom"])
+def summary_network(request):
+    match request.param:
+        case "time_series_transformer":
+            from bayesflow.networks import TimeSeriesTransformer
+
+            return TimeSeriesTransformer(embed_dims=(8, 8), mlp_widths=(16, 8), mlp_depths=(1, 1))
+
+        case "fusion_transformer":
+            from bayesflow.networks import FusionTransformer
+
+            return FusionTransformer(
+                embed_dims=(8, 8), mlp_widths=(8, 16), mlp_depths=(2, 1), template_dim=8, bidirectional=False
+            )
+
+        case "time_series_network":
+            from bayesflow.networks import TimeSeriesNetwork
+
+            return TimeSeriesNetwork(filters=4, skip_steps=2)
+
+        case "deep_set":
+            from bayesflow.networks import DeepSet
+
+            return DeepSet(summary_dim=2, depth=1)
+
+        case "custom":
+            from bayesflow.networks import SummaryNetwork
+            from bayesflow.utils.serialization import serializable
+            import keras
+
+            @serializable("test", disable_module_check=True)
+            class Custom(SummaryNetwork):
+                def __init__(self, **kwargs):
+                    super().__init__(**kwargs)
+                    self.inner = keras.Sequential([keras.layers.LSTM(8), keras.layers.Dense(4)])
+
+                def call(self, x, **kwargs):
+                    return self.inner(x, training=kwargs.get("stage") == "training")
+
+            return Custom()
+
+        case "flatten":
+            # very simple summary network for fast training
+            from bayesflow.networks import SummaryNetwork
+            from bayesflow.utils.serialization import serializable
+            import keras
+
+            @serializable("test", disable_module_check=True)
+            class FlattenSummaryNetwork(SummaryNetwork):
+                def __init__(self, **kwargs):
+                    super().__init__(**kwargs)
+                    self.inner = keras.layers.Flatten()
+
+                def call(self, x, **kwargs):
+                    return self.inner(x, training=kwargs.get("stage") == "training")
+
+            return FlattenSummaryNetwork()
+
+        case "fusion_network":
+            from bayesflow.networks import FusionNetwork, DeepSet
+
+            return FusionNetwork({"a": DeepSet(), "b": keras.layers.Flatten()}, head=keras.layers.Dense(2))
+        case None:
+            return None
+        case _:
+            raise ValueError(f"Invalid request parameter for summary_network: {request.param}")
+
+
+@pytest.fixture(params=["sir", "fusion"])
+def simulator(request):
+    match request.param:
+        case "sir":
+            from bayesflow.simulators import SIR
+
+            return SIR()
+        case "lotka_volterra":
+            from bayesflow.simulators import LotkaVolterra
+
+            return LotkaVolterra()
+
+        case "two_moons":
+            from bayesflow.simulators import TwoMoons
+
+            return TwoMoons()
+        case "normal":
+            from tests.utils.normal_simulator import NormalSimulator
+
+            return NormalSimulator()
+        case "fusion":
+            from bayesflow.simulators import Simulator
+            from bayesflow.types import Shape, Tensor
+            from bayesflow.utils.decorators import allow_batch_size
+            import numpy as np
+
+            class FusionSimulator(Simulator):
+                @allow_batch_size
+                def sample(self, batch_shape: Shape, num_observations: int = 4) -> dict[str, Tensor]:
+                    mean = np.random.normal(0.0, 0.1, size=batch_shape + (2,))
+                    noise = np.random.standard_normal(batch_shape + (num_observations, 2))
+
+                    x = mean[:, None] + noise
+
+                    return dict(mean=mean, a=x, b=x)
+
+            return FusionSimulator()
+        case None:
+            return None
+        case _:
+            raise ValueError(f"Invalid request parameter for simulator: {request.param}")
diff --git a/tests/test_compatibility/test_adapters/__init__.py b/tests/test_compatibility/test_adapters/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_adapters/conftest.py b/tests/test_compatibility/test_adapters/conftest.py
new file mode 100644
index 000000000..bc8ad90d3
--- /dev/null
+++ b/tests/test_compatibility/test_adapters/conftest.py
@@ -0,0 +1,89 @@
+import pytest
+import numpy as np
+
+
+@pytest.fixture()
+def default_adapter():
+    from bayesflow import Adapter
+
+    return Adapter.create_default(["x1", "x2"])
+
+
+@pytest.fixture()
+def complete_adapter():
+    from bayesflow.adapters import Adapter
+    import keras
+
+    @keras.saving.register_keras_serializable("custom")
+    def serializable_fn(x):
+        return x
+
+    return (
+        Adapter()
+        .to_array()
+        .as_set(["s1", "s2"])
+        .broadcast("t1", to="t2")
+        .as_time_series(["t1", "t2"])
+        .convert_dtype("float64", "float32", exclude="o1")
+        .concatenate(["x1", "x2"], into="x")
+        .concatenate(["y1", "y2"], into="y")
+        .expand_dims(["z1"], axis=2)
+        .squeeze("z1", axis=2)
+        .log("p1")
+        .constrain("p2", lower=0)
+        .apply(include="p2", forward="exp", inverse="log")
+        .apply(include="p2", forward="log1p")
+        .apply_serializable(include="x", forward=serializable_fn, inverse=serializable_fn)
+        .scale("x", by=[-1, 2])
+        .shift("x", by=2)
+        .split("key_to_split", into=["split_1", "split_2"])
+        .drop("d1")
+        .one_hot("o1", 10)
+        .keep(["x", "y", "z1", "p1", "p2", "s1", "s2", "s3", "t1", "t2", "o1", "split_1", "split_2"])
+        .rename("o1", "o2")
+        .random_subsample("s3", sample_size=33, axis=0)
+        .take("s3", indices=np.arange(0, 32), axis=0)
+        .group(["p1", "p2"], into="ps", prefix="p")
+        .ungroup("ps", prefix="p")
+    )
+
+
+@pytest.fixture(params=["default_adapter", "complete_adapter"])
+def adapter(request):
+    return request.getfixturevalue(request.param)
+
+
+def get_data(rng):
+    return {
+        "x1": rng.standard_normal(size=(32, 1)),
+        "x2": rng.standard_normal(size=(32, 1)),
+        "y1": rng.standard_normal(size=(32, 2)),
+        "y2": rng.standard_normal(size=(32, 2)),
+        "z1": rng.standard_normal(size=(32, 2)),
+        "p1": rng.lognormal(size=(32, 2)),
+        "p2": rng.lognormal(size=(32, 2)),
+        "p3": rng.lognormal(size=(32, 2)),
+        "n1": 1 - rng.lognormal(size=(32, 2)),
+        "s1": rng.standard_normal(size=(32, 3, 2)),
+        "s2": rng.standard_normal(size=(32, 3, 2)),
+        "t1": np.zeros((3, 2)),
+        "t2": np.ones((32, 3, 2)),
+        "d1": rng.standard_normal(size=(32, 2)),
+        "d2": rng.standard_normal(size=(32, 2)),
+        "o1": rng.integers(0, 9, size=(32, 2)),
+        "s3": rng.standard_normal(size=(35, 2)),
+        "u1": rng.uniform(low=-1, high=2, size=(32, 1)),
+        "key_to_split": rng.standard_normal(size=(32, 10)),
+    }
+
+
+@pytest.fixture
+def data_1():
+    rng = np.random.default_rng(seed=1)
+    return get_data(rng)
+
+
+@pytest.fixture
+def data_2():
+    rng = np.random.default_rng(seed=2)
+    return get_data(rng)
diff --git a/tests/test_compatibility/test_adapters/test_adapters.py b/tests/test_compatibility/test_adapters/test_adapters.py
new file mode 100644
index 000000000..70cf91a31
--- /dev/null
+++ b/tests/test_compatibility/test_adapters/test_adapters.py
@@ -0,0 +1,40 @@
+import pytest
+from utils import SaveLoadTest, load_from_config, save_config, load_path, dump_path
+import numpy as np
+
+
+class TestAdapter(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, adapter, data_1, data_2):
+        if mode == "save":
+            _ = adapter(data_1)
+            save_config(adapter, filepaths["model"])
+
+            output = self.evaluate(adapter, data_2)
+            dump_path(output, filepaths["output"])
+
+        adapter = load_from_config(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return adapter, output
+
+    def evaluate(self, adapter, data):
+        adapted = adapter(data)
+        cycled = adapter(adapted, inverse=True)
+        return {"adapted": adapted, "cycled": cycled}
+
+    def test_output(self, setup, data_2):
+        adapter, reference = setup
+        output = self.evaluate(adapter, data_2)
+        for k, v in reference.items():
+            for name, variable in v.items():
+                if name == "s3":
+                    continue
+                np.testing.assert_allclose(
+                    variable, output[k][name], err_msg=f"Values for key '{k}/{name} do not match."
+                )
diff --git a/tests/test_compatibility/test_approximators/__init__.py b/tests/test_compatibility/test_approximators/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/conftest.py b/tests/test_compatibility/test_approximators/conftest.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/__init__.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py
new file mode 100644
index 000000000..1006b2c61
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_continuous_approximator/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+
+@pytest.fixture
+def approximator(adapter, inference_network, summary_network, standardize):
+    from bayesflow.approximators import ContinuousApproximator
+
+    return ContinuousApproximator(
+        adapter=adapter, inference_network=inference_network, summary_network=summary_network, standardize=standardize
+    )
diff --git a/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py b/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
new file mode 100644
index 000000000..ddbc9c84b
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_continuous_approximator/test_continuous_approximator.py
@@ -0,0 +1,48 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize("inference_network", ["coupling_flow"], indirect=True)
+@pytest.mark.parametrize(
+    "summary_network,simulator,adapter,standardize",
+    [
+        ["deep_set", "sir", "summary", ["summary_variables", "inference_variables"]],  # use deep_set for speed
+        [None, "two_moons", "direct", "all"],
+        [None, "two_moons", "direct", None],
+    ],
+    indirect=True,
+)
+class TestContinuousApproximator(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, approximator, adapter, inference_network, summary_network, standardize, simulator):
+        if mode == "save":
+            approximator.compile("adamw", run_eagerly=False)
+            approximator.fit(simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0)
+            keras.saving.save_model(approximator, filepaths["approximator"])
+
+            input = simulator.sample(4)
+            output = self.evaluate(approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.log_prob(data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximator/__init__.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/conftest.py
new file mode 100644
index 000000000..b11b752e8
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/conftest.py
@@ -0,0 +1,75 @@
+import pytest
+import numpy as np
+
+
+@pytest.fixture
+def simulator():
+    from bayesflow import make_simulator
+    from bayesflow.simulators import ModelComparisonSimulator
+
+    def context(batch_shape, n=None):
+        if n is None:
+            n = np.random.randint(2, 5)
+        return dict(n=n)
+
+    def prior_null():
+        return dict(mu=0.0)
+
+    def prior_alternative():
+        mu = np.random.normal(loc=0, scale=1)
+        return dict(mu=mu)
+
+    def likelihood(n, mu):
+        x = np.random.normal(loc=mu, scale=1, size=n)
+        return dict(x=x)
+
+    simulator_null = make_simulator([prior_null, likelihood])
+    simulator_alternative = make_simulator([prior_alternative, likelihood])
+    return ModelComparisonSimulator(
+        simulators=[simulator_null, simulator_alternative],
+        use_mixed_batches=True,
+        shared_simulator=context,
+    )
+
+
+@pytest.fixture
+def adapter():
+    from bayesflow import Adapter
+
+    return (
+        Adapter()
+        .sqrt("n")
+        .broadcast("n", to="x")
+        .as_set("x")
+        .rename("n", "classifier_conditions")
+        .rename("x", "summary_variables")
+        .drop("mu")
+        .convert_dtype("float64", "float32")
+    )
+
+
+@pytest.fixture
+def classifier_network():
+    from bayesflow.networks import MLP
+
+    return MLP(widths=[32, 32])
+
+
+@pytest.fixture
+def approximator(adapter, classifier_network, summary_network, simulator, standardize):
+    from bayesflow.approximators import ModelComparisonApproximator
+
+    return ModelComparisonApproximator(
+        num_models=len(simulator.simulators),
+        classifier_network=classifier_network,
+        adapter=adapter,
+        summary_network=summary_network,
+        standardize=standardize,
+    )
+
+
+@pytest.fixture(
+    params=["all", None, "classifier_conditions", "summary_variables", ("classifier_conditions", "summary_variables")]
+)
+def standardize(request):
+    return request.param
diff --git a/tests/test_compatibility/test_approximators/test_model_comparison_approximator/test_model_comparison_approximator.py b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/test_model_comparison_approximator.py
new file mode 100644
index 000000000..e90f2fc36
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_model_comparison_approximator/test_model_comparison_approximator.py
@@ -0,0 +1,41 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize("summary_network", ["deep_set"], indirect=True)
+class TestModelComparisonApproximator(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, simulator, approximator, classifier_network, summary_network):
+        if mode == "save":
+            approximator.compile("adamw")
+            approximator.fit(
+                adapter=approximator.adapter, simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0
+            )
+            keras.saving.save_model(approximator, filepaths["approximator"])
+
+            input = simulator.sample(4)
+            output = self.evaluate(approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.predict(conditions=data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/__init__.py b/tests/test_compatibility/test_approximators/test_point_approximator/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
new file mode 100644
index 000000000..a0d9583a2
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/conftest.py
@@ -0,0 +1,49 @@
+import pytest
+
+
+@pytest.fixture()
+def batch_size():
+    return 8
+
+
+@pytest.fixture(params=["single_parametric", "multiple_parametric"])
+def point_inference_network(request):
+    match request.param:
+        case "single_parametric":
+            from bayesflow.networks import PointInferenceNetwork
+            from bayesflow.scores import NormedDifferenceScore, QuantileScore, MultivariateNormalScore
+
+            return PointInferenceNetwork(
+                scores=dict(
+                    mean=NormedDifferenceScore(k=2),
+                    quantiles=QuantileScore(q=[0.1, 0.5, 0.9]),
+                    mvn=MultivariateNormalScore(),
+                ),
+                subnet="mlp",
+                subnet_kwargs=dict(widths=(32, 32)),
+            )
+
+        case "multiple_parametric":
+            from bayesflow.networks import PointInferenceNetwork
+            from bayesflow.scores import MultivariateNormalScore
+
+            return PointInferenceNetwork(
+                scores=dict(
+                    mvn1=MultivariateNormalScore(),
+                    mvn2=MultivariateNormalScore(),
+                ),
+            )
+        case _:
+            raise ValueError(f"Invalid request parameter for point_inference_network: {request.param}")
+
+
+@pytest.fixture
+def approximator(adapter, point_inference_network, summary_network, standardize):
+    from bayesflow.approximators import PointApproximator
+
+    return PointApproximator(
+        adapter=adapter,
+        inference_network=point_inference_network,
+        summary_network=summary_network,
+        standardize=standardize,
+    )
diff --git a/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
new file mode 100644
index 000000000..a45801e79
--- /dev/null
+++ b/tests/test_compatibility/test_approximators/test_point_approximator/test_point_approximator.py
@@ -0,0 +1,52 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize(
+    "summary_network,simulator,adapter,standardize",
+    [
+        ["deep_set", "sir", "summary", "all"],  # use deep_set for speed
+        [None, "two_moons", "direct", None],
+    ],
+    indirect=True,
+)
+class TestPointApproximator(SaveLoadTest):
+    filenames = {
+        "approximator": "approximator.keras",
+        "input": "input.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(
+        self, filepaths, mode, approximator, adapter, point_inference_network, summary_network, standardize, simulator
+    ):
+        if mode == "save":
+            approximator.compile("adamw", run_eagerly=False)
+            approximator.fit(simulator=simulator, epochs=1, batch_size=8, num_batches=2, verbose=0)
+            keras.saving.save_model(approximator, filepaths["approximator"])
+
+            input = simulator.sample(4)
+            output = self.evaluate(approximator, input)
+            dump_path(input, filepaths["input"])
+            dump_path(output, filepaths["output"])
+
+        approximator = keras.saving.load_model(filepaths["approximator"])
+        input = load_path(filepaths["input"])
+        output = load_path(filepaths["output"])
+
+        return approximator, input, output
+
+    def evaluate(self, approximator, data):
+        return approximator.estimate(data)
+
+    def test_output(self, setup):
+        approximator, input, reference = setup
+        output = self.evaluate(approximator, input)
+
+        from keras.tree import flatten
+
+        for ref, out in zip(flatten(reference), flatten(output)):
+            np.testing.assert_allclose(ref, out)
diff --git a/tests/test_compatibility/test_distributions/__init__.py b/tests/test_compatibility/test_distributions/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_distributions/conftest.py b/tests/test_compatibility/test_distributions/conftest.py
new file mode 100644
index 000000000..4c77bcf27
--- /dev/null
+++ b/tests/test_compatibility/test_distributions/conftest.py
@@ -0,0 +1,48 @@
+import pytest
+
+
+@pytest.fixture()
+def diagonal_normal():
+    from bayesflow.distributions import DiagonalNormal
+
+    return DiagonalNormal(mean=1.0, std=2.0)
+
+
+@pytest.fixture()
+def diagonal_student_t():
+    from bayesflow.distributions import DiagonalStudentT
+
+    return DiagonalStudentT(df=10, loc=1.0, scale=2.0)
+
+
+@pytest.fixture()
+def mixture():
+    from bayesflow.distributions import DiagonalNormal, DiagonalStudentT, Mixture
+
+    return Mixture([DiagonalNormal(mean=1.0, std=2.0), DiagonalStudentT(df=25, mean=1.0, std=2.0)])
+
+
+@pytest.fixture(params=["diagonal_normal", "diagonal_student_t", "mixture"])
+def distribution(request):
+    name, kwargs = request.param
+
+    match name:
+        case "diagonal_normal":
+            from bayesflow.distributions import DiagonalNormal
+
+            return DiagonalNormal(mean=1.0, std=2.0, **kwargs)
+        case "diagonal_student_t":
+            from bayesflow.distributions import DiagonalStudentT
+
+            return DiagonalStudentT(df=10, loc=1.0, scale=2.0, **kwargs)
+        case "mixture":
+            from bayesflow.distributions import DiagonalNormal, DiagonalStudentT, Mixture
+
+            return Mixture(
+                [
+                    DiagonalNormal(mean=1.0, std=2.0, trainable_parameters=True),
+                    DiagonalStudentT(df=25, mean=1.0, std=2.0),
+                ],
+                **kwargs,
+            )
+    return request.getfixturevalue(request.param)
diff --git a/tests/test_compatibility/test_distributions/test_distributions.py b/tests/test_compatibility/test_distributions/test_distributions.py
new file mode 100644
index 000000000..c53046e93
--- /dev/null
+++ b/tests/test_compatibility/test_distributions/test_distributions.py
@@ -0,0 +1,82 @@
+from utils import SaveLoadTest
+import numpy as np
+import keras
+import pytest
+
+
+@pytest.mark.parametrize(
+    "distribution",
+    [
+        ["diagonal_normal", dict(trainable_parameters=False)],
+        ["diagonal_normal", dict(trainable_parameters=True)],
+        ["diagonal_student_t", dict(trainable_parameters=False)],
+        ["diagonal_student_t", dict(trainable_parameters=True)],
+        ["mixture", dict(trainable_mixture=False)],
+        ["mixture", dict(trainable_mixture=True)],
+    ],
+    indirect=True,
+)
+class TestDistribution(SaveLoadTest):
+    filenames = {
+        "model": "model.keras",
+        "output": "output.npy",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, distribution, random_samples):
+        from bayesflow.utils.serialization import serialize, deserialize
+
+        class DummyModel(keras.Model):
+            def __init__(self, distribution, **kwargs):
+                super().__init__(**kwargs)
+                self.distribution = distribution
+
+            def call(self, inputs):
+                return self.distribution.log_prob(inputs)
+
+            def get_config(self):
+                base_config = super().get_config()
+                config = {"distribution": self.distribution}
+                return base_config | serialize(config)
+
+            @classmethod
+            def from_config(cls, config, custom_objects=None):
+                return cls(**deserialize(config, custom_objects=custom_objects))
+
+        if mode == "save":
+            distribution.build(keras.ops.shape(random_samples))
+
+            model = DummyModel(distribution)
+            model.compile(loss=keras.losses.MeanSquaredError())
+            fit_kwargs = dict(
+                x=random_samples,
+                y=keras.ops.ones(keras.ops.shape(random_samples)[:-1]),
+                batch_size=keras.ops.shape(random_samples)[0],
+                epochs=1,
+            )
+            if keras.backend.backend() == "torch":
+                import torch
+
+                with torch.enable_grad():
+                    model.fit(**fit_kwargs)
+            else:
+                model.fit(**fit_kwargs)
+
+            model.save(filepaths["model"])
+
+            output = self.evaluate(model.distribution, random_samples)
+            np.save(filepaths["output"], output, allow_pickle=False)
+
+        distribution = keras.saving.load_model(
+            filepaths["model"], custom_objects={"DummyModel": DummyModel}
+        ).distribution
+        output = np.load(filepaths["output"])
+
+        return distribution, output
+
+    def evaluate(self, distribution, random_samples):
+        return keras.ops.convert_to_numpy(distribution.log_prob(random_samples))
+
+    def test_output(self, setup, random_samples):
+        distribution, output = setup
+        np.testing.assert_allclose(self.evaluate(distribution, random_samples), output)
diff --git a/tests/test_compatibility/test_links/__init__.py b/tests/test_compatibility/test_links/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_links/conftest.py b/tests/test_compatibility/test_links/conftest.py
new file mode 100644
index 000000000..883393db3
--- /dev/null
+++ b/tests/test_compatibility/test_links/conftest.py
@@ -0,0 +1,32 @@
+import keras
+import pytest
+
+
+@pytest.fixture()
+def batch_size():
+    return 16
+
+
+@pytest.fixture()
+def feature_size():
+    return 10
+
+
+@pytest.fixture
+def link(request):
+    name, kwargs = request.param
+    match name:
+        case "ordered":
+            from bayesflow.links import Ordered
+
+            return Ordered(**kwargs)
+        case "ordered_quantiles":
+            from bayesflow.links import OrderedQuantiles
+
+            return OrderedQuantiles(**kwargs)
+        case "cholesky_factor":
+            from bayesflow.links import CholeskyFactor
+
+            return CholeskyFactor(**kwargs)
+        case "linear":
+            return keras.layers.Activation("linear", **kwargs)
diff --git a/tests/test_compatibility/test_links/test_links.py b/tests/test_compatibility/test_links/test_links.py
new file mode 100644
index 000000000..9343b048d
--- /dev/null
+++ b/tests/test_compatibility/test_links/test_links.py
@@ -0,0 +1,44 @@
+import pytest
+from utils import save_config, load_from_config, dump_path, load_path
+from utils import SaveLoadTest
+import numpy as np
+
+
+@pytest.mark.parametrize(
+    "link",
+    [
+        ["ordered", dict(axis=1, anchor_index=2)],
+        ["ordered_quantiles", dict()],
+        ["cholesky_factor", dict()],
+        ["linear", dict()],
+    ],
+    indirect=True,
+)
+class TestLink(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, link, random_samples):
+        if mode == "save":
+            _ = link(random_samples)
+            save_config(link, filepaths["model"])
+
+            output = self.evaluate(link, random_samples)
+            dump_path(output, filepaths["output"])
+
+        link = load_from_config(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return link, output
+
+    def evaluate(self, link, data):
+        return link(data)
+
+    def test_output(self, setup, random_samples):
+        link, reference = setup
+        print(reference)
+        output = self.evaluate(link, random_samples)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_metrics/__init__.py b/tests/test_compatibility/test_metrics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_metrics/conftest.py b/tests/test_compatibility/test_metrics/conftest.py
new file mode 100644
index 000000000..9de8610cc
--- /dev/null
+++ b/tests/test_compatibility/test_metrics/conftest.py
@@ -0,0 +1,28 @@
+import pytest
+import keras
+
+
+@pytest.fixture()
+def metric(request):
+    name, kwargs = request.param
+
+    match name:
+        case "root_mean_squared_error":
+            from bayesflow.metrics import RootMeanSquaredError
+
+            return RootMeanSquaredError(**kwargs)
+        case "maximum_mean_discrepancy":
+            from bayesflow.metrics import MaximumMeanDiscrepancy
+
+            return MaximumMeanDiscrepancy(**kwargs)
+    raise ValueError(f"unknown name: {name}")
+
+
+@pytest.fixture
+def samples_1():
+    return keras.random.normal((2, 3), seed=1)
+
+
+@pytest.fixture
+def samples_2():
+    return keras.random.normal((2, 3), seed=2)
diff --git a/tests/test_compatibility/test_metrics/test_metrics.py b/tests/test_compatibility/test_metrics/test_metrics.py
new file mode 100644
index 000000000..ce2d7267d
--- /dev/null
+++ b/tests/test_compatibility/test_metrics/test_metrics.py
@@ -0,0 +1,41 @@
+import pytest
+from utils import SaveLoadTest, load_from_config, save_config
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize(
+    "metric",
+    [
+        ["root_mean_squared_error", dict(normalize=True, dtype="float32")],
+        ["root_mean_squared_error", dict(normalize=False)],
+        ["maximum_mean_discrepancy", dict(kernel="gaussian", unbiased=True, dtype="float32")],
+        ["maximum_mean_discrepancy", dict(kernel="inverse_multiquadratic", unbiased=False)],
+    ],
+    indirect=True,
+)
+class TestMetric(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.npy",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, metric, samples_1, samples_2):
+        if mode == "save":
+            save_config(metric, filepaths["model"])
+
+            output = self.evaluate(metric, samples_1, samples_2)
+            np.save(filepaths["output"], output, allow_pickle=False)
+
+        metric = load_from_config(filepaths["model"])
+        output = np.load(filepaths["output"])
+
+        return metric, output
+
+    def evaluate(self, metric, samples_1, samples_2):
+        return keras.ops.convert_to_numpy(metric(samples_1, samples_2))
+
+    def test_output(self, setup, samples_1, samples_2):
+        metric, output = setup
+        np.testing.assert_allclose(self.evaluate(metric, samples_1, samples_2), output)
diff --git a/tests/test_compatibility/test_networks/__init__.py b/tests/test_compatibility/test_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/conftest.py b/tests/test_compatibility/test_networks/conftest.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/__init__.py b/tests/test_compatibility/test_networks/test_inference_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/conftest.py b/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
new file mode 100644
index 000000000..4d0126da2
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_inference_networks/conftest.py
@@ -0,0 +1,39 @@
+import pytest
+
+from bayesflow.networks import MLP
+
+
+@pytest.fixture
+def inference_network(request):
+    name, kwargs = request.param
+    from bayesflow.utils.dispatch import find_inference_network
+
+    try:
+        return find_inference_network(name, **kwargs)
+    except ValueError:
+        # network not yet in find_inference_network
+        pass
+    match name:
+        case "diffusion_model":
+            from bayesflow.experimental import DiffusionModel
+
+            return DiffusionModel(**kwargs)
+        case "free_form_flow":
+            from bayesflow.experimental import FreeFormFlow
+
+            return FreeFormFlow(**kwargs)
+        case "point_inference_network":
+            from bayesflow.networks import PointInferenceNetwork
+            from bayesflow.scores import MeanScore, MedianScore, QuantileScore, MultivariateNormalScore
+
+            return PointInferenceNetwork(
+                scores=dict(
+                    mean=MeanScore(subnets=dict(value=MLP([16, 8]))),
+                    median=MedianScore(subnets=dict(value=MLP([16, 8]))),
+                    quantiles=QuantileScore(subnets=dict(value=MLP([16, 8]))),
+                    mvn=MultivariateNormalScore(subnets=dict(mean=MLP([16, 8]), covariance=MLP([16, 8]))),
+                ),
+                **kwargs,
+            )
+        case _:
+            raise ValueError(f"Invalid request parameter for inference_network: {name}")
diff --git a/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py b/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
new file mode 100644
index 000000000..99d8a5be6
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_inference_networks/test_inference_networks.py
@@ -0,0 +1,111 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize(
+    "inference_network",
+    [
+        [
+            "coupling_flow",
+            dict(
+                depth=2,
+                subnet="mlp",
+                subnet_kwargs=dict(widths=[8, 8]),
+                transform="affine",
+                transform_kwargs=dict(clamp=1.8),
+            ),
+        ],
+        [
+            "coupling_flow",
+            dict(
+                depth=2,
+                subnet="mlp",
+                subnet_kwargs=dict(widths=[8, 8]),
+                transform="spline",
+                transform_kwargs=dict(bins=8),
+            ),
+        ],
+        ["flow_matching", dict(integrate_kwargs={"method": "rk45", "steps": 10})],
+        ["consistency_model", dict(total_steps=10)],
+        [
+            "diffusion_model",
+            dict(noise_schedule="edm", prediction_type="F", integrate_kwargs={"method": "rk45", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(noise_schedule="edm", prediction_type="velocity", integrate_kwargs={"method": "euler", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(noise_schedule="edm", prediction_type="noise", integrate_kwargs={"method": "euler", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(noise_schedule="cosine", prediction_type="F", integrate_kwargs={"method": "euler", "steps": 10}),
+        ],
+        [
+            "diffusion_model",
+            dict(
+                noise_schedule="cosine", prediction_type="velocity", integrate_kwargs={"method": "euler", "steps": 10}
+            ),
+        ],
+        [
+            "free_form_flow",
+            dict(encoder_subnet_kwargs={"widths": [16, 16]}, decoder_subnet_kwargs={"widths": [16, 16]}),
+        ],
+        ["point_inference_network", dict(subnet_kwargs={"widths": [8, 8]})],
+    ],
+    indirect=True,
+)
+class TestInferenceNetwork(SaveLoadTest):
+    filenames = {
+        "model": "model.keras",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, inference_network, random_samples, random_conditions):
+        if mode == "save":
+            xz_shape = keras.ops.shape(random_samples)
+            conditions_shape = keras.ops.shape(random_conditions) if random_conditions is not None else None
+            inference_network.build(xz_shape, conditions_shape)
+
+            _ = inference_network.compute_metrics(random_samples, conditions=random_conditions)
+            keras.saving.save_model(inference_network, filepaths["model"])
+            output = self.evaluate(inference_network, random_samples, random_conditions)
+
+            dump_path(output, filepaths["output"])
+
+        inference_network = keras.saving.load_model(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return inference_network, random_samples, random_conditions, output
+
+    def evaluate(self, inference_network, samples, conditions):
+        import bayesflow as bf
+
+        if isinstance(inference_network, bf.networks.ConsistencyModel):
+            # not invertible, but inverse with steps=1 is deterministic
+            return keras.tree.map_structure(
+                keras.ops.convert_to_numpy, inference_network._inverse(samples, conditions, steps=1)
+            )
+        if isinstance(inference_network, bf.networks.PointInferenceNetwork) and conditions is None:
+            pytest.skip("PointInferenceNetwork requires condition")
+        try:
+            return keras.tree.map_structure(
+                keras.ops.convert_to_numpy, inference_network.log_prob(samples, conditions=conditions)
+            )
+        except NotImplementedError:
+            pytest.skip("log_prob not available")
+
+    def test_output(self, setup):
+        approximator, samples, conditions, reference = setup
+        output = self.evaluate(approximator, samples, conditions)
+        print(reference)
+        from keras.tree import flatten
+
+        for ref, out in zip(flatten(reference), flatten(output)):
+            print(ref, out)
+            np.testing.assert_allclose(ref, out)
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/__init__.py b/tests/test_compatibility/test_networks/test_summary_networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/conftest.py b/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
new file mode 100644
index 000000000..078486c59
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_summary_networks/conftest.py
@@ -0,0 +1,62 @@
+import pytest
+
+
+@pytest.fixture(scope="function")
+def time_series_network(summary_dim):
+    from bayesflow.networks import TimeSeriesNetwork
+
+    return TimeSeriesNetwork(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def time_series_transformer(summary_dim):
+    from bayesflow.networks import TimeSeriesTransformer
+
+    return TimeSeriesTransformer(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def fusion_transformer(summary_dim):
+    from bayesflow.networks import FusionTransformer
+
+    return FusionTransformer(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def set_transformer(summary_dim):
+    from bayesflow.networks import SetTransformer
+
+    return SetTransformer(summary_dim=summary_dim)
+
+
+@pytest.fixture(scope="function")
+def deep_set(summary_dim):
+    from bayesflow.networks import DeepSet
+
+    return DeepSet(summary_dim=summary_dim)
+
+
+@pytest.fixture(
+    params=[
+        "time_series_network",
+        "time_series_transformer",
+        "fusion_transformer",
+        "set_transformer",
+        "deep_set",
+    ],
+    scope="function",
+)
+def summary_network(request, summary_dim):
+    from bayesflow.utils.dispatch import find_summary_network
+
+    name, kwargs = request.param
+    print(name)
+    try:
+        return find_summary_network(name, summary_dim=summary_dim, **kwargs)
+    except ValueError:
+        # network not in dispatch
+        pass
+
+    match name:
+        case _:
+            raise ValueError(f"Invalid request parameter for summary_network: {name}")
diff --git a/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
new file mode 100644
index 000000000..05aec6793
--- /dev/null
+++ b/tests/test_compatibility/test_networks/test_summary_networks/test_summary_networks.py
@@ -0,0 +1,48 @@
+import pytest
+from utils import SaveLoadTest, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize(
+    "summary_network",
+    [
+        ["time_series_network", dict()],
+        ["time_series_transformer", dict()],
+        ["fusion_transformer", dict()],
+        ["set_transformer", dict()],
+        ["deep_set", dict()],
+    ],
+    indirect=True,
+)
+class TestSummaryNetwork(SaveLoadTest):
+    filenames = {
+        "model": "model.keras",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture()
+    def setup(self, filepaths, mode, summary_network, summary_dim, random_set):
+        if mode == "save":
+            shape = keras.ops.shape(random_set)
+            summary_network.build(shape)
+
+            _ = summary_network(random_set)
+            keras.saving.save_model(summary_network, filepaths["model"])
+            output = self.evaluate(summary_network, random_set)
+
+            dump_path(output, filepaths["output"])
+
+        summary_network = keras.saving.load_model(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return summary_network, random_set, output
+
+    def evaluate(self, summary_network, data):
+        return keras.ops.convert_to_numpy(summary_network(data))
+
+    def test_output(self, setup):
+        approximator, data, reference = setup
+        output = self.evaluate(approximator, data)
+
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/test_scores/__init__.py b/tests/test_compatibility/test_scores/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_compatibility/test_scores/conftest.py b/tests/test_compatibility/test_scores/conftest.py
new file mode 100644
index 000000000..db3800f98
--- /dev/null
+++ b/tests/test_compatibility/test_scores/conftest.py
@@ -0,0 +1,31 @@
+import pytest
+
+
+@pytest.fixture(
+    params=["median_score", "mean_score", "normed_diff_score", "quantile_score", "multivariate_normal_score"],
+)
+def scoring_rule(request):
+    name, kwargs = request.param
+    match name:
+        case "median_score":
+            from bayesflow.scores import MedianScore
+
+            return MedianScore(**kwargs)
+        case "mean_score":
+            from bayesflow.scores import MeanScore
+
+            return MeanScore(**kwargs)
+        case "normed_diff_score":
+            from bayesflow.scores import NormedDifferenceScore
+
+            return NormedDifferenceScore(**kwargs)
+        case "quantile_score":
+            from bayesflow.scores import QuantileScore
+
+            return QuantileScore(**kwargs)
+        case "multivariate_normal_score":
+            from bayesflow.scores import MultivariateNormalScore
+
+            return MultivariateNormalScore(**kwargs)
+        case _:
+            raise ValueError(f"Invalid request parameter for scoring_rule: {name}")
diff --git a/tests/test_compatibility/test_scores/test_scores.py b/tests/test_compatibility/test_scores/test_scores.py
new file mode 100644
index 000000000..11be69c49
--- /dev/null
+++ b/tests/test_compatibility/test_scores/test_scores.py
@@ -0,0 +1,56 @@
+import pytest
+from utils import SaveLoadTest, save_config, load_from_config, dump_path, load_path
+import numpy as np
+import keras
+
+
+@pytest.mark.parametrize(
+    "scoring_rule",
+    [
+        ["median_score", {}],
+        ["mean_score", {}],
+        ["normed_diff_score", dict(k=3)],
+        ["quantile_score", {}],
+        ["multivariate_normal_score", {}],
+    ],
+    indirect=True,
+)
+class TestScore(SaveLoadTest):
+    filenames = {
+        "model": "model.pickle",
+        "output": "output.pickle",
+    }
+
+    @pytest.fixture
+    def setup(self, filepaths, mode, scoring_rule, random_samples, request):
+        if mode == "save":
+            save_config(scoring_rule, filepaths["model"])
+
+            output = self.evaluate(scoring_rule, random_samples)
+            dump_path(output, filepaths["output"])
+
+        scoring_rule = load_from_config(filepaths["model"])
+        output = load_path(filepaths["output"])
+
+        return scoring_rule, output
+
+    def evaluate(self, scoring_rule, data):
+        # Using random data also as targets for the purpose of this test.
+        head_shapes = scoring_rule.get_head_shapes_from_target_shape(data.shape)
+        estimates = {}
+        for key, output_shape in head_shapes.items():
+            link = scoring_rule.get_link(key)
+            if hasattr(link, "compute_input_shape"):
+                link_input_shape = link.compute_input_shape(output_shape)
+            else:
+                link_input_shape = output_shape
+            dummy_input = keras.ops.ones((data.shape[0],) + link_input_shape)
+            estimates[key] = link(dummy_input)
+
+        score = scoring_rule.score(estimates, data)
+        return score
+
+    def test_output(self, setup, random_samples):
+        scoring_rule, reference = setup
+        output = self.evaluate(scoring_rule, random_samples)
+        np.testing.assert_allclose(reference, output)
diff --git a/tests/test_compatibility/utils/__init__.py b/tests/test_compatibility/utils/__init__.py
new file mode 100644
index 000000000..3f5c7efaa
--- /dev/null
+++ b/tests/test_compatibility/utils/__init__.py
@@ -0,0 +1,2 @@
+from .io import *
+from .helpers import *
diff --git a/tests/test_compatibility/utils/helpers.py b/tests/test_compatibility/utils/helpers.py
new file mode 100644
index 000000000..f42ef46ec
--- /dev/null
+++ b/tests/test_compatibility/utils/helpers.py
@@ -0,0 +1,33 @@
+import pytest
+import hashlib
+import inspect
+from pathlib import Path
+
+
+class SaveLoadTest:
+    filenames = {}
+
+    @pytest.fixture(autouse=True)
+    def filepaths(self, data_dir, mode, request):
+        # this name contains the config for the test and is therefore a unique identifier
+        test_config_str = request._pyfuncitem.name
+        # hash it, as it could be too long for the file system
+        prefix = hashlib.sha1(test_config_str.encode("utf-8")).hexdigest()
+        # use path to test file as base, remove ".py" suffix
+        base_path = Path(inspect.getsourcefile(type(self))[:-3])
+        # add class name
+        directory = base_path / type(self).__name__
+        # only keep the path relative to the tests directory
+        directory = directory.relative_to(Path("tests").absolute())
+        directory = Path(data_dir) / directory
+
+        if mode == "save":
+            directory.mkdir(parents=True, exist_ok=True)
+
+        files = {}
+        for label, filename in self.filenames.items():
+            path = directory / f"{prefix}__{filename}"
+            if mode == "load" and not path.exists():
+                pytest.skip(f"Required file not available: {path}")
+            files[label] = path
+        return files
diff --git a/tests/test_compatibility/utils/io.py b/tests/test_compatibility/utils/io.py
new file mode 100644
index 000000000..618b4e26a
--- /dev/null
+++ b/tests/test_compatibility/utils/io.py
@@ -0,0 +1,22 @@
+from keras.saving import deserialize_keras_object, serialize_keras_object
+import pickle
+from pathlib import Path
+
+
+def dump_path(object, filepath: Path | str):
+    with open(filepath, "wb") as f:
+        pickle.dump(object, f)
+
+
+def load_path(filepath: Path | str):
+    with open(filepath, "rb") as f:
+        return pickle.load(f)
+
+
+def save_config(object, filepath: Path | str):
+    dump_path(serialize_keras_object(object), filepath)
+
+
+def load_from_config(filepath: Path | str, custom_objects=None):
+    config = load_path(filepath)
+    return deserialize_keras_object(config, custom_objects=custom_objects)