diff --git a/bin/config.py b/bin/config.py
index 026cb9651..b10b0dfef 100644
--- a/bin/config.py
+++ b/bin/config.py
@@ -72,9 +72,7 @@
     *KNOWN_TESTCASE_EXTENSIONS,
     *KNOWN_SAMPLE_TESTCASE_EXTENSIONS,
     ".interaction",
-    ".hint",
-    ".desc",
-    #'.args',
+    ".yaml",
 ]
 
 KNOWN_DATA_EXTENSIONS: Final[Sequence[str]] = [
diff --git a/bin/export.py b/bin/export.py
index d54dd1d0c..7e6dfd321 100644
--- a/bin/export.py
+++ b/bin/export.py
@@ -213,7 +213,7 @@ def add_testcase(in_file: Path) -> None:
     # substitute constants.
     if problem.settings.constants:
         constants_supported = [
-            "data/**/testdata.yaml",
+            "data/**/test_group.yaml",
             f"{InputValidator.source_dir}/**/*",
             f"{AnswerValidator.source_dir}/**/*",
             f"{OutputValidator.source_dir}/**/*",
@@ -298,7 +298,7 @@ def add_testcase(in_file: Path) -> None:
             ryaml_filter(limits, "time_limit")
         # validator_flags
         validator_flags = " ".join(
-            problem.get_testdata_yaml(
+            problem.get_test_case_yaml(
                 problem.path / "data",
                 OutputValidator.args_key,
                 PrintBar("Getting validator_flags for legacy export"),
@@ -325,13 +325,6 @@ def add_testcase(in_file: Path) -> None:
                     else:
                         util.error(f"{f}: no name set for language {lang}.")
 
-        # rename output_validator dir
-        if (export_dir / OutputValidator.source_dir).exists():
-            (export_dir / "output_validators").mkdir(parents=True)
-            (export_dir / OutputValidator.source_dir).rename(
-                export_dir / "output_validators" / OutputValidator.source_dir
-            )
-
         # rename statement dirs
         if (export_dir / "statement").exists():
             (export_dir / "statement").rename(export_dir / "problem_statement")
@@ -352,6 +345,18 @@ def add_testcase(in_file: Path) -> None:
                         add_file(out, f)
             shutil.rmtree(export_dir / d)
 
+        # rename output_validator dir
+        if (export_dir / OutputValidator.source_dir).exists():
+            (export_dir / "output_validators").mkdir(parents=True)
+            (export_dir / OutputValidator.source_dir).rename(
+                export_dir / "output_validators" / OutputValidator.source_dir
+            )
+
+        # rename test_group.yaml back to testdata.yaml
+        for f in (export_dir / "data").rglob("test_group.yaml"):
+            f.rename(f.with_name("testdata.yaml"))
+            # TODO potentially, some keys also need to be renamed, but we don't use this often enough for this to matter (I hope)
+
     # handle yaml updates
     yaml_path.unlink()
     write_yaml(yaml_data, yaml_path)
diff --git a/bin/generate.py b/bin/generate.py
index ce58261ba..587b1ac0b 100644
--- a/bin/generate.py
+++ b/bin/generate.py
@@ -306,11 +306,11 @@ def __init__(self, generator_config):
     "retries",
     "count",
 ] + [e[1:] for e in config.KNOWN_TEXT_DATA_EXTENSIONS]
-RESERVED_TESTCASE_KEYS: Final[Sequence[str]] = ["data", "testdata.yaml", "include"]
+RESERVED_TESTCASE_KEYS: Final[Sequence[str]] = ["data", "test_group.yaml", "include"]
 KNOWN_DIRECTORY_KEYS: Final[Sequence[str]] = [
     "type",
     "data",
-    "testdata.yaml",
+    "test_group.yaml",
     "include",
     "solution",
     "random_salt",
@@ -394,7 +394,14 @@ def __init__(self, problem, key, name, yaml, parent):
 
 class TestcaseRule(Rule):
     def __init__(
-        self, problem: Problem, generator_config, key, name: str, yaml, parent, count_index
+        self,
+        problem: Problem,
+        generator_config,
+        key,
+        name: str,
+        yaml: dict[str, Any],
+        parent,
+        count_index,
     ):
         assert is_testcase(yaml)
 
@@ -450,12 +457,17 @@ def __init__(
         # root in /data
         self.root = self.path.parts[0]
 
-        if not config.COMPILED_FILE_NAME_REGEX.fullmatch(name + ".in"):
-            raise ParseException("Testcase does not have a valid name.")
-
         # files to consider for hashing
         hashes = {}
         try:
+            if not config.COMPILED_FILE_NAME_REGEX.fullmatch(name + ".in"):
+                raise ParseException("Test case does not have a valid name.")
+
+            if name == "test_group":
+                raise ParseException(
+                    "Test case must not be named 'test_group', this clashes with the group-level 'test_group.yaml'."
+                )
+
             if yaml is None:
                 raise ParseException(
                     "Empty yaml entry (Testcases must be generated not only mentioned)."
@@ -558,11 +570,16 @@ def __init__(
                         if self.copy.with_suffix(ext).is_file():
                             hashes[ext] = hash_file_content(self.copy.with_suffix(ext))
 
-                # 3. hardcoded
+                # 3. hardcoded strings (or, for the Test Case Configuration, a yaml mapping)
                 for ext in config.KNOWN_TEXT_DATA_EXTENSIONS:
                     if ext[1:] in yaml:
                         value = yaml[ext[1:]]
-                        assert_type(ext, value, str)
+                        if ext == ".yaml":
+                            assert_type(ext, value, dict)
+                            value = write_yaml(value)
+                            assert value is not None
+                        else:
+                            assert_type(ext, value, str)
                         if len(value) > 0 and value[-1] != "\n":
                             value += "\n"
                         self.hardcoded[ext] = value
@@ -1045,7 +1062,7 @@ def use_feedback_image(feedbackdir: Path, source: str) -> None:
                 use_feedback_image(feedbackdir, "validator")
                 return True
 
-            visualizer_args = testcase.testdata_yaml_args(visualizer, bar)
+            visualizer_args = testcase.test_case_yaml_args(visualizer, bar)
             visualizer_hash = {
                 "visualizer_hash": visualizer.hash,
                 "visualizer_args": visualizer_args,
@@ -1155,11 +1172,11 @@ def copy_generated():
                     # both source and target do not exist
                     pass
 
-        def add_testdata_to_cache():
-            # Used to identify generated testcases
+        def add_test_case_to_cache():
+            # Used to identify generated test cases
             generator_config.hashed_in.add(hash_file_content(infile))
 
-            # Store the generated testdata for deduplication test cases.
+            # Store the hashes of the generated files for this test case to detect duplicate test cases.
             hashes = {}
 
             # consider specific files for the uniqueness of this testcase
@@ -1180,11 +1197,11 @@ def add_testdata_to_cache():
             test_hash = combine_hashes_dict(hashes)
 
             # check for duplicates
-            if test_hash not in generator_config.generated_testdata:
-                generator_config.generated_testdata[test_hash] = t
+            if test_hash not in generator_config.generated_test_cases:
+                generator_config.generated_test_cases[test_hash] = t
             else:
                 bar.warn(
-                    f"Testcase {t.path} is equal to {generator_config.generated_testdata[test_hash].path}."
+                    f"Testcase {t.path} is equal to {generator_config.generated_test_cases[test_hash].path}."
                 )
 
         # Step 1: handle non unique generate entry
@@ -1231,7 +1248,7 @@ def add_testdata_to_cache():
         # Note that we set this to true even if not all files were overwritten -- a different log/warning message will be displayed for that.
         t.generate_success = True
         if infile.is_file():
-            add_testdata_to_cache()
+            add_test_case_to_cache()
         if config.args.action != "generate":
             bar.logged = True  # Disable redundant 'up to date' message in run mode.
         bar.done(message="SKIPPED: up to date")
@@ -1299,11 +1316,7 @@ def __init__(
                             color_type=MessageType.LOG,
                         )
 
-        if "testdata.yaml" in yaml:
-            self.testdata_yaml = yaml["testdata.yaml"]
-        else:
-            self.testdata_yaml = False
-
+        self.test_group_yaml: Any = yaml.get("test_group.yaml", False)
         self.numbered = False
 
         # List of child TestcaseRule/Directory objects, filled by parse().
@@ -1388,7 +1401,7 @@ def walk(self, testcase_f=None, dir_f=True, *, dir_last=False):
     def generate(d, problem, generator_config, bar):
         # Generate the current directory:
         # - Create the directory.
-        # - Write testdata.yaml.
+        # - Write test_group.yaml.
         # - Link included testcases.
         #   - Input of included testcases are re-validated with the
         #     directory-specific input validator flags.
@@ -1398,29 +1411,29 @@ def generate(d, problem, generator_config, bar):
         dir_path = problem.path / "data" / d.path
         dir_path.mkdir(parents=True, exist_ok=True)
 
-        # Write the testdata.yaml, or remove it when the key is set but empty.
-        testdata_yaml_path = dir_path / "testdata.yaml"
-        if d.testdata_yaml:
-            generator_config.known_files.add(testdata_yaml_path)
-            yaml_text = write_yaml(dict(d.testdata_yaml))
+        # Write the test_group.yaml, or remove it when the key is set but empty.
+        test_group_yaml_path = dir_path / "test_group.yaml"
+        if d.test_group_yaml:
+            generator_config.known_files.add(test_group_yaml_path)
+            yaml_text = write_yaml(dict(d.test_group_yaml))
 
-            if testdata_yaml_path.is_file():
-                if yaml_text == testdata_yaml_path.read_text():
+            if test_group_yaml_path.is_file():
+                if yaml_text == test_group_yaml_path.read_text():
                     # identical -> skip
                     pass
                 else:
                     # different -> overwrite
-                    generator_config.remove(testdata_yaml_path)
-                    testdata_yaml_path.write_text(yaml_text)
-                    bar.log("CHANGED: testdata.yaml")
+                    generator_config.remove(test_group_yaml_path)
+                    test_group_yaml_path.write_text(yaml_text)
+                    bar.log("CHANGED: test_group.yaml")
             else:
                 # new file -> create it
-                testdata_yaml_path.write_text(yaml_text)
-                bar.log("NEW: testdata.yaml")
-        elif d.testdata_yaml == "" and testdata_yaml_path.is_file():
+                test_group_yaml_path.write_text(yaml_text)
+                bar.log("NEW: test_group.yaml")
+        elif d.test_group_yaml == "" and test_group_yaml_path.is_file():
             # empty -> remove it
-            generator_config.remove(testdata_yaml_path)
-            bar.log("REMOVED: testdata.yaml")
+            generator_config.remove(test_group_yaml_path)
+            bar.log("REMOVED: test_group.yaml")
         bar.done()
 
     def generate_includes(d, problem, generator_config, bar):
@@ -1470,7 +1483,7 @@ def generate_includes(d, problem, generator_config, bar):
 
 
 # Returns the numbered name
-def numbered_testcase_name(base_name, i, n):
+def numbered_test_case_name(base_name, i, n):
     width = len(str(n))
     number_prefix = f"{i:0{width}}"
     if base_name:
@@ -1521,25 +1534,25 @@ def __init__(self, problem, restriction=None):
         yaml_path = self.problem.path / "generators" / "generators.yaml"
         self.n_parse_error = 0
 
-        # A map of paths `secret/testgroup/testcase` to their canonical TestcaseRule.
+        # A map of paths `secret/test_group/test_case` to their canonical TestcaseRule.
         # For generated cases this is the rule itself.
-        # For included cases, this is the 'resolved' location of the testcase that is included.
+        # For included cases, this is the 'resolved' location of the test case that is included.
         self.known_cases = dict()
-        # A map of paths `secret/testgroup` to Directory rules.
+        # A map of paths `secret/test_group` to Directory rules.
         self.known_directories = dict()
         # Used for cleanup
         self.known_files = set()
-        # A map from key to (is_included, list of testcases and directories),
+        # A map from key to (is_included, list of test cases and directories),
         # used for `include` statements.
         self.known_keys = collections.defaultdict[str, tuple[bool, list[TestcaseRule | Directory]]](
             lambda: (False, [])
         )
         # A set of testcase rules, including seeds.
         self.rules_cache = dict()
-        # The set of generated testcases keyed by hash(testdata).
-        self.generated_testdata = dict()
+        # The set of generated test cases keyed by hash(test_case).
+        self.generated_test_cases = dict()
         # Path to the trash directory for this run
-        self.trashdir: Optional[Path] = None
+        self.trash_dir: Optional[Path] = None
         # Set of hash(.in) for all generated testcases
         self.hashed_in = set()
         # Files that should be processed
@@ -1602,8 +1615,8 @@ def add_known(obj):
                     color_type=MessageType.ERROR,
                 )
 
-        num_numbered_testcases = 0
-        testcase_id = 0
+        num_numbered_test_cases = 0
+        test_case_id = 0
 
         def parse_count(yaml, warn_for=None):
             if not has_count(yaml):
@@ -1639,7 +1652,7 @@ def parse_count(yaml, warn_for=None):
         # Count the number of testcases in the given directory yaml.
         # This parser is quite forgiving,
         def count(yaml):
-            nonlocal num_numbered_testcases
+            nonlocal num_numbered_test_cases
             ds = yaml.get("data")
             if isinstance(ds, dict):
                 ds = [ds]
@@ -1652,7 +1665,7 @@ def count(yaml):
                 if isinstance(elem, dict):
                     for key in elem:
                         if is_testcase(elem[key]) and numbered:
-                            num_numbered_testcases += parse_count(elem[key])
+                            num_numbered_test_cases += parse_count(elem[key])
                         elif is_directory(elem[key]):
                             count(elem[key])
 
@@ -1670,7 +1683,7 @@ def parse(key: str, name_gen: Callable[[], str], yaml: dict, parent: AnyDirector
 
             if is_testcase(yaml):
                 if isinstance(parent, RootDirectory):
-                    raise ParseException("Testcase must be inside Directory", name)
+                    raise ParseException("Test case must be inside a Directory.", name)
 
                 count = parse_count(yaml, parent.path / name)
 
@@ -1706,23 +1719,23 @@ def parse(key: str, name_gen: Callable[[], str], yaml: dict, parent: AnyDirector
                 raise ParseException("Duplicate entry", d.path)
             add_known(d)
 
-            # Parse child directories/testcases.
+            # Parse child test cases/groups.
             if "data" in yaml and yaml["data"]:
                 data = yaml["data"] if isinstance(yaml["data"], list) else [yaml["data"]]
-                # Count the number of child testgroups.
-                num_testgroups = 0
+                # Count the number of child test groups.
+                num_test_groups = 0
                 for dictionary in data:
                     assert_type("Elements of data", dictionary, dict, d.path)
                     for key in dictionary.keys():
                         assert_type("Key of data", key, [type(None), str], d.path / str(key))
                     for child_name, child_yaml in sorted(dictionary.items()):
                         if is_directory(child_yaml):
-                            num_testgroups += 1
+                            num_test_groups += 1
 
-                testgroup_id = 0
+                test_group_id = 0
                 for dictionary in data:
                     for key in dictionary:
-                        assert_type("Testcase/directory name", key, [type(None), str], d.path)
+                        assert_type("Test case/group name", key, [type(None), str], d.path)
 
                     # Process named children alphabetically, but not in the root directory.
                     # There, process in the 'natural order'.
@@ -1759,24 +1772,24 @@ def parse(key: str, name_gen: Callable[[], str], yaml: dict, parent: AnyDirector
                         if d.numbered:
                             if is_directory(child_yaml):
 
-                                def next_testgroup_name():
-                                    nonlocal testgroup_id
-                                    testgroup_id += 1
-                                    return numbered_testcase_name(
-                                        child_key, testgroup_id, num_testgroups
+                                def next_test_group_name():
+                                    nonlocal test_group_id
+                                    test_group_id += 1
+                                    return numbered_test_case_name(
+                                        child_key, test_group_id, num_test_groups
                                     )
 
-                                child_name = next_testgroup_name
+                                child_name = next_test_group_name
                             elif is_testcase(child_yaml):
 
-                                def next_testcase_name():
-                                    nonlocal testcase_id
-                                    testcase_id += 1
-                                    return numbered_testcase_name(
-                                        child_key, testcase_id, num_numbered_testcases
+                                def next_test_case_name():
+                                    nonlocal test_case_id
+                                    test_case_id += 1
+                                    return numbered_test_case_name(
+                                        child_key, test_case_id, num_numbered_test_cases
                                     )
 
-                                child_name = next_testcase_name
+                                child_name = next_test_case_name
                             else:
                                 # Use error will be given inside parse(child).
                                 child_name = lambda: ""  # noqa: E731  # TODO this can probably be prettier
@@ -1785,7 +1798,7 @@ def next_testcase_name():
                             child_name = lambda: child_key  # noqa: E731  # TODO this can probably be prettier
                             if not child_name():
                                 raise ParseException(
-                                    "Unnumbered testcases must not have an empty key",
+                                    "Unnumbered test cases must not have an empty key",
                                     d.path,
                                 )
                         c = parse(child_key, child_name, child_yaml, d)
@@ -1826,7 +1839,7 @@ def add_included_case(t: TestcaseRule):
                     assert_type("include", include, str, d.path)
                     if "/" in include:
                         message(
-                            f"Include {include} should be a testcase/testgroup key, not a path.",
+                            f"Include {include} should be a test case/group key, not a path.",
                             "generators.yaml",
                             d.path,
                             color_type=MessageType.ERROR,
@@ -1856,7 +1869,7 @@ def add_included_case(t: TestcaseRule):
                             pass
                     else:
                         message(
-                            f"Unknown include key {include} does not refer to a previous testcase.",
+                            f"Unknown include key {include} does not refer to a previous test case.",
                             "generators.yaml",
                             d.path,
                             color_type=MessageType.ERROR,
@@ -1995,9 +2008,9 @@ def generate_copies_and_includes(d):
 
     # move a file or into the trash directory
     def remove(self, src):
-        if self.trashdir is None:
-            self.trashdir = self.problem.tmpdir / "trash" / secrets.token_hex(4)
-        dst = self.trashdir / src.absolute().relative_to((self.problem.path / "data").absolute())
+        if self.trash_dir is None:
+            self.trash_dir = self.problem.tmpdir / "trash" / secrets.token_hex(4)
+        dst = self.trash_dir / src.absolute().relative_to((self.problem.path / "data").absolute())
         dst.parent.mkdir(parents=True, exist_ok=True)
 
         shutil.move(src, dst)
@@ -2034,8 +2047,8 @@ def clean_up(self):
         bar = ProgressBar("Clean Up", max_len=-1)
 
         self._remove_unknown(self.problem.path / "data", bar)
-        if self.trashdir is not None:
-            bar.warn("Some files were changed/removed.", f"-> {self.trashdir}")
+        if self.trash_dir is not None:
+            bar.warn("Some files were changed/removed.", f"-> {self.trash_dir}")
         bar.finalize()
 
     # write a gitignore file to ignore everything in data/ except data/sample/
@@ -2187,22 +2200,22 @@ def reorder(self):
             assert "data" in d.yaml
             assert isinstance(d.yaml["data"], list)
 
-            # dont move unknown cases/directories or testcases with count
-            testnodes = {
+            # don't move unknown test cases/groups, or test cases with count
+            test_nodes = {
                 id(c.yaml): str(c.path)
                 for c in d.data
                 if c.path in testcase_paths and not has_count(c.yaml)
             }
-            others = [e for e in d.yaml["data"] if id(next(iter(e.values()))) not in testnodes]
+            others = [e for e in d.yaml["data"] if id(next(iter(e.values()))) not in test_nodes]
 
             class TestcaseResult:
                 def __init__(self, yaml):
                     self.yaml = yaml
-                    self.testnode = testnodes[id(next(iter(yaml.values())))]
+                    self.test_node = test_nodes[id(next(iter(yaml.values())))]
                     self.scores = []
                     self.result = []
                     for i in range(len(submissions)):
-                        verdict = verdict_table.results[i][self.testnode]
+                        verdict = verdict_table.results[i][self.test_node]
                         # moving TLE cases to the front is most important to save resources
                         # RTE are less reliable and therefore less important than WA
                         if verdict == Verdict.TIME_LIMIT_EXCEEDED:
@@ -2211,10 +2224,10 @@ def __init__(self, yaml):
                             self.scores.append((i, 4))
                         elif verdict == Verdict.RUNTIME_ERROR:
                             self.scores.append((i, 3))
-                        self.result.append(verdict_table._get_verdict(i, self.testnode))
+                        self.result.append(verdict_table._get_verdict(i, self.test_node))
 
                 def __str__(self):
-                    return f"{Fore.CYAN}Reorder{Style.RESET_ALL}: {self.testnode:<{max_testcase_len}} {''.join(self.result)}"
+                    return f"{Fore.CYAN}Reorder{Style.RESET_ALL}: {self.test_node:<{max_testcase_len}} {''.join(self.result)}"
 
                 def score(self, weights):
                     return sum(weights[i] * x for i, x in self.scores)
@@ -2234,7 +2247,9 @@ def update(self, weights):
                     return weights
 
             todo = [
-                TestcaseResult(e) for e in d.yaml["data"] if id(next(iter(e.values()))) in testnodes
+                TestcaseResult(e)
+                for e in d.yaml["data"]
+                if id(next(iter(e.values()))) in test_nodes
             ]
 
             # TODO: ProgressBar?
diff --git a/bin/interactive.py b/bin/interactive.py
index 2ba533cc3..759bd63dd 100644
--- a/bin/interactive.py
+++ b/bin/interactive.py
@@ -56,7 +56,7 @@ def get_validator_command():
                 run.testcase.ans_path.resolve(),
                 run.feedbackdir.resolve(),
             ]
-            + run.testcase.testdata_yaml_args(
+            + run.testcase.test_case_yaml_args(
                 output_validator,
                 bar or PrintBar("Run interactive test case"),
             )
diff --git a/bin/problem.py b/bin/problem.py
index 0f4772784..e62c3a407 100644
--- a/bin/problem.py
+++ b/bin/problem.py
@@ -283,7 +283,9 @@ def __init__(
         self.limits = ProblemLimits(parse_setting(yaml_data, "limits", {}), problem, self)
 
         parse_deprecated_setting(
-            yaml_data, "validator_flags", f"{validate.OutputValidator.args_key}' in 'testdata.yaml"
+            yaml_data,
+            "validator_flags",
+            f"{validate.OutputValidator.args_key}' in 'test_group.yaml",
         )
 
         self.keywords: list[str] = parse_optional_list_setting(yaml_data, "keywords", str)
@@ -362,9 +364,9 @@ def __init__(self, path: Path, tmpdir: Path, label: Optional[str] = None):
         self._programs = dict[Path, "Program"]()
         self._program_callbacks = dict[Path, list[Callable[["Program"], None]]]()
         # Dictionary from path to parsed file contents.
-        # TODO #102: Add type for testdata.yaml (typed Namespace?)
-        self._testdata_yamls = dict[Path, dict[str, Any]]()
-        self._testdata_lock = threading.Lock()
+        # TODO #102: Add type for test_group.yaml (typed Namespace?)
+        self._test_case_yamls = dict[Path, dict[str, Any]]()
+        self._test_group_lock = threading.Lock()
 
         # The label for the problem: A, B, A1, A2, X, ...
         self.label = label
@@ -457,84 +459,81 @@ def _read_settings(self):
         self.multi_pass: bool = self.settings.multi_pass
         self.custom_output: bool = self.settings.custom_output
 
-    # TODO #102 move to TestData class
-    def _parse_testdata_yaml(p, path, bar):
+    # TODO #102 move to a new TestGroup class
+    def _parse_test_case_and_groups_yaml(p, path: Path, bar: BAR_TYPE):
         assert path.is_relative_to(p.path / "data")
-        for dir in [path] + list(path.parents):
+        for f in [path] + list(path.parents):
             # Do not go above the data directory.
-            if dir == p.path:
+            if f == p.path:
                 return
 
-            f = dir / "testdata.yaml"
-            if not f.is_file() or f in p._testdata_yamls:
-                continue
-            with p._testdata_lock:
-                if f not in p._testdata_yamls:
-                    raw = substitute(
-                        f.read_text(),
-                        p.settings.constants,
-                        pattern=config.CONSTANT_SUBSTITUTE_REGEX,
-                    )
-                    p._testdata_yamls[f] = flags = parse_yaml(raw, path=f, plain=True)
+            if f.is_dir():
+                f = f / "test_group.yaml"
+            with p._test_group_lock:
+                if not f.is_file() or f in p._test_case_yamls:
+                    continue
+                raw = substitute(
+                    f.read_text(),
+                    p.settings.constants,
+                    pattern=config.CONSTANT_SUBSTITUTE_REGEX,
+                )
+                p._test_case_yamls[f] = flags = parse_yaml(raw, path=f, plain=True)
 
-                    parse_deprecated_setting(
-                        flags, "output_validator_flags", validate.OutputValidator.args_key
-                    )
-                    parse_deprecated_setting(
-                        flags, "input_validator_flags", validate.InputValidator.args_key
-                    )
+                parse_deprecated_setting(
+                    flags, "output_validator_flags", validate.OutputValidator.args_key
+                )
+                parse_deprecated_setting(
+                    flags, "input_validator_flags", validate.InputValidator.args_key
+                )
 
-                    # Verify testdata.yaml
-                    for k in flags:
-                        match k:
-                            case (
-                                validate.OutputValidator.args_key
-                                | validate.AnswerValidator.args_key
-                                | visualize.TestCaseVisualizer.args_key
-                                | visualize.OutputVisualizer.args_key
-                            ):
-                                if not isinstance(flags[k], list):
-                                    bar.error(
-                                        f"{k} must be a list of strings",
-                                        resume=True,
-                                        print_item=False,
-                                    )
-                            case validate.InputValidator.args_key:
-                                if not isinstance(flags[k], (list, dict)):
-                                    bar.error(
-                                        f"{k} must be list or map",
-                                        resume=True,
-                                        print_item=False,
-                                    )
-                                if isinstance(flags[k], dict):
-                                    input_validator_names = set(
-                                        val.name for val in p.validators(validate.InputValidator)
-                                    )
-                                    for name in set(flags[k]) - input_validator_names:
-                                        bar.warn(
-                                            f"Unknown input validator {name}; expected {input_validator_names}",
-                                            print_item=False,
-                                        )
-                            case (
-                                "args"
-                                | "description"
-                                | "full_feedback"
-                                | "hint"
-                                | "scoring"
-                                | "static_validation"
-                            ):
-                                bar.warn(
-                                    f"{k} in testdata.yaml not implemented in BAPCtools",
-                                    print_item=False,
+                # Use variable kwargs so the type checker does not complain when passing them to a PrintBar (nothing happens in that case anyway)
+                bar_kwargs = {"resume": True, "print_item": False}
+
+                # Verify test_group.yaml
+                for k in flags:
+                    match k:
+                        case (
+                            validate.OutputValidator.args_key
+                            | validate.AnswerValidator.args_key
+                            | visualize.TestCaseVisualizer.args_key
+                            | visualize.OutputVisualizer.args_key
+                        ):
+                            if not isinstance(flags[k], list):
+                                bar.error(
+                                    f"{k} must be a list of strings",
+                                    None,
+                                    **bar_kwargs,
                                 )
-                            case _:
-                                path = f.relative_to(p.path / "data")
-                                bar.warn(f'Unknown key "{k}" in {path}', print_item=False)
-            # Do not go above the data directory.
-            if dir == p.path / "data":
-                break
-
-    def get_testdata_yaml(
+                        case validate.InputValidator.args_key:
+                            if not isinstance(flags[k], (list, dict)):
+                                bar.error(
+                                    f"{k} must be list or map",
+                                    None,
+                                    **bar_kwargs,
+                                )
+                            if isinstance(flags[k], dict):
+                                input_validator_names = set(
+                                    val.name for val in p.validators(validate.InputValidator)
+                                )
+                                for name in set(flags[k]) - input_validator_names:
+                                    bar.warn(
+                                        f"Unknown input validator {name}; expected {input_validator_names}",
+                                        None,
+                                        **bar_kwargs,
+                                    )
+                        case "description" | "hint":
+                            pass  # We don't do anything with hint or description in BAPCtools, but no need to warn about this
+                        case "args" | "full_feedback" | "scoring" | "static_validation":
+                            bar.warn(
+                                f"{k} in test_group.yaml not implemented in BAPCtools",
+                                None,
+                                **bar_kwargs,
+                            )
+                        case _:
+                            path = f.relative_to(p.path / "data")
+                            bar.warn(f'Unknown key "{k}" in {path}', None, **bar_kwargs)
+
+    def get_test_case_yaml(
         p,
         path: Path,
         key: str,
@@ -542,20 +541,20 @@ def get_testdata_yaml(
         name: Optional[str] = None,
     ) -> list[str]:
         """
-        Find the testdata flags applying at the given path for the given key.
-        If necessary, walk up from `path` looking for the first testdata.yaml file that applies,
+        Find the value of the given test_group.yaml key applying at the given path.
+        If necessary, walk up from `path` looking for the first test_group.yaml file that applies.
 
         Side effects: parses and caches the file.
 
         Arguments
         ---------
         path: absolute path (a file or a directory)
-        key: The testdata.yaml key to look for (TODO: 'grading' is not yet implemented)
+        key: The test_group.yaml key to look for (TODO: 'grading' is not yet implemented)
         name: If key == 'input_validator_args', optionally the name of the input validator.
 
         Returns:
         --------
-        A list of string arguments, which is empty if no testdata.yaml is found.
+        A list of string arguments, which is empty if no test_group.yaml is found.
         TODO: when 'grading' is supported, it also can return dict
         """
         known_args_keys = [
@@ -572,19 +571,21 @@ def get_testdata_yaml(
                 f"Only input validators support flags by validator name, got {key} and {name}"
             )
 
-        # parse and cache testdata.yaml
-        p._parse_testdata_yaml(path, bar)
+        # parse and cache <test_case>.yaml and test_group.yaml
+        path = path.with_suffix(".yaml")
+        p._parse_test_case_and_groups_yaml(path, bar)
 
         # extract the flags
-        for dir in [path] + list(path.parents):
+        for f in [path] + list(path.parents):
             # Do not go above the data directory.
-            if dir == p.path:
+            if f == p.path:
                 return []
 
-            f = dir / "testdata.yaml"
-            if f not in p._testdata_yamls:
+            if f.suffix != ".yaml":
+                f = f / "test_group.yaml"
+            if f not in p._test_case_yamls:
                 continue
-            flags = p._testdata_yamls[f]
+            flags = p._test_case_yamls[f]
             if key in flags:
                 args = flags[key]
                 if key == validate.InputValidator.args_key:
@@ -611,6 +612,15 @@ def get_testdata_yaml(
 
         return []
 
+    # Because Problem.testcases() may be called multiple times (e.g. validating multiple modes, or with `bt all`),
+    # this cache makes sure that some warnings (like malformed test case names) only appear once.
+    _warned_for_test_case = set[str]()
+
+    def _warn_once(p, test_name, msg):
+        if test_name not in p._warned_for_test_case:
+            p._warned_for_test_case.add(test_name)
+            warn(msg)
+
     def testcases(
         p,
         *,
@@ -659,6 +669,15 @@ def testcases(
         testcases = []
         for f in in_paths:
             t = testcase.Testcase(p, f)
+            if not config.COMPILED_FILE_NAME_REGEX.fullmatch(f.name):
+                p._warn_once(t.name, f"Test case name {t.name} is not valid. Skipping.")
+                continue
+            if f.with_suffix("").name == "test_group":
+                p._warn_once(
+                    t.name,
+                    "Test case must not be named 'test_group', this clashes with the group-level 'test_group.yaml'. Skipping.",
+                )
+                continue
             if (
                 (p.interactive or p.multi_pass)
                 and mode in [validate.Mode.INVALID, validate.Mode.VALID_OUTPUT]
@@ -670,7 +689,7 @@ def testcases(
                 continue
             if needans and not t.ans_path.is_file():
                 if t.root != "invalid_input":
-                    warn(f"Found input file {f} without a .ans file. Skipping.")
+                    p._warn_once(t.name, f"Found input file {f} without a .ans file. Skipping.")
                     continue
             if mode == validate.Mode.VALID_OUTPUT:
                 if t.out_path is None:
@@ -1331,7 +1350,7 @@ def validate_valid_extra_data(p) -> bool:
         if not p.validators(validate.OutputValidator, strict=True, print_warn=False):
             return True
 
-        args = p.get_testdata_yaml(
+        args = p.get_test_case_yaml(
             p.path / "data" / "valid_output",
             "output_validator_args",
             PrintBar("Generic Output Validation"),
@@ -1492,7 +1511,7 @@ def run_all(select_verdict, select):
                 return None, None, None
 
             def get_slowest(result):
-                slowest_pair = result.slowest_testcase()
+                slowest_pair = result.slowest_test_case()
                 assert slowest_pair is not None
                 return slowest_pair
 
diff --git a/bin/run.py b/bin/run.py
index 84932e730..377a77391 100644
--- a/bin/run.py
+++ b/bin/run.py
@@ -228,7 +228,7 @@ def _validate_output(self, bar: BAR_TYPE) -> Optional[ExecResult]:
         return output_validator.run(
             self.testcase,
             self,
-            args=self.testcase.testdata_yaml_args(output_validator, bar),
+            args=self.testcase.test_case_yaml_args(output_validator, bar),
         )
 
     def _visualize_output(self, bar: BAR_TYPE) -> Optional[ExecResult]:
@@ -242,7 +242,7 @@ def _visualize_output(self, bar: BAR_TYPE) -> Optional[ExecResult]:
             self.testcase.ans_path.resolve(),
             self.out_path if not self.problem.interactive else None,
             self.feedbackdir,
-            args=self.testcase.testdata_yaml_args(output_visualizer, bar),
+            args=self.testcase.test_case_yaml_args(output_visualizer, bar),
         )
 
 
@@ -501,7 +501,7 @@ def process_run(run: Run):
         else:
             color = Fore.GREEN if self.verdict in self.expected_verdicts else Fore.RED
 
-        (salient_testcase, salient_duration) = verdicts.salient_testcase()
+        (salient_testcase, salient_duration) = verdicts.salient_test_case()
         salient_print_verdict = self.verdict
         salient_duration_style = Style.BRIGHT if salient_duration >= self.limits["timeout"] else ""
 
@@ -509,7 +509,7 @@ def process_run(run: Run):
         message = f"{color}{salient_print_verdict.short():>3}{salient_duration_style}{salient_duration:6.3f}s{Style.RESET_ALL} {Style.DIM}@ {salient_testcase:{max_testcase_len}}{Style.RESET_ALL}"
 
         if verdicts.run_until in [RunUntil.DURATION, RunUntil.ALL]:
-            slowest_pair = verdicts.slowest_testcase()
+            slowest_pair = verdicts.slowest_test_case()
             assert slowest_pair is not None
             (slowest_testcase, slowest_duration) = slowest_pair
             slowest_verdict = verdicts[slowest_testcase]
diff --git a/bin/skel.py b/bin/skel.py
index f38df4380..46707c4db 100644
--- a/bin/skel.py
+++ b/bin/skel.py
@@ -122,7 +122,7 @@ def new_problem() -> None:
         "author": author,
         "type": problem_type,
         OutputValidator.args_key: output_validator_args,
-        "testdata_yaml_comment": "#" if output_validator_args[0] == "#" else "",
+        "test_group_yaml_comment": "#" if output_validator_args[0] == "#" else "",
     }
 
     source_name = ask_variable_string(
diff --git a/bin/testcase.py b/bin/testcase.py
index 26f061566..2a2de206b 100644
--- a/bin/testcase.py
+++ b/bin/testcase.py
@@ -22,6 +22,7 @@
     import problem
 
 
+# TODO #102: Consistently separate the compound noun "test case", e.g. "TestCase" or "test_case"
 class Testcase:
     """
     A single test case. It consists of files with matching base names, typically
@@ -59,8 +60,8 @@ class Testcase:
     ans_path: Path
         Like `hamiltonicity/data/secret/cubic/petersen.ans`.
 
-    testdata_yaml: dict
-        The YAML-parsed test data flags that apply to this test case.
+    out_path: Path
+        Like `hamiltonicity/data/secret/cubic/petersen.out`.
 
     """
 
@@ -86,7 +87,6 @@ def __init__(
 
         self.problem = base_problem
 
-        # TODO add self.out_path
         if short_path is None:
             try:
                 self.short_path: Path = path.relative_to(self.problem.path / "data")
@@ -115,13 +115,13 @@ def __repr__(self) -> str:
     def with_suffix(self, ext: str) -> Path:
         return self.in_path.with_suffix(ext)
 
-    def testdata_yaml_args(
+    def test_case_yaml_args(
         self,
         program: "validate.AnyValidator | visualize.AnyVisualizer",
         bar: BAR_TYPE,
     ) -> list[str]:
         """
-        The flags specified in testdata.yaml for the given validator applying to this testcase.
+        The flags specified in test_group.yaml for the given validator applying to this testcase.
 
         Returns
         -------
@@ -130,9 +130,8 @@ def testdata_yaml_args(
         or ["--max_N", "50"] or even [""].
         """
 
-        path = self.problem.path / "data" / self.short_path
-        return self.problem.get_testdata_yaml(
-            path,
+        return self.problem.get_test_case_yaml(
+            self.problem.path / "data" / self.short_path,
             type(program).args_key,
             bar,
             name=program.name if isinstance(program, validate.InputValidator) else None,
@@ -156,7 +155,7 @@ def validator_hashes(
         d = dict()
 
         for validator in validators:
-            flags = self.testdata_yaml_args(validator, bar)
+            flags = self.test_case_yaml_args(validator, bar)
             flags_string = " ".join(flags)
             h = combine_hashes_dict(
                 {
@@ -287,7 +286,7 @@ def _run_validators(
             if isinstance(validator, validate.OutputValidator) and mode == validate.Mode.ANSWER:
                 args += ["case_sensitive", "space_change_sensitive"]
                 name = f"{name} (ans)"
-            flags = self.testdata_yaml_args(validator, bar)
+            flags = self.test_case_yaml_args(validator, bar)
             flags = flags + args
 
             ret = validator.run(self, mode=mode, constraints=constraints, args=flags)
diff --git a/bin/upgrade.py b/bin/upgrade.py
index cd69e61b1..0305eb160 100644
--- a/bin/upgrade.py
+++ b/bin/upgrade.py
@@ -1,5 +1,6 @@
 import config
 import generate
+from collections import defaultdict
 from util import *
 from validate import InputValidator, AnswerValidator, OutputValidator
 
@@ -96,6 +97,8 @@ def upgrade_data(problem_path: Path, bar: ProgressBar) -> None:
             bar.log(f"renaming '{old_name}' to '{new_name}'")
             old_path.rename(new_path)
 
+    # Move test cases in 'bad' to either 'invalid_input' or 'invalid_answer', whichever applies
+
     def rename_testcase(old_base: Path, new_dir: Path) -> None:
         new_dir.mkdir(parents=True, exist_ok=True)
         new_base = new_dir / old_base.name
@@ -124,14 +127,52 @@ def rename_testcase(old_base: Path, new_dir: Path) -> None:
     if bad_dir.is_dir() and not any(bad_dir.iterdir()):
         bad_dir.rmdir()
 
+    # Move .hint and .desc files to the Test Case Configuration .yaml file
+
+    test_case_yamls = defaultdict[Path, CommentedMap](CommentedMap)
+    for f in (problem_path / "data").rglob("*.yaml"):
+        if f.with_suffix(".in").exists():  # Prevent reading test_group.yaml, which has no *.in file
+            test_case_yamls[f] = read_yaml(f)
 
-def upgrade_testdata_yaml(problem_path: Path, bar: ProgressBar) -> None:
+    for f in (problem_path / "data").rglob("*.desc"):
+        test_case_yaml = test_case_yamls[f.with_suffix(".yaml")]
+        if "description" in test_case_yaml:
+            bar.warn(f"can't move '{f}' to '*.yaml', it already contains the key 'description'")
+        else:
+            bar.log(f"moving '{f}' to 'description' key in '*.yaml'")
+            test_case_yaml["description"] = f.read_text()
+            write_yaml(test_case_yaml, f.with_suffix(".yaml"))
+            f.unlink()
+
+    for f in (problem_path / "data").rglob("*.hint"):
+        test_case_yaml = test_case_yamls[f.with_suffix(".yaml")]
+        if "hint" in test_case_yaml:
+            bar.warn(f"can't move '{f}' to '*.yaml', it already contains the key 'hint'")
+        else:
+            bar.log(f"moving '{f}' to 'hint' key in '*.yaml'")
+            test_case_yaml["hint"] = f.read_text()
+            write_yaml(test_case_yaml, f.with_suffix(".yaml"))
+            f.unlink()
+
+
+def rename_testdata_to_test_group_yaml(problem_path: Path, bar: ProgressBar) -> None:
+    for f in (problem_path / "data").rglob("testdata.yaml"):
+        new_name = f.with_name("test_group.yaml")
+        rename_log = f"'{f.relative_to(problem_path)}' to '{new_name.relative_to(problem_path)}'"
+        if new_name.exists():
+            bar.error(f"can't rename {rename_log}, target already exists", resume=True)
+            continue
+        bar.log(f"renaming {rename_log}")
+        f.rename(new_name)
+
+
+def upgrade_test_group_yaml(problem_path: Path, bar: ProgressBar) -> None:
     rename = [
         ("output_validator_flags", OutputValidator.args_key),
         ("input_validator_flags", InputValidator.args_key),
     ]
 
-    for f in (problem_path / "data").rglob("testdata.yaml"):
+    for f in (problem_path / "data").rglob("test_group.yaml"):
         data = cast(CommentedMap, read_yaml(f))
 
         for old, new in rename:
@@ -217,10 +258,33 @@ def move_testcase(name: str, value: Any, new_parent: str) -> None:
             ryaml_filter(data, "bad")
             changed = True
 
-    def upgrade_generated_testdata_yaml(data: dict[str, Any], path: str) -> bool:
+    def apply_recursively(
+        operation: Callable[[dict[str, Any], str], bool], data: dict[str, Any], path=""
+    ) -> bool:
+        changed = operation(data, path)
+        if "data" in data and data["data"]:
+            children = data["data"] if isinstance(data["data"], list) else [data["data"]]
+            for dictionary in children:
+                for child_name, child_data in sorted(dictionary.items()):
+                    if not child_name:
+                        child_name = '""'
+                    if generate.is_directory(child_data):
+                        changed |= apply_recursively(operation, child_data, path + "." + child_name)
+        return changed
+
+    def rename_testdata_to_test_group_yaml(data: dict[str, Any], path: str) -> bool:
+        old, new = "testdata.yaml", "test_group.yaml"
+        if old in data:
+            print_path = f" ({path[1:]})" if len(path) > 1 else ""
+            bar.log(f"changing '{old}' to '{new}' in generators.yaml{print_path}")
+            ryaml_replace(data, old, new)
+            return True
+        return False
+
+    def upgrade_generated_test_group_yaml(data: dict[str, Any], path: str) -> bool:
         changed = False
-        if "testdata.yaml" in data:
-            testdata = cast(CommentedMap, data["testdata.yaml"])
+        if "test_group.yaml" in data:
+            test_group_yaml = cast(CommentedMap, data["test_group.yaml"])
             print_path = f" ({path[1:]})" if len(path) > 1 else ""
 
             rename = [
@@ -228,27 +292,46 @@ def upgrade_generated_testdata_yaml(data: dict[str, Any], path: str) -> bool:
                 ("input_validator_flags", InputValidator.args_key),
             ]
             for old, new in rename:
-                if old in testdata:
-                    if new in testdata:
+                if old in test_group_yaml:
+                    if new in test_group_yaml:
                         bar.error(
                             f"can't change '{old}', '{new}' already exists in generators.yaml{print_path}",
                             resume=True,
                         )
                         continue
-                    bar.log(f"change '{old}' to '{new}' in generators.yaml{print_path}")
-                    ryaml_replace(testdata, old, new)
+                    bar.log(f"changing '{old}' to '{new}' in generators.yaml{print_path}")
+                    ryaml_replace(test_group_yaml, old, new)
                     changed = True
+        return changed
+
+    def replace_hint_desc_in_test_cases(data: dict[str, Any], path: str) -> bool:
+        changed = False
         if "data" in data and data["data"]:
             children = data["data"] if isinstance(data["data"], list) else [data["data"]]
             for dictionary in children:
                 for child_name, child_data in sorted(dictionary.items()):
-                    if generate.is_directory(child_data):
-                        changed |= upgrade_generated_testdata_yaml(
-                            child_data, path + "." + child_name
-                        )
+                    if not child_name:
+                        child_name = '""'
+                    if generate.is_testcase(child_data):
+                        if "desc" in child_data:
+                            ryaml_get_or_add(child_data, "yaml")["description"] = child_data["desc"]
+                            ryaml_filter(child_data, "desc")
+                            bar.log(
+                                f"moving 'desc' inside 'yaml' in generators.yaml ({path}.{child_name})"
+                            )
+                            changed = True
+                        if "hint" in child_data:
+                            ryaml_get_or_add(child_data, "yaml")["hint"] = child_data["hint"]
+                            ryaml_filter(child_data, "hint")
+                            bar.log(
+                                f"moving 'hint' inside 'yaml' in generators.yaml ({path}.{child_name})"
+                            )
+                            changed = True
         return changed
 
-    changed |= upgrade_generated_testdata_yaml(yaml_data, "")
+    changed |= apply_recursively(rename_testdata_to_test_group_yaml, yaml_data, "")
+    changed |= apply_recursively(upgrade_generated_test_group_yaml, yaml_data, "")
+    changed |= apply_recursively(replace_hint_desc_in_test_cases, yaml_data, "")
 
     if changed:
         write_yaml(yaml_data, generators_yaml)
@@ -414,11 +497,11 @@ def upgrade_problem_yaml(problem_path: Path, bar: ProgressBar) -> None:
     def add_args(new_data: dict[str, Any]) -> bool:
         if OutputValidator.args_key in new_data:
             bar.error(
-                f"can't change 'validator_flags', '{OutputValidator.args_key}' already exists in testdata.yaml",
+                f"can't change 'validator_flags', '{OutputValidator.args_key}' already exists in test_group.yaml",
                 resume=True,
             )
             return False
-        bar.log(f"change 'validator_flags' to '{OutputValidator.args_key}' in testdata.yaml")
+        bar.log(f"change 'validator_flags' to '{OutputValidator.args_key}' in test_group.yaml")
         validator_flags = data["validator_flags"]
         new_data[OutputValidator.args_key] = (
             validator_flags.split() if isinstance(validator_flags, str) else validator_flags
@@ -432,26 +515,26 @@ def add_args(new_data: dict[str, Any]) -> bool:
             if generators_path.exists():
                 generators_data = cast(CommentedMap, read_yaml(generators_path))
 
-                if "testdata.yaml" not in generators_data:
+                if "test_group.yaml" not in generators_data:
                     if "data" in generators_data:
                         # insert before data
                         pos = list(generators_data.keys()).index("data")
-                        generators_data.insert(pos, "testdata.yaml", CommentedMap())
+                        generators_data.insert(pos, "test_group.yaml", CommentedMap())
                     else:
                         # insert at end
-                        generators_data["testdata.yaml"] = CommentedMap()
-                if add_args(generators_data["testdata.yaml"]):
+                        generators_data["test_group.yaml"] = CommentedMap()
+                if add_args(generators_data["test_group.yaml"]):
                     write_yaml(generators_data, generators_path)
             else:
-                testdata_path = problem_path / "data" / "testdata.yaml"
-                testdata_data = (
-                    cast(CommentedMap, read_yaml(testdata_path))
-                    if testdata_path.exists()
+                test_group_path = problem_path / "data" / "test_group.yaml"
+                test_group_data = (
+                    cast(CommentedMap, read_yaml(test_group_path))
+                    if test_group_path.exists()
                     else CommentedMap()
                 )
 
-                if add_args(testdata_data):
-                    write_yaml(testdata_data, testdata_path)
+                if add_args(test_group_data):
+                    write_yaml(test_group_data, test_group_path)
         else:
             ryaml_filter(data, "validator_flags")
 
@@ -498,7 +581,8 @@ def _upgrade(problem_path: Path, bar: ProgressBar) -> None:
     bar.start(problem_path)
 
     upgrade_data(problem_path, bar)
-    upgrade_testdata_yaml(problem_path, bar)
+    rename_testdata_to_test_group_yaml(problem_path, bar)
+    upgrade_test_group_yaml(problem_path, bar)
     upgrade_generators_yaml(problem_path, bar)
     upgrade_statement(problem_path, bar)
     upgrade_format_validators(problem_path, bar)
diff --git a/bin/verdicts.py b/bin/verdicts.py
index 34f422d3f..fb988457b 100644
--- a/bin/verdicts.py
+++ b/bin/verdicts.py
@@ -4,7 +4,7 @@
 import threading
 from enum import Enum
 from pathlib import Path
-from typing import Literal
+from typing import Literal, TYPE_CHECKING
 
 from colorama import Fore, Style
 
@@ -12,9 +12,12 @@
 import testcase
 from util import ProgressBar
 
+if TYPE_CHECKING:
+    import run
+
 
 class Verdict(Enum):
-    """The verdict of a testcase or testgroup"""
+    """The verdict of a test case or test group"""
 
     ACCEPTED = 1
     WRONG_ANSWER = 2
@@ -69,7 +72,7 @@ def color(self):
 class RunUntil(Enum):
     # Run until the lexicographically first error is known.
     FIRST_ERROR = 1
-    # Run until the lexicographically first timeout testcase is known.
+    # Run until the lexicographically first timeout test case is known.
     DURATION = 2
     # Run all cases.
     ALL = 3
@@ -135,14 +138,14 @@ def from_string_domjudge(s: str) -> Verdict:
 class Verdicts:
     """The verdicts of a submission.
 
-    Testcases and testgroups are identified by strings.  In particular,
-    * the testcase whose input file is 'a/b/1.in' is called 'a/b/1'
-    * the two topmost testgroups are 'sample', 'secret'
+    Test cases and test groups are identified by strings.  In particular,
+    * the test case whose input file is 'a/b/1.in' is called 'a/b/1'
+    * the two topmost test groups are 'sample', 'secret'
     * the root is called '.'
 
-    Initialised with all testcases. Individual verdicts are registered
+    Initialised with all test cases. Individual verdicts are registered
     with set(), which infers verdicts upwards in the tree as they become
-    available (and returns the topmost inferred testgroup).
+    available (and returns the topmost inferred test group).
     Verdicts (registered and inferred) are accessed with __getitem__
 
     >>> V = Verdicts(["a/b/1", "a/b/2", "a/c/1", "a/d/1", "b/3"], timeout=1)
@@ -152,26 +155,25 @@ class Verdicts:
     ACCEPTED None
 
     Attributes:
-    - run_until: Which testcases to run.
-    - children[testgroup]: the lexicographically sorted list of direct children (testgroups and testcases) of the given testnode
-
-    - verdict[testnode]: the verdict at the given testnode, or None. In particular,
+    - run_until: Which test cases to run.
+    - children[test_group]: the lexicographically sorted list of direct children (test groups and test cases) of the given test node
+    - verdict[test_node]: the verdict at the given test node, or None. In particular,
         verdict['.'] is the root verdict, sometimes called final verdict or submission verdict.
         Should not be directly set; use __setitem__ on the Verdict object instead.
 
         None: not computed yet.
         False: determined to be unneeded.
-    - duration[testcase]: the duration of the testcase
+    - duration[test_case]: the duration of the test case
     """
 
     def __init__(
         self,
-        testcases_list: list[testcase.Testcase],
+        test_cases_list: list[testcase.Testcase],
         timeout: int,
         run_until: RunUntil = RunUntil.FIRST_ERROR,
     ):
-        testcases: set[str] = set(t.name for t in testcases_list)
-        testgroups: set[str] = set(str(path) for tc in testcases for path in Path(tc).parents)
+        test_cases: set[str] = set(t.name for t in test_cases_list)
+        test_groups: set[str] = set(str(path) for tc in test_cases for path in Path(tc).parents)
 
         # Lock operations reading/writing non-static data.
         # Private methods assume the lock is already locked when entering a public method.
@@ -180,16 +182,16 @@ def __init__(
         self.run_until = run_until
         self.timeout = timeout
 
-        # (testcase | testgroup) -> Verdict | None | Literal[False]
+        # (test_case | test_group) -> Verdict | None | Literal[False]
         self.verdict: dict[str, Verdict | None | Literal[False]] = {
-            g: None for g in testcases | testgroups
+            g: None for g in test_cases | test_groups
         }
-        # testcase -> float | None
-        self.duration: dict[str, float | None] = {g: None for g in testcases}
+        # test_case -> float | None
+        self.duration: dict[str, float | None] = {g: None for g in test_cases}
 
-        # const testgroup -> [testgroup | testcase]
-        self.children: dict[str, list[str]] = {node: [] for node in testgroups}
-        for node in testcases | testgroups:
+        # const test_group -> [test_group | test_case]
+        self.children: dict[str, list[str]] = {node: [] for node in test_groups}
+        for node in test_cases | test_groups:
             if node != ".":
                 parent = str(Path(node).parent)
                 self.children[parent].append(node)
@@ -203,20 +205,20 @@ def __enter__(self):
     def __exit__(self, *args):
         self.lock.__exit__(*args)
 
-    def is_testgroup(self, node: str) -> bool:
-        """Is the given testnode name a testgroup (rather than a testcase)?
-        This assumes nonempty testgroups.
+    def is_test_group(self, node: str) -> bool:
+        """Is the given test node name a test group (rather than a test case)?
+        This assumes nonempty test groups.
         """
         return node in self.children
 
-    def is_testcase(self, node: str) -> bool:
-        """Is the given testnode name a testcase (rather than a testgroup)?
-        This assumes nonempty testgroups.
+    def is_test_case(self, node: str) -> bool:
+        """Is the given test node name a test case (rather than a test group)?
+        This assumes nonempty test groups.
         """
         return node not in self.children
 
-    def set(self, testcase: str, verdict: str | Verdict, duration: float):
-        """Set the verdict and duration of the given testcase (implying possibly others)
+    def set(self, test_case: str, verdict: str | Verdict, duration: float):
+        """Set the verdict and duration of the given test case (implying possibly others)
 
         verdict can be given as a Verdict or as a string using either long or
         short form ('ACCEPTED', 'AC', or Verdict.ACCEPTED).
@@ -224,23 +226,25 @@ def set(self, testcase: str, verdict: str | Verdict, duration: float):
         with self:
             if isinstance(verdict, str):
                 verdict = from_string(verdict)
-            self.duration[testcase] = duration
-            self._set_verdict_for_node(testcase, verdict, duration >= self.timeout)
+            self.duration[test_case] = duration
+            self._set_verdict_for_node(test_case, verdict, duration >= self.timeout)
 
-    def __getitem__(self, testnode) -> Verdict | None | Literal[False]:
+    def __getitem__(self, test_node) -> Verdict | None | Literal[False]:
         with self:
-            return self.verdict[testnode]
+            return self.verdict[test_node]
 
-    def salient_testcase(self) -> tuple[str, float]:
-        """The testcase most salient to the root verdict.
-        If self['.'] is Verdict.ACCEPTED, then this is the slowest testcase.
-        Otherwise, it is the lexicographically first testcase that was rejected."""
+    def salient_test_case(self) -> tuple[str, float]:
+        """The test case most salient to the root verdict.
+        If self['.'] is Verdict.ACCEPTED, then this is the slowest test case.
+        Otherwise, it is the lexicographically first test case that was rejected."""
         with self:
             match self["."]:
                 case None:
-                    raise ValueError("Salient testcase called before submission verdict determined")
+                    raise ValueError(
+                        "Salient test case called before submission verdict determined"
+                    )
                 case Verdict.ACCEPTED:
-                    # This implicitly assumes there is at least one testcase.
+                    # This implicitly assumes there is at least one test case.
                     return max(
                         ((tc, d) for tc, d in self.duration.items() if d is not None),
                         key=lambda x: x[1],
@@ -249,14 +253,14 @@ def salient_testcase(self) -> tuple[str, float]:
                     tc = min(
                         tc
                         for tc, v in self.verdict.items()
-                        if self.is_testcase(tc) and v != Verdict.ACCEPTED
+                        if self.is_test_case(tc) and v != Verdict.ACCEPTED
                     )
                     duration = self.duration[tc]
                     assert duration is not None
                     return (tc, duration)
 
-    def slowest_testcase(self) -> None | tuple[str, float]:
-        """The slowest testcase, if all cases were run or a timeout occurred."""
+    def slowest_test_case(self) -> None | tuple[str, float]:
+        """The slowest test case, if all cases were run or a timeout occurred."""
         with self:
             tc, d = max(
                 ((tc, d) for tc, d in self.duration.items() if d is not None),
@@ -270,8 +274,8 @@ def slowest_testcase(self) -> None | tuple[str, float]:
 
             return tc, d
 
-    def aggregate(self, testgroup: str) -> Verdict:
-        """The aggregate verdict at the given testgroup.
+    def aggregate(self, test_group: str) -> Verdict:
+        """The aggregate verdict at the given test group.
         Computes the lexicographically first non-accepted verdict.
 
         Raises:
@@ -280,29 +284,29 @@ def aggregate(self, testgroup: str) -> Verdict:
             [AC, None, RTE] is not (the first error cannot be determined).
         """
         with self:
-            child_verdicts = list(self.verdict[c] for c in self.children[testgroup])
+            child_verdicts = list(self.verdict[c] for c in self.children[test_group])
             if all(v == Verdict.ACCEPTED for v in child_verdicts):
                 return Verdict.ACCEPTED
             else:
                 first_error = next(v for v in child_verdicts if v != Verdict.ACCEPTED)
                 if first_error in [None, False]:
                     raise ValueError(
-                        f"Verdict aggregation at {testgroup} with unknown child verdicts"
+                        f"Verdict aggregation at {test_group} with unknown child verdicts"
                     )
                 assert first_error is not None
                 assert first_error is not False
                 return first_error
 
-    def _set_verdict_for_node(self, testnode: str, verdict: Verdict, timeout: bool):
+    def _set_verdict_for_node(self, test_node: str, verdict: Verdict, timeout: bool):
         # This assumes self.lock is already held.
         # Note that `False` verdicts can be overwritten if they were already started before being set to False.
-        if self.verdict[testnode] not in [None, False]:
+        if self.verdict[test_node] not in [None, False]:
             raise ValueError(
-                f"Overwriting verdict of {testnode} to {verdict} (was {self.verdict[testnode]})"
+                f"Overwriting verdict of {test_node} to {verdict} (was {self.verdict[test_node]})"
             )
-        self.verdict[testnode] = verdict
-        if testnode != ".":
-            parent = str(Path(testnode).parent)
+        self.verdict[test_node] = verdict
+        if test_node != ".":
+            parent = str(Path(test_node).parent)
 
             # Possibly mark sibling cases as unneeded.
             match self.run_until:
@@ -310,14 +314,14 @@ def _set_verdict_for_node(self, testnode: str, verdict: Verdict, timeout: bool):
                     # On error, set all later siblings to False.
                     if verdict != Verdict.ACCEPTED:
                         for sibling in self.children[parent]:
-                            if sibling > testnode and self.verdict[sibling] is None:
+                            if sibling > test_node and self.verdict[sibling] is None:
                                 self.verdict[sibling] = False
 
                 case RunUntil.DURATION:
                     # On timeout, set all later siblings to False.
                     if timeout:
                         for sibling in self.children[parent]:
-                            if sibling > testnode and self.verdict[sibling] is None:
+                            if sibling > test_node and self.verdict[sibling] is None:
                                 self.verdict[sibling] = False
 
                 case RunUntil.ALL:
@@ -333,29 +337,29 @@ def _set_verdict_for_node(self, testnode: str, verdict: Verdict, timeout: bool):
                     # parent verdict cannot be determined yet
                     pass
 
-    def run_is_needed(self, testcase: str) -> bool:
+    def run_is_needed(self, test_case: str) -> bool:
         """
         There are 3 modes for running cases:
         - default: run until the lexicographically first error is known
         - duration: run until the slowest case is known
         - all: run all cases
 
-        Testcases/groups have their verdict set to `False` as soon as it is determined they are not needed.
+        Test cases/groups have their verdict set to `False` as soon as it is determined they are not needed.
         """
         with self:
-            if self.verdict[testcase] is not None:
+            if self.verdict[test_case] is not None:
                 return False
 
             match self.run_until:
                 case RunUntil.FIRST_ERROR:
                     # Run only if parents do not have known verdicts yet.
                     return all(
-                        self.verdict[str(parent)] is None for parent in Path(testcase).parents
+                        self.verdict[str(parent)] is None for parent in Path(test_case).parents
                     )
                 case RunUntil.DURATION:
                     # Run only if not explicitly marked as unneeded.
                     return all(
-                        self.verdict[str(parent)] is not False for parent in Path(testcase).parents
+                        self.verdict[str(parent)] is not False for parent in Path(test_case).parents
                     )
                 case RunUntil.ALL:
                     # Run all cases.
@@ -374,16 +378,16 @@ def __iter__(self):
     def __init__(
         self,
         submissions,
-        testcases: list[testcase.Testcase],
+        test_cases: list[testcase.Testcase],
         width: int = ProgressBar.columns,
         height: int = shutil.get_terminal_size().lines,
         max_name_width: int = 50,
     ):
         self.submissions: list[str] = [s.name for s in submissions]
-        self.testcases: list[str] = [t.name for t in testcases]
-        self.samples: set[str] = set(t.name for t in testcases if t.root == "sample")
+        self.test_cases: list[str] = [t.name for t in test_cases]
+        self.samples: set[str] = set(t.name for t in test_cases if t.root == "sample")
         self.results: list[Verdicts] = []
-        self.current_testcases: set[str] = set()
+        self.current_test_cases: set[str] = set()
         self.last_printed: list[int] = []
         self.width: int
         self.print_without_force: bool
@@ -407,11 +411,11 @@ def __init__(
                 lines = [f"{Style.DIM}{Fore.CYAN}{name}{Fore.WHITE}:"]
 
                 verdicts = []
-                for t, testcase in enumerate(self.testcases):
+                for t, test_case in enumerate(self.test_cases):
                     if t % 10 == 0:
                         verdicts.append(VerdictTable.Group(0, ""))
                     verdicts[-1].length += 1
-                    verdicts[-1].text += "s" if testcase in self.samples else "-"
+                    verdicts[-1].text += "s" if test_case in self.samples else "-"
 
                 printed = self.name_width + 1
                 for length, tmp in verdicts:
@@ -439,14 +443,14 @@ def __init__(
 
     def next_submission(self, verdicts: Verdicts):
         self.results.append(verdicts)
-        self.current_testcases = set()
+        self.current_test_cases = set()
 
-    def add_testcase(self, testcase: str):
-        self.current_testcases.add(testcase)
+    def add_test_case(self, test_case: str):
+        self.current_test_cases.add(test_case)
 
-    def update_verdicts(self, testcase: str, verdict: str | Verdict, duration: float):
-        self.results[-1].set(testcase, verdict, duration)
-        self.current_testcases.discard(testcase)
+    def update_verdicts(self, test_case: str, verdict: str | Verdict, duration: float):
+        self.results[-1].set(test_case, verdict, duration)
+        self.current_test_cases.discard(test_case)
 
     def _clear(self, *, force: bool = True):
         if force or self.print_without_force:
@@ -466,11 +470,11 @@ def _clear(self, *, force: bool = True):
 
                 self.last_printed = []
 
-    def _get_verdict(self, s: int, testcase: str, check_sample: bool = True) -> str:
+    def _get_verdict(self, s: int, test_case: str, check_sample: bool = True) -> str:
         res = f"{Style.DIM}-{Style.RESET_ALL}"
-        if s < len(self.results) and self.results[s][testcase] not in [None, False]:
-            res = to_char(self.results[s][testcase], check_sample and testcase in self.samples)
-        elif s + 1 == len(self.results) and testcase in self.current_testcases:
+        if s < len(self.results) and self.results[s][test_case] not in [None, False]:
+            res = to_char(self.results[s][test_case], check_sample and test_case in self.samples)
+        elif s + 1 == len(self.results) and test_case in self.current_test_cases:
             res = Style.DIM + to_char(None)
         return res
 
@@ -518,7 +522,7 @@ def _print_tree(
                 first = True
                 verdicts = []
                 for child in reversed(self.results[-1].children[node]):
-                    if self.results[-1].is_testgroup(child):
+                    if self.results[-1].is_test_group(child):
                         if first:
                             stack.append((child, indent + pipe + " ", "└─", True))
                             first = False
@@ -603,11 +607,11 @@ def _print_table(
 
                 # group verdicts in parts of length at most ten
                 verdicts = []
-                for t, testcase in enumerate(self.testcases):
+                for t, test_case in enumerate(self.test_cases):
                     if t % 10 == 0:
                         verdicts.append(VerdictTable.Group(0, ""))
                     verdicts[-1].length += 1
-                    verdicts[-1].text += self._get_verdict(s, testcase)
+                    verdicts[-1].text += self._get_verdict(s, test_case)
 
                 for length, tmp in verdicts:
                     if self.width >= 0 and printed + 1 + length > self.width:
@@ -680,8 +684,8 @@ def _print(self, *objects, sep="", end="\n", file=sys.stderr, flush=True):
         print(*objects, sep=sep, end=end, file=file, flush=False)
 
     # TODO #102: item has type `str` in the base class, but type `run.Run` here.
-    def start(self, item):  # type: ignore[override]
-        self.table.add_testcase(item.testcase.name)
+    def start(self, item: "run.Run"):  # type: ignore[override]
+        self.table.add_test_case(item.testcase.name)
         return super().start(item)
 
     def done(self, success=True, message="", data="", print_item=True):
diff --git a/doc/commands.md b/doc/commands.md
index ec100e55e..7e0d27ad7 100644
--- a/doc/commands.md
+++ b/doc/commands.md
@@ -306,7 +306,7 @@ See the [implementation notes](implementation_notes.md#constraints-checking) for
 
 **Verify testcase**
 
-Validators that accept the `--constraints_file <path>` option are run on all testcases to check whether the bounds specified in the validator are actually reached by the testdata. A warning is raised when this is not the case.
+Validators that accept the `--constraints_file <path>` option are run on all testcases to check whether the bounds specified in the validator are actually reached by the test data. A warning is raised when this is not the case.
 E.g. when an `input_validator` based on [headers/validation.h](../headers/validation.h) does `v.read_integer("n", 1, 1000)` (on line `7`) and the maximum value of `n` over all testcases is `999`, the following warning will be raised:
 
 ```
diff --git a/doc/generators.md b/doc/generators.md
index 572ed2700..60e627b86 100644
--- a/doc/generators.md
+++ b/doc/generators.md
@@ -26,7 +26,7 @@ The two main object types are `directory` and `generator`. The root of `generato
 
 **Directory objects** take the following keys:
 
-- `testdata.yaml`: Optional yaml configuration that will be copied to `testdata.yaml` in this directory.
+- `test_group.yaml`: Optional yaml configuration that will be copied to `test_group.yaml` in this directory.
 - `solution`: Optional invocation of a solution to be used to generate `.ans` files. Set to empty to disable generating `.ans`. (Useful for e.g. the `data/samples/` directory.) This must be an absolute path relative to the problem root.
 - `random_salt`: Optional string that will be prepended to each command before computing its `{seed}`. May be used to regenerate all random cases and to prevent predictable seeds.
 - `data`: The test cases / test groups contained in this directory. This may take two forms:
diff --git a/doc/generators.yaml b/doc/generators.yaml
index fb78d06d0..a38be444b 100644
--- a/doc/generators.yaml
+++ b/doc/generators.yaml
@@ -15,8 +15,8 @@ solution: /submissions/accepted/sol.py
 # the command being run.
 random_salt: abcd
 
-# The top level may contain a testdata.yaml that will be written to data/ as specified.
-testdata.yaml:
+# The top level may contain a test_group.yaml that will be written to data/ as specified.
+test_group.yaml:
   output_validator_args: []
 
 # We support three types of generators:
@@ -74,19 +74,20 @@ data:
 
       "2":
         in: 23 foo # generates the test case input file data/2.in with contents "23 foo"
-      # The copy key indicates a manual testcase that will be copied
-      # from the given directory into the target testcase. The given directory
+      # The copy key indicates a manual test case that will be copied
+      # from the given directory into the target test case. The given directory
       # must not start with a /, not include an extension and will be relative to generators/.
       "3":
         copy: manual_cases/sample/3
-      # Small testcases can be specified explictly:
+      # Small test cases can be specified explictly:
       "4":
         in: 1 0
         # Values must be a strings, so `1` is wrapped in quotes.
         ans: "1"
-        desc: Right identity for addition
-        hint: Make sure addition with zero also works
-      # Use YAML multiline syntax for multiline testcases
+        yaml:
+          description: Right identity for addition
+          hint: Make sure addition with zero also works
+      # Use YAML multiline syntax for multiline test cases
       # The pipe | preserves newlines, but strips indentation whitespace.
       # See also https://yaml-multiline.info/
       "5":
@@ -98,13 +99,13 @@ data:
           23
           0
           -4
-  # Every testcase present in the directory must be listed.
-  # TOOLING: may still allow unlisted testcases and warn about them.
+  # Every test case present in the directory must be listed.
+  # TOOLING: may still allow unlisted test cases and warn about them.
   #'6':
 
   secret:
     include:
-      # You can include other testcroups by there yaml name
+      # You can include other test groups by their yaml name
       - "sample"
       # This will include "1", "2", "3", "4", and "5" from sample
     data:
@@ -137,9 +138,9 @@ data:
       #11-random-4: graph {seed} {seed:2}  # Not allowed because the regex matches twice.
       12-counted:
         generate: graph {seed:3} {count}
-        count: 2 # generate two testcases at once
+        count: 2 # generate two test cases at once
 
-      # No key (testcase or testgroup) may be a prefix of another key.
+      # No key (test case or test group) may be a prefix of another key.
       #01-second: graph 6                     # Collision with rule 01 above.
       #12-counted-1: graph 7                  # Collision with the first rule of 12-counted above
       #12-counted-2: graph 8                  # Collision with the second rule of 12-counted above
@@ -147,17 +148,17 @@ data:
 
       # Commands are only allowed to read and write files of the form
       # `testcase.<ext>`, where <ext> is a known file extension in
-      # .in, .ans, .hint, .desc, .png, .jpg, .svg.
+      # .in, .ans, .out, .yaml, .png, .jpg, .svg.
       # Any such written files will be saved.
       #
       # In case a generator program writes testcase.in, its stdout will be ignored.
-      # In case testcase.in is not created, stdout will be used as the input for the testcase.
+      # In case testcase.in is not created, stdout will be used as the input for the test case.
       #
       # The generator below generates and writes both testcase.in and testcase.ans, and
       # the optionally specified `solution:` will not be called.
       "13": write_in_and_ans.py
 
-      # To override the global/testgroup configuration on a per-testcase basis,
+      # To override the global/test group configuration on a per-test-case basis,
       # a dictionary may be used. This allows the solution: key,
       # as well as the generate: key which contains the command to execute.
       14_override:
@@ -169,25 +170,26 @@ data:
       # either by using 'in', 'copy', or 'generate'
       # 14_no_input_produced: # this is an error
       #   solution: /submissions/accepted/foo.py
-      #   desc: add two numbers
-      #   hint: check for maxint!
+      #   yaml:
+      #     description: add two numbers
+      #     hint: check for maxint!
 
-      # Introduce a testgroup.
+      # Introduce a test group.
       # The top-level `data:` key is always assumed to be a directory.
       hard_cases_group:
-        # Directories may contain a testdata.yaml that will be written as specified.
-        testdata.yaml:
+        # Directories may contain a test_group.yaml that will be written as specified.
+        test_group.yaml:
           output_validator_args: [space_change_sensitive]
 
-        # To enable automatic numbering of testcases, data: may also contain a list of
+        # To enable automatic numbering of test cases, data: may also contain a list of
         # single-element dictionaries instead of a single dictionary. In this case,
-        # testcases and/or groups will be numbered in the order they appear, starting at
+        # test cases and/or groups will be numbered in the order they appear, starting at
         # 1. The system will determine the required number of digits to use and numbers
         # will be zero-padded accordingly, using a dash as separator from the given name
         # (when the given name is not empty). Each dictionary in the list must contain a
         # single item.
         #
-        # Numbering is per directory. Testcases/testgroups are ordered by the order of lists
+        # Numbering is per directory. Test cases/test groups are ordered by the order of lists
         # and alphabetical for dictionaries.
         data:
           # 15.in
@@ -206,18 +208,18 @@ data:
           - j: tree j
           # 24-h
           - k: tree k
-          # When mixing testcases and testgroups within a testgroup, testgroups
+          # When mixing test cases and test groups within a test group, test groups
           # must be last.
-          # Testgroup numbers are always prefixed with g when they are numbered.
-          # g1-numbered_testgroup
-          - numbered_testgroup:
+          # Test group numbers are always prefixed with g when they are numbered.
+          # 1-numbered_test_group
+          - numbered_test_group:
               data:
                 # 18-c
                 - c: tree c
                 # 19-d
                 - d: tree d
-          # g2-numbered_testgroup
-          - numbered_testgroup:
+          # 2-numbered_test_group
+          - numbered_test_group:
               data:
                 # e
                 e: tree e
@@ -229,11 +231,11 @@ data:
 #  15: tree empty
 #  16-a: tree a
 #  17-b: tree b
-#  g1-numbered_testgroup:
+#  1-numbered_test_group:
 #    data:
 #      18-c: tree c
 #      19-d: tree d
-#  g2-numbered_testgroup:
+#  2-numbered_test_group:
 #    data:
 #      e: tree e
 #      f: tree f
diff --git a/doc/validation.md b/doc/validation.md
index 0613252ac..87f98eff1 100644
--- a/doc/validation.md
+++ b/doc/validation.md
@@ -10,7 +10,7 @@ Input and answer validation run on the _files_ in `data/*`; their purpose is to
 Output validation runs on the output of the author submissions in `submissions` (and eventually on solver submissions when the problem is hosted on a judge system);
 the purpose of output validation is to check correctness of _submissions_.
 
-The testcases in `/data/sample` and `/data/secret` must pass each of input, answer, and output validation;
+The test cases in `/data/sample` and `/data/secret` must pass each of input, answer, and output validation;
 whereas submission output must only pass output validation.
 
 
@@ -18,7 +18,7 @@ whereas submission output must only pass output validation.
 
 These are some things that hold for all types of validation mentioned below.
 
-- For each testcase, all validators of the same type are run in lexicographic order. If one
+- For each test case, all validators of the same type are run in lexicographic order. If one
   fails, later ones are skipped.
 - In BAPCtools, the current working directory is always a temporary
   `<testcase>.feedbackdir` directory.
@@ -30,22 +30,22 @@ These are some things that hold for all types of validation mentioned below.
 - The return code must be `43` for failed validation. (Note that the spec is
   slightly more lenient and allows any non-`42` return code for input format
   validation. BAPCtools expects a code of exactly `43` when validating
-  invalid testcases (see below).)
+  invalid test cases (see below).)
 - For input and answer validation, the out-of-spec `--constraints-file
 <path>` flag is set when running `bt constraints`. The validator can write some
-  statistics on the testcase to this file. See the [implementation
+  statistics on the test case to this file. See the [implementation
   notes](implementation_notes.md#constraints-checking).
 - `<{input,output}_validator_args>` are either empty, or the value of the
-  `{input,output}_validator_args` key in the first `testdata.yaml` file that is found
-  in the directory (testgroup) of the current testcase or its parents.
+  `{input,output}_validator_args` key in the first `test_group.yaml` file that is found
+  in the directory (test group) of the current test case or its parents.
 
 ## Input validation
 
 `bt validate --input`
 
-Test if the testcase input file `testcase.in` file passes the 'input validators'. Each file or
+Test if the test case input file `testcase.in` file passes the 'input validators'. Each file or
 directory in `/input_validators/` is an input validator.
-Input validators receive the testcase on standard input, as
+Input validators receive the test case on standard input, as
 
 ```
 input_validator [input_validator_args] < testcase.in
@@ -55,15 +55,15 @@ input_validator [input_validator_args] < testcase.in
 
 `bt validate --answer`
 
-BAPCtools allows (in fact, encourages) the validation of the `.ans`-file of each testcase.
+BAPCtools allows (in fact, encourages) the validation of the `.ans`-file of each test case.
 As for input validation, every program in `answer_validators` is a validator, and all validators must pass.
-Answer validators receive the testcase answer file on standard input, as
+Answer validators receive the test case answer file on standard input, as
 ```
 answer_validator /path/to/testcase.in [output_validator_args] < testcase.ans
 ```
 
 Answer validation can be as simple as checking that standard input contains a single integer (and nothing else).
-A more advanced use case would be to read an integer `n` from the testcase input file `testcase.in` file provided as the first argument,
+A more advanced use case would be to read an integer `n` from the test case input file `testcase.in` file provided as the first argument,
 followed by verifying that the standard input contains `n` newline-separated integers.
 
 BAPCtools assumes that all answer files are also valid outputs and therefore also checks that the `.ans` files pass output validation.
@@ -110,7 +110,7 @@ Examples:
 Invalid answers are test cases in `data/invalid_answer`.
 Such a test case consist of input and answer files (`.in` and `.ans`), just like a normal test case.
 The input file must pass input validation (i.e., all input validators must accept).
-The testcase must fail answer validation, i.e., at least one answer validator or the output validator must reject it.
+The test case must fail answer validation, i.e., at least one answer validator or the output validator must reject it.
 The output validator is run in strict mode, i.e., with the flags `case_sensitive` and `space_change_sensitive`;
 to ensure maximum conformity of answer files in the test data.
 
diff --git a/headers/validation.h b/headers/validation.h
index 4b1904fa7..25c4de183 100644
--- a/headers/validation.h
+++ b/headers/validation.h
@@ -10,8 +10,9 @@
 // This strict checking mode is used for *.in and *.ans files.
 // When validating submission outputs, the checking is more lenient,
 // but the case_sensitive and space_change_sensitive flags can be passed
-// via the output_validator_args in testdata.yaml to enable strict checking behaviour
-// for submission outputs regarding case and whitespace, respectively.
+// via the output_validator_args in test_group.yaml or <testcase>.yaml
+// to enable strict checking behaviour for submission outputs
+// regarding case and whitespace, respectively.
 
 #include <algorithm>
 #include <array>
diff --git a/skel/problem/generators/generators.yaml b/skel/problem/generators/generators.yaml
index fa2e7209f..1e3eafae4 100644
--- a/skel/problem/generators/generators.yaml
+++ b/skel/problem/generators/generators.yaml
@@ -1,7 +1,7 @@
 #solution: /submissions/accepted/submission.py
-version: 2025-04  # use this version of the generators framework
+version: 2025-08  # use this version of the generators framework
 
-{%testdata_yaml_comment%}testdata.yaml:
+{%test_group_yaml_comment%}test_group.yaml:
   # One or more of:
   # case_sensitive
   # space_change_sensitive
diff --git a/support/schemas/generators.cue b/support/schemas/generators.cue
index 5dce093ee..bd94e0da2 100644
--- a/support/schemas/generators.cue
+++ b/support/schemas/generators.cue
@@ -27,51 +27,53 @@ import "strings"
 	random_salt?: string
 }
 
-#testgroup_config: {
+#test_group_config: {
 	#config
-	"testdata.yaml": #testdata_settings
+	"test_group.yaml": #test_group_settings
 }
 
-#testcase:
+#test_case:
 	#command & !~"^/" |
 	{
 		generate?: #command & !~"^/"
 		count?:    int & >=1 & <=100
-		// The "copy" key uses a path relative to "/generators/" ending in a testcase name,
+		// The "copy" key uses a path relative to "/generators/" ending in a test case name,
 		// such as "manual/samples/3".
 		copy?: #dirpath
 
 		["in" | "in.statement" | "in.download" |
 		 "ans" | "ans.statement" | "ans.download" |
-		 "out" | "desc" | "hint"]: string
+		 "out"]: string
 		interaction?: =~"^([<>][^\\n]*\\n)+$"
+		yaml?: #test_case_config
+
 		#config
 	}
 
-#data_dict: {[#name]: #testgroup | #testcase}
-#data_list: {[#name | ""]: #testgroup | #testcase} & struct.MinFields(1) & struct.MaxFields(1)
+#data_dict: {[#name]: #test_group | #test_case}
+#data_list: {[#name | ""]: #test_group | #test_case} & struct.MinFields(1) & struct.MaxFields(1)
 
-#testgroup: {
+#test_group: {
 	data?: #data_dict | [...#data_list]
 	include?: [...#dirpath]
-	#testgroup_config
+	#test_group_config
 }
 
 #Generators: {
-	// Generators are named like files or testcases, like "tree.py" or "a".
+	// Generators are named like files or test cases, like "tree.py" or "a".
 	// Each consists of a nonempty list of paths relative to "/generators/",
 	// such as ["tree_generator/tree.py", "lib.py"].
 	generators?: [#name]: [...(#path & !~"^/")] & [_, ...]
 	data: close({
-		sample!:          #testgroup
-		secret!:          #testgroup
-		invalid_input?:  #testgroup
-		invalid_answer?: #testgroup
-		invalid_output?: #testgroup
-		valid_output?: #testgroup
+		sample!:         #test_group
+		secret!:         #test_group
+		invalid_input?:  #test_group
+		invalid_answer?: #test_group
+		invalid_output?: #test_group
+		valid_output?:   #test_group
 	})
-	#testgroup_config
-	version: =~"^[0-9]{4}-[0-9]{2}$" | *"2025-04"
+	#test_group_config
+	version: =~"^[0-9]{4}-[0-9]{2}$" | *"2025-08"
 
 	... // Do allow unknown_key at top level for tooling
 }
diff --git a/support/schemas/generators_yaml_schema.json b/support/schemas/generators_yaml_schema.json
index 3377488a3..70dc2b74a 100644
--- a/support/schemas/generators_yaml_schema.json
+++ b/support/schemas/generators_yaml_schema.json
@@ -2,7 +2,7 @@
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://json.schemastore.org/problem_package_generators.json",
   "$defs": {
-    "testgroup": {
+    "test_group": {
       "type": "object",
       "title": "Test Group",
       "description": "A test group",
@@ -37,30 +37,24 @@
             "type": "string"
           }
         },
-        "testdata.yaml": {
-          "$ref": "#/$defs/testdata_settings"
+        "test_group.yaml": {
+          "$ref": "#/$defs/test_group_settings"
         },
         "solution": {
           "$ref": "#/$defs/solution"
-        },
+        }
       },
       "additionalProperties": false
     },
-    "testdata_settings": {
+    "test_case_or_group_settings": {
       "type": "object",
-      "title": "Test data settings",
-      "description": "The settings that apply to the test data for this test group. Will be copied to this test group's `testdata.yaml`.",
       "properties": {
-        "on_reject": {
-          "enum": ["break", "continue"],
-          "default": "break"
-        },
-        "grading": {
-          "enum": ["default", "custom"]
-        },
-        "grader_flags": {
-          "type": "string",
-          "examples": ["min", "sum"]
+        "args": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "Defines arguments passed to the submission for the test case/group."
         },
         "input_validator_args": {
           "oneOf": [
@@ -114,18 +108,101 @@
           "type": "string",
           "deprecated": true,
           "description": "With 'problem_format_version: 2023-07-draft' in problem.yaml, use output_validator_args instead."
-        },
-        "accept_score": {
-          "type": "string"
-        },
-        "reject_score": {
-          "type": "string"
-        },
-        "range": {
-          "type": "string"
         }
       }
     },
+    "test_case_settings": {
+      "title": "Test case settings",
+      "description": "The settings that apply to this test case.",
+      "allOf": [
+        {
+          "$ref": "#/$defs/test_case_or_group_settings"
+        },
+        {
+          "type": "object",
+          "properties": {
+            "hint": {
+              "type": "string",
+              "description": "A hint provides feedback for solving a test case to, e.g., somebody whose submission didn't pass."
+            },
+            "description": {
+              "type": "string",
+              "description": "A description conveys the purpose of a test case. It is an explanation of what aspect or edge case of the solution the input file is meant to test."
+            }
+          }
+        }
+      ]
+    },
+    "test_group_settings": {
+      "title": "Test group settings",
+      "description": "The settings that apply to the test data for this test group. Will be copied to this test group's `test_group.yaml`.",
+      "allOf": [
+        {"$ref": "#/$defs/test_case_or_group_settings"},
+        {
+          "type": "object",
+          "properties": {
+            "scoring": {
+              "type": "object",
+              "title": "Scoring settings",
+              "description": "For scoring problems, submissions are given a non-negative score instead of a verdict. The goal of each submission is to maximize this score. The scoring behavior is configured for `secret` and each test data group using the `scoring` object.",
+              "properties": {
+                "score": {
+                  "oneOf": [
+                    {
+                      "type": "integer",
+                      "minimum": 1
+                    },
+                    {
+                      "enum": ["unbounded"]
+                    }
+                  ]
+                },
+                "aggregation": {
+                  "enum": ["pass-fail", "sum", "min"]
+                },
+                "require_pass": {
+                  "oneOf": [
+                    {
+                      "type": "string"
+                    },
+                    {
+                      "type": "array",
+                      "items": {
+                        "type": "string"
+                      }
+                    }
+                  ]
+                }
+              }
+            },
+            "static_validation": {
+              "oneOf": [
+                {
+                  "type": "boolean"
+                },
+                {
+                  "type": "object",
+                  "properties": {
+                    "args": {
+                      "type": "string",
+                      "description": "Represents the additional arguments passed to the static validator in this group's static validation test case."
+                    },
+                    "score": {
+                      "type": "integer",
+                      "description": "The maximum score of the static validation test case."
+                    }
+                  }
+                }
+              ]
+            },
+            "full_feedback": {
+              "description": "Defaults to `false` in `secret` and `true` in `sample`.\nWhen `full_feedback` is `true`, somebody whose submission didn't pass case should be shown:\n- the given input,\n- the produced output (stdout),\n- any error messages (stderr),\n- the illustration created by the output visualizer (if applicable),\n- the expected output.",
+              "type": "boolean"
+            }
+          }
+        }
+      ]
+    },
     "data_dict": {
       "title": "Data Dictionary",
       "description": "Defines the contents of a test group",
@@ -134,10 +211,10 @@
         "^([A-Za-z0-9][A-Za-z0-9_-]*[A-Za-z0-9]|[A-Za-z0-9]|)$": {
           "oneOf": [
             {
-              "$ref": "#/$defs/testgroup"
+              "$ref": "#/$defs/test_group"
             },
             {
-              "$ref": "#/$defs/testcase"
+              "$ref": "#/$defs/test_case"
             }
           ]
         }
@@ -145,7 +222,7 @@
       "additionalProperties": false,
       "minProperties": 1
     },
-    "testcase": {
+    "test_case": {
       "title": "Test Case",
       "description": "A test case, i.e., a single instance to the problem.",
       "oneOf": [
@@ -215,15 +292,8 @@
               "type": "string",
               "pattern": "^([<>][^\\n]*\\n)+$"
             },
-            "desc": {
-              "type": "string",
-              "title": "Description",
-              "description": "Privileged information explaining the purpose of this test case given as a string"
-            },
-            "hint": {
-              "type": "string",
-              "title": "Hint",
-              "description": "Feedback shown to the solver about this test case given as a string"
+            "yaml": {
+              "$ref": "#/$defs/test_case_settings"
             },
             "random_salt": {
               "$ref": "#/$defs/random_salt"
@@ -280,7 +350,7 @@
     }
   },
   "additionalProperties": true,
-  "description": "Generate test data for this problem. Version 2025-04.",
+  "description": "Generate test data for this problem. Version 2025-08.",
   "properties": {
     "solution": {
       "$ref": "#/$defs/solution"
@@ -309,37 +379,37 @@
       "type": "object",
       "properties": {
         "sample": {
-          "$ref": "#/$defs/testgroup",
+          "$ref": "#/$defs/test_group",
           "title": "Sample inputs",
           "description": "Test cases shown to the solver in the problem statement"
         },
         "secret": {
-          "$ref": "#/$defs/testgroup",
+          "$ref": "#/$defs/test_group",
           "title": "Secret inputs",
           "description": "The test cases against which submissions are validated"
         },
         "invalid_input": {
-          "$ref": "#/$defs/testgroup",
+          "$ref": "#/$defs/test_group",
           "title": "Invalid inputs",
           "description": "Test cases whose input files are invalid"
         },
         "invalid_answer": {
-          "$ref": "#/$defs/testgroup",
+          "$ref": "#/$defs/test_group",
           "title": "Invalid answers",
           "description": "Test cases whose answer files are invalid. Inputs must be valid."
         },
         "invalid_output": {
-          "$ref": "#/$defs/testgroup",
+          "$ref": "#/$defs/test_group",
           "title": "Invalid outputs",
           "description": "Valid test cases for which the `.out` string is rejected by output validation."
         },
         "valid_output": {
-          "$ref": "#/$defs/testgroup",
+          "$ref": "#/$defs/test_group",
           "title": "Valid outputs",
           "description": "Valid test cases for which the `.out` must pass output validation."
         },
-        "testdata.yaml": {
-          "$ref": "#/$defs/testdata_settings"
+        "test_group.yaml": {
+          "$ref": "#/$defs/test_group_settings"
         }
       },
       "additionalProperties": false,
diff --git a/support/schemas/problemformat.cue b/support/schemas/problemformat.cue
index 8f5fca3d5..747115cff 100644
--- a/support/schemas/problemformat.cue
+++ b/support/schemas/problemformat.cue
@@ -1,6 +1,6 @@
 package problemformat
 
-// Directory names, as well as names of testcases and generators are
+// Directory names, as well as names of test cases and generators are
 // alphanumerical with internal underscores and hyphens; such as
 // "huge", "make_tree", "3", "a", or "connected_graph-01";
 // but not "huge_" or "-2" or "bapc.24" or ".." or "".
@@ -16,21 +16,35 @@ let filename = "[A-Za-z0-9][A-Za-z0-9_.-]{0,253}[A-Za-z0-9]"
 
 #filepath: =~"^/?(\(dirname)/)*\(filename)$"
 
-// Paths can both refer to objects like the testgroup "data/secret/huge" or
+// Paths can both refer to objects like the test group "data/secret/huge" or
 // a program file like "/submissions/accepted/x.cpp"
-
 #path: #dirpath | #filepath
-// Test data settings
 
-#testdata_settings: {
+// Test data settings
+#test_case_or_group_settings: {
+	args?: *[] | [string]
 	input_validator_args?: *[] | [string] | {[string]: [string]}
 	output_validator_args?: *[] | [string]
 	test_case_visualizer_args?: *[] | [string]
 	output_visualizer_args?: *[] | [string]
-	grading?: {
-		score?:       >0
-		max_score?:   >0
+	full_feedback?: bool
+}
+
+#test_case_settings: {
+    #test_case_or_group_settings
+	hint?: string
+	description?: string
+}
+
+#test_group_settings: {
+	scoring?: {
+		score?:       >0 | "unbounded"
 		aggregation?: "sum" | "min"
-		// run_samples?: bool
+		require_pass: string | [string]
+	}
+    #test_case_or_group_settings
+	static_validation?: *false | true | {
+		args?: string
+		score?: int
 	}
 }
diff --git a/test/problems/divsort/generators/generators.yaml b/test/problems/divsort/generators/generators.yaml
index dd1d0f5b1..9b3f9e38f 100644
--- a/test/problems/divsort/generators/generators.yaml
+++ b/test/problems/divsort/generators/generators.yaml
@@ -6,12 +6,12 @@ data:
           in: 9.0 3.0 ab cd
           ans: 3.0 abcd
   secret:
-    testdata.yaml:
+    test_group.yaml:
       input_validator_args:
         integers: [small]
     data:
       integers:
-        testdata.yaml:
+        test_group.yaml:
           input_validator_args: [--integer]
             #grading: foo
         data:
@@ -19,7 +19,7 @@ data:
               in: 10.0 2.0 ba cd
               ans: 5.0 abcd
       sorted:
-        testdata.yaml:
+        test_group.yaml:
           input_validator_args:
             strings: [--sorted]
         data:
@@ -29,31 +29,33 @@ data:
       general:
         data:
           nested_1:
-            testdata.yaml:
+            test_group.yaml:
               input_validator_args: [--small]
             data:
               small_floats:
                 in: 10 3.5 ab cd
           nested_2:
-            testdata.yaml:
+            test_group.yaml:
               input_validator_args:
-                integers: [] # hides the input_validator_args in secret/testdata.yaml
+                integers: [] # hides the input_validator_args in secret/test_group.yaml
             data:
               - tiny_floats:
                   in: 10.0 3.5 ab dc
               - large_integers:
                   in: 102.0 2.0 ab cd
-                  desc: Must validate, because `secret/testdata.yaml` hidden by `secret/general/nested_2/testdata.yaml`
+                  yaml:
+                    description: Must validate, because `secret/general/nested_2/test_group.yaml` shadows `secret/test_group.yaml`
       tolerant:
-        testdata.yaml:
+        test_group.yaml:
           output_validator_args: [float_tolerance, "1e-2"]
         data:
           - tiny_floats:
               in: 10.0 3.0 ab dc
               ans: 3.33 abcd
-              desc: |
-                Must be AC. Compare invalid_output/imprecise, which is (there)
-                invalid because of float_tolerance
+              yaml:
+                description: |
+                  Must be AC. Compare invalid_output/imprecise, which is (there)
+                  invalid because of float_tolerance
 
   invalid_input:
     data:
@@ -61,14 +63,14 @@ data:
         data:
           too_many_tokens: {in: 10.0 2.5 ab cd ef}
       integers:
-        testdata.yaml:
+        test_group.yaml:
           input_validator_args: [--integer]
         data:
           ints_expected: {in: 10.0 2.5 ab cd}
         include:
           - small_floats
       sorted:
-        testdata.yaml:
+        test_group.yaml:
           input_validator_args: [--sorted]
         include:
           - unsorted # invalid here because of --sorted flag (valid input in invalid_answers/no_output_validator_args)
@@ -83,7 +85,7 @@ data:
             in: 10.0 2.0 cba cd
             ans: 5.0 Abccd
       with_output_validator_args:
-        testdata.yaml:
+        test_group.yaml:
           output_validator_args: [--forbid_abcd]
         include:
           - imprecise # must reject because its ans includes abcd
@@ -94,7 +96,7 @@ data:
         ans: 3.333333333 abcd
         out: 3.33 abcd
   valid_output:
-    testdata.yaml:
+    test_group.yaml:
       output_validator_args: [float_tolerance, "1e-2"]
     data:
       valid:
diff --git a/test/problems/fltcmp/data/testdata.yaml b/test/problems/fltcmp/data/test_group.yaml
similarity index 100%
rename from test/problems/fltcmp/data/testdata.yaml
rename to test/problems/fltcmp/data/test_group.yaml
diff --git a/test/problems/generatorincludes/generators/generators.yaml b/test/problems/generatorincludes/generators/generators.yaml
index 139deb187..3a1a83451 100644
--- a/test/problems/generatorincludes/generators/generators.yaml
+++ b/test/problems/generatorincludes/generators/generators.yaml
@@ -10,7 +10,7 @@ data:
   secret:
     data:
       - small:
-          testdata.yaml:
+          test_group.yaml:
             output_validator_args: [space_change_sensitive]
             input_validator_args:
               connected: [--small]
diff --git a/test/problems/identity/data/sample/testdata.yaml b/test/problems/identity/data/sample/test_group.yaml
similarity index 100%
rename from test/problems/identity/data/sample/testdata.yaml
rename to test/problems/identity/data/sample/test_group.yaml
diff --git a/test/problems/identity/generators/generators.yaml b/test/problems/identity/generators/generators.yaml
index e6090d53c..92682dcf1 100644
--- a/test/problems/identity/generators/generators.yaml
+++ b/test/problems/identity/generators/generators.yaml
@@ -72,7 +72,7 @@ data:
       "6":
         in.statement: "6"
         ans.statement: "6"
-    testdata.yaml:
+    test_group.yaml:
       output_visualizer_args: [--draw-please]
 
   secret:
@@ -82,7 +82,7 @@ data:
           stdoutpy: stdout.py 200
           stdoutcpp: stdout.cpp 201
           inans: write_in_and_ans.py 202
-          hintdesc: hint_desc.py 203
+          hint_desc_yaml: hint_desc_yaml.py 203
           main_py: main_py 204
           main_c: main_c 205
           main_cpp: main_cpp 206
diff --git a/test/problems/identity/generators/hint_desc.py b/test/problems/identity/generators/hint_desc_yaml.py
similarity index 58%
rename from test/problems/identity/generators/hint_desc.py
rename to test/problems/identity/generators/hint_desc_yaml.py
index 8c503fe59..c842f91b2 100644
--- a/test/problems/identity/generators/hint_desc.py
+++ b/test/problems/identity/generators/hint_desc_yaml.py
@@ -5,5 +5,4 @@
 n = sys.argv[1]
 Path("testcase.in").write_text(n + "\n")
 Path("testcase.ans").write_text(n + "\n")
-Path("testcase.hint").write_text("hint: " + n + "\n")
-Path("testcase.desc").write_text("description: " + n + "\n")
+Path("testcase.yaml").write_text("hint: " + n + "\ndescription: " + n + "\n")
diff --git a/test/test_default_output_validator.py b/test/test_default_output_validator.py
index 220b12d9f..fae741e87 100644
--- a/test/test_default_output_validator.py
+++ b/test/test_default_output_validator.py
@@ -66,10 +66,10 @@ class MockRun:
 
 @pytest.mark.usefixtures("validator")
 class TestDefaultOutputValidator:
-    @pytest.mark.parametrize("testdata", read_tests())
-    def test_default_output_validator(self, validator, testdata):
+    @pytest.mark.parametrize("test_data", read_tests())
+    def test_default_output_validator(self, validator, test_data):
         problem, validator = validator
-        flags, ans, out, exp = testdata
+        flags, ans, out, exp = test_data
         flags = flags.split()
 
         (problem.tmpdir / "data").mkdir(exist_ok=True, parents=True)
@@ -91,7 +91,7 @@ def test_default_output_validator(self, validator, testdata):
         result = validator.run(t, r, args=flags)
 
         if result.status != exp:
-            print(testdata)
+            print(test_data)
             for k in vars(result):
                 print(k, " -> ", getattr(result, k))
         assert result.status == exp
diff --git a/test/test_generators_yaml.py b/test/test_generators_yaml.py
index a35919c66..480742a31 100644
--- a/test/test_generators_yaml.py
+++ b/test/test_generators_yaml.py
@@ -30,25 +30,27 @@ def __init__(self, problem, restriction=None):
         self.problem = problem
         self.n_parse_error = 0
 
-        # A map of paths `secret/testgroup/testcase` to their canonical TestcaseRule.
+        # A map of paths `secret/test_group/test_case` to their canonical TestcaseRule.
         # For generated cases this is the rule itself.
-        # For included cases, this is the 'resolved' location of the testcase that is included.
+        # For included cases, this is the 'resolved' location of the test case that is included.
         self.known_cases = dict()
-        # A set of paths `secret/testgroup`.
+        # A set of paths `secret/test_group`.
         # Used for cleanup.
         self.known_directories = dict()
         # Used for cleanup
         self.known_files = set()
-        # A map from key to (is_included, list of testcases and directories),
+        # A map from key to (is_included, list of test cases and directories),
         # used for `include` statements.
         self.known_keys = collections.defaultdict(lambda: [False, []])
         # A set of testcase rules, including seeds.
         self.rules_cache = dict()
-        # The set of generated testcases keyed by testdata.
+        # The set of generated test cases keyed by hash(test_case).
         # Used to delete duplicated unlisted cases.
-        self.generated_testdata = dict()
+        self.generated_test_cases = dict()
         # Path to the trash directory for this run
-        self.trashdir = None
+        self.trash_dir = None
+        # Set of hash(.in) for all generated testcases
+        self.hashed_in = set()
         # Files that should be processed
         self.restriction = restriction
 
diff --git a/test/test_problem_yaml.py b/test/test_problem_yaml.py
index 4f6b4117c..0b90b799a 100644
--- a/test/test_problem_yaml.py
+++ b/test/test_problem_yaml.py
@@ -54,20 +54,20 @@ class MockProblem:
 
 
 class TestProblemYaml:
-    @pytest.mark.parametrize("testdata", read_tests("valid"))
-    def test_valid(self, testdata):
+    @pytest.mark.parametrize("test_data", read_tests("valid"))
+    def test_valid(self, test_data):
         config.n_error = 0
         config.n_warn = 0
 
-        p = problem.ProblemSettings(testdata["yaml"], cast(problem.Problem, MockProblem()))
+        p = problem.ProblemSettings(test_data["yaml"], cast(problem.Problem, MockProblem()))
         assert config.n_error == 0 and config.n_warn == 0, (
             f"Expected zero errors and warnings, got {config.n_error} and {config.n_warn}"
         )
-        if "eq" in testdata:
-            assert_equal(p, testdata["eq"])
+        if "eq" in test_data:
+            assert_equal(p, test_data["eq"])
 
-    @pytest.mark.parametrize("testdata", read_tests("invalid"))
-    def test_invalid(self, monkeypatch, testdata):
+    @pytest.mark.parametrize("test_data", read_tests("invalid"))
+    def test_invalid(self, monkeypatch, test_data):
         config.n_error = 0
         config.n_warn = 0
 
@@ -85,16 +85,16 @@ def test_invalid(self, monkeypatch, testdata):
         )
 
         try:
-            problem.ProblemSettings(testdata["yaml"], cast(problem.Problem, MockProblem()))
+            problem.ProblemSettings(test_data["yaml"], cast(problem.Problem, MockProblem()))
         except SystemExit as e:
             assert e.code == -42
 
-        assert ([call(testdata["fatal"])] if "fatal" in testdata else []) == fatal.mock_calls
+        assert ([call(test_data["fatal"])] if "fatal" in test_data else []) == fatal.mock_calls
 
-        if isinstance(testdata.get("error", None), str):
-            testdata["error"] = [testdata["error"]]
-        assert [call(x) for x in testdata.get("error", [])] == error.mock_calls
+        if isinstance(test_data.get("error", None), str):
+            test_data["error"] = [test_data["error"]]
+        assert [call(x) for x in test_data.get("error", [])] == error.mock_calls
 
-        if isinstance(testdata.get("warn", None), str):
-            testdata["warn"] = [testdata["warn"]]
-        assert [call(x) for x in testdata.get("warn", [])] == warn.mock_calls
+        if isinstance(test_data.get("warn", None), str):
+            test_data["warn"] = [test_data["warn"]]
+        assert [call(x) for x in test_data.get("warn", [])] == warn.mock_calls
diff --git a/test/test_verdicts.py b/test/test_verdicts.py
index dafa57cb2..f81aff3c0 100644
--- a/test/test_verdicts.py
+++ b/test/test_verdicts.py
@@ -77,5 +77,5 @@ def test_slowest_testcase(self):
         verds.set("secret/a/1", "TLE", 2.9)
         verds.set("secret/a/2", "RTE", 3.5)
         verds.set("secret/a/3", "TLE", 3.2)
-        assert verds.salient_testcase() == ("secret/a/1", 2.9)
-        assert verds.slowest_testcase() == ("secret/a/2", 3.5)
+        assert verds.salient_test_case() == ("secret/a/1", 2.9)
+        assert verds.slowest_test_case() == ("secret/a/2", 3.5)
diff --git a/test/yaml/generators/invalid_yaml/invalid.generators.yaml b/test/yaml/generators/invalid_yaml/invalid.generators.yaml
index d4618cc0c..6fc27a90b 100644
--- a/test/yaml/generators/invalid_yaml/invalid.generators.yaml
+++ b/test/yaml/generators/invalid_yaml/invalid.generators.yaml
@@ -7,7 +7,7 @@ data: {sample: {data: []}}
 # missing sample:
 data: {secret: {data: []}}
 ---
-# invalid testgroup below root
+# invalid test group below root
 data: {sample: {data: []}, secret: {data: []}, public: {data: []}}
 ---
 # solution must be null or string
@@ -276,7 +276,7 @@ data: {sample: {data: []}, secret: {data: []}}
 #generate: xyz
 #data: {sample: {data: []}, secret: {data: []}}
 #---
-# No testgroup generate
+# No generate in test group
 data:
   sample: {data: []}
   secret:
@@ -381,7 +381,7 @@ data:
        generate: my_generator {count}
        count: 101
 ---
-# No testdata.yaml on testcase level
+# No test_group.yaml on testcase level
 # TODO Not picked up by JSON schema
 data:
   sample: {}
@@ -389,5 +389,5 @@ data:
     data:
     - '':
         in: '1 2'
-        testdata.yaml:      # this is not ok
+        test_group.yaml:      # this is not ok
           input_validator_args: [connected]
diff --git a/test/yaml/generators/test_schemata.sh b/test/yaml/generators/test_schemata.sh
index 10922f680..59a64382d 100644
--- a/test/yaml/generators/test_schemata.sh
+++ b/test/yaml/generators/test_schemata.sh
@@ -23,7 +23,7 @@ for dir in "${all_valid_yaml[@]}"; do
     for file in $(find "$dir" -type f -name '*generators.yaml'); do
 	    echo -n "cue vet "$file" $schemadir/*.cue -d \"#Generators\" "
 	    tmp="$(mktemp --suffix .yaml)"
-	    sed "s/{%testdata_yaml_comment%}/#/" "$file" | sed "s/{%output_validator_args%}//" > "$tmp"
+	    sed "s/{%test_group_yaml_comment%}/#/" "$file" | sed "s/{%output_validator_args%}//" > "$tmp"
 	    output_cue=$(cue vet "$tmp" $schemadir/*.cue -d "#Generators" 2>&1)
 	    rm "$tmp"
 	    exit_code_cue=$?
diff --git a/test/yaml/generators/valid_yaml/rich-generators.yaml b/test/yaml/generators/valid_yaml/rich-generators.yaml
index 4e32ed274..26d62cbbd 100644
--- a/test/yaml/generators/valid_yaml/rich-generators.yaml
+++ b/test/yaml/generators/valid_yaml/rich-generators.yaml
@@ -21,17 +21,24 @@ data:
       'explicit':
         in: "-1 2"
         ans: "1"
-        desc: "Negative numbers"
-        hint: "Remember that a can be negative"
+        yaml:
+          description: "Negative numbers"
+          hint: "Remember that a can be negative"
       'curlies': my_generator {seed:1} --name {name}
       'morecurlies':
         generate: my_generator {seed:1} --name {name} --ctr {count} --arg {count}
         count: 5
-      'group_with_testdata':
-        testdata.yaml:
+      'group_with_test_group_yaml':
+        test_group.yaml:
           input_validator_args: [--connected, --max_n, "2000"]
         data:
           'a': my_generator
+          # A test case may not be called 'test_group', but it may appear as part of the name.
+          'test_group_': my_generator
+      'group_with_numbers':
+        data:
+          # A numbered test case may be called 'X-test-group'.
+          - 'test_group': my_generator
   invalid_input:
     data:
     - '':