Add checkpoint selection option (#573)

PicoCentauri · SanggyuChong · web-flow · commit fd6d5188c11e · 2025-05-07T16:26:50.000+02:00
Implement a checkpoint selection option through a tag named `context`, and update relevant features

---------

Co-authored-by: SanggyuChong &lt;sanggyu.chong@epfl.ch&gt;
diff --git a/docs/src/advanced-concepts/auto-restarting.rst b/docs/src/advanced-concepts/auto-restarting.rst
@@ -5,7 +5,7 @@ When restarting multiple times (for example, when training an expensive model
 or running on an HPC cluster with short time limits), it is useful to be able
 to train and restart multiple times with the same command.
 
-In ``metatrain``, this functionality is provided via the ``--continue auto``
+In ``metatrain``, this functionality is provided via the ``--restart auto``
 (or ``-c auto``) flag of ``mtt train``. This flag will automatically restart
 the training from the last checkpoint, if one is found in the ``outputs/``
 of the current directory. If no checkpoint is found, the training will start
diff --git a/docs/src/dev-docs/new-architecture.rst b/docs/src/dev-docs/new-architecture.rst
@@ -105,7 +105,18 @@ method.
             self.dataset_info = dataset_info
 
         @classmethod
-        def load_checkpoint(cls, checkpoint: Dict[str, Any]) -> "ModelInterface":
+        def load_checkpoint(
+            cls,
+            checkpoint: Dict[str, Any],
+            context: Literal["restart", "finetune", "export"],
+        ) -> "ModelInterface":
+            """Create a model from a checkpoint's state dictionary.
+
+            :param checkpoint: Checkpoint's state dictionary.
+            :param context: Purpose of the model to load from the checkpoint file.
+                Required values are "restart" and "finetune", "export" but can be
+                extended to other values.
+            """
             pass
 
         def restart(cls, dataset_info: DatasetInfo) -> "ModelInterface":
@@ -168,8 +179,19 @@ methods for ``train()``, ``save_checkpoint()`` and ``load_checkpoint()``.
 
         @classmethod
         def load_checkpoint(
-            cls, checkpoint: Dict[str, Any], train_hypers: Dict[str, Any]
+            cls,
+            checkpoint: Dict[str, Any],
+            train_hypers: Dict[str, Any],
+            context: Literal["restart", "finetune"],
         ) -> "TrainerInterface":
+            """Create a trainer from a checkpoint's state dictionary.
+
+            :param checkpoint: Checkpoint's state dictionary.
+            :param context: Purpose of the model to load from the checkpoint file.
+                Required values are "restart" and "finetune" but can be
+                extended to other values.
+            :param train_hypers: Hyperparameters used to create the trainer.
+            """
             pass
 
 The format of checkpoints is not defined by ``metatrain`` and can be any format that
diff --git a/docs/src/getting-started/checkpoints.rst b/docs/src/getting-started/checkpoints.rst
@@ -18,7 +18,7 @@ The sub-command to continue training from a checkpoint is
 
 .. code-block:: bash
 
-    mtt train options.yaml --continue model.ckpt
+    mtt train options.yaml --restart model.ckpt
 
 or
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ dynamic = ["version"]
 requires-python = ">=3.9"
 
 readme = "README.rst"
-license = {text = "BSD-3-Clause"}
+license = "BSD-3-Clause"
 description = "Training and evaluating machine learning models for atomistic systems."
 authors = [{name = "metatrain developers"}]
 
@@ -24,7 +24,6 @@ keywords = ["machine learning", "molecular modeling"]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: BSD License",
     "Operating System :: POSIX",
     "Operating System :: MacOS :: MacOS X",
     "Operating System :: Microsoft :: Windows",
@@ -52,7 +51,7 @@ mtt = "metatrain.__main__:main"
 
 [build-system]
 requires = [
-    "setuptools >= 68",
+    "setuptools >= 77",
     "setuptools_scm>=8",
     "wheel",
 ]
diff --git a/src/metatrain/cli/train.py b/src/metatrain/cli/train.py
@@ -88,12 +88,14 @@ def _add_train_model_parser(subparser: argparse._SubParsersAction) -> None:
         ),
     )
     parser.add_argument(
-        "-c",
-        "--continue",
-        dest="continue_from",
-        type=_process_continue_from,
+        "--restart",
+        dest="restart_from",
+        type=_process_restart_from,
         required=False,
-        help="Checkpoint file (.ckpt) to continue training from.",
+        help=(
+            "Checkpoint file (.ckpt) to continue interrupted training. "
+            "Set to `'auto'` to use latest checkpoint from the outputs directory."
+        ),
     )
     parser.add_argument(
         "-r",
@@ -115,9 +117,9 @@ def _prepare_train_model_args(args: argparse.Namespace) -> None:
     args.options = OmegaConf.merge(args.options, override_options)
 
 
-def _process_continue_from(continue_from: str) -> Optional[str]:
-    # covers the case where `continue_from` is `auto`
-    if continue_from == "auto":
+def _process_restart_from(restart_from: str) -> Optional[str]:
+    # covers the case where `restart_from` is `auto`
+    if restart_from == "auto":
         # try to find the `outputs` directory; if it doesn't exist
         # then we are not continuing from a previous run
         if Path("outputs/").exists():
@@ -129,12 +131,12 @@ def _process_continue_from(continue_from: str) -> Optional[str]:
             # `sorted` because some checkpoint files are named with
             # the epoch number (e.g. `epoch_10.ckpt` would be before
             # `epoch_8.ckpt`). We therefore sort by file creation time.
-            new_continue_from = str(
+            new_restart_from = str(
                 sorted(dir.glob("*.ckpt"), key=lambda f: f.stat().st_ctime)[-1]
             )
-            logging.info(f"Auto-continuing from `{new_continue_from}`")
+            logging.info(f"Auto-continuing from `{new_restart_from}`")
         else:
-            new_continue_from = None
+            new_restart_from = None
             logging.info(
                 "Auto-continuation did not find any previous runs, "
                 "training from scratch"
@@ -145,17 +147,17 @@ def _process_continue_from(continue_from: str) -> Optional[str]:
         # still executing this function
         time.sleep(3)
     else:
-        new_continue_from = continue_from
+        new_restart_from = restart_from
 
-    return new_continue_from
+    return new_restart_from
 
 
 def train_model(
     options: Union[DictConfig, Dict],
     output: str = "model.pt",
     extensions: str = "extensions/",
     checkpoint_dir: Union[str, Path] = ".",
-    continue_from: Optional[str] = None,
+    restart_from: Optional[str] = None,
 ) -> None:
     """Train an atomistic machine learning model using provided ``options``.
 
@@ -169,7 +171,7 @@ def train_model(
     :param output: Path to save the final model
     :param checkpoint_dir: Path to save checkpoints and other intermediate output files
         like the fully expanded training options for a later restart.
-    :param continue_from: File to continue training from.
+    :param restart_from: File to continue training from.
     """
     ###########################
     # VALIDATE BASE OPTIONS ###
@@ -439,10 +441,12 @@ def train_model(
 
     logging.info("Setting up model")
     try:
-        if continue_from is not None:
-            logging.info(f"Loading checkpoint from `{continue_from}`")
-            trainer = trainer_from_checkpoint(continue_from, hypers["training"])
-            model = model_from_checkpoint(continue_from)
+        if restart_from is not None:
+            logging.info(f"Restarting training from `{restart_from}`")
+            trainer = trainer_from_checkpoint(
+                path=restart_from, context="restart", hypers=hypers["training"]
+            )
+            model = model_from_checkpoint(path=restart_from, context="restart")
             model = model.restart(dataset_info)
         else:
             model = Model(hypers["model"], dataset_info)
diff --git a/src/metatrain/deprecated/pet/model.py b/src/metatrain/deprecated/pet/model.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional
 
 import metatensor.torch
 import torch
@@ -243,7 +243,11 @@ def forward(
         return output_quantities
 
     @classmethod
-    def load_checkpoint(cls, checkpoint: Dict[str, Any]) -> "PET":
+    def load_checkpoint(
+        cls,
+        checkpoint: Dict[str, Any],
+        context: Literal["restart", "finetune", "export"],
+    ) -> "PET":
         hypers = checkpoint["hypers"]
         model_hypers = hypers["ARCHITECTURAL_HYPERS"]
         dataset_info = checkpoint["dataset_info"]
diff --git a/src/metatrain/deprecated/pet/trainer.py b/src/metatrain/deprecated/pet/trainer.py
@@ -5,7 +5,7 @@
 import time
 import warnings
 from pathlib import Path
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Literal, Union
 
 import numpy as np
 import torch
@@ -784,7 +784,10 @@ def save_checkpoint(self, model, path: Union[str, Path]):
 
     @classmethod
     def load_checkpoint(
-        cls, checkpoint: Dict[str, Any], train_hypers: Dict[str, Any]
+        cls,
+        checkpoint: Dict[str, Any],
+        train_hypers: Dict,
+        context: Literal["restart", "finetune"],
     ) -> "Trainer":
         # This function takes a metatrain PET checkpoint and returns a Trainer
         # instance with the hypers, while also saving the checkpoint in the
diff --git a/src/metatrain/experimental/nanopet/model.py b/src/metatrain/experimental/nanopet/model.py
@@ -1,6 +1,6 @@
 import warnings
 from math import prod
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional
 
 import metatensor.torch
 import torch
@@ -551,9 +551,19 @@ def requested_neighbor_lists(
         return [self.requested_nl]
 
     @classmethod
-    def load_checkpoint(cls, checkpoint: Dict[str, Any]) -> "NanoPET":
+    def load_checkpoint(
+        cls,
+        checkpoint: Dict[str, Any],
+        context: Literal["restart", "finetune", "export"],
+    ) -> "NanoPET":
         model_data = checkpoint["model_data"]
-        model_state_dict = checkpoint["model_state_dict"]
+
+        if context == "restart":
+            model_state_dict = checkpoint["model_state_dict"]
+        elif context == "finetune" or context == "export":
+            model_state_dict = checkpoint["best_model_state_dict"]
+        else:
+            raise ValueError("Unknown context tag for checkpoint loading!")
 
         # Create the model
         model = cls(**model_data)
diff --git a/src/metatrain/experimental/nanopet/tests/test_continue.py b/src/metatrain/experimental/nanopet/tests/test_continue.py
@@ -70,7 +70,7 @@ def test_continue(monkeypatch, tmp_path):
 
     trainer.save_checkpoint(model, "tmp.ckpt")
 
-    model_after = model_from_checkpoint("tmp.ckpt")
+    model_after = model_from_checkpoint("tmp.ckpt", context="restart")
     assert isinstance(model_after, NanoPET)
     model_after.restart(dataset_info)
 
diff --git a/src/metatrain/experimental/nanopet/trainer.py b/src/metatrain/experimental/nanopet/trainer.py
@@ -1,7 +1,7 @@
 import copy
 import logging
 from pathlib import Path
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Literal, Union
 
 import torch
 import torch.distributed
@@ -494,7 +494,10 @@ def save_checkpoint(self, model, path: Union[str, Path]):
 
     @classmethod
     def load_checkpoint(
-        cls, checkpoint: Dict[str, Any], train_hypers: Dict[str, Any]
+        cls,
+        checkpoint: Dict[str, Any],
+        train_hypers: Dict[str, Any],
+        context: Literal["restart", "finetune"],  # not used at the moment
     ) -> "Trainer":
         epoch = checkpoint["epoch"]
         optimizer_state_dict = checkpoint["optimizer_state_dict"]
diff --git a/src/metatrain/gap/trainer.py b/src/metatrain/gap/trainer.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Literal, Union
 
 import metatensor
 import metatensor.torch
@@ -145,6 +145,9 @@ def save_checkpoint(self, model, checkpoint_dir: str):
 
     @classmethod
     def load_checkpoint(
-        cls, checkpoint: Dict[str, Any], hypers_train: Dict[str, Any]
+        cls,
+        checkpoint: Dict[str, Any],
+        hypers_train: Dict[str, Any],
+        context: Literal["restart", "finetune"],
     ) -> "GAP":
         raise ValueError("GAP does not allow restarting training")
diff --git a/src/metatrain/pet/model.py b/src/metatrain/pet/model.py
@@ -1,6 +1,6 @@
 import warnings
 from math import prod
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional
 
 import metatensor.torch
 import torch
@@ -656,9 +656,20 @@ def forward(
         return return_dict
 
     @classmethod
-    def load_checkpoint(cls, checkpoint: Dict[str, Any]) -> "PET":
+    def load_checkpoint(
+        cls,
+        checkpoint: Dict[str, Any],
+        context: Literal["restart", "finetune", "export"],
+    ) -> "PET":
         model_data = checkpoint["model_data"]
-        model_state_dict = checkpoint["model_state_dict"]
+
+        if context == "restart":
+            model_state_dict = checkpoint["model_state_dict"]
+        elif context == "finetune" or context == "export":
+            model_state_dict = checkpoint["best_model_state_dict"]
+        else:
+            raise ValueError("Unknown context tag for checkpoint loading!")
+
         finetune_config = checkpoint["train_hypers"].get("finetune", {})
 
         # Create the model
diff --git a/src/metatrain/pet/tests/test_continue.py b/src/metatrain/pet/tests/test_continue.py
@@ -70,7 +70,7 @@ def test_continue(monkeypatch, tmp_path):
 
     trainer.save_checkpoint(model, "tmp.ckpt")
 
-    model_after = model_from_checkpoint("tmp.ckpt")
+    model_after = model_from_checkpoint("tmp.ckpt", context="restart")
     assert isinstance(model_after, PET)
     model_after.restart(dataset_info)
 
diff --git a/src/metatrain/pet/tests/test_finetuning.py b/src/metatrain/pet/tests/test_finetuning.py
@@ -111,7 +111,7 @@ def test_finetuning_restart(monkeypatch, tmp_path):
 
     hypers = DEFAULT_HYPERS.copy()
 
-    hypers["training"]["num_epochs"] = 0
+    hypers["training"]["num_epochs"] = 1
 
     # Pre-training
     trainer = Trainer(hypers["training"])
@@ -126,7 +126,7 @@ def test_finetuning_restart(monkeypatch, tmp_path):
     trainer.save_checkpoint(model, "tmp.ckpt")
 
     # Finetuning
-    model_finetune = model_from_checkpoint("tmp.ckpt")
+    model_finetune = model_from_checkpoint("tmp.ckpt", context="finetune")
     assert isinstance(model_finetune, PET)
     model_finetune.restart(dataset_info)
 
@@ -158,7 +158,7 @@ def test_finetuning_restart(monkeypatch, tmp_path):
     assert any(["lora_" in name for name, _ in model_finetune.named_parameters()])
 
     # Finetuning restart
-    model_finetune_restart = model_from_checkpoint("finetuned.ckpt")
+    model_finetune_restart = model_from_checkpoint("finetuned.ckpt", context="restart")
     assert isinstance(model_finetune_restart, PET)
     model_finetune_restart.restart(dataset_info)
 
diff --git a/src/metatrain/pet/trainer.py b/src/metatrain/pet/trainer.py
@@ -1,7 +1,7 @@
 import copy
 import logging
 from pathlib import Path
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Literal, Union
 
 import torch
 from torch.optim.lr_scheduler import LambdaLR
@@ -511,7 +511,10 @@ def save_checkpoint(self, model, path: Union[str, Path]):
 
     @classmethod
     def load_checkpoint(
-        cls, checkpoint: Dict[str, Any], train_hypers: Dict[str, Any]
+        cls,
+        checkpoint: Dict[str, Any],
+        context: Literal["restart", "finetune", "export"],  # not used at the moment
+        train_hypers: Dict[str, Any],
     ) -> "Trainer":
         epoch = checkpoint["epoch"]
         optimizer_state_dict = checkpoint["optimizer_state_dict"]
diff --git a/src/metatrain/share/metatrain-completion.bash b/src/metatrain/share/metatrain-completion.bash
diff --git a/src/metatrain/soap_bpnn/model.py b/src/metatrain/soap_bpnn/model.py
diff --git a/src/metatrain/soap_bpnn/tests/test_continue.py b/src/metatrain/soap_bpnn/tests/test_continue.py
diff --git a/src/metatrain/soap_bpnn/trainer.py b/src/metatrain/soap_bpnn/trainer.py
diff --git a/src/metatrain/utils/io.py b/src/metatrain/utils/io.py
diff --git a/tests/cli/test_train_model.py b/tests/cli/test_train_model.py
diff --git a/tests/utils/test_io.py b/tests/utils/test_io.py