jax-ml · rdyro · Mar 12, 2025 · Jul 25, 2025 · Jul 26, 2025 · Jul 30, 2025
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -32,7 +32,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        model: ["deepseek_r1_jax", "kimi_k2", "llama3", "llama4", "qwen3"]
+        model: ["deepseek_r1_jax", "kimi_k2", "llama3", "llama4", "qwen3", "gpt_oss"]
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - uses: astral-sh/setup-uv@7edac99f961f18b581bbd960d59d049f04c0002f # v6.4.1

diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Current contents include:
 * [Llama 3](llama3/)
 * [Qwen 3](qwen3/)
 * [Kimi K2](kimi_k2/)
+* [OpenAI GPT OSS](gpt_oss/)
 
 ---
 

diff --git a/deepseek_r1_jax/deepseek_r1_jax/model.py b/deepseek_r1_jax/deepseek_r1_jax/model.py
diff --git a/deepseek_r1_jax/main.ipynb b/deepseek_r1_jax/main.ipynb
diff --git a/deepseek_r1_jax/scripts/convert_hf_r1_checkpoint.py b/deepseek_r1_jax/scripts/convert_hf_r1_checkpoint.py
@@ -19,13 +19,15 @@
 
 import jax
 from jax.sharding import PartitionSpec as P
+from argparse import ArgumentParser
 
-from deepseek_r1_jax.model import ShardingRules, Config
-from deepseek_r1_jax import chkpt_utils as utils
 
-def main():
-    root_path = Path("/mnt/storage/DeepSeek-R1")
-    dest_path = Path("/mnt/storage/deepseek-r1-jax-chkpt")
+def main(root_path, dest_path):
+    from deepseek_r1_jax.model import ShardingRules, Config
+    from deepseek_r1_jax import chkpt_utils as utils
+
+    root_path, dest_path = Path(root_path), Path(dest_path)
+    dest_path.mkdir(exist_ok=True, parents=True)
 
     cfg = Config()
     cfg.quantize_mlp = False
@@ -39,4 +41,17 @@ def main():
     utils.convert_hf_checkpoint(params_map, root_path, dest_path, cfg)
 
 if __name__ == "__main__":
-    main()
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--source-path", default="/mnt/storage/DeepSeek-R1-weights-only", required=True, help="HF model directory path"
+    )
+    parser.add_argument(
+        "--dest-path",
+        default="~/deepseek_r1_jax",
+        required=True,
+        help="JAX model model directory (to be created).",
+    )
+    args = parser.parse_args()
+    main(args.source_path, args.dest_path)
+
+    main(args)
diff --git a/gpt_oss/.gitignore b/gpt_oss/.gitignore
@@ -0,0 +1,14 @@
+poetry.lock
+scratch/**
+
+projects/charformer/data/
+projects/bio/data/
+
+# Python ignores
+__pycache__/
+*.pyc
+*.egg-info
+build/**
+
+.venv
+.vscode
diff --git a/gpt_oss/README.md b/gpt_oss/README.md
@@ -0,0 +1,21 @@
+# Minimal OpenAI GPT OSS inference
+
+**tl;dr: open-source OpenAI GPT OSS inference using JAX, minimal yet performant**
+
+This model is a work in progress, but it should already work well on both TPU and GPU.
+
+<br/>
+
+This is a pure JAX implementation of OpenAI's GPT OSS for inference, including a
+checkpoint converter for the K2 Instruct weights. on TPU.
+It should work on GPU.
+
+The entire model is defined in [model.py](gpt_oss_jax/model.py) and invoked
+via [main.py](main.py).
+
+## Quickstart
+
+Run:
+```
+$ python3 main.py
+```