pytorch
diff --git a/‎tests/integration_tests/models.py‎
Lines changed: 28 additions & 0 deletions b/‎tests/integration_tests/models.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎torchtitan/experiments/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/models/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/experiments/qwen3/README.md‎ renamed to ‎torchtitan/models/qwen3/README.md‎
Lines changed: 0 additions & 1 deletion b/‎torchtitan/experiments/qwen3/README.md‎ renamed to ‎torchtitan/models/qwen3/README.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎torchtitan/experiments/qwen3/__init__.py‎ renamed to ‎torchtitan/models/qwen3/__init__.py‎
Lines changed: 16 additions & 3 deletions b/‎torchtitan/experiments/qwen3/__init__.py‎ renamed to ‎torchtitan/models/qwen3/__init__.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎torchtitan/experiments/qwen3/infra/parallelize.py‎ renamed to ‎torchtitan/models/qwen3/infra/parallelize.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/experiments/qwen3/infra/parallelize.py‎ renamed to ‎torchtitan/models/qwen3/infra/parallelize.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/qwen3/model/args.py‎ renamed to ‎torchtitan/models/qwen3/model/args.py‎ b/‎torchtitan/experiments/qwen3/model/args.py‎ renamed to ‎torchtitan/models/qwen3/model/args.py‎
diff --git a/‎torchtitan/experiments/qwen3/model/model.py‎ renamed to ‎torchtitan/models/qwen3/model/model.py‎ b/‎torchtitan/experiments/qwen3/model/model.py‎ renamed to ‎torchtitan/models/qwen3/model/model.py‎
diff --git a/‎torchtitan/experiments/qwen3/model/state_dict_adapter.py‎ renamed to ‎torchtitan/models/qwen3/model/state_dict_adapter.py‎ b/‎torchtitan/experiments/qwen3/model/state_dict_adapter.py‎ renamed to ‎torchtitan/models/qwen3/model/state_dict_adapter.py‎
diff --git a/‎torchtitan/experiments/qwen3/train_configs/qwen3_0.6b.toml‎ renamed to ‎torchtitan/models/qwen3/train_configs/qwen3_0.6b.toml‎ b/‎torchtitan/experiments/qwen3/train_configs/qwen3_0.6b.toml‎ renamed to ‎torchtitan/models/qwen3/train_configs/qwen3_0.6b.toml‎
@@ -76,6 +76,34 @@ def build_model_tests_list() -> list[OverrideDefinitions]:
             "pp+fsdp+tp+ep+etp",
             ngpu=8,
         ),
+        # Integration Test Cases for Qwen3 dense and MoE model
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name qwen3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                ],
+            ],
+            "Qwen3 FSDP+TP",
+            "qwen3_fsdp+tp",
+            ngpu=4,
+        ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name qwen3",
+                    "--model.flavor debugmodel_moe",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--parallelism.expert_parallel_degree 2",
+                    "--parallelism.expert_tensor_parallel_degree 2",
+                ],
+            ],
+            "Qwen3 FSDP+TP+EP+ETP",
+            "qwen3_fsdp+tp+ep+etp",
+            ngpu=4,
+        ),
     ]
 
     return model_tests
@@ -5,5 +5,5 @@
 # LICENSE file in the root directory of this source tree.
 
 _supported_experiments = frozenset(
-    ["flux", "llama4", "qwen3", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm"]
+    ["flux", "llama4", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm"]
 )
@@ -4,4 +4,4 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-_supported_models = frozenset(["llama3", "llama3_ft", "deepseek_v3"])
+_supported_models = frozenset(["llama3", "llama3_ft", "deepseek_v3", "qwen3"])
@@ -23,7 +23,6 @@ eg, for Qwen3 0.6B model, the HF repo name is `Qwen/Qwen3-0.6B`. For 1.7B model,
 ## To be added
 - Modeling
     - CP is not supported currently because of RoPE embedding implementation details.
-    - `StateDictAdapter` support for MoE model
 
 - Testing
     - Learning rate verifying: verify learning rate and schedule with real training jobs (eg, 3k stps), or find official references.
 
@@ -30,6 +30,19 @@
 # Adding different variants of the model
 
 qwen3_configs = {
+    "debugmodel": Qwen3ModelArgs(
+        vocab_size=2048,
+        max_seq_len=4096,
+        head_dim=128,
+        dim=256,
+        n_layers=8,
+        n_heads=16,
+        n_kv_heads=8,
+        qk_norm=True,
+        hidden_dim=3072,
+        rope_theta=1000000,
+        enable_weight_tying=True,
+    ),
     "0.6B": Qwen3ModelArgs(
         vocab_size=151936,
         max_seq_len=4096,
@@ -107,11 +120,11 @@
     ),
     # Qwen3-MoE models
     "debugmodel_moe": Qwen3ModelArgs(
-        vocab_size=151936,
+        vocab_size=2048,
         max_seq_len=4096,
         head_dim=128,
-        dim=1024,
-        n_layers=28,
+        dim=256,
+        n_layers=8,
         n_heads=16,
         n_kv_heads=8,
         qk_norm=True,
 
@@ -239,8 +239,8 @@ def apply_non_moe_tp(
         layer_plan = {
             "attention_norm": SequenceParallel(),
             "attention": prepare_module_input(
-                input_layouts=(Shard(1), Replicate()),
-                desired_input_layouts=(Replicate(), Replicate()),
+                input_layouts=(Shard(1), Replicate(), None),
+                desired_input_layouts=(Replicate(), Replicate(), None),
             ),
             "attention.wq": colwise_parallel(use_local_output=False),
             "attention.wk": colwise_parallel(use_local_output=False),
Original file line number	Diff line number	Diff line change
`@@ -5,5 +5,5 @@`
`5`	`5`	`# LICENSE file in the root directory of this source tree.`
`6`	`6`
`7`	`7`	`_supported_experiments = frozenset(`
`8`		`- ["flux", "llama4", "qwen3", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm"]`
	`8`	`+ ["flux", "llama4", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm"]`
`9`	`9`	`)`