From 0bd842e858db339001358ce23cf2d8e69367503d Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Thu, 9 Oct 2025 12:47:40 -0700
Subject: [PATCH 1/6] change 32b config

---
 apps/grpo/qwen3_32b.yaml      | 18 +++++++++---------
 apps/mast/qwen3_32b_mast.yaml | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index ca88b349a..064b585f1 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -3,10 +3,10 @@
 # NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
 
 # Global configuration
-group_size: 2
-batch_size: 8
-max_req_tokens: 512
-max_res_tokens: 512
+group_size: 8
+batch_size: 32
+max_req_tokens: 1024
+max_res_tokens: 1024
 model: "Qwen/Qwen3-32B"
 off_by_n: 1 # Off by one by default
 
@@ -14,7 +14,7 @@ provisioner:
   launcher: slurm
 
 # Main loop configuration
-rollout_threads: 1   # Recommended to set equal to policy.num_replicas
+rollout_threads: 8
 
 # Observability configuration
 metric_logging:
@@ -69,8 +69,8 @@ trainer:
     enable: false
   parallelism:
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: -1
-    tensor_parallel_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 8
     pipeline_parallel_degree: 1
     context_parallel_degree: 1
     expert_parallel_degree: 1
@@ -90,7 +90,7 @@ replay_buffer:
   batch_size: ${batch_size}
   max_policy_age: ${off_by_n}
   # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
-  dp_size: 8
+  dp_size: 1
 
 # Reference model configuration
 ref_model:
@@ -119,7 +119,7 @@ ref_model:
 services:
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
-    num_replicas: 1
+    num_replicas: 4
     hosts: 1
     with_gpus: true
   ref_model:
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index 47368becd..65381a387 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -3,9 +3,9 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
-max_req_tokens: 512
-max_res_tokens: 512
+batch_size: 32
+max_req_tokens: 1024
+max_res_tokens: 1024
 model: "Qwen/Qwen3-32B"
 off_by_n: 1 # Off by one by default
 launcher: mast
@@ -71,8 +71,8 @@ trainer:
     enable: false
   parallelism:
     data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: 8
-    tensor_parallel_degree: 1
+    data_parallel_shard_degree: 1
+    tensor_parallel_degree: 8
     pipeline_parallel_degree: 1
     context_parallel_degree: 1
     expert_parallel_degree: 1
@@ -129,13 +129,13 @@ ref_model:
 services:
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
-    num_replicas: 2
+    num_replicas: 4
     with_gpus: true
     mesh_name: policy
     hosts: 1
   ref_model:
-    procs: 4
-    num_replicas: 2
+    procs: ${ref_model.parallelism.tensor_parallel_degree}
+    num_replicas: 1
     with_gpus: true
     mesh_name: ref_model
     hosts: 1

From cb66e7d717c12011c3deab558fba0416dd507567 Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Thu, 9 Oct 2025 12:54:32 -0700
Subject: [PATCH 2/6] group size 16

---
 apps/grpo/qwen3_32b.yaml      | 2 +-
 apps/mast/qwen3_32b_mast.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index 064b585f1..34e686bac 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -3,7 +3,7 @@
 # NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
 
 # Global configuration
-group_size: 8
+group_size: 16
 batch_size: 32
 max_req_tokens: 1024
 max_res_tokens: 1024
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index 65381a387..8f68c8f1e 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -2,7 +2,7 @@
 # >>> python -m apps.mast.main --config apps/mast/qwen3_1_7b_mast.yaml
 
 # Global configuration
-group_size: 8
+group_size: 16
 batch_size: 32
 max_req_tokens: 1024
 max_res_tokens: 1024

From 9ddbef156d1abfe7ae7a2b0ee26d213d0c914d1c Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Thu, 9 Oct 2025 12:55:34 -0700
Subject: [PATCH 3/6] increase rollout threads for 32b

---
 apps/mast/qwen3_32b_mast.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index 8f68c8f1e..c4063f20c 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -13,7 +13,7 @@ job_name: forge-qwen3-32b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
 # Main loop configuration
-rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to policy.num_replicas
+rollout_threads: 8
 
 # Observability configuration
 metric_logging:

From 5a3ef1d2c519bc0e6a23da0ddec029d19c70c745 Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Fri, 10 Oct 2025 14:07:24 -0400
Subject: [PATCH 4/6] Update apps/grpo/qwen3_32b.yaml

Co-authored-by: casteryh <57782783+casteryh@users.noreply.github.com>
---
 apps/grpo/qwen3_32b.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index 34e686bac..f2467fc73 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -14,7 +14,7 @@ provisioner:
   launcher: slurm
 
 # Main loop configuration
-rollout_threads: 8
+rollout_threads: 32 # make this equal to the batch size seems to work well
 
 # Observability configuration
 metric_logging:

From 061dbdcfc4a815e1bdb2ad7d11e2b0af3b37c5e5 Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Fri, 10 Oct 2025 12:22:27 -0700
Subject: [PATCH 5/6] rollout loops

---
 apps/grpo/qwen3_32b.yaml      | 2 +-
 apps/mast/qwen3_32b_mast.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
index f2467fc73..4129facbf 100644
--- a/apps/grpo/qwen3_32b.yaml
+++ b/apps/grpo/qwen3_32b.yaml
@@ -14,7 +14,7 @@ provisioner:
   launcher: slurm
 
 # Main loop configuration
-rollout_threads: 32 # make this equal to the batch size seems to work well
+rollout_threads: 32 # make this 4x the number of policy replicas seems to work well
 
 # Observability configuration
 metric_logging:
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index c4063f20c..841d4f9ad 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -13,7 +13,7 @@ job_name: forge-qwen3-32b
 checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/
 
 # Main loop configuration
-rollout_threads: 8
+rollout_threads: 32 # make this 4x the number of policy replicas seems to work well
 
 # Observability configuration
 metric_logging:

From 90542b053955aa19f1f81de3d52508758313975c Mon Sep 17 00:00:00 2001
From: Allen Wang <9057208+allenwang28@users.noreply.github.com>
Date: Fri, 10 Oct 2025 12:26:30 -0700
Subject: [PATCH 6/6] all local_batch_size

---
 apps/mast/qwen3_14b_mast.yaml  | 6 +++---
 apps/mast/qwen3_1_7b_mast.yaml | 6 +++---
 apps/mast/qwen3_32b_mast.yaml  | 6 +++---
 apps/mast/qwen3_4b_mast.yaml   | 6 +++---
 apps/mast/qwen3_8b_mast.yaml   | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml
index 484a71538..1d5300838 100644
--- a/apps/mast/qwen3_14b_mast.yaml
+++ b/apps/mast/qwen3_14b_mast.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
+local_batch_size: 16 # per-device batch size
 max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-14B"
@@ -61,7 +61,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -95,7 +95,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
 
diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml
index 58d879579..92d27da16 100644
--- a/apps/mast/qwen3_1_7b_mast.yaml
+++ b/apps/mast/qwen3_1_7b_mast.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
+local_batch_size: 16 # per-device batch size
 max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-1.7B"
@@ -61,7 +61,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -95,7 +95,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
 
diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml
index 841d4f9ad..3fa79b955 100644
--- a/apps/mast/qwen3_32b_mast.yaml
+++ b/apps/mast/qwen3_32b_mast.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 16
-batch_size: 32
+local_batch_size: 32 # per-device batch size
 max_req_tokens: 1024
 max_res_tokens: 1024
 model: "Qwen/Qwen3-32B"
@@ -61,7 +61,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -95,7 +95,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
 
diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml
index 92119055a..a7e44e069 100644
--- a/apps/mast/qwen3_4b_mast.yaml
+++ b/apps/mast/qwen3_4b_mast.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
+local_batch_size: 16 # per-device batch size
 max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-4B"
@@ -61,7 +61,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -95,7 +95,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
 
diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml
index 7f2f99694..953f809b7 100644
--- a/apps/mast/qwen3_8b_mast.yaml
+++ b/apps/mast/qwen3_8b_mast.yaml
@@ -3,7 +3,7 @@
 
 # Global configuration
 group_size: 8
-batch_size: 16
+local_batch_size: 16 # per-device batch size
 max_req_tokens: 512
 max_res_tokens: 512
 model: "Qwen/Qwen3-8B"
@@ -61,7 +61,7 @@ trainer:
   lr_scheduler:
     warmup_steps: 1
   training:
-    local_batch_size: ${batch_size}
+    local_batch_size: ${local_batch_size}
     seq_len: 2048
     max_norm: 1.0
     steps: 1000000
@@ -95,7 +95,7 @@ trainer:
 
 # Replay buffer configuration
 replay_buffer:
-  batch_size: ${batch_size}
+  batch_size: ${local_batch_size}
   max_policy_age: ${off_by_n}
   dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree