From 0bd842e858db339001358ce23cf2d8e69367503d Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Thu, 9 Oct 2025 12:47:40 -0700 Subject: [PATCH 1/6] change 32b config --- apps/grpo/qwen3_32b.yaml | 18 +++++++++--------- apps/mast/qwen3_32b_mast.yaml | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index ca88b349a..064b585f1 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -3,10 +3,10 @@ # NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability # Global configuration -group_size: 2 -batch_size: 8 -max_req_tokens: 512 -max_res_tokens: 512 +group_size: 8 +batch_size: 32 +max_req_tokens: 1024 +max_res_tokens: 1024 model: "Qwen/Qwen3-32B" off_by_n: 1 # Off by one by default @@ -14,7 +14,7 @@ provisioner: launcher: slurm # Main loop configuration -rollout_threads: 1 # Recommended to set equal to policy.num_replicas +rollout_threads: 8 # Observability configuration metric_logging: @@ -69,8 +69,8 @@ trainer: enable: false parallelism: data_parallel_replicate_degree: 1 - data_parallel_shard_degree: -1 - tensor_parallel_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 8 pipeline_parallel_degree: 1 context_parallel_degree: 1 expert_parallel_degree: 1 @@ -90,7 +90,7 @@ replay_buffer: batch_size: ${batch_size} max_policy_age: ${off_by_n} # dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree - dp_size: 8 + dp_size: 1 # Reference model configuration ref_model: @@ -119,7 +119,7 @@ ref_model: services: policy: procs: ${policy.engine_config.tensor_parallel_size} - num_replicas: 1 + num_replicas: 4 hosts: 1 with_gpus: true ref_model: diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index 47368becd..65381a387 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -3,9 +3,9 @@ # Global configuration group_size: 8 -batch_size: 16 -max_req_tokens: 512 -max_res_tokens: 512 +batch_size: 32 +max_req_tokens: 1024 +max_res_tokens: 1024 model: "Qwen/Qwen3-32B" off_by_n: 1 # Off by one by default launcher: mast @@ -71,8 +71,8 @@ trainer: enable: false parallelism: data_parallel_replicate_degree: 1 - data_parallel_shard_degree: 8 - tensor_parallel_degree: 1 + data_parallel_shard_degree: 1 + tensor_parallel_degree: 8 pipeline_parallel_degree: 1 context_parallel_degree: 1 expert_parallel_degree: 1 @@ -129,13 +129,13 @@ ref_model: services: policy: procs: ${policy.engine_config.tensor_parallel_size} - num_replicas: 2 + num_replicas: 4 with_gpus: true mesh_name: policy hosts: 1 ref_model: - procs: 4 - num_replicas: 2 + procs: ${ref_model.parallelism.tensor_parallel_degree} + num_replicas: 1 with_gpus: true mesh_name: ref_model hosts: 1 From cb66e7d717c12011c3deab558fba0416dd507567 Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Thu, 9 Oct 2025 12:54:32 -0700 Subject: [PATCH 2/6] group size 16 --- apps/grpo/qwen3_32b.yaml | 2 +- apps/mast/qwen3_32b_mast.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index 064b585f1..34e686bac 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -3,7 +3,7 @@ # NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability # Global configuration -group_size: 8 +group_size: 16 batch_size: 32 max_req_tokens: 1024 max_res_tokens: 1024 diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index 65381a387..8f68c8f1e 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -2,7 +2,7 @@ # >>> python -m apps.mast.main --config apps/mast/qwen3_1_7b_mast.yaml # Global configuration -group_size: 8 +group_size: 16 batch_size: 32 max_req_tokens: 1024 max_res_tokens: 1024 From 9ddbef156d1abfe7ae7a2b0ee26d213d0c914d1c Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Thu, 9 Oct 2025 12:55:34 -0700 Subject: [PATCH 3/6] increase rollout threads for 32b --- apps/mast/qwen3_32b_mast.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index 8f68c8f1e..c4063f20c 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -13,7 +13,7 @@ job_name: forge-qwen3-32b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ # Main loop configuration -rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas +rollout_threads: 8 # Observability configuration metric_logging: From 5a3ef1d2c519bc0e6a23da0ddec029d19c70c745 Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Fri, 10 Oct 2025 14:07:24 -0400 Subject: [PATCH 4/6] Update apps/grpo/qwen3_32b.yaml Co-authored-by: casteryh <57782783+casteryh@users.noreply.github.com> --- apps/grpo/qwen3_32b.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index 34e686bac..f2467fc73 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -14,7 +14,7 @@ provisioner: launcher: slurm # Main loop configuration -rollout_threads: 8 +rollout_threads: 32 # make this equal to the batch size seems to work well # Observability configuration metric_logging: From 061dbdcfc4a815e1bdb2ad7d11e2b0af3b37c5e5 Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Fri, 10 Oct 2025 12:22:27 -0700 Subject: [PATCH 5/6] rollout loops --- apps/grpo/qwen3_32b.yaml | 2 +- apps/mast/qwen3_32b_mast.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml index f2467fc73..4129facbf 100644 --- a/apps/grpo/qwen3_32b.yaml +++ b/apps/grpo/qwen3_32b.yaml @@ -14,7 +14,7 @@ provisioner: launcher: slurm # Main loop configuration -rollout_threads: 32 # make this equal to the batch size seems to work well +rollout_threads: 32 # make this 4x the number of policy replicas seems to work well # Observability configuration metric_logging: diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index c4063f20c..841d4f9ad 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -13,7 +13,7 @@ job_name: forge-qwen3-32b checkpoint_folder: /mnt/wsfuse/teamforge/forge_runs/ # Main loop configuration -rollout_threads: 8 +rollout_threads: 32 # make this 4x the number of policy replicas seems to work well # Observability configuration metric_logging: From 90542b053955aa19f1f81de3d52508758313975c Mon Sep 17 00:00:00 2001 From: Allen Wang <9057208+allenwang28@users.noreply.github.com> Date: Fri, 10 Oct 2025 12:26:30 -0700 Subject: [PATCH 6/6] all local_batch_size --- apps/mast/qwen3_14b_mast.yaml | 6 +++--- apps/mast/qwen3_1_7b_mast.yaml | 6 +++--- apps/mast/qwen3_32b_mast.yaml | 6 +++--- apps/mast/qwen3_4b_mast.yaml | 6 +++--- apps/mast/qwen3_8b_mast.yaml | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/apps/mast/qwen3_14b_mast.yaml b/apps/mast/qwen3_14b_mast.yaml index 484a71538..1d5300838 100644 --- a/apps/mast/qwen3_14b_mast.yaml +++ b/apps/mast/qwen3_14b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-14B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_1_7b_mast.yaml b/apps/mast/qwen3_1_7b_mast.yaml index 58d879579..92d27da16 100644 --- a/apps/mast/qwen3_1_7b_mast.yaml +++ b/apps/mast/qwen3_1_7b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-1.7B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_32b_mast.yaml b/apps/mast/qwen3_32b_mast.yaml index 841d4f9ad..3fa79b955 100644 --- a/apps/mast/qwen3_32b_mast.yaml +++ b/apps/mast/qwen3_32b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 16 -batch_size: 32 +local_batch_size: 32 # per-device batch size max_req_tokens: 1024 max_res_tokens: 1024 model: "Qwen/Qwen3-32B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_4b_mast.yaml b/apps/mast/qwen3_4b_mast.yaml index 92119055a..a7e44e069 100644 --- a/apps/mast/qwen3_4b_mast.yaml +++ b/apps/mast/qwen3_4b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-4B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree diff --git a/apps/mast/qwen3_8b_mast.yaml b/apps/mast/qwen3_8b_mast.yaml index 7f2f99694..953f809b7 100644 --- a/apps/mast/qwen3_8b_mast.yaml +++ b/apps/mast/qwen3_8b_mast.yaml @@ -3,7 +3,7 @@ # Global configuration group_size: 8 -batch_size: 16 +local_batch_size: 16 # per-device batch size max_req_tokens: 512 max_res_tokens: 512 model: "Qwen/Qwen3-8B" @@ -61,7 +61,7 @@ trainer: lr_scheduler: warmup_steps: 1 training: - local_batch_size: ${batch_size} + local_batch_size: ${local_batch_size} seq_len: 2048 max_norm: 1.0 steps: 1000000 @@ -95,7 +95,7 @@ trainer: # Replay buffer configuration replay_buffer: - batch_size: ${batch_size} + batch_size: ${local_batch_size} max_policy_age: ${off_by_n} dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree