From c6c758a33e8a06a26387fe072c5da2c41a7b0234 Mon Sep 17 00:00:00 2001 From: kkscilife <1658148753@qq.com> Date: Fri, 27 Mar 2026 15:34:21 +0800 Subject: [PATCH] add sp size to avoid of OOM --- autotest/config.yaml | 2 +- autotest/config/qwen3_5_recompute.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/autotest/config.yaml b/autotest/config.yaml index 7a2fe274e..563cb4117 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -390,7 +390,7 @@ case: assert_info: base_metric: qwen3-5-sft-recompute/625c0018/tracker.jsonl check_metrics: - grad_norm: 0.000001 + grad_norm: 0.02 loss/reduced_llm_loss: 0.000001 lr: 0 memory/max_memory_GB: 0.2 diff --git a/autotest/config/qwen3_5_recompute.py b/autotest/config/qwen3_5_recompute.py index 4a72b4559..4cac7c514 100644 --- a/autotest/config/qwen3_5_recompute.py +++ b/autotest/config/qwen3_5_recompute.py @@ -43,6 +43,7 @@ model_cfg=moe_cfg, optim_cfg=optim_cfg, fsdp_cfg=fsdp_cfg, + sp_size=4, dataset_cfg=dataset_config, dataloader_cfg=dataloader_config, lr_cfg=lr_cfg,