From d4aa468a916e9874cb323ed8d564288409a41fba Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 31 Jul 2025 16:28:49 +0800 Subject: [PATCH 1/4] support seed --- fastdeploy/config.py | 1 + fastdeploy/engine/args_utils.py | 11 +++++++++++ fastdeploy/engine/sampling_params.py | 6 +----- fastdeploy/model_executor/layers/sample/meta_data.py | 1 + fastdeploy/model_executor/layers/sample/sampler.py | 6 +++++- fastdeploy/utils.py | 9 +++++++++ fastdeploy/worker/gpu_model_runner.py | 11 ++--------- 7 files changed, 30 insertions(+), 15 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index a4f4c307d1..0eb6812555 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -117,6 +117,7 @@ def __init__( self.enable_mm = False self.enable_redundant_experts = False self.redundant_experts_num = 0 + self.seed = 0 self.quantization = None for key, value in args.items(): if hasattr(self, key): diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 4a2414304d..b9eb57105d 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -316,6 +316,11 @@ class EngineArgs: Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values. """ + seed: Optional[int] = None + """ + Random seed to use for initialization. If not set, a random seed is used. + """ + enable_early_stop: bool = False """ Flag to enable early stop. Default is False (disabled). @@ -484,6 +489,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.enable_logprob, help="Enable output of token-level log probabilities.", ) + model_group.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for initialization. If not specified, a random seed will be used.", + ) model_group.add_argument( "--enable-early-stop", action="store_true", diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py index 46d9fd8acf..8f90c59d12 100644 --- a/fastdeploy/engine/sampling_params.py +++ b/fastdeploy/engine/sampling_params.py @@ -16,7 +16,6 @@ from __future__ import annotations -import random from dataclasses import dataclass, fields from typing import Any, List, Optional, Union @@ -155,7 +154,7 @@ def from_optional( def __post_init__(self): if self.seed is None: - self.seed = random.randint(0, 922337203685477580) + self.seed = 0 if self.max_tokens is not None and self.reasoning_max_tokens is None: self.reasoning_max_tokens = max(int(self.max_tokens * 0.8), 1) self._verify_args() @@ -200,9 +199,6 @@ def _verify_args(self) -> None: if self.logprobs is not None and self.logprobs > 20: raise ValueError("Invalid value for 'top_logprobs': must be less than or equal to 20.") - if not 0 <= self.seed <= 922337203685477580: - raise ValueError("seed must be in [0, 922337203685477580], got " f"{self.seed}.") - def update_from_tokenizer(self, tokenizer): """Support bad words""" if self.bad_words is None: diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py index 9cca5af273..06281a5a50 100644 --- a/fastdeploy/model_executor/layers/sample/meta_data.py +++ b/fastdeploy/model_executor/layers/sample/meta_data.py @@ -43,6 +43,7 @@ class SamplingMetadata: top_p: paddle.Tensor top_k: Optional[paddle.Tensor] = None min_p: Optional[paddle.Tensor] = None + seed: Optional[paddle.Tensor] = None max_num_logprobs: Optional[int] = None enable_early_stop: Optional[int] = False stop_flags: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index 412a7eda7f..6b4338ef92 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -282,7 +282,11 @@ def forward_cuda( probs = min_p_sampling(probs, sampling_metadata.min_p) - _, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k) + if paddle.count_nonzero(sampling_metadata.seed) == 0: + seed_value = -1 + else: + seed_value = int(sampling_metadata.seed[0, 0]) + _, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k, seed=seed_value) logprobs_tensors = ( None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens) diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 9ea25000c7..f7a644200a 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -29,6 +29,8 @@ from pathlib import Path from typing import Literal, TypeVar, Union +import numpy as np +import paddle import requests import yaml from aistudio_sdk.snapshot_download import snapshot_download as aistudio_download @@ -291,6 +293,13 @@ def extract_tar(tar_path, output_dir): raise RuntimeError(f"Extraction failed: {e!s}") +def set_random_seed(seed: int) -> None: + if seed is not None: + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + def download_model(url, output_dir, temp_tar): """ 下载模型,并将其解压到指定目录。 diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 1fb6235f9a..b6c54c1a0d 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -126,11 +126,7 @@ def __init__( # Initialize share inputs self._init_share_inputs(self.parallel_config.max_num_seqs) - self.infer_seed_increment = paddle.full( - shape=[self.parallel_config.max_num_seqs, 1], - fill_value=4, - dtype="int64", - ) + self.restore_chunked_prefill_request = dict() # Initialize attention Backend @@ -795,6 +791,7 @@ def _prepare_inputs(self) -> None: top_p=self.share_inputs["top_p"], top_k=self.share_inputs["top_k"], min_p=self.share_inputs["min_p"], + seed=self.share_inputs["infer_seed"], step_idx=self.share_inputs["step_idx"], pre_token_ids=self.share_inputs["pre_ids"], prompt_ids=self.share_inputs["prompt_ids"], @@ -1096,8 +1093,6 @@ def _dummy_run( self.proposer.run(share_inputs=self.share_inputs) # 7. Updata 'infer_seed' and step_cuda() - self.share_inputs["infer_seed"].add_(self.infer_seed_increment) - self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED step_cuda( self.share_inputs, self.cache_config.block_size, @@ -1368,8 +1363,6 @@ class at the server level, which is too granular for ModelRunner. self.proposer.run(share_inputs=self.share_inputs) # 7. Updata 'infer_seed' and step_cuda() - self.share_inputs["infer_seed"].add_(self.infer_seed_increment) - self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED if not envs.ENABLE_V1_KVCACHE_SCHEDULER: step_cuda( self.share_inputs, From 17903a80e55d426d888af2410b0dc9b208b21c55 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 31 Jul 2025 18:04:01 +0800 Subject: [PATCH 2/4] fix --- fastdeploy/worker/gpu_model_runner.py | 1 - fastdeploy/worker/xpu_model_runner.py | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index b6c54c1a0d..34bb7a1e7d 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -554,7 +554,6 @@ def _init_share_inputs(self, max_num_seqs: int): """ Initialize all share buffers for model inputs. """ - self.MAX_INFER_SEED = 9223372036854775806 self.share_inputs = {} self.share_inputs["pre_ids"] = paddle.full( diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 53ca380207..6e55510642 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -357,11 +357,6 @@ def __init__(self, fd_config: FDConfig, device: str, rank: int, local_rank: int) # Initialize share inputs self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs) - self.infer_seed_increment = paddle.full( - shape=[self.parallel_config.max_num_seqs, 1], - fill_value=4, - dtype="int64", - ) # Initialize attention Backend # Note(gonshaotian): Currently, all attention layers share one attention backend instance. @@ -529,7 +524,6 @@ def _init_share_inputs(self, max_num_seqs: int): """Initialize all share buffers for model inputs. Note: In the future, we may abandon share buffers. """ - self.MAX_INFER_SEED = 9223372036854775806 self.share_inputs = {} self.share_inputs["pre_ids"] = paddle.full( @@ -673,6 +667,7 @@ def _prepare_inputs(self, is_dummy_run=False) -> None: top_p=self.share_inputs["top_p"], top_k=self.share_inputs["top_k"], min_p=self.share_inputs["min_p"], + seed=self.share_inputs["infer_seed"], step_idx=self.share_inputs["step_idx"], pre_token_ids=self.share_inputs["pre_ids"], frequency_penalties=self.share_inputs["frequency_score"], @@ -911,8 +906,7 @@ class at the server level, which is too granular for ModelRunner. ) # 7. Updata 'infer_seed' and step_paddle() - self.share_inputs["infer_seed"].add_(self.infer_seed_increment) - self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED + step_paddle( self.share_inputs, self.cache_config.block_size, From ccc0dbe712684cd97d2abaa7909b8acd65b0ace4 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Sat, 2 Aug 2025 22:32:42 +0800 Subject: [PATCH 3/4] add SamplingMetadata seed test --- test/layers/test_sampler.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/layers/test_sampler.py b/test/layers/test_sampler.py index 65a6bfbe68..14a22cf6d6 100644 --- a/test/layers/test_sampler.py +++ b/test/layers/test_sampler.py @@ -52,25 +52,43 @@ def _create_default_sampling_metadata( pre_token_ids=_create_tokens_tensor(batch_size, max_seq_len), frequency_penalties=_create_penalty_tensor(batch_size, 0.0), presence_penalties=_create_penalty_tensor(batch_size, 0.0), + min_p=paddle.full(shape=[batch_size, 1], fill_value=0.0, dtype="float32"), repetition_penalties=_create_penalty_tensor(batch_size, 1.0), min_dec_lens=paddle.full(shape=[batch_size, 1], fill_value=min_seq_len, dtype="int64"), bad_words_token_ids=paddle.full(shape=[batch_size], fill_value=-1, dtype="int64"), eos_token_ids=paddle.full(shape=[batch_size], fill_value=-2, dtype="int64"), + seed=paddle.full(shape=[batch_size, 1], fill_value=1, dtype="int64"), ) return fake_sampling_metadata def test_sampler(): - batch_size = 32 - vocab_size = 1024 + batch_size = 8 + vocab_size = 10131 min_seq_len = 1 max_seq_len = 1024 sampler = Sampler() - logits = _create_fake_logits(batch_size, vocab_size) - sampling_metadata = _create_default_sampling_metadata(batch_size, min_seq_len, max_seq_len) - next_tokens = sampler(logits, sampling_metadata) - print(next_tokens) + reference_tokens = None + all_consistent = True + + for i in range(batch_size): + logits = _create_fake_logits(batch_size, vocab_size) + sampling_metadata = _create_default_sampling_metadata(batch_size, min_seq_len, max_seq_len) + next_tokens = sampler(logits, sampling_metadata) + print("next_tokens", next_tokens) + + current_tokens = next_tokens.sampled_token_ids.numpy() + + if reference_tokens is None: + reference_tokens = current_tokens + else: + if not (current_tokens == reference_tokens).all(): + all_consistent = False + break + + if not all_consistent: + raise AssertionError("输出的 next_tokens 值不一致!") if __name__ == "__main__": From 65e3927d0c298348ea30fc7e3f33c058408776b7 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Sat, 2 Aug 2025 23:45:53 +0800 Subject: [PATCH 4/4] The next_tokens values are inconsistent! --- test/layers/test_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/layers/test_sampler.py b/test/layers/test_sampler.py index 14a22cf6d6..de36d39c61 100644 --- a/test/layers/test_sampler.py +++ b/test/layers/test_sampler.py @@ -88,7 +88,7 @@ def test_sampler(): break if not all_consistent: - raise AssertionError("输出的 next_tokens 值不一致!") + raise AssertionError("The next_tokens values are inconsistent!") if __name__ == "__main__":