Skip to content

[Feature] support seed parameter #3116

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def __init__(
self.enable_mm = False
self.enable_redundant_experts = False
self.redundant_experts_num = 0
self.seed = 0
self.quantization = None
for key, value in args.items():
if hasattr(self, key):
Expand Down
11 changes: 11 additions & 0 deletions fastdeploy/engine/args_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,11 @@ class EngineArgs:
Must be explicitly enabled via the `--enable-logprob` startup parameter to output logprob values.
"""

seed: Optional[int] = None
"""
Random seed to use for initialization. If not set, a random seed is used.
"""

enable_early_stop: bool = False
"""
Flag to enable early stop. Default is False (disabled).
Expand Down Expand Up @@ -484,6 +489,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
default=EngineArgs.enable_logprob,
help="Enable output of token-level log probabilities.",
)
model_group.add_argument(
"--seed",
type=int,
default=None,
help="Random seed for initialization. If not specified, a random seed will be used.",
)
model_group.add_argument(
"--enable-early-stop",
action="store_true",
Expand Down
6 changes: 1 addition & 5 deletions fastdeploy/engine/sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from __future__ import annotations

import random
from dataclasses import dataclass, fields
from typing import Any, List, Optional, Union

Expand Down Expand Up @@ -155,7 +154,7 @@ def from_optional(

def __post_init__(self):
if self.seed is None:
self.seed = random.randint(0, 922337203685477580)
self.seed = 0
if self.max_tokens is not None and self.reasoning_max_tokens is None:
self.reasoning_max_tokens = max(int(self.max_tokens * 0.8), 1)
self._verify_args()
Expand Down Expand Up @@ -200,9 +199,6 @@ def _verify_args(self) -> None:
if self.logprobs is not None and self.logprobs > 20:
raise ValueError("Invalid value for 'top_logprobs': must be less than or equal to 20.")

if not 0 <= self.seed <= 922337203685477580:
raise ValueError("seed must be in [0, 922337203685477580], got " f"{self.seed}.")

def update_from_tokenizer(self, tokenizer):
"""Support bad words"""
if self.bad_words is None:
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/model_executor/layers/sample/meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class SamplingMetadata:
top_p: paddle.Tensor
top_k: Optional[paddle.Tensor] = None
min_p: Optional[paddle.Tensor] = None
seed: Optional[paddle.Tensor] = None
max_num_logprobs: Optional[int] = None
enable_early_stop: Optional[int] = False
stop_flags: Optional[paddle.Tensor] = None
Expand Down
6 changes: 5 additions & 1 deletion fastdeploy/model_executor/layers/sample/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,11 @@ def forward_cuda(

probs = min_p_sampling(probs, sampling_metadata.min_p)

_, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k)
if paddle.count_nonzero(sampling_metadata.seed) == 0:
seed_value = -1
else:
seed_value = int(sampling_metadata.seed[0, 0])
_, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k, seed=seed_value)

logprobs_tensors = (
None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens)
Expand Down
9 changes: 9 additions & 0 deletions fastdeploy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from pathlib import Path
from typing import Literal, TypeVar, Union

import numpy as np
import paddle
import requests
import yaml
from aistudio_sdk.snapshot_download import snapshot_download as aistudio_download
Expand Down Expand Up @@ -291,6 +293,13 @@ def extract_tar(tar_path, output_dir):
raise RuntimeError(f"Extraction failed: {e!s}")


def set_random_seed(seed: int) -> None:
if seed is not None:
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)


def download_model(url, output_dir, temp_tar):
"""
下载模型,并将其解压到指定目录。
Expand Down
12 changes: 2 additions & 10 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,7 @@ def __init__(

# Initialize share inputs
self._init_share_inputs(self.parallel_config.max_num_seqs)
self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1],
fill_value=4,
dtype="int64",
)

self.restore_chunked_prefill_request = dict()

# Initialize attention Backend
Expand Down Expand Up @@ -558,7 +554,6 @@ def _init_share_inputs(self, max_num_seqs: int):
"""
Initialize all share buffers for model inputs.
"""
self.MAX_INFER_SEED = 9223372036854775806
self.share_inputs = {}

self.share_inputs["pre_ids"] = paddle.full(
Expand Down Expand Up @@ -795,6 +790,7 @@ def _prepare_inputs(self) -> None:
top_p=self.share_inputs["top_p"],
top_k=self.share_inputs["top_k"],
min_p=self.share_inputs["min_p"],
seed=self.share_inputs["infer_seed"],
step_idx=self.share_inputs["step_idx"],
pre_token_ids=self.share_inputs["pre_ids"],
prompt_ids=self.share_inputs["prompt_ids"],
Expand Down Expand Up @@ -1096,8 +1092,6 @@ def _dummy_run(
self.proposer.run(share_inputs=self.share_inputs)

# 7. Updata 'infer_seed' and step_cuda()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
step_cuda(
self.share_inputs,
self.cache_config.block_size,
Expand Down Expand Up @@ -1368,8 +1362,6 @@ class at the server level, which is too granular for ModelRunner.
self.proposer.run(share_inputs=self.share_inputs)

# 7. Updata 'infer_seed' and step_cuda()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
step_cuda(
self.share_inputs,
Expand Down
10 changes: 2 additions & 8 deletions fastdeploy/worker/xpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,11 +357,6 @@ def __init__(self, fd_config: FDConfig, device: str, rank: int, local_rank: int)

# Initialize share inputs
self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs)
self.infer_seed_increment = paddle.full(
shape=[self.parallel_config.max_num_seqs, 1],
fill_value=4,
dtype="int64",
)

# Initialize attention Backend
# Note(gonshaotian): Currently, all attention layers share one attention backend instance.
Expand Down Expand Up @@ -529,7 +524,6 @@ def _init_share_inputs(self, max_num_seqs: int):
"""Initialize all share buffers for model inputs.
Note: In the future, we may abandon share buffers.
"""
self.MAX_INFER_SEED = 9223372036854775806
self.share_inputs = {}

self.share_inputs["pre_ids"] = paddle.full(
Expand Down Expand Up @@ -673,6 +667,7 @@ def _prepare_inputs(self, is_dummy_run=False) -> None:
top_p=self.share_inputs["top_p"],
top_k=self.share_inputs["top_k"],
min_p=self.share_inputs["min_p"],
seed=self.share_inputs["infer_seed"],
step_idx=self.share_inputs["step_idx"],
pre_token_ids=self.share_inputs["pre_ids"],
frequency_penalties=self.share_inputs["frequency_score"],
Expand Down Expand Up @@ -911,8 +906,7 @@ class at the server level, which is too granular for ModelRunner.
)

# 7. Updata 'infer_seed' and step_paddle()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED

step_paddle(
self.share_inputs,
self.cache_config.block_size,
Expand Down
30 changes: 24 additions & 6 deletions test/layers/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,25 +52,43 @@ def _create_default_sampling_metadata(
pre_token_ids=_create_tokens_tensor(batch_size, max_seq_len),
frequency_penalties=_create_penalty_tensor(batch_size, 0.0),
presence_penalties=_create_penalty_tensor(batch_size, 0.0),
min_p=paddle.full(shape=[batch_size, 1], fill_value=0.0, dtype="float32"),
repetition_penalties=_create_penalty_tensor(batch_size, 1.0),
min_dec_lens=paddle.full(shape=[batch_size, 1], fill_value=min_seq_len, dtype="int64"),
bad_words_token_ids=paddle.full(shape=[batch_size], fill_value=-1, dtype="int64"),
eos_token_ids=paddle.full(shape=[batch_size], fill_value=-2, dtype="int64"),
seed=paddle.full(shape=[batch_size, 1], fill_value=1, dtype="int64"),
)
return fake_sampling_metadata


def test_sampler():
batch_size = 32
vocab_size = 1024
batch_size = 8
vocab_size = 10131
min_seq_len = 1
max_seq_len = 1024

sampler = Sampler()
logits = _create_fake_logits(batch_size, vocab_size)
sampling_metadata = _create_default_sampling_metadata(batch_size, min_seq_len, max_seq_len)
next_tokens = sampler(logits, sampling_metadata)
print(next_tokens)
reference_tokens = None
all_consistent = True

for i in range(batch_size):
logits = _create_fake_logits(batch_size, vocab_size)
sampling_metadata = _create_default_sampling_metadata(batch_size, min_seq_len, max_seq_len)
next_tokens = sampler(logits, sampling_metadata)
print("next_tokens", next_tokens)

current_tokens = next_tokens.sampled_token_ids.numpy()

if reference_tokens is None:
reference_tokens = current_tokens
else:
if not (current_tokens == reference_tokens).all():
all_consistent = False
break

if not all_consistent:
raise AssertionError("The next_tokens values are inconsistent!")


if __name__ == "__main__":
Expand Down
Loading