@@ -357,11 +357,6 @@ def __init__(self, fd_config: FDConfig, device: str, rank: int, local_rank: int)
357
357
358
358
# Initialize share inputs
359
359
self ._init_share_inputs (self .fd_config .parallel_config .max_num_seqs )
360
- self .infer_seed_increment = paddle .full (
361
- shape = [self .parallel_config .max_num_seqs , 1 ],
362
- fill_value = 4 ,
363
- dtype = "int64" ,
364
- )
365
360
366
361
# Initialize attention Backend
367
362
# Note(gonshaotian): Currently, all attention layers share one attention backend instance.
@@ -529,7 +524,6 @@ def _init_share_inputs(self, max_num_seqs: int):
529
524
"""Initialize all share buffers for model inputs.
530
525
Note: In the future, we may abandon share buffers.
531
526
"""
532
- self .MAX_INFER_SEED = 9223372036854775806
533
527
self .share_inputs = {}
534
528
535
529
self .share_inputs ["pre_ids" ] = paddle .full (
@@ -673,6 +667,7 @@ def _prepare_inputs(self, is_dummy_run=False) -> None:
673
667
top_p = self .share_inputs ["top_p" ],
674
668
top_k = self .share_inputs ["top_k" ],
675
669
min_p = self .share_inputs ["min_p" ],
670
+ seed = self .share_inputs ["infer_seed" ],
676
671
step_idx = self .share_inputs ["step_idx" ],
677
672
pre_token_ids = self .share_inputs ["pre_ids" ],
678
673
frequency_penalties = self .share_inputs ["frequency_score" ],
@@ -911,8 +906,7 @@ class at the server level, which is too granular for ModelRunner.
911
906
)
912
907
913
908
# 7. Updata 'infer_seed' and step_paddle()
914
- self .share_inputs ["infer_seed" ].add_ (self .infer_seed_increment )
915
- self .share_inputs ["infer_seed" ][:] %= self .MAX_INFER_SEED
909
+
916
910
step_paddle (
917
911
self .share_inputs ,
918
912
self .cache_config .block_size ,
0 commit comments