diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index e7443bc1db..c76a6432dd 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -749,10 +749,6 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): """ Insert tasks to engine. """ - for task in tasks: - start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER) - if task.sampling_params.bad_words is not None: - task.sampling_params.update_from_tokenizer(self.data_processor.tokenizer) # TODO 返回至 scheduler if allocated: current_tasks = [] @@ -779,6 +775,11 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz)) return True + for task in tasks: + start_span_request("DEQUEUE", task, trace.SpanKind.CONSUMER) + if task.sampling_params.bad_words is not None: + task.sampling_params.update_from_tokenizer(self.data_processor.tokenizer) + self.resource_manager.check_and_free_block_tables() if not isinstance(tasks, list): diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 306164635b..199a26db81 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -208,7 +208,7 @@ def init_attention_metadata(self, forward_meta: ForwardMeta): ) = pre_cache_len_concat( forward_meta.seq_lens_decoder, forward_meta.seq_lens_this_time, - metadata.set_max_lengths[2], + forward_meta.max_len_tensor_cpu[2], self.block_size, ) diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index 2856737ff5..01f81ac13d 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -215,11 +215,13 @@ def load_pre_sharded_checkpoint(model_path: str, local_rank: int, use_fastsafete """ load_pre_sharded_checkpoint """ + from fastdeploy.model_executor.layers.utils import get_tensor + state_dict = {} _, safetensor_files = get_all_safetensors(os.path.join(model_path, f"rank{local_rank}")) weights_iterator = safetensors_weights_iterator(safetensor_files) for name, weight in weights_iterator: - state_dict[name] = weight + state_dict[name] = get_tensor(weight) return state_dict diff --git a/setup.py b/setup.py index 87099104b7..e13e70d07e 100644 --- a/setup.py +++ b/setup.py @@ -181,7 +181,7 @@ def get_name(): cmdclass_dict = {"bdist_wheel": CustomBdistWheel} cmdclass_dict["build_ext"] = CMakeBuild -FASTDEPLOY_VERSION = os.environ.get("FASTDEPLOY_VERSION", "2.1.0-dev") +FASTDEPLOY_VERSION = os.environ.get("FASTDEPLOY_VERSION", "2.1.0") cmdclass_dict["build_optl"] = PostInstallCommand setup(