Skip to content

Commit f307dee

Browse files
Merge branch 'develop' into seed_1
2 parents d4aa468 + 3230728 commit f307dee

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1340
-350
lines changed

.github/workflows/_build_linux.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ jobs:
103103
-v $(pwd):/workspace -w /workspace \
104104
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
105105
-v "${CACHE_DIR}/.cache:/root/.cache" \
106+
-v "${CACHE_DIR}/.ccache:/root/.ccache" \
106107
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
107108
-e TZ="Asia/Shanghai" \
108109
-e "COMPILE_ARCH=${compile_arch}" \

.github/workflows/_logprob_test_linux.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,12 @@ jobs:
101101
-v "${CACHE_DIR}/ConfigDir:/root/.config" \
102102
-e TZ="Asia/Shanghai" \
103103
--gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
104+
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
105+
python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
106+
104107
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
105108
pip config set install.trusted-host pip.baidu.com
106109
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
107-
108-
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
109110
python -m pip install ${fastdeploy_wheel_url}
110111
111112
wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64

.github/workflows/_unit_test_coverage.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,15 @@ jobs:
9292
9393
git config --global --add safe.directory /workspace/FastDeploy
9494
cd FastDeploy
95+
# python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
96+
python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
97+
9598
pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
9699
pip config set install.trusted-host pip.baidu.com
97100
pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
98101
99102
python -m pip install coverage
100103
python -m pip install diff-cover
101-
python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
102104
python -m pip install ${fd_wheel_url}
103105
export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
104106
export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc

.github/workflows/approve.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Approval
2+
3+
on:
4+
pull_request:
5+
branches:
6+
- develop
7+
- 'release/*'
8+
9+
jobs:
10+
Approval:
11+
name: Approval
12+
if: ${{ github.repository_owner == 'PaddlePaddle' }}
13+
runs-on: ubuntu-latest
14+
env:
15+
PR_ID: ${{ github.event.pull_request.number }}
16+
BRANCH: ${{ github.event.pull_request.base.ref }}
17+
steps:
18+
- name: Checkout base repo
19+
uses: actions/checkout@v4
20+
with:
21+
ref: ${{ github.event.pull_request.base.ref }}
22+
fetch-depth: 1000
23+
24+
- name: Merge PR to test branch
25+
run: |
26+
git fetch origin pull/${PR_ID}/merge
27+
git checkout -b test FETCH_HEAD
28+
git log -n 3 --oneline
29+
git remote add upstream https://github.com/PaddlePaddle/FastDeploy.git
30+
git fetch upstream $BRANCH
31+
32+
- name: Setup python3.10
33+
uses: actions/setup-python@v5
34+
with:
35+
python-version: '3.10'
36+
cache: 'pip'
37+
38+
- name: Run approval check script
39+
run: |
40+
bash scripts/check_approval.sh

custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu

Lines changed: 67 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -195,22 +195,25 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
195195
const paddle::Tensor &seq_lens_encoder,
196196
const paddle::Tensor &seq_lens_decoder,
197197
const paddle::Tensor &seq_lens_this_time,
198-
const int encoder_block_shape_q, const int decoder_block_shape_q,
199-
const int group_size, const int block_size,
200-
const int decoder_step_token_num) {
198+
paddle::Tensor &decoder_batch_ids, // Inplace
199+
paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
200+
paddle::Tensor &decoder_num_blocks_x_cpu, // Inplace, Pinned Memory
201+
paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory
202+
const int encoder_block_shape_q,
203+
const int decoder_block_shape_q,
204+
const int group_size,
205+
const int block_size,
206+
const int decoder_step_token_num)
207+
{
201208
auto stream = seq_lens_encoder.stream();
202209
int bsz = seq_lens_this_time.shape()[0];
203-
auto max_len_tensor =
204-
GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
210+
211+
paddle::Tensor max_len_tensor_gpu = GetEmptyTensor({max_len_tensor_cpu.shape()[0]}, paddle::DataType::INT32, seq_lens_this_time.place());
205212
GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
206-
max_len_tensor, bsz);
213+
max_len_tensor_gpu, bsz);
214+
max_len_tensor_cpu.copy_(max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
207215

208-
// max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
209-
// max_enc_dec_len_this_time, max_just_dec_len_this_time,
210-
// max_just_dec_merged_len_this_time, max_system_len,
211-
// max_just_dec_len_without_system
212-
auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
213-
auto max_len_cpu_ptr = max_len_cpu.data<int>();
216+
auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
214217
int max_len_this_time = max_len_cpu_ptr[0];
215218
int max_enc_len_this_time = max_len_cpu_ptr[1];
216219
int max_dec_len_this_time = max_len_cpu_ptr[2];
@@ -222,14 +225,11 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
222225

223226
paddle::Tensor encoder_batch_ids;
224227
paddle::Tensor encoder_tile_ids_per_batch;
225-
paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
228+
paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
226229
paddle::Tensor kv_batch_ids;
227230
paddle::Tensor kv_tile_ids_per_batch;
228-
paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
229-
paddle::Tensor decoder_batch_ids;
230-
paddle::Tensor decoder_tile_ids_per_batch;
231-
paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
232-
paddle::Tensor max_len_kv_cpu; /*cpu*/
231+
paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
232+
paddle::Tensor max_len_kv_cpu; /*cpu*/
233233

234234
auto max_len_kv =
235235
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
@@ -291,92 +291,64 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
291291
kv_num_blocks_x_cpu =
292292
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
293293
}
294+
294295
if (max_just_dec_len_this_time > 0) {
295-
const uint32_t decoder_max_tile_size_per_bs_q =
296-
div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
296+
// Clear buffer
297+
const uint32_t decoder_max_tile_size_per_bs_q = div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
298+
const uint32_t decoder_batch_shape = bsz * decoder_max_tile_size_per_bs_q;
299+
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_batch_ids.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
300+
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
301+
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_num_blocks_x_cpu.data<int>(), 0, sizeof(int32_t), stream));
297302

298-
decoder_batch_ids =
299-
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
300-
paddle::DataType::INT32, seq_lens_encoder.place());
301-
decoder_tile_ids_per_batch =
302-
GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
303-
paddle::DataType::INT32, seq_lens_encoder.place());
304303
auto decoder_num_blocks_x =
305304
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
306305
split_q_block<<<1, 32, 0, stream>>>(
307-
seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
308-
decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
309-
decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
306+
seq_lens_this_time.data<int>(),
307+
seq_lens_encoder.data<int>(),
308+
decoder_batch_ids.data<int>(),
309+
decoder_tile_ids_per_batch.data<int>(),
310+
decoder_num_blocks_x.data<int>(),
311+
bsz,
312+
decoder_block_shape_q,
310313
group_size);
311-
decoder_num_blocks_x_cpu =
312-
decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
313-
} else {
314-
decoder_batch_ids =
315-
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
316-
decoder_tile_ids_per_batch =
317-
GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
318-
decoder_num_blocks_x_cpu =
319-
GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
314+
decoder_num_blocks_x_cpu.copy_(decoder_num_blocks_x, decoder_num_blocks_x_cpu.place(), false);
320315
}
321316

322-
return {encoder_batch_ids,
323-
encoder_tile_ids_per_batch,
324-
encoder_num_blocks_x_cpu, /*cpu*/
325-
kv_batch_ids,
326-
kv_tile_ids_per_batch,
327-
kv_num_blocks_x_cpu, /*cpu*/
328-
decoder_batch_ids,
329-
decoder_tile_ids_per_batch,
330-
decoder_num_blocks_x_cpu, /*cpu*/
331-
max_len_kv_cpu /*cpu*/,
332-
max_len_cpu};
333-
}
334-
335-
std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
336-
const paddle::DataType &seq_lens_encoder_dtype,
337-
const paddle::DataType &seq_lens_decoder_dtype,
338-
const paddle::DataType &seq_lens_this_time_dtype) {
339317
return {
340-
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
341-
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
342-
paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
343-
paddle::DataType::INT32, paddle::DataType::INT32};
344-
}
345-
346-
std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
347-
const std::vector<int64_t> &seq_lens_encoder_shape,
348-
const std::vector<int64_t> &seq_lens_decoder_shape,
349-
const std::vector<int64_t> &seq_lens_this_time_shape) {
350-
std::vector<int64_t> dynamic_shape = {-1};
351-
352-
return {dynamic_shape,
353-
dynamic_shape,
354-
{1},
355-
dynamic_shape,
356-
dynamic_shape,
357-
{1},
358-
dynamic_shape,
359-
dynamic_shape,
360-
{1},
361-
{1},
362-
{8}};
318+
encoder_batch_ids,
319+
encoder_tile_ids_per_batch,
320+
encoder_num_blocks_x_cpu, /*cpu*/
321+
kv_batch_ids,
322+
kv_tile_ids_per_batch,
323+
kv_num_blocks_x_cpu, /*cpu*/
324+
max_len_kv_cpu, /*cpu*/
325+
};
363326
}
364327

365328
PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
366-
.Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time"})
367-
.Outputs({paddle::Optional("encoder_batch_ids"),
368-
paddle::Optional("encoder_tile_ids_per_batch"),
369-
paddle::Optional("encoder_num_blocks"),
370-
paddle::Optional("kv_batch_ids"),
371-
paddle::Optional("kv_tile_ids_per_batch"),
372-
paddle::Optional("kv_num_blocks"),
373-
paddle::Optional("decoder_batch_ids"),
374-
paddle::Optional("decoder_tile_ids_per_batch"),
375-
paddle::Optional("decoder_num_blocks"),
376-
paddle::Optional("max_len_kv"), "set_max_lengths"})
377-
.Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
378-
"group_size: int", "block_size: int",
379-
"decoder_step_token_num: int"})
380-
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
381-
.SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
382-
.SetInferDtypeFn(PD_INFER_DTYPE(GetBlockShapeAndSplitKVBlockInferDtype));
329+
.Inputs({
330+
"seq_lens_encoder",
331+
"seq_lens_decoder",
332+
"seq_lens_this_time",
333+
"decoder_batch_ids",
334+
"decoder_tile_ids_per_batch",
335+
"decoder_num_blocks_x_cpu",
336+
"max_len_tensor_cpu"
337+
})
338+
.Outputs({
339+
paddle::Optional("encoder_batch_ids"),
340+
paddle::Optional("encoder_tile_ids_per_batch"),
341+
paddle::Optional("encoder_num_blocks_x_cpu"),
342+
paddle::Optional("kv_batch_ids"),
343+
paddle::Optional("kv_tile_ids_per_batch"),
344+
paddle::Optional("kv_num_blocks_x_cpu"),
345+
"max_len_kv_cpu"
346+
})
347+
.Attrs({
348+
"encoder_block_shape_q: int",
349+
"decoder_block_shape_q: int",
350+
"group_size: int",
351+
"block_size: int",
352+
"decoder_step_token_num: int"
353+
})
354+
.SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock));

custom_ops/gpu_ops/cpp_extensions.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,14 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
235235
const paddle::Tensor &seq_lens_encoder,
236236
const paddle::Tensor &seq_lens_decoder,
237237
const paddle::Tensor &seq_lens_this_time,
238-
const int encoder_block_shape_q, const int decoder_block_shape_q,
239-
const int group_size, const int block_size,
238+
paddle::Tensor &decoder_batch_ids, // Inplace
239+
paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
240+
paddle::Tensor &decoder_num_blocks_x_cpu, // Inplace, Pinned Memory
241+
paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory
242+
const int encoder_block_shape_q,
243+
const int decoder_block_shape_q,
244+
const int group_size,
245+
const int block_size,
240246
const int decoder_step_token_num);
241247

242248
std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,

docs/features/early_stop.md

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11

22
# Early Stopping
33

4-
The early stopping is used to prematurely terminate the token generation of the model. Specifically, the early stopping uses different strategies to determine whether the currently generated token sequence meets the early stopping criteria. If so, token generation is terminated prematurely. FastDeploy currently only supports the repetition strategy.
4+
The early stopping is used to prematurely terminate the token generation of the model. Specifically, the early stopping uses different strategies to determine whether the currently generated token sequence meets the early stopping criteria. If so, token generation is terminated prematurely. FastDeploy currently supports the repetition strategy and stop sequence.
55

6-
1. Repetition Strategy
6+
## 1. Repetition Strategy
77
* The repetition strategy determines whether to trigger the early stopping function by checking the number of times a high-probability token is generated.
88
* Specifically, if the probability of generating a token for a batch exceeds a user-set probability threshold for a specified number of consecutive times, token generation for that batch is terminated prematurely.
99

10-
## Usage Instructions
10+
### Usage Instructions
1111

1212
When starting the service, add the early stopping function startup option.
1313

@@ -61,7 +61,7 @@ When starting the service, add the early stopping function startup option.
6161
print(output)
6262
```
6363

64-
## Parameter Description
64+
### Parameter Description
6565

6666
* `enable_early_stop`: (bool) Whether to enable the early stopping. Default False.
6767

@@ -70,3 +70,53 @@ When starting the service, add the early stopping function startup option.
7070
* `window_size`: (int) The upper limit of the number of consecutive high-probability tokens in the repetition strategy. If the number exceeds this limit, the early stopping will be triggered. Default 3000.
7171

7272
* `threshold`: (float) The high-probability threshold in the repetition strategy. Default 0.99.
73+
74+
## 2. Stop Sequence
75+
* The Stop Sequence strategy determines whether to trigger early stopping by checking whether the generated token sequence contains a user-specified stop sequence.
76+
77+
* Specifically, if the token sequence generated by a batch contains a user-specified stop sequence, token generation for that batch is terminated prematurely.
78+
79+
### Usage Instructions
80+
Before starting the service, set the following environment variables
81+
82+
```
83+
FD_STOP_SEQS_MAX_LEN (Maximum length of stop sequences, default is 8)
84+
85+
FD_MAX_STOP_SEQS_NUM (Maximum number of stop sequences, default is 5)
86+
```
87+
88+
request with stop parameter, it can be str or List[str]
89+
90+
* online serving, set `stop` parameter in request
91+
```
92+
# create a chat request with "stop" parameter
93+
import openai
94+
ip = "0.0.0.0"
95+
service_http_port = "8233"
96+
client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
97+
response = client.chat.completions.create(
98+
model="default",
99+
messages=[
100+
{"role": "user", "content": '今天天气真好'},
101+
],
102+
temperature=1.0,
103+
top_p=0,
104+
stream=False,
105+
stop=["明天", "出去走走"]
106+
)
107+
```
108+
109+
* offline LLM, set `stop_seqs` parameter in `SamplingParams`
110+
```
111+
from fastdeploy.engine.sampling_params import SamplingParams
112+
from fastdeploy.entrypoints.llm import LLM
113+
114+
model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle"
115+
116+
sampling_params = SamplingParams(temperature=1, top_p=0, stop=["出去走走"])
117+
llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
118+
output = llm.chat(messages=[{"role": "user", "content": "今天天气真好"}], use_tqdm=True, sampling_params=sampling_params)
119+
120+
print(output)
121+
122+
```

docs/get_started/installation/kunlunxin_xpu.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ Verified platform:
2525
```bash
2626
mkdir Work
2727
cd Work
28-
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
28+
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3
2929
docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
30-
ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0 \
30+
ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \
3131
/bin/bash
3232
docker exec -it fastdeploy-xpu /bin/bash
3333
```
@@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
4949
### Install FastDeploy (**Do NOT install via PyPI source**)
5050

5151
```bash
52-
python -m pip install fastdeploy-xpu==2.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
52+
python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
5353
```
5454

5555
Alternatively, you can install the latest version of FastDeploy (Not recommended)

0 commit comments

Comments
 (0)