PaddlePaddle
diff --git a/‎.github/workflows/_build_linux.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/_build_linux.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/_logprob_test_linux.yml
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/_logprob_test_linux.yml
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/_unit_test_coverage.yml
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/_unit_test_coverage.yml
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/approve.yml
Lines changed: 40 additions & 0 deletions b/‎.github/workflows/approve.yml
Lines changed: 40 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
Lines changed: 67 additions & 95 deletions b/‎custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu
Lines changed: 67 additions & 95 deletions
diff --git a/‎custom_ops/gpu_ops/cpp_extensions.cc
Lines changed: 8 additions & 2 deletions b/‎custom_ops/gpu_ops/cpp_extensions.cc
Lines changed: 8 additions & 2 deletions
diff --git a/‎docs/features/early_stop.md
Lines changed: 54 additions & 4 deletions b/‎docs/features/early_stop.md
Lines changed: 54 additions & 4 deletions
diff --git a/‎docs/get_started/installation/kunlunxin_xpu.md
Lines changed: 3 additions & 3 deletions b/‎docs/get_started/installation/kunlunxin_xpu.md
Lines changed: 3 additions & 3 deletions
@@ -103,6 +103,7 @@ jobs:
             -v $(pwd):/workspace -w /workspace \
             -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
             -v "${CACHE_DIR}/.cache:/root/.cache" \
+            -v "${CACHE_DIR}/.ccache:/root/.ccache" \
             -v "${CACHE_DIR}/ConfigDir:/root/.config" \
             -e TZ="Asia/Shanghai" \
             -e "COMPILE_ARCH=${compile_arch}" \
 
@@ -101,11 +101,12 @@ jobs:
           -v "${CACHE_DIR}/ConfigDir:/root/.config" \
           -e TZ="Asia/Shanghai" \
           --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c '
+          # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+          python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
           pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
           pip config set install.trusted-host  pip.baidu.com
           pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-          python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
           python -m pip install ${fastdeploy_wheel_url}
 
           wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64
 
@@ -92,13 +92,15 @@ jobs:
 
             git config --global --add safe.directory /workspace/FastDeploy
             cd FastDeploy
+            # python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+            python -m pip install paddlepaddle-gpu==3.0.0.dev20250729 -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
             pip config set global.index-url http://pip.baidu.com/root/baidu/+simple/
             pip config set install.trusted-host  pip.baidu.com
             pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 
             python -m pip install coverage
             python -m pip install diff-cover
-            python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
             python -m pip install ${fd_wheel_url}
             export COVERAGE_FILE=/workspace/FastDeploy/coveragedata/.coverage
             export COVERAGE_RCFILE=/workspace/FastDeploy/scripts/.coveragerc
 
@@ -0,0 +1,40 @@
+name: Approval
+
+on:
+  pull_request:
+    branches:
+      - develop
+      - 'release/*'
+
+jobs:
+  Approval:
+    name: Approval
+    if: ${{ github.repository_owner == 'PaddlePaddle' }}
+    runs-on: ubuntu-latest
+    env:
+      PR_ID: ${{ github.event.pull_request.number }}
+      BRANCH: ${{ github.event.pull_request.base.ref }}
+    steps:
+      - name: Checkout base repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+          fetch-depth: 1000
+
+      - name: Merge PR to test branch
+        run: |
+          git fetch origin pull/${PR_ID}/merge
+          git checkout -b test FETCH_HEAD
+          git log -n 3 --oneline
+          git remote add upstream https://github.com/PaddlePaddle/FastDeploy.git
+          git fetch upstream $BRANCH
+
+      - name: Setup python3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+          cache: 'pip'
+
+      - name: Run approval check script
+        run: |
+          bash scripts/check_approval.sh
@@ -195,22 +195,25 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
     const paddle::Tensor &seq_lens_encoder,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_this_time,
-    const int encoder_block_shape_q, const int decoder_block_shape_q,
-    const int group_size, const int block_size,
-    const int decoder_step_token_num) {
+    paddle::Tensor &decoder_batch_ids,          // Inplace
+    paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
+    paddle::Tensor &decoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
+    paddle::Tensor &max_len_tensor_cpu,         // Inplace, Pinned Memory
+    const int encoder_block_shape_q,
+    const int decoder_block_shape_q,
+    const int group_size,
+    const int block_size,
+    const int decoder_step_token_num)
+{
   auto stream = seq_lens_encoder.stream();
   int bsz = seq_lens_this_time.shape()[0];
-  auto max_len_tensor =
-      GetEmptyTensor({8}, paddle::DataType::INT32, seq_lens_encoder.place());
+
+  paddle::Tensor max_len_tensor_gpu = GetEmptyTensor({max_len_tensor_cpu.shape()[0]}, paddle::DataType::INT32, seq_lens_this_time.place());
   GetMaxLen(seq_lens_decoder, seq_lens_this_time, seq_lens_encoder,
-            max_len_tensor, bsz);
+            max_len_tensor_gpu, bsz);
+  max_len_tensor_cpu.copy_(max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
 
-  // max_len_this_time, max_enc_len_this_time, max_dec_len_this_time,
-  // max_enc_dec_len_this_time, max_just_dec_len_this_time,
-  // max_just_dec_merged_len_this_time, max_system_len,
-  // max_just_dec_len_without_system
-  auto max_len_cpu = max_len_tensor.copy_to(paddle::CPUPlace(), false);
-  auto max_len_cpu_ptr = max_len_cpu.data<int>();
+  auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
   int max_len_this_time = max_len_cpu_ptr[0];
   int max_enc_len_this_time = max_len_cpu_ptr[1];
   int max_dec_len_this_time = max_len_cpu_ptr[2];
@@ -222,14 +225,11 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
 
   paddle::Tensor encoder_batch_ids;
   paddle::Tensor encoder_tile_ids_per_batch;
-  paddle::Tensor encoder_num_blocks_x_cpu; /*cpu*/
+  paddle::Tensor encoder_num_blocks_x_cpu;  /*cpu*/
   paddle::Tensor kv_batch_ids;
   paddle::Tensor kv_tile_ids_per_batch;
-  paddle::Tensor kv_num_blocks_x_cpu; /*cpu*/
-  paddle::Tensor decoder_batch_ids;
-  paddle::Tensor decoder_tile_ids_per_batch;
-  paddle::Tensor decoder_num_blocks_x_cpu; /*cpu*/
-  paddle::Tensor max_len_kv_cpu;           /*cpu*/
+  paddle::Tensor kv_num_blocks_x_cpu;       /*cpu*/
+  paddle::Tensor max_len_kv_cpu;            /*cpu*/
 
   auto max_len_kv =
       GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_decoder.place());
@@ -291,92 +291,64 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
     kv_num_blocks_x_cpu =
         GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
   }
+
   if (max_just_dec_len_this_time > 0) {
-    const uint32_t decoder_max_tile_size_per_bs_q =
-        div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
+    // Clear buffer
+    const uint32_t decoder_max_tile_size_per_bs_q = div_up((decoder_step_token_num * group_size), decoder_block_shape_q);
+    const uint32_t decoder_batch_shape = bsz * decoder_max_tile_size_per_bs_q;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_batch_ids.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(), 0, decoder_batch_shape * sizeof(int32_t), stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(decoder_num_blocks_x_cpu.data<int>(), 0, sizeof(int32_t), stream));
 
-    decoder_batch_ids =
-        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                       paddle::DataType::INT32, seq_lens_encoder.place());
-    decoder_tile_ids_per_batch =
-        GetEmptyTensor({bsz * decoder_max_tile_size_per_bs_q},
-                       paddle::DataType::INT32, seq_lens_encoder.place());
     auto decoder_num_blocks_x =
         GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
     split_q_block<<<1, 32, 0, stream>>>(
-        seq_lens_this_time.data<int>(), seq_lens_encoder.data<int>(),
-        decoder_batch_ids.data<int>(), decoder_tile_ids_per_batch.data<int>(),
-        decoder_num_blocks_x.data<int>(), bsz, decoder_block_shape_q,
+        seq_lens_this_time.data<int>(),
+        seq_lens_encoder.data<int>(),
+        decoder_batch_ids.data<int>(),
+        decoder_tile_ids_per_batch.data<int>(),
+        decoder_num_blocks_x.data<int>(),
+        bsz,
+        decoder_block_shape_q,
         group_size);
-    decoder_num_blocks_x_cpu =
-        decoder_num_blocks_x.copy_to(paddle::CPUPlace(), false);
-  } else {
-    decoder_batch_ids =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    decoder_tile_ids_per_batch =
-        GetEmptyTensor({0}, paddle::DataType::INT32, seq_lens_encoder.place());
-    decoder_num_blocks_x_cpu =
-        GetEmptyTensor({0}, paddle::DataType::INT32, paddle::CPUPlace());
+    decoder_num_blocks_x_cpu.copy_(decoder_num_blocks_x, decoder_num_blocks_x_cpu.place(), false);
   }
 
-  return {encoder_batch_ids,
-          encoder_tile_ids_per_batch,
-          encoder_num_blocks_x_cpu, /*cpu*/
-          kv_batch_ids,
-          kv_tile_ids_per_batch,
-          kv_num_blocks_x_cpu, /*cpu*/
-          decoder_batch_ids,
-          decoder_tile_ids_per_batch,
-          decoder_num_blocks_x_cpu, /*cpu*/
-          max_len_kv_cpu /*cpu*/,
-          max_len_cpu};
-}
-
-std::vector<paddle::DataType> GetBlockShapeAndSplitKVBlockInferDtype(
-    const paddle::DataType &seq_lens_encoder_dtype,
-    const paddle::DataType &seq_lens_decoder_dtype,
-    const paddle::DataType &seq_lens_this_time_dtype) {
   return {
-      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-      paddle::DataType::INT32, paddle::DataType::INT32, paddle::DataType::INT32,
-      paddle::DataType::INT32, paddle::DataType::INT32};
-}
-
-std::vector<std::vector<int64_t>> GetBlockShapeAndSplitKVBlockInferShape(
-    const std::vector<int64_t> &seq_lens_encoder_shape,
-    const std::vector<int64_t> &seq_lens_decoder_shape,
-    const std::vector<int64_t> &seq_lens_this_time_shape) {
-  std::vector<int64_t> dynamic_shape = {-1};
-
-  return {dynamic_shape,
-          dynamic_shape,
-          {1},
-          dynamic_shape,
-          dynamic_shape,
-          {1},
-          dynamic_shape,
-          dynamic_shape,
-          {1},
-          {1},
-          {8}};
+    encoder_batch_ids,
+    encoder_tile_ids_per_batch,
+    encoder_num_blocks_x_cpu, /*cpu*/
+    kv_batch_ids,
+    kv_tile_ids_per_batch,
+    kv_num_blocks_x_cpu,      /*cpu*/
+    max_len_kv_cpu,           /*cpu*/
+  };
 }
 
 PD_BUILD_STATIC_OP(get_block_shape_and_split_kv_block)
-    .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time"})
-    .Outputs({paddle::Optional("encoder_batch_ids"),
-              paddle::Optional("encoder_tile_ids_per_batch"),
-              paddle::Optional("encoder_num_blocks"),
-              paddle::Optional("kv_batch_ids"),
-              paddle::Optional("kv_tile_ids_per_batch"),
-              paddle::Optional("kv_num_blocks"),
-              paddle::Optional("decoder_batch_ids"),
-              paddle::Optional("decoder_tile_ids_per_batch"),
-              paddle::Optional("decoder_num_blocks"),
-              paddle::Optional("max_len_kv"), "set_max_lengths"})
-    .Attrs({"encoder_block_shape_q: int", "decoder_block_shape_q: int",
-            "group_size: int", "block_size: int",
-            "decoder_step_token_num: int"})
-    .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock))
-    .SetInferShapeFn(PD_INFER_SHAPE(GetBlockShapeAndSplitKVBlockInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(GetBlockShapeAndSplitKVBlockInferDtype));
+    .Inputs({
+      "seq_lens_encoder",
+      "seq_lens_decoder",
+      "seq_lens_this_time",
+      "decoder_batch_ids",
+      "decoder_tile_ids_per_batch",
+      "decoder_num_blocks_x_cpu",
+      "max_len_tensor_cpu"
+    })
+    .Outputs({
+      paddle::Optional("encoder_batch_ids"),
+      paddle::Optional("encoder_tile_ids_per_batch"),
+      paddle::Optional("encoder_num_blocks_x_cpu"),
+      paddle::Optional("kv_batch_ids"),
+      paddle::Optional("kv_tile_ids_per_batch"),
+      paddle::Optional("kv_num_blocks_x_cpu"),
+      "max_len_kv_cpu"
+    })
+    .Attrs({
+      "encoder_block_shape_q: int",
+      "decoder_block_shape_q: int",
+      "group_size: int",
+      "block_size: int",
+      "decoder_step_token_num: int"
+    })
+    .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock));
@@ -235,8 +235,14 @@ std::vector<paddle::Tensor> GetBlockShapeAndSplitKVBlock(
     const paddle::Tensor &seq_lens_encoder,
     const paddle::Tensor &seq_lens_decoder,
     const paddle::Tensor &seq_lens_this_time,
-    const int encoder_block_shape_q, const int decoder_block_shape_q,
-    const int group_size, const int block_size,
+    paddle::Tensor &decoder_batch_ids,          // Inplace
+    paddle::Tensor &decoder_tile_ids_per_batch, // Inplace
+    paddle::Tensor &decoder_num_blocks_x_cpu,   // Inplace, Pinned Memory
+    paddle::Tensor &max_len_tensor_cpu,         // Inplace, Pinned Memory
+    const int encoder_block_shape_q,
+    const int decoder_block_shape_q,
+    const int group_size,
+    const int block_size,
     const int decoder_step_token_num);
 
 std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
 
@@ -1,13 +1,13 @@
 
 # Early Stopping
 
-The early stopping is used to prematurely terminate the token generation of the model. Specifically, the early stopping uses different strategies to determine whether the currently generated token sequence meets the early stopping criteria. If so, token generation is terminated prematurely. FastDeploy currently only supports the repetition strategy.
+The early stopping is used to prematurely terminate the token generation of the model. Specifically, the early stopping uses different strategies to determine whether the currently generated token sequence meets the early stopping criteria. If so, token generation is terminated prematurely. FastDeploy currently supports the repetition strategy and stop sequence.
 
-1. Repetition Strategy
+## 1. Repetition Strategy
 * The repetition strategy determines whether to trigger the early stopping function by checking the number of times a high-probability token is generated.
 * Specifically, if the probability of generating a token for a batch exceeds a user-set probability threshold for a specified number of consecutive times, token generation for that batch is terminated prematurely.
 
-## Usage Instructions
+### Usage Instructions
 
 When starting the service, add the early stopping function startup option.
 
@@ -61,7 +61,7 @@ When starting the service, add the early stopping function startup option.
     print(output)
     ```
 
-## Parameter Description
+### Parameter Description
 
 * `enable_early_stop`: (bool) Whether to enable the early stopping. Default False.
 
@@ -70,3 +70,53 @@ When starting the service, add the early stopping function startup option.
 * `window_size`: (int) The upper limit of the number of consecutive high-probability tokens in the repetition strategy. If the number exceeds this limit, the early stopping will be triggered. Default 3000.
 
 * `threshold`: (float) The high-probability threshold in the repetition strategy. Default 0.99.
+
+## 2. Stop Sequence
+* The Stop Sequence strategy determines whether to trigger early stopping by checking whether the generated token sequence contains a user-specified stop sequence.
+
+* Specifically, if the token sequence generated by a batch contains a user-specified stop sequence, token generation for that batch is terminated prematurely.
+
+### Usage Instructions
+Before starting the service, set the following environment variables
+
+```
+FD_STOP_SEQS_MAX_LEN (Maximum length of stop sequences, default is 8)
+
+FD_MAX_STOP_SEQS_NUM (Maximum number of stop sequences, default is 5)
+```
+
+request with stop parameter， it can be str or List[str]
+
+* online serving, set `stop` parameter in request
+```
+# create a chat request with "stop" parameter
+import openai
+ip = "0.0.0.0"
+service_http_port = "8233"
+client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY")
+response = client.chat.completions.create(
+    model="default",
+    messages=[
+        {"role": "user", "content": '今天天气真好'},
+    ],
+    temperature=1.0,
+    top_p=0,
+    stream=False,
+    stop=["明天", "出去走走"]
+)
+```
+
+* offline LLM, set `stop_seqs` parameter in `SamplingParams`
+```
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.entrypoints.llm import LLM
+
+model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle"
+
+sampling_params = SamplingParams(temperature=1, top_p=0, stop=["出去走走"])
+llm = LLM(model=model_name_or_path, tensor_parallel_size=1)
+output = llm.chat(messages=[{"role": "user", "content": "今天天气真好"}], use_tqdm=True, sampling_params=sampling_params)
+
+print(output)
+
+```
@@ -25,9 +25,9 @@ Verified platform:
 ```bash
 mkdir Work
 cd Work
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3
 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
-    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.0 \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.0.3 \
     /bin/bash
 docker exec -it fastdeploy-xpu /bin/bash
 ```
@@ -49,7 +49,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
 ### Install FastDeploy (**Do NOT install via PyPI source**)
 
 ```bash
-python -m pip install fastdeploy-xpu==2.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+python -m pip install fastdeploy-xpu==2.0.3 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 ```
 
 Alternatively, you can install the latest version of FastDeploy (Not recommended)