diff --git a/custom_ops/iluvatar_ops/flash_attn_unpadded.cu b/custom_ops/iluvatar_ops/flash_attn_unpadded.cu index ffde18ea034..ad677396f8b 100644 --- a/custom_ops/iluvatar_ops/flash_attn_unpadded.cu +++ b/custom_ops/iluvatar_ops/flash_attn_unpadded.cu @@ -24,8 +24,8 @@ void FlashAttnUnpaddedKernel(const paddle::Tensor& q, int num_heads, int head_dim, int num_kv_heads, - int max_seqlens_q, - int max_seqlens_k, + const paddle::Tensor& max_seqlens_q_, + const paddle::Tensor& max_seqlens_k_, bool causal, float scale, paddle::Tensor& out) { @@ -148,10 +148,15 @@ void FlashAttnUnpaddedKernel(const paddle::Tensor& q, cuinferTensorDescriptor_t lse_desc; CUINFER_CHECK(cuinferCreateTensorDescriptor(&lse_desc)); + PD_CHECK(max_seqlens_q_.is_cpu(), "max_seqlens_q tensor must be on CPU"); + PD_CHECK(max_seqlens_k_.is_cpu(), "max_seqlens_k tensor must be on CPU"); + const int32_t* max_seqlens_q = max_seqlens_q_.data(); + const int32_t* max_seqlens_k = max_seqlens_k_.data(); + FmhaFwdFuncArguments args; args.batch = batch_size; - args.max_seqlen_q = max_seqlens_q; - args.max_seqlen_k = max_seqlens_k; + args.max_seqlen_q = *max_seqlens_q; + args.max_seqlen_k = *max_seqlens_k; args.is_causal = causal; args.scaling = scale; args.window_size_left = -1; @@ -197,8 +202,8 @@ std::vector FlashAttnUnpadded( const paddle::Tensor& v, const paddle::Tensor& cu_seqlens_q, const paddle::Tensor& cu_seqlens_k, - int max_seqlens_q, - int max_seqlens_k, + const paddle::Tensor& max_seqlens_q, + const paddle::Tensor& max_seqlens_k, bool causal, float scale, bool training) { @@ -248,23 +253,37 @@ std::vector FlashAttnUnpadded( } std::vector> FlashAttnUnpaddedInferShape( - const std::vector& q_shape) { + const std::vector& q_shape, + const std::vector& k_shape, + const std::vector& v_shape, + const std::vector& cu_seqlens_q_shape, + const std::vector& cu_seqlens_k_shape, + const std::vector& max_seqlens_q_shape, + const std::vector& max_seqlens_k_shape) { return {{q_shape[0], q_shape[1], q_shape[2]}}; } std::vector FlashAttnUnpaddedInferDtype( - const paddle::DataType& q_dtype) { + const paddle::DataType& q_dtype, + const paddle::DataType& k_dtype, + const paddle::DataType& v_dtype, + const paddle::DataType& cu_seqlens_q_dtype, + const paddle::DataType& cu_seqlens_k_dtype, + const paddle::DataType& max_seqlens_q_dtype, + const paddle::DataType& max_seqlens_k_dtype) { return {q_dtype}; } PD_BUILD_STATIC_OP(cuinfer_flash_attn_unpadded) - .Inputs({"q", "k", "v", "cu_seqlens_q", "cu_seqlens_k"}) + .Inputs({"q", + "k", + "v", + "cu_seqlens_q", + "cu_seqlens_k", + "max_seqlens_q", + "max_seqlens_k"}) .Outputs({"out"}) - .Attrs({"max_seqlens_q:int", - "max_seqlens_k:int", - "causal:bool", - "scale:float", - "training:bool"}) + .Attrs({"causal:bool", "scale:float", "training:bool"}) .SetKernelFn(PD_KERNEL(FlashAttnUnpadded)) .SetInferShapeFn(PD_INFER_SHAPE(FlashAttnUnpaddedInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(FlashAttnUnpaddedInferDtype)); diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index cc6834b56df..ff5e21210a5 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -606,7 +606,7 @@ def find_end_files(directory, end_str): elif paddle.is_compiled_with_xpu(): assert False, "For XPU, please use setup_ops.py in the xpu_ops directory to compile custom ops." elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): - _iluvatar_clang_cuda_flags = ["-Wno-non-pod-varargs", "-DPADDLE_DEV", "-DPADDLE_WITH_CUSTOM_DEVICE"] + _iluvatar_clang_cuda_flags = ["-Wno-non-pod-varargs", "-DPADDLE_DEV", "-DPADDLE_WITH_CUSTOM_DEVICE", "-std=c++17"] setup( name="fastdeploy_ops", ext_modules=CUDAExtension( diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index a0620789366..7817e1af164 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -23,13 +23,13 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260 ### 3.1 Start Container ```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507 -docker exec -it paddle_infer bash +docker run -itd --name fd_iluvatar -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/workspace:/home/workspace -v /usr/local/corex-4.3.8/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex-4.3.8/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex-4.3.8/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex-4.3.8/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --shm-size=64G --net=host --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507 +docker exec -it fd_iluvatar bash ``` Note: Because the 4.3.8 SDK in the image is incompatible with KMD, paddle cannot find the iluvatar device. Therefore, it is temporarily necessary to map ixsmi, libcuda.so.1, libixml.so, and libixthunk.so from the host corex-4.3.8 directory into the container. -/home/paddle contains the model files, *.whl packages, and scripts. +/home/workspace contains the model files, *.whl packages, and scripts. ### 3.2 Install paddle @@ -458,14 +458,9 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ ### 4.3 PaddleOCR-VL series #### 4.3.1 PaddleOCR-VL-0.9B -- (Optional) Build and install paddleocr from source - -To install the latest `paddleocr`, you can compile it from source. The version in the image is `3.3.2`. - +- install paddleocr ```bash -git clone -b main https://github.com/PaddlePaddle/PaddleOCR.git -cd PaddleOCR -pip3 install -e ".[doc-parser]" +pip3 install paddleocr[doc-parser]==3.3.2 ``` Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/PaddleOCR-VL-0.9B.md), the command as bellow: @@ -478,17 +473,17 @@ export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection export CUDA_VISIBLE_DEVICES=1 python3 -m fastdeploy.entrypoints.openai.api_server \ - --model /data1/fastdeploy/PaddleOCR-VL \ - --port 8180 \ - --metrics-port 8471 \ - --engine-worker-queue-port 8472 \ - --cache-queue-port 55660 \ - --max-model-len 16384 \ - --max-num-batched-tokens 16384 \ - --max-num-seqs 64 \ - --workers 2 \ - --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": true}' + --model /data1/fastdeploy/PaddleOCR-VL \ + --port 8180 \ + --metrics-port 8471 \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --max-num-seqs 240 \ + --block-size 16 \ + --workers 2 \ + --gpu-memory-utilization 0.7 \ + --graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}' + ``` client: @@ -508,14 +503,14 @@ The output is: **benchmark** -1. Download and extract image datasets +1) Download and extract image datasets ```bash wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar tar xvf images.tar ``` -2. Prepare `infer_ocr_vl_benchmark.py` +2) Prepare `infer_ocr_vl_benchmark.py` ```python import os @@ -532,13 +527,80 @@ for file_name in file_list: res.save_to_markdown(save_path="output", pretty=False) ``` -3. execute `infer_ocr_vl_benchmark.py` on client +3) execute `infer_ocr_vl_benchmark.py` on client ```bash python3 infer_ocr_vl_benchmark.py ``` -After each image is inferred, a corresponding `md` file will be generated in the `output` path. Running the entire benchmark (1355 images) takes approximately 1.8 hours. +#### 4.3.2 PaddleOCR-VL-1.6-0.9B + +- install paddleocr + +```bash +pip3 install paddleocr[doc-parser]==3.6.0 +``` + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export CUDA_VISIBLE_DEVICES=1 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /data1/fastdeploy/PaddleOCR-VL-1.6 \ + --port 8180 \ + --metrics-port 8471 \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --max-num-seqs 240 \ + --block-size 16 \ + --workers 2 \ + --gpu-memory-utilization 0.7 \ + --graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}' + +``` + +client: + +**simple demo** + +```bash +paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png --vl_rec_backend fastdeploy-server --vl_rec_server_url http://127.0.0.1:8180/v1 --device iluvatar_gpu --pipeline_version v1.6 +``` + +**benchmark** + +1) Download and extract image datasets + +```bash +wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar +tar xvf images.tar +``` + +2) Prepare `infer_ocr_vl_benchmark.py` + +```python +import os +from paddleocr import PaddleOCRVL + +input_path = "./images" +pipeline = PaddleOCRVL(vl_rec_backend="fastdeploy-server", vl_rec_server_url="http://127.0.0.1:8180/v1", device="iluvatar_gpu", pipeline_version="v1.6") +file_list = os.listdir(input_path) +for file_name in file_list: + file_path = os.path.join(input_path, file_name) + output = pipeline.predict(file_path) + for res in output: + res.print() + res.save_to_markdown(save_path="output", pretty=False) +``` + +3) execute `infer_ocr_vl_benchmark.py` on client + +```bash +python3 infer_ocr_vl_benchmark.py +``` ## 5. Quantization Format Support - `W8A16`: `--quantization wint8` diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index 7262d309d88..8afc2245dc8 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -23,13 +23,13 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260 ### 3.1 启动容器 ```bash -docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle -v /usr/local/corex/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507 -docker exec -it paddle_infer bash +docker run -itd --name fd_iluvatar -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/workspace:/home/workspace -v /usr/local/corex-4.3.8/bin/ixsmi:/usr/local/corex/bin/ixsmi -v /usr/local/corex-4.3.8/lib64/libcuda.so.1:/usr/local/corex/lib64/libcuda.so.1 -v /usr/local/corex-4.3.8/lib64/libixml.so:/usr/local/corex/lib64/libixml.so -v /usr/local/corex-4.3.8/lib64/libixthunk.so:/usr/local/corex/lib64/libixthunk.so --privileged --shm-size=64G --net=host --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:3.3.0-20260507 +docker exec -it fd_iluvatar bash ``` 注意: 由于镜像中的 4.3.8 SDK 与 KMD 不兼容,paddle 无法找到 iluvatar device。因此,暂时需要将宿主机 corex-4.3.8 目录中的 ixsmi、libcuda.so.1、libixml.so 和 libixthunk.so 映射到容器中 -/home/paddle 为模型文件、whl包、脚本所在目录。 +/home/workspace 为模型文件、whl包、脚本所在目录。 ### 3.2 安装paddle @@ -458,14 +458,9 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ ### 4.3 PaddleOCR-VL系列 #### 4.3.1 PaddleOCR-VL-0.9B -- (可选) 源码编译安装 paddleocr - -如果想要安装最新的`paddleocr`,可以源码编译。镜像里的版本是`3.3.2` - +- 安装 paddleocr ```bash -git clone -b main https://github.com/PaddlePaddle/PaddleOCR.git -cd PaddleOCR -pip3 install -e ".[doc-parser]" +pip3 install paddleocr[doc-parser]==3.3.2 ``` 参考[gpu文档](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/PaddleOCR-VL-0.9B.md), 命令如下所示: @@ -478,17 +473,17 @@ export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 export FD_SAMPLING_CLASS=rejection export CUDA_VISIBLE_DEVICES=1 python3 -m fastdeploy.entrypoints.openai.api_server \ - --model /data1/fastdeploy/PaddleOCR-VL \ - --port 8180 \ - --metrics-port 8471 \ - --engine-worker-queue-port 8472 \ - --cache-queue-port 55660 \ - --max-model-len 16384 \ - --max-num-batched-tokens 16384 \ - --max-num-seqs 64 \ - --workers 2 \ - --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": true}' + --model /data1/fastdeploy/PaddleOCR-VL \ + --port 8180 \ + --metrics-port 8471 \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --max-num-seqs 240 \ + --block-size 16 \ + --workers 2 \ + --gpu-memory-utilization 0.7 \ + --graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}' + ``` 客户端: @@ -505,14 +500,14 @@ paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/ **benchmark** -1. 下载和解压image数据集 +1) 下载和解压image数据集 ```bash wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar tar xvf images.tar ``` -2. 准备推理脚本`infer_ocr_vl_benchmark.py` +2) 准备推理脚本`infer_ocr_vl_benchmark.py` ```python import os @@ -529,13 +524,80 @@ for file_name in file_list: res.save_to_markdown(save_path="output", pretty=False) ``` -3. 客户端执行`infer_ocr_vl_benchmark.py` +3) 客户端执行`infer_ocr_vl_benchmark.py` ```bash python3 infer_ocr_vl_benchmark.py ``` -每推理完一张图片,会在`output`路径下生成一个对应的`md`文件,跑完整个benchmark(1355张图片)大概需要1.8个小时。 +#### 4.3.2 PaddleOCR-VL-1.6-0.9B + +- 安装paddleocr + +```bash +pip3 install paddleocr[doc-parser]==3.6.0 +``` + +服务端: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export CUDA_VISIBLE_DEVICES=1 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model /data1/fastdeploy/PaddleOCR-VL-1.6 \ + --port 8180 \ + --metrics-port 8471 \ + --max-model-len 16384 \ + --max-num-batched-tokens 16384 \ + --max-num-seqs 240 \ + --block-size 16 \ + --workers 2 \ + --gpu-memory-utilization 0.7 \ + --graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}' + +``` + +客户端: + +**simple demo** + +```bash +paddleocr doc_parser -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png --vl_rec_backend fastdeploy-server --vl_rec_server_url http://127.0.0.1:8180/v1 --device iluvatar_gpu --pipeline_version v1.6 +``` + +**benchmark** + +1) 下载和解压image数据集 + +```bash +wget https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/internal/tmp/images.tar +tar xvf images.tar +``` + +2) 准备推理脚本`infer_ocr_vl_benchmark.py` + +```python +import os +from paddleocr import PaddleOCRVL + +input_path = "./images" +pipeline = PaddleOCRVL(vl_rec_backend="fastdeploy-server", vl_rec_server_url="http://127.0.0.1:8180/v1", device="iluvatar_gpu", pipeline_version="v1.6") +file_list = os.listdir(input_path) +for file_name in file_list: + file_path = os.path.join(input_path, file_name) + output = pipeline.predict(file_path) + for res in output: + res.print() + res.save_to_markdown(save_path="output", pretty=False) +``` + +3) 客户端执行`infer_ocr_vl_benchmark.py` + +```bash +python3 infer_ocr_vl_benchmark.py +``` ## 5. 支持的量化策略 - `W8A16`: `--quantization wint8` diff --git a/fastdeploy/model_executor/ops/iluvatar/attention_ops.py b/fastdeploy/model_executor/ops/iluvatar/attention_ops.py index adc87c6fd2a..fb7910d1190 100644 --- a/fastdeploy/model_executor/ops/iluvatar/attention_ops.py +++ b/fastdeploy/model_executor/ops/iluvatar/attention_ops.py @@ -203,6 +203,11 @@ def flash_attn_unpadded( causal=False, training=False, ): + # max_seqlen_q and max_seqlen_k must be a scalar tensor for cinn + if isinstance(max_seqlen_q, int): + max_seqlen_q = paddle.to_tensor(max_seqlen_q, dtype="int32", place="cpu") + if isinstance(max_seqlen_k, int): + max_seqlen_k = paddle.to_tensor(max_seqlen_k, dtype="int32", place="cpu") output = cuinfer_flash_attn_unpadded( query, key, value, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal, scale, training ) diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index ef95aed25d6..d0150f44771 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -87,18 +87,18 @@ def _initialize_attn_backend(self) -> None: ), f"attn_backends should be empty before initialization, got {len(self.attn_backends)} backends" num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size - self.model_config.kv_num_heads = max( - 1, - int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size, - ) + kv_num_heads_per_layer = self._get_kv_num_heads_per_layer() + self.model_config.kv_num_heads = kv_num_heads_per_layer[0] + head_dim = self.model_config.head_dim attn_cls = get_attention_backend() - attn_backend = attn_cls( - self.fd_config, - kv_num_heads=self.model_config.kv_num_heads, - num_heads=num_heads, - head_dim=self.model_config.head_dim, - ) - self.attn_backends.append(attn_backend) + for kv_num_heads in kv_num_heads_per_layer: + attn_backend = attn_cls( + self.fd_config, + kv_num_heads=kv_num_heads, + num_heads=num_heads, + head_dim=head_dim, + ) + self.attn_backends.append(attn_backend) def initialize_kv_cache(self, profile: bool = False) -> None: super(IluvatarModelRunner, self).initialize_kv_cache(profile) diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt index aea8372e95f..441813a9ef7 100644 --- a/requirements_iluvatar.txt +++ b/requirements_iluvatar.txt @@ -46,4 +46,3 @@ aistudio_sdk p2pstore py-cpuinfo transformers>=4.55.1,<5.0.0 -paddleocr[doc-parser]==3.3.2 diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh index 180684135a3..3b9cc00fd45 100644 --- a/scripts/run_ci_iluvatar.sh +++ b/scripts/run_ci_iluvatar.sh @@ -309,6 +309,8 @@ for tensor_parallel_size in "${tensor_parallel_sizes[@]}"; do done echo -e "\n============ Online: start to test PaddleOCR-VL ===========" +pip3 install paddleocr[doc-parser]==3.3.2 + clear_message echo "Start server..." python -m fastdeploy.entrypoints.openai.api_server \ @@ -322,7 +324,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --max-num-seqs 64 \ --workers 2 \ --block-size 16 \ - --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 & + --graph-optimization-config '{"graph_opt_level":2, "use_cudagraph": true}' > server.log 2>&1 & check_server_status