diff --git a/.github/workflows/_accuracy_test.yml b/.github/workflows/_accuracy_test.yml index 40c9d5bd98b..2e112f055c0 100644 --- a/.github/workflows/_accuracy_test.yml +++ b/.github/workflows/_accuracy_test.yml @@ -69,12 +69,27 @@ jobs: if ls "${REPO_NAME}"* >/dev/null 2>&1; then echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" ls -ld "${REPO_NAME}"* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Force cleanup still failed" + exit 1 + else + echo "Force cleanup succeeded" + fi fi ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz + wget -q --no-proxy ${fd_archive_url} || { + echo "ERROR: Failed to download archive from ${fd_archive_url}" + exit 1 + } + + tar --no-same-owner -xf FastDeploy.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf FastDeploy.tar.gz cd FastDeploy git config --global user.name "FastDeployCI" @@ -145,7 +160,10 @@ jobs: docker rm -f ${runner_name} || true fi - docker run --rm --ipc=host --pid=host --net=host \ + docker run --rm --net=host \ + --shm-size=64g \ + --sysctl kernel.msgmax=1048576 \ + --sysctl kernel.msgmnb=268435456 \ --name ${runner_name} \ -v $(pwd):/workspace \ -w /workspace \ @@ -160,10 +178,11 @@ jobs: -v "${CACHE_DIR}/.cache:/root/.cache" \ -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -206,3 +225,10 @@ jobs: fi echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" exit ${TEST_EXIT_CODE} + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index 37fd195c179..99813aa4273 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -81,7 +81,14 @@ jobs: if ls "${REPO_NAME}"* >/dev/null 2>&1; then echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" ls -ld "${REPO_NAME}"* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Force cleanup still failed" + exit 1 + else + echo "Force cleanup succeeded" + fi fi ' @@ -111,7 +118,11 @@ jobs: exit 1 fi - tar -xf FastDeploy.tar.gz + tar --no-same-owner -xf FastDeploy.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf FastDeploy.tar.gz cd FastDeploy git config --global user.name "FastDeployCI" @@ -182,7 +193,10 @@ jobs: docker rm -f ${runner_name} || true fi - docker run --rm --ipc=host --pid=host --net=host \ + docker run --rm --net=host \ + --shm-size=64g \ + --sysctl kernel.msgmax=1048576 \ + --sysctl kernel.msgmnb=268435456 \ --name ${runner_name} \ -v $(pwd):/workspace \ -w /workspace \ @@ -197,17 +211,18 @@ jobs: -v "${CACHE_DIR}/.cache:/root/.cache" \ -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install ${fastdeploy_wheel_url} python -m pip install pytest - wget https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 + wget --no-proxy https://paddle-qa.bj.bcebos.com/zhengtianyu/tools/llm-deploy-linux-amd64 chmod +x ./llm-deploy-linux-amd64 ./llm-deploy-linux-amd64 -python python3.10 \ -model_name ERNIE-4.5-0.3B-Paddle \ @@ -279,3 +294,10 @@ jobs: fi echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" exit ${TEST_EXIT_CODE} + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_build_linux.yml b/.github/workflows/_build_linux.yml index a43da2cc8ef..8ce0b3fef64 100644 --- a/.github/workflows/_build_linux.yml +++ b/.github/workflows/_build_linux.yml @@ -120,6 +120,7 @@ jobs: git config --global user.name "FastDeployCI" git config --global user.email "fastdeploy_ci@example.com" git log -n 3 --oneline + - name: FastDeploy Build shell: bash env: @@ -150,7 +151,8 @@ jobs: PARENT_DIR=$(dirname "$WORKSPACE") echo "PARENT_DIR:$PARENT_DIR" docker run --rm --net=host \ - --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + --cap-add=SYS_PTRACE --shm-size=64G \ + --name ${runner_name} \ -v $(pwd):/workspace -w /workspace \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ @@ -164,6 +166,7 @@ jobs: -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \ -e "BRANCH_REF=${BRANCH_REF}" \ -e "CCACHE_MAXSIZE=50G" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c ' if [[ -n "${FD_VERSION}" ]]; then export FASTDEPLOY_VERSION=${FD_VERSION} @@ -188,7 +191,7 @@ jobs: else # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ fi pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -237,3 +240,10 @@ jobs: target_path_stripped="${target_path#paddle-github-action/}" WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name} echo "wheel_path=${WHEEL_PATH}" >> $GITHUB_OUTPUT + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_build_linux_rl.yml b/.github/workflows/_build_linux_rl.yml index 38f052473e8..ac83da907cf 100644 --- a/.github/workflows/_build_linux_rl.yml +++ b/.github/workflows/_build_linux_rl.yml @@ -8,7 +8,7 @@ on: description: "Build Images" required: true type: string - default: "iregistry.baidu-int.com/tiangexiao/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-rc2" + default: "iregistry.baidu-int.com/new_rl_infra/base-images:paddlecloud-ubuntu24.04-gcc13.3-cuda12.9-cudnn9.9-bccl1.4.1.4-nccl2.26.5-openmpi4.1.5-FleetY13.0.0-v2.4.0-rc1" FASTDEPLOY_ARCHIVE_URL: description: "URL of the compressed FastDeploy code archive." required: true @@ -52,9 +52,10 @@ on: wheel_path_rl: description: "Output path of the generated wheel" value: ${{ jobs.fd-build-rl.outputs.wheel_path_rl }} + jobs: fd-build-rl: - runs-on: [self-hosted, GPU-Build] + runs-on: [self-hosted, GPU-Build-RL] timeout-minutes: 360 outputs: wheel_path_rl: ${{ steps.set_output.outputs.wheel_path_rl }} @@ -107,6 +108,7 @@ jobs: git config --global user.name "FastDeployCI" git config --global user.email "fastdeploy_ci@example.com" git log -n 3 --oneline + - name: FastDeploy Build shell: bash env: @@ -137,7 +139,8 @@ jobs: PARENT_DIR=$(dirname "$WORKSPACE") echo "PARENT_DIR:$PARENT_DIR" docker run --rm --net=host \ - --cap-add=SYS_PTRACE --privileged --shm-size=64G \ + --cap-add=SYS_PTRACE --shm-size=64G \ + --name ${runner_name} \ -v $(pwd):/workspace -w /workspace \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache_rl:/root/.cache" \ @@ -151,6 +154,7 @@ jobs: -e "PADDLE_WHL_URL=${PADDLE_WHL_URL}" \ -e "BRANCH_REF=${BRANCH_REF}" \ -e "CCACHE_MAXSIZE=50G" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus "\"device=${gpu_id}\"" ${docker_image} /bin/bash -c ' if [[ -n "${FD_VERSION}" ]]; then export FASTDEPLOY_VERSION=${FD_VERSION} @@ -162,6 +166,7 @@ jobs: cd FastDeploy # Avoid using pip cache to ensure the wheel is updated to the latest version + python -m pip uninstall paddlepaddle-gpu -y || true wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Paddle-RL-Compile/release/3.3/latest/paddlepaddle_gpu-3.3.0.dev-cp310-cp310-linux_x86_64.whl python -m pip install paddlepaddle_gpu* @@ -202,3 +207,10 @@ jobs: target_path_stripped="${target_path#paddle-github-action/}" WHEEL_PATH=https://paddle-github-action.bj.bcebos.com/${target_path_stripped}/${fd_wheel_name} echo "wheel_path_rl=${WHEEL_PATH}" >> $GITHUB_OUTPUT + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_gpu_4cards_case_test.yml b/.github/workflows/_gpu_4cards_case_test.yml index d7a095570fc..a393552ddbc 100644 --- a/.github/workflows/_gpu_4cards_case_test.yml +++ b/.github/workflows/_gpu_4cards_case_test.yml @@ -81,12 +81,27 @@ jobs: if ls "${REPO_NAME}"* >/dev/null 2>&1; then echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" ls -ld "${REPO_NAME}"* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Force cleanup still failed" + exit 1 + else + echo "Force cleanup succeeded" + fi fi ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz + wget -q --no-proxy ${fd_archive_url} || { + echo "ERROR: Failed to download archive from ${fd_archive_url}" + exit 1 + } + + tar --no-same-owner -xf FastDeploy.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf FastDeploy.tar.gz cd FastDeploy git config --global user.name "FastDeployCI" @@ -166,7 +181,10 @@ jobs: docker rm -f ${runner_name} || true fi - docker run --rm --ipc=host --net=host \ + docker run --rm --net=host \ + --shm-size=64g \ + --sysctl kernel.msgmax=1048576 \ + --sysctl kernel.msgmnb=268435456 \ --name ${runner_name} \ -v $(pwd):/workspace -w /workspace \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ @@ -183,6 +201,7 @@ jobs: -e "fd_wheel_url=${fd_wheel_url}" \ -e "BASE_REF=${BASE_REF}" \ -e "IS_PR=${IS_PR}" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy @@ -191,8 +210,7 @@ jobs: # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install -r scripts/unittest_requirement.txt @@ -204,3 +222,10 @@ jobs: export CUDA_VISIBLE_DEVICES=0,1,2,3 bash scripts/run_gpu_4cards.sh ' + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_logprob_test_linux.yml b/.github/workflows/_logprob_test_linux.yml index 553a84c1006..1de1f93adac 100644 --- a/.github/workflows/_logprob_test_linux.yml +++ b/.github/workflows/_logprob_test_linux.yml @@ -78,11 +78,27 @@ jobs: if ls /workspace/* >/dev/null 2>&1; then echo "ERROR: Failed to clean /workspace/* after multiple attempts" ls -ld /workspace/* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls /workspace/* >/dev/null 2>&1; then + echo "ERROR: Force cleanup failed. Exiting..." + exit 1 + else + echo "Force cleanup succeeded." + fi fi ' - wget -q --no-proxy ${paddletest_archive_url} - tar -xf PaddleTest.tar.gz + + wget -q --no-proxy ${paddletest_archive_url} || { + echo "ERROR: Failed to download archive from ${paddletest_archive_url}" + exit 1 + } + + tar --no-same-owner -xf PaddleTest.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf PaddleTest.tar.gz cd PaddleTest git config --global user.name "FastDeployCI" @@ -152,7 +168,11 @@ jobs: echo "Removing stale container: ${runner_name}" docker rm -f ${runner_name} || true fi - docker run --rm --ipc=host --pid=host --net=host \ + + docker run --rm --net=host \ + --shm-size=64g \ + --sysctl kernel.msgmax=1048576 \ + --sysctl kernel.msgmnb=268435456 \ --name ${runner_name} \ -v $(pwd):/workspace \ -w /workspace \ @@ -167,10 +187,11 @@ jobs: -v "${CACHE_DIR}/.cache:/root/.cache" \ -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -221,3 +242,10 @@ jobs: run: | echo "logprob test failed with exit code ${{ env.LOGPROB_EXIT_CODE }}" exit 8 + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_pre_ce_test.yml b/.github/workflows/_pre_ce_test.yml index f73cb6c1ce1..91ba176bcd0 100644 --- a/.github/workflows/_pre_ce_test.yml +++ b/.github/workflows/_pre_ce_test.yml @@ -83,12 +83,27 @@ jobs: if ls "${REPO_NAME}"* >/dev/null 2>&1; then echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" ls -ld "${REPO_NAME}"* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Force cleanup still failed" + exit 1 + else + echo "Force cleanup succeeded" + fi fi ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz + wget -q --no-proxy ${fd_archive_url} || { + echo "ERROR: Failed to download archive from ${fd_archive_url}" + exit 1 + } + + tar --no-same-owner -xf FastDeploy.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf FastDeploy.tar.gz cd FastDeploy git config --global user.name "FastDeployCI" @@ -163,6 +178,7 @@ jobs: fi docker run --rm --net=host \ + --shm-size=64G \ --name ${runner_name} \ -v $(pwd):/workspace \ -w /workspace \ @@ -181,14 +197,20 @@ jobs: -e "FD_ZMQ_SEND_RESPONSE_SERVER_PORT=${FD_ZMQ_SEND_RESPONSE_SERVER_PORT}" \ -e "FD_ZMQ_CONTROL_CMD_SERVER_PORTS=${FD_ZMQ_CONTROL_CMD_SERVER_PORTS}" \ -e "fd_wheel_url=${fd_wheel_url}" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ python -m pip install ${fd_wheel_url} bash scripts/run_pre_ce.sh ' + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_stable_test.yml b/.github/workflows/_stable_test.yml index e834816100b..b8b544468d1 100644 --- a/.github/workflows/_stable_test.yml +++ b/.github/workflows/_stable_test.yml @@ -81,12 +81,27 @@ jobs: if ls "${REPO_NAME}"* >/dev/null 2>&1; then echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" ls -ld "${REPO_NAME}"* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Force cleanup still failed" + exit 1 + else + echo "Force cleanup succeeded" + fi fi ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz + wget -q --no-proxy ${fd_archive_url} || { + echo "ERROR: Failed to download archive from ${fd_archive_url}" + exit 1 + } + + tar --no-same-owner -xf FastDeploy.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf FastDeploy.tar.gz cd FastDeploy git config --global user.name "FastDeployCI" @@ -160,6 +175,7 @@ jobs: fi docker run --rm --net=host \ + --shm-size=64G \ --name ${runner_name} \ -v $(pwd):/workspace \ -w /workspace \ @@ -175,10 +191,11 @@ jobs: -v "${CACHE_DIR}/.cache:/root/.cache" \ -v "${CACHE_DIR}/ConfigDir:/root/.config" \ -e TZ="Asia/Shanghai" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus '"device='"${DEVICES}"'"' ${docker_image} /bin/bash -xc ' # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -190,6 +207,7 @@ jobs: TEST_EXIT_CODE=0 pushd tests/ce/stable_cases bash launch_model.sh /MODELDATA + TEST_EXIT_CODE=0 bash run.sh || { TEST_EXIT_CODE=1 @@ -211,6 +229,7 @@ jobs: echo "=======================================================" } + popd echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" >> /workspace/FastDeploy/exit_code.env ' @@ -220,3 +239,10 @@ jobs: fi echo "TEST_EXIT_CODE=${TEST_EXIT_CODE}" exit ${TEST_EXIT_CODE} + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} diff --git a/.github/workflows/_unit_test_coverage.yml b/.github/workflows/_unit_test_coverage.yml index 6b553a036bd..086223b006a 100644 --- a/.github/workflows/_unit_test_coverage.yml +++ b/.github/workflows/_unit_test_coverage.yml @@ -85,12 +85,27 @@ jobs: if ls "${REPO_NAME}"* >/dev/null 2>&1; then echo "ERROR: Failed to clean ${REPO_NAME}* after multiple attempts" ls -ld "${REPO_NAME}"* - exit 1 + echo "Attempting force cleanup with find..." + find /workspace -mindepth 1 -maxdepth 1 -name "${REPO_NAME}*" -type d -exec chmod -R u+rwx {} \; -exec rm -rf {} + 2>/dev/null || true + if ls "${REPO_NAME}"* >/dev/null 2>&1; then + echo "ERROR: Force cleanup still failed" + exit 1 + else + echo "Force cleanup succeeded" + fi fi ' - wget -q --no-proxy ${fd_archive_url} - tar -xf FastDeploy.tar.gz + wget -q --no-proxy ${fd_archive_url} || { + echo "ERROR: Failed to download archive from ${fd_archive_url}" + exit 1 + } + + tar --no-same-owner -xf FastDeploy.tar.gz || { + echo "ERROR: Failed to extract archive" + exit 1 + } + rm -rf FastDeploy.tar.gz cd FastDeploy git config --global user.name "FastDeployCI" @@ -173,12 +188,16 @@ jobs: export RDMA_DEVICES=$(find /dev/infiniband/uverbs* -maxdepth 1 -not -type d | xargs -I{} echo '--device {}:{}') docker run --rm --net=host \ + --sysctl kernel.msgmax=1048576 \ + --sysctl kernel.msgmnb=268435456 \ --name ${runner_name} \ --cap-add=SYS_PTRACE --cap-add=IPC_LOCK \ - --shm-size=64G \ + --shm-size=128G \ ${RDMA_DEVICES} \ --device=/dev/infiniband/rdma_cm \ --ulimit memlock=-1:-1 \ + --ulimit nofile=65536:65536 \ + --ulimit nproc=8192:8192 \ -v $(pwd):/workspace -w /workspace \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ @@ -198,6 +217,7 @@ jobs: -e "fd_wheel_url=${fd_wheel_url}" \ -e "BASE_REF=${BASE_REF}" \ -e "IS_PR=${IS_PR}" \ + -e "no_proxy=localhost,127.0.0.1,0.0.0.0,bcebos.com,.bcebos.com,bj.bcebos.com,su.bcebos.com,paddle-ci.gz.bcebos.com,apiin.im.baidu.com,baidu-int.com,.baidu.com,aliyun.com,gitee.com,pypi.tuna.tsinghua.edu.cn,.tuna.tsinghua.edu.cn" \ --gpus "\"device=${DEVICES}\"" ${docker_image} /bin/bash -c ' git config --global --add safe.directory /workspace/FastDeploy @@ -205,7 +225,7 @@ jobs: git diff origin/${BASE_REF}..HEAD --unified=0 > diff.txt # Avoid using pip cache to ensure the wheel is updated to the latest version wget -q --no-proxy https://paddle-qa.bj.bcebos.com/paddle-pipeline/Release-TagBuild-Training-Linux-Gpu-Cuda12.6-Cudnn9.5-Trt10.5-Mkl-Avx-Gcc11-SelfBuiltPypiUse/latest/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl - python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl + python -m pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cu126/ pip config set global.extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple python -m pip install -r scripts/unittest_requirement.txt @@ -380,6 +400,13 @@ jobs: echo "coverage passed" exit 0 + - name: Terminate and delete the container + if: always() + run: | + set +e + docker exec -t ${{ runner.name }} /bin/bash -c 'find /workspace -mindepth 1 -delete' + docker rm -f ${{ runner.name }} + diff_coverage_report: needs: run_tests_with_coverage if: always() diff --git a/.github/workflows/cancel_pr_build_and_test.yml b/.github/workflows/cancel_pr_build_and_test.yml new file mode 100644 index 00000000000..bb488a529ea --- /dev/null +++ b/.github/workflows/cancel_pr_build_and_test.yml @@ -0,0 +1,19 @@ +name: PR Build and Test +on: + pull_request: + types: [closed] + branches: [develop, release/**] +permissions: read-all + +concurrency: + group: ${{ github.event.pull_request.number }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + cancel: + name: Cancel PR Build and Test for ${{ github.event.pull_request.number }} + runs-on: ubuntu-latest + steps: + - name: Cancel PR Build and Test + run: | + exit 0 diff --git a/.github/workflows/ci_hpu.yml b/.github/workflows/ci_hpu.yml index 18c9333e321..857442abf42 100644 --- a/.github/workflows/ci_hpu.yml +++ b/.github/workflows/ci_hpu.yml @@ -4,7 +4,6 @@ on: pull_request: branches: - develop - - 'release/*' workflow_dispatch: concurrency: diff --git a/scripts/run_pre_ce.sh b/scripts/run_pre_ce.sh index aac8e404d9b..a17dbeab22a 100644 --- a/scripts/run_pre_ce.sh +++ b/scripts/run_pre_ce.sh @@ -7,7 +7,11 @@ python -m pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/p python -m pip install -r requirements.txt python -m pip install jsonschema aistudio_sdk==0.3.5 -python -m pip install xgrammar==0.1.19 torch==2.6.0 +# Use prebuilt wheel files to install xgrammar==0.1.19 and torch==2.6.0 specifically for the CI environment +python -m pip install \ + https://paddle-qa.bj.bcebos.com/FastDeploy/torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl \ + https://paddle-qa.bj.bcebos.com/FastDeploy/triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl \ + https://paddle-qa.bj.bcebos.com/FastDeploy/xgrammar-0.1.19-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl failed_files=() run_path="$DIR/../tests/ci_use/" diff --git a/tests/ce/server/core/utils.py b/tests/ce/server/core/utils.py index 92b00ed736b..b32b4afca1f 100644 --- a/tests/ce/server/core/utils.py +++ b/tests/ce/server/core/utils.py @@ -26,14 +26,14 @@ def build_request_payload(template_name: str, case_data: dict) -> dict: return final_payload -def send_request(url, payload, timeout=600, stream=False): +def send_request(url, payload, timeout=60, stream=False): """ 向指定URL发送POST请求,并返回响应结果。 Args: url (str): 请求的目标URL。 payload (dict): 请求的负载数据,应该是一个字典类型。 - timeout (int, optional): 请求的超时时间,默认为600秒。 + timeout (int, optional): 请求的超时时间,默认为60秒。 stream (bool, optional): 是否以流的方式下载响应内容,默认为False。 Returns: diff --git a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py index ec97fbf8ab7..ea70a7e270b 100644 --- a/tests/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/tests/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -23,7 +23,6 @@ import openai import pytest -import requests tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, tests_dir) @@ -159,66 +158,6 @@ def consistent_payload(): } -# ========================== -# Helper function to calculate difference rate between two texts -# ========================== -def calculate_diff_rate(text1, text2): - """ - Calculate the difference rate between two strings - based on the normalized Levenshtein edit distance. - Returns a float in [0,1], where 0 means identical. - """ - if text1 == text2: - return 0.0 - - len1, len2 = len(text1), len(text2) - dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] - - for i in range(len1 + 1): - for j in range(len2 + 1): - if i == 0 or j == 0: - dp[i][j] = i + j - elif text1[i - 1] == text2[j - 1]: - dp[i][j] = dp[i - 1][j - 1] - else: - dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) - - edit_distance = dp[len1][len2] - max_len = max(len1, len2) - return edit_distance / max_len if max_len > 0 else 0.0 - - -# ========================== -# Consistency test for repeated runs with fixed payload -# ========================== -def test_consistency_between_runs(api_url, headers, consistent_payload): - """ - Test that two runs with the same fixed input produce similar outputs. - """ - # First request - resp1 = requests.post(api_url, headers=headers, json=consistent_payload) - assert resp1.status_code == 200 - result1 = resp1.json() - content1 = result1["choices"][0]["message"]["content"] - - # Second request - resp2 = requests.post(api_url, headers=headers, json=consistent_payload) - assert resp2.status_code == 200 - result2 = resp2.json() - content2 = result2["choices"][0]["message"]["content"] - - # Calculate difference rate - diff_rate = calculate_diff_rate(content1, content2) - - # Verify that the difference rate is below the threshold - assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})" - - -# ========================== -# OpenAI Client chat.completions Test -# ========================== - - @pytest.fixture def openai_client(): ip = "0.0.0.0" @@ -230,896 +169,9 @@ def openai_client(): return client -# Non-streaming test -def test_non_streaming_chat(openai_client): - """ - Test non-streaming chat functionality with the local service - """ - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=1, - max_tokens=1024, - stream=False, - ) - - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "content") - - -# Streaming test -def test_streaming_chat(openai_client, capsys): - """ - Test streaming chat functionality with the local service - """ - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, - {"role": "user", "content": "List 3 countries and their capitals."}, - { - "role": "assistant", - "content": "China(Beijing), France(Paris), Australia(Canberra).", - }, - {"role": "user", "content": "OK, tell more."}, - ], - temperature=1, - max_tokens=1024, - stream=True, - ) - - output = [] - for chunk in response: - if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): - output.append(chunk.choices[0].delta.content) - assert len(output) > 2 - - -# ========================== -# OpenAI Client completions Test -# ========================== - - -def test_non_streaming(openai_client): - """ - Test non-streaming chat functionality with the local service - """ - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=1024, - stream=False, - ) - - # Assertions to check the response structure - assert hasattr(response, "choices") - assert len(response.choices) > 0 - - -def test_streaming(openai_client, capsys): - """ - Test streaming functionality with the local service - """ - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=1024, - stream=True, - ) - - # Collect streaming output - output = [] - for chunk in response: - output.append(chunk.choices[0].text) - assert len(output) > 0 - - # ========================== -# OpenAI Client additional chat/completions test +# Helper functions for structured outputs testing # ========================== - - -def test_non_streaming_with_stop_str(openai_client): - """ - Test non-streaming chat functionality with the local service - """ - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"include_stop_str_in_output": True}, - stream=False, - ) - # Assertions to check the response structure - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert response.choices[0].message.content.endswith("") - - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"include_stop_str_in_output": False}, - stream=False, - ) - # Assertions to check the response structure - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert not response.choices[0].message.content.endswith("") - - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=1024, - stream=False, - ) - assert not response.choices[0].text.endswith("") - - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=1024, - extra_body={"include_stop_str_in_output": True}, - stream=False, - ) - assert response.choices[0].text.endswith("") - - -def test_streaming_with_stop_str(openai_client): - """ - Test non-streaming chat functionality with the local service - """ - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"min_tokens": 1, "include_stop_str_in_output": True}, - stream=True, - ) - # Assertions to check the response structure - last_token = "" - for chunk in response: - last_token = chunk.choices[0].delta.content - if last_token: - assert last_token.endswith(""), f"last_token did not end with '': {last_token!r}" - else: - print("Warning: empty output received, skipping test_streaming_with_stop_str.") - - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"include_stop_str_in_output": False}, - stream=True, - ) - # Assertions to check the response structure - last_token = "" - for chunk in response: - last_token = chunk.choices[0].delta.content - assert last_token != "" - - response_1 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - max_tokens=10, - stream=True, - ) - last_token = "" - for chunk in response_1: - last_token = chunk.choices[0].text - assert not last_token.endswith("") - - response_1 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - max_tokens=10, - extra_body={"include_stop_str_in_output": True}, - stream=True, - ) - last_token = "" - for chunk in response_1: - last_token = chunk.choices[0].text - assert last_token.endswith("") - - -def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in non-streaming chat functionality with the local service - """ - # enable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": True}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "prompt_token_ids") - assert isinstance(response.choices[0].message.prompt_token_ids, list) - assert hasattr(response.choices[0].message, "completion_token_ids") - assert isinstance(response.choices[0].message.completion_token_ids, list) - - # disable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": False}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "prompt_token_ids") - assert response.choices[0].message.prompt_token_ids is None - assert hasattr(response.choices[0].message, "completion_token_ids") - assert response.choices[0].message.completion_token_ids is None - - -def test_streaming_chat_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in streaming chat functionality with the local service - """ - # enable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": True}, - stream=True, - ) - is_first_chunk = True - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "prompt_token_ids") - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) - assert chunk.choices[0].delta.completion_token_ids is None - else: - assert chunk.choices[0].delta.prompt_token_ids is None - assert isinstance(chunk.choices[0].delta.completion_token_ids, list) - - # disable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": False}, - stream=True, - ) - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "prompt_token_ids") - assert chunk.choices[0].delta.prompt_token_ids is None - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - assert chunk.choices[0].delta.completion_token_ids is None - - -def test_non_streaming_completion_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in non-streaming completion functionality with the local service - """ - # enable return_token_ids - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": True}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "prompt_token_ids") - assert isinstance(response.choices[0].prompt_token_ids, list) - assert hasattr(response.choices[0], "completion_token_ids") - assert isinstance(response.choices[0].completion_token_ids, list) - - # disable return_token_ids - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": False}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "prompt_token_ids") - assert response.choices[0].prompt_token_ids is None - assert hasattr(response.choices[0], "completion_token_ids") - assert response.choices[0].completion_token_ids is None - - -def test_streaming_completion_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in streaming completion functionality with the local service - """ - # enable return_token_ids - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": True}, - stream=True, - ) - is_first_chunk = True - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "prompt_token_ids") - assert hasattr(chunk.choices[0], "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - assert isinstance(chunk.choices[0].prompt_token_ids, list) - assert chunk.choices[0].completion_token_ids is None - else: - assert chunk.choices[0].prompt_token_ids is None - assert isinstance(chunk.choices[0].completion_token_ids, list) - - # disable return_token_ids - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=5, - extra_body={"return_token_ids": False}, - stream=True, - ) - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "prompt_token_ids") - assert chunk.choices[0].prompt_token_ids is None - assert hasattr(chunk.choices[0], "completion_token_ids") - assert chunk.choices[0].completion_token_ids is None - - -def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys): - """ - Test prompt_token_ids option in non-streaming chat functionality with the local service - """ - response = openai_client.chat.completions.create( - model="default", - messages=[], - temperature=1, - max_tokens=5, - extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response, "usage") - assert hasattr(response.usage, "prompt_tokens") - assert response.usage.prompt_tokens == 9 - - -def test_streaming_chat_with_prompt_token_ids(openai_client, capsys): - """ - Test prompt_token_ids option in streaming chat functionality with the local service - """ - response = openai_client.chat.completions.create( - model="default", - messages=[], - temperature=1, - max_tokens=5, - extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, - stream=True, - stream_options={"include_usage": True}, - ) - for chunk in response: - assert hasattr(chunk, "choices") - assert hasattr(chunk, "usage") - if len(chunk.choices) > 0: - assert chunk.usage is None - else: - assert hasattr(chunk.usage, "prompt_tokens") - assert chunk.usage.prompt_tokens == 9 - - -def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys): - """ - Test prompt_token_ids option in streaming completion functionality with the local service - """ - response = openai_client.completions.create( - model="default", - prompt="", - temperature=1, - max_tokens=5, - extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response, "usage") - assert hasattr(response.usage, "prompt_tokens") - assert response.usage.prompt_tokens == 9 - - -def test_streaming_completion_with_prompt_token_ids(openai_client, capsys): - """ - Test prompt_token_ids option in non-streaming completion functionality with the local service - """ - response = openai_client.completions.create( - model="default", - prompt="", - temperature=1, - max_tokens=5, - extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]}, - stream=True, - stream_options={"include_usage": True}, - ) - for chunk in response: - assert hasattr(chunk, "choices") - assert hasattr(chunk, "usage") - if len(chunk.choices) > 0: - assert chunk.usage is None - else: - assert hasattr(chunk.usage, "prompt_tokens") - assert chunk.usage.prompt_tokens == 9 - - -def test_non_streaming_chat_with_disable_chat_template(openai_client, capsys): - """ - Test disable_chat_template option in chat functionality with the local service. - """ - enabled_response = openai_client.chat.completions.create( - model="default", - messages=[], - max_tokens=10, - temperature=0.0, - top_p=0, - extra_body={ - "disable_chat_template": True, - "prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937], - }, - stream=False, - ) - assert hasattr(enabled_response, "choices") - assert len(enabled_response.choices) > 0 - - enabled_response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - max_tokens=10, - temperature=0.0, - top_p=0, - extra_body={"disable_chat_template": False}, - stream=False, - ) - assert hasattr(enabled_response, "choices") - assert len(enabled_response.choices) > 0 - - # from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer - # tokenizer = Ernie4_5Tokenizer.from_pretrained("PaddlePaddle/ERNIE-4.5-0.3B-Paddle", trust_remote_code=True) - # prompt = tokenizer.apply_chat_template([{"role": "user", "content": "Hello, how are you?"}], tokenize=False) - prompt = "<|begin_of_sentence|>User: Hello, how are you?\nAssistant: " - disabled_response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": prompt}], - max_tokens=10, - temperature=0, - top_p=0, - extra_body={"disable_chat_template": True}, - stream=False, - ) - assert hasattr(disabled_response, "choices") - assert len(disabled_response.choices) > 0 - assert enabled_response.choices[0].message.content == disabled_response.choices[0].message.content - - -def test_non_streaming_chat_with_min_tokens(openai_client, capsys): - """ - Test min_tokens option in non-streaming chat functionality with the local service - """ - min_tokens = 1000 - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - max_tokens=1010, - extra_body={"min_tokens": min_tokens}, - stream=False, - ) - assert hasattr(response, "usage") - assert hasattr(response.usage, "completion_tokens") - assert response.usage.completion_tokens >= min_tokens - - -def test_non_streaming_min_max_token_equals_one(openai_client, capsys): - """ - Test chat/completion when min_tokens equals max_tokens equals 1. - Verify it returns exactly one token. - """ - # Test non-streaming chat - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello"}], - max_tokens=1, - temperature=0.0, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "content") - # Verify usage shows exactly 1 completion token - assert hasattr(response, "usage") - assert response.usage.completion_tokens == 1 - - -def test_non_streaming_chat_with_bad_words(openai_client, capsys): - """ - Test bad_words option in non-streaming chat functionality with the local service - """ - base_path = os.getenv("MODEL_PATH") - if base_path: - model_path = os.path.join(base_path, "ernie-4_5-21b-a3b-bf16-paddle") - else: - model_path = "./ernie-4_5-21b-a3b-bf16-paddle" - response_0 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - top_p=0.0, - max_tokens=20, - stream=False, - extra_body={"return_token_ids": True}, - ) - - assert hasattr(response_0, "choices") - assert len(response_0.choices) > 0 - assert hasattr(response_0.choices[0], "message") - assert hasattr(response_0.choices[0].message, "completion_token_ids") - assert isinstance(response_0.choices[0].message.completion_token_ids, list) - - from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer - - tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True) - output_tokens_0 = [] - output_ids_0 = [] - for ids in response_0.choices[0].message.completion_token_ids: - output_tokens_0.append(tokenizer.decode(ids)) - output_ids_0.append(ids) - - # add bad words - bad_tokens = output_tokens_0[6:10] - bad_token_ids = output_ids_0[6:10] - response_1 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words": bad_tokens, "return_token_ids": True}, - stream=False, - ) - assert hasattr(response_1, "choices") - assert len(response_1.choices) > 0 - assert hasattr(response_1.choices[0], "message") - assert hasattr(response_1.choices[0].message, "completion_token_ids") - assert isinstance(response_1.choices[0].message.completion_token_ids, list) - - response_2 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words_token_ids": bad_token_ids, "return_token_ids": True}, - stream=False, - ) - assert hasattr(response_2, "choices") - assert len(response_2.choices) > 0 - assert hasattr(response_2.choices[0], "message") - assert hasattr(response_2.choices[0].message, "completion_token_ids") - assert isinstance(response_2.choices[0].message.completion_token_ids, list) - - assert not any(ids in response_1.choices[0].message.completion_token_ids for ids in bad_token_ids) - assert not any(ids in response_2.choices[0].message.completion_token_ids for ids in bad_token_ids) - - -def test_streaming_chat_with_bad_words(openai_client, capsys): - """ - Test bad_words option in streaming chat functionality with the local service - """ - response_0 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - top_p=0.0, - max_tokens=20, - stream=True, - extra_body={"return_token_ids": True}, - ) - output_tokens_0 = [] - output_ids_0 = [] - is_first_chunk = True - for chunk in response_0: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "content") - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - else: - assert isinstance(chunk.choices[0].delta.completion_token_ids, list) - output_tokens_0.append(chunk.choices[0].delta.content) - output_ids_0.extend(chunk.choices[0].delta.completion_token_ids) - - # add bad words - bad_tokens = output_tokens_0[6:10] - bad_token_ids = output_ids_0[6:10] - response_1 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words": bad_tokens, "return_token_ids": True}, - stream=True, - ) - output_tokens_1 = [] - output_ids_1 = [] - is_first_chunk = True - for chunk in response_1: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "content") - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - else: - assert isinstance(chunk.choices[0].delta.completion_token_ids, list) - output_tokens_1.append(chunk.choices[0].delta.content) - output_ids_1.extend(chunk.choices[0].delta.completion_token_ids) - - response_2 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Hello, how are you?"}], - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words_token_ids": bad_token_ids, "return_token_ids": True}, - stream=True, - ) - output_tokens_2 = [] - output_ids_2 = [] - is_first_chunk = True - for chunk in response_2: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "content") - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - else: - assert isinstance(chunk.choices[0].delta.completion_token_ids, list) - output_tokens_2.append(chunk.choices[0].delta.content) - output_ids_2.extend(chunk.choices[0].delta.completion_token_ids) - - assert not any(ids in output_ids_1 for ids in bad_token_ids) - assert not any(ids in output_ids_2 for ids in bad_token_ids) - - -def test_non_streaming_completion_with_bad_words(openai_client, capsys): - """ - Test bad_words option in non-streaming completion functionality with the local service - """ - base_path = os.getenv("MODEL_PATH") - if base_path: - model_path = os.path.join(base_path, "ernie-4_5-21b-a3b-bf16-paddle") - else: - model_path = "./ernie-4_5-21b-a3b-bf16-paddle" - - response_0 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - top_p=0.0, - max_tokens=20, - stream=False, - extra_body={"return_token_ids": True}, - ) - assert hasattr(response_0, "choices") - assert len(response_0.choices) > 0 - assert hasattr(response_0.choices[0], "completion_token_ids") - assert isinstance(response_0.choices[0].completion_token_ids, list) - - from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer - - tokenizer = Ernie4_5Tokenizer.from_pretrained(model_path, trust_remote_code=True) - output_tokens_0 = [] - output_ids_0 = [] - for ids in response_0.choices[0].completion_token_ids: - output_tokens_0.append(tokenizer.decode(ids)) - output_ids_0.append(ids) - - # add bad words - bad_tokens = output_tokens_0[6:10] - bad_token_ids = output_ids_0[6:10] - response_1 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words": bad_tokens, "return_token_ids": True}, - stream=False, - ) - assert hasattr(response_1, "choices") - assert len(response_1.choices) > 0 - assert hasattr(response_1.choices[0], "completion_token_ids") - assert isinstance(response_1.choices[0].completion_token_ids, list) - - response_2 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words_token_ids": bad_token_ids, "return_token_ids": True}, - stream=False, - ) - assert hasattr(response_2, "choices") - assert len(response_2.choices) > 0 - assert hasattr(response_2.choices[0], "completion_token_ids") - assert isinstance(response_2.choices[0].completion_token_ids, list) - - assert not any(ids in response_1.choices[0].completion_token_ids for ids in bad_token_ids) - assert not any(ids in response_2.choices[0].completion_token_ids for ids in bad_token_ids) - - -def test_streaming_completion_with_bad_words(openai_client, capsys): - """ - Test bad_words option in streaming completion functionality with the local service - """ - response_0 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - top_p=0.0, - max_tokens=20, - stream=True, - extra_body={"return_token_ids": True}, - ) - output_tokens_0 = [] - output_ids_0 = [] - is_first_chunk = True - for chunk in response_0: - if is_first_chunk: - is_first_chunk = False - else: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "text") - assert hasattr(chunk.choices[0], "completion_token_ids") - output_tokens_0.append(chunk.choices[0].text) - output_ids_0.extend(chunk.choices[0].completion_token_ids) - - # add bad words - bad_token_ids = output_ids_0[6:10] - bad_tokens = output_tokens_0[6:10] - response_1 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words": bad_tokens, "return_token_ids": True}, - stream=True, - ) - output_tokens_1 = [] - output_ids_1 = [] - is_first_chunk = True - for chunk in response_1: - if is_first_chunk: - is_first_chunk = False - else: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "text") - assert hasattr(chunk.choices[0], "completion_token_ids") - output_tokens_1.append(chunk.choices[0].text) - output_ids_1.extend(chunk.choices[0].completion_token_ids) - # add bad words token ids - response_2 = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - top_p=0.0, - max_tokens=20, - extra_body={"bad_words_token_ids": bad_token_ids, "return_token_ids": True}, - stream=True, - ) - output_tokens_2 = [] - output_ids_2 = [] - is_first_chunk = True - for chunk in response_2: - if is_first_chunk: - is_first_chunk = False - else: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "text") - assert hasattr(chunk.choices[0], "completion_token_ids") - output_tokens_2.append(chunk.choices[0].text) - output_ids_2.extend(chunk.choices[0].completion_token_ids) - - assert not any(ids in output_ids_1 for ids in bad_token_ids) - assert not any(ids in output_ids_2 for ids in bad_token_ids) - - -def test_profile_reset_block_num(): - """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" - baseline = 31446 - - if not os.path.exists(log_file): - pytest.fail(f"Log file not found: {log_file}") - - with open(log_file, "r") as f: - log_lines = f.readlines() - - target_line = None - for line in log_lines: - if "Reset block num" in line: - target_line = line.strip() - break - - if target_line is None: - pytest.fail("日志中没有Reset block num信息") - - match = re.search(r"total_block_num:(\d+)", target_line) - if not match: - pytest.fail(f"Failed to extract total_block_num from line: {target_line}") - - try: - actual_value = int(match.group(1)) - except ValueError: - pytest.fail(f"Invalid number format: {match.group(1)}") - - lower_bound = baseline * (1 - 0.05) - upper_bound = baseline * (1 + 0.05) - print(f"Reset total_block_num: {actual_value}. baseline: {baseline}") - - assert lower_bound <= actual_value <= upper_bound, ( - f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内" - f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]" - ) - - def streaming_chat_base(openai_client, chat_param): """ Test streaming chat base functionality with the local service @@ -1161,6 +213,9 @@ def non_streaming_chat_base(openai_client, chat_param): return response.choices[0].message.content +# ========================== +# Structured outputs tests +# ========================== @pytest.mark.skip(reason="Temporarily skip this case due to unstable execution") def test_structured_outputs_json_schema(openai_client): """ @@ -1396,8 +451,6 @@ def test_structured_outputs_regex(openai_client): "extra_body": {"guided_regex": r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n"}, } - import re - response = streaming_chat_base(openai_client, regex_param) assert re.fullmatch( r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response @@ -1436,6 +489,7 @@ def test_structured_outputs_grammar(openai_client): grammar_param = { "temperature": 1, + "top_p": 0.0, "max_tokens": 1024, "messages": [ { @@ -1446,8 +500,6 @@ def test_structured_outputs_grammar(openai_client): "extra_body": {"guided_grammar": html_h1_grammar}, } - import re - pattern = r'^[A-Za-z0-9 ]+$' response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index af14bb9f4cf..2f45936f929 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -22,7 +22,6 @@ import openai import pytest -import requests tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, tests_dir) @@ -129,97 +128,6 @@ def setup_and_run_server(): print(f"Failed to terminate API server: {e}") -@pytest.fixture(scope="session") -def api_url(request): - """ - Returns the API endpoint URL for chat completions. - """ - return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" - - -@pytest.fixture(scope="session") -def metrics_url(request): - """ - Returns the metrics endpoint URL. - """ - return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" - - -@pytest.fixture -def headers(): - """ - Returns common HTTP request headers. - """ - return {"Content-Type": "application/json"} - - -@pytest.fixture -def consistent_payload(): - """ - Returns a fixed payload for consistency testing, - including a fixed random seed and temperature. - """ - return { - "messages": [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - } - ], - "temperature": 0.8, - "top_p": 0, # fix top_p to reduce randomness - "seed": 13, # fixed random seed - } - - -# ========================== -# Consistency test for repeated runs with fixed payload -# ========================== -def test_consistency_between_runs(api_url, headers, consistent_payload): - """ - Test that result is same as the base result. - """ - # request - resp1 = requests.post(api_url, headers=headers, json=consistent_payload) - assert resp1.status_code == 200 - result1 = resp1.json() - content1 = ( - result1["choices"][0]["message"]["reasoning_content"] - + "" - + result1["choices"][0]["message"]["content"] - ) - file_res_temp = "ernie-4_5-vl" - f_o = open(file_res_temp, "a") - f_o.writelines(content1) - f_o.close() - - # base result - base_path = os.getenv("MODEL_PATH") - if base_path: - base_file = os.path.join(base_path, "ernie-4_5-vl-base-tp2-24-0130") - else: - base_file = "ernie-4_5-vl-base-tp2-24-0130" - with open(base_file, "r") as f: - content2 = f.read() - - # Verify that result is same as the base result - assert content1 == content2 - - -# ========================== -# OpenAI Client Chat Completion Test -# ========================== - - @pytest.fixture def openai_client(): ip = "0.0.0.0" @@ -231,305 +139,9 @@ def openai_client(): return client -# Non-streaming test -def test_non_streaming_chat(openai_client): - """Test non-streaming chat functionality with the local service""" - response = openai_client.chat.completions.create( - model="default", - messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - stream=False, - ) - - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "content") - - -# Streaming test -def test_streaming_chat(openai_client, capsys): - """Test streaming chat functionality with the local service""" - response = openai_client.chat.completions.create( - model="default", - messages=[ - { - "role": "system", - "content": "You are a helpful AI assistant.", - }, # system不是必需,可选 - {"role": "user", "content": "List 3 countries and their capitals."}, - { - "role": "assistant", - "content": "China(Beijing), France(Paris), Australia(Canberra).", - }, - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=512, - stream=True, - ) - - output = [] - for chunk in response: - if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): - output.append(chunk.choices[0].delta.content) - assert len(output) > 2 - - # ========================== -# OpenAI Client additional chat/completions test +# Helper functions for structured outputs testing # ========================== - - -def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in non-streaming chat functionality with the local service - """ - # 设定 return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": True}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "prompt_token_ids") - assert isinstance(response.choices[0].message.prompt_token_ids, list) - assert hasattr(response.choices[0].message, "completion_token_ids") - assert isinstance(response.choices[0].message.completion_token_ids, list) - - # 不设定 return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": False}, - stream=False, - ) - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "prompt_token_ids") - assert response.choices[0].message.prompt_token_ids is None - assert hasattr(response.choices[0].message, "completion_token_ids") - assert response.choices[0].message.completion_token_ids is None - - -def test_streaming_chat_with_return_token_ids(openai_client, capsys): - """ - Test return_token_ids option in streaming chat functionality with the local service - """ - # enable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": True}, - stream=True, - ) - is_first_chunk = True - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "prompt_token_ids") - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - if is_first_chunk: - is_first_chunk = False - assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) - assert chunk.choices[0].delta.completion_token_ids is None - else: - assert chunk.choices[0].delta.prompt_token_ids is None - assert isinstance(chunk.choices[0].delta.completion_token_ids, list) - - # disable return_token_ids - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, # system不是必需,可选 - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", - "detail": "high", - }, - }, - {"type": "text", "text": "请描述图片内容"}, - ], - }, - ], - temperature=1, - max_tokens=53, - extra_body={"return_token_ids": False}, - stream=True, - ) - for chunk in response: - assert hasattr(chunk, "choices") - assert len(chunk.choices) > 0 - assert hasattr(chunk.choices[0], "delta") - assert hasattr(chunk.choices[0].delta, "prompt_token_ids") - assert chunk.choices[0].delta.prompt_token_ids is None - assert hasattr(chunk.choices[0].delta, "completion_token_ids") - assert chunk.choices[0].delta.completion_token_ids is None - - -def test_chat_with_thinking(openai_client, capsys): - """ - Test enable_thinking & reasoning_max_tokens option in non-streaming chat functionality with the local service - """ - # enable thinking, non-streaming - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], - temperature=1, - stream=False, - max_tokens=10, - extra_body={"chat_template_kwargs": {"enable_thinking": True}}, - ) - assert response.choices[0].message.reasoning_content is not None - - # disable thinking, non-streaming - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], - temperature=1, - stream=False, - max_tokens=10, - extra_body={"chat_template_kwargs": {"enable_thinking": False}}, - ) - assert response.choices[0].message.reasoning_content is None - assert "" not in response.choices[0].message.content - - # test logic - reasoning_max_tokens = None - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], - temperature=1, - stream=False, - max_tokens=20, - extra_body={ - "chat_template_kwargs": {"enable_thinking": True}, - "reasoning_max_tokens": reasoning_max_tokens, - }, - ) - assert response.choices[0].message.reasoning_content is not None - - # enable thinking, streaming - reasoning_max_tokens = 3 - response = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], - temperature=1, - extra_body={ - "chat_template_kwargs": {"enable_thinking": True}, - "reasoning_max_tokens": reasoning_max_tokens, - "return_token_ids": True, - }, - stream=True, - max_tokens=10, - ) - completion_tokens = 0 - reasoning_tokens = 0 - total_tokens = 0 - for chunk_id, chunk in enumerate(response): - if chunk_id == 0: # the first chunk is an extra chunk - continue - delta_message = chunk.choices[0].delta - if delta_message.reasoning_content != "" and delta_message.content == "": - reasoning_tokens += len(delta_message.completion_token_ids) - else: - completion_tokens += len(delta_message.completion_token_ids) - total_tokens += len(delta_message.completion_token_ids) - assert completion_tokens + reasoning_tokens == total_tokens - assert reasoning_tokens <= reasoning_max_tokens - - def streaming_chat_base(openai_client, chat_param): """ Test streaming chat base functionality with the local service @@ -571,6 +183,9 @@ def non_streaming_chat_base(openai_client, chat_param): return response.choices[0].message.content +# ========================== +# Structured outputs tests +# ========================== @pytest.mark.skip(reason="Temporarily skip this case due to unstable execution") def test_structured_outputs_json_schema(openai_client): """ @@ -776,6 +391,7 @@ def test_structured_outputs_choice(openai_client): """ choice_param = { "temperature": 1, + "top_p": 0.0, "max_tokens": 1024, "messages": [{"role": "user", "content": "What is the landmark building in Shenzhen?"}], "extra_body": { @@ -815,8 +431,6 @@ def test_structured_outputs_regex(openai_client): "extra_body": {"guided_regex": r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n"}, } - import re - response = streaming_chat_base(openai_client, regex_param) assert re.fullmatch( r"^https:\/\/www\.[a-zA-Z]+\.com\/?$\n", response @@ -855,6 +469,7 @@ def test_structured_outputs_grammar(openai_client): grammar_param = { "temperature": 1, + "top_p": 0.0, "max_tokens": 1024, "messages": [ { @@ -865,96 +480,8 @@ def test_structured_outputs_grammar(openai_client): "extra_body": {"guided_grammar": html_h1_grammar}, } - import re - pattern = r'^[A-Za-z0-9 ]+$' response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected" - - -def test_profile_reset_block_num(): - """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" - baseline = 40000 - - if not os.path.exists(log_file): - pytest.fail(f"Log file not found: {log_file}") - - with open(log_file, "r") as f: - log_lines = f.readlines() - - target_line = None - for line in log_lines: - if "Reset block num" in line: - target_line = line.strip() - break - - if target_line is None: - pytest.fail("日志中没有Reset block num信息") - - match = re.search(r"total_block_num:(\d+)", target_line) - if not match: - pytest.fail(f"Failed to extract total_block_num from line: {target_line}") - - try: - actual_value = int(match.group(1)) - except ValueError: - pytest.fail(f"Invalid number format: {match.group(1)}") - - lower_bound = baseline * (1 - 0.05) - upper_bound = baseline * (1 + 0.05) - print(f"Reset total_block_num: {actual_value}. baseline: {baseline}") - - assert lower_bound <= actual_value <= upper_bound, ( - f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内" - f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]" - ) - - -def test_thinking_logic_flag(openai_client, capsys): - """ - Test the interaction between token calculation logic and conditional thinking. - This test covers: - 1. Default max_tokens calculation when not provided. - 2. Capping of max_tokens when it exceeds model limits. - 3. Default reasoning_max_tokens calculation when not provided. - 4. Activation of thinking based on the final state of reasoning_max_tokens. - """ - - response_case_1 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity briefly."}], - temperature=1, - stream=False, - extra_body={ - "chat_template_kwargs": {"enable_thinking": True}, - }, - ) - assert response_case_1.choices[0].message.reasoning_content is not None - - response_case_2 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], - temperature=1, - stream=False, - max_tokens=20, - extra_body={ - "chat_template_kwargs": {"enable_thinking": True}, - "reasoning_max_tokens": 5, - }, - ) - assert response_case_2.choices[0].message.reasoning_content is not None - - response_case_3 = openai_client.chat.completions.create( - model="default", - messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], - temperature=1, - stream=False, - max_tokens=20, - extra_body={ - "chat_template_kwargs": {"enable_thinking": False}, - }, - ) - assert response_case_3.choices[0].message.reasoning_content is None diff --git a/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py b/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py deleted file mode 100644 index dc6f61c6a9b..00000000000 --- a/tests/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py +++ /dev/null @@ -1,647 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import concurrent.futures -import json -import os -import re -import signal -import subprocess -import sys -import time - -import openai -import pytest -import requests -from jsonschema import validate - -tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -sys.path.insert(0, tests_dir) - -from e2e.utils.serving_utils import ( - FD_API_PORT, - FD_CACHE_QUEUE_PORT, - FD_ENGINE_QUEUE_PORT, - FD_METRICS_PORT, - clean_ports, - is_port_open, -) - - -@pytest.fixture(scope="session", autouse=True) -def setup_and_run_server(): - """ - Pytest fixture that runs once per test session: - - Cleans ports before tests - - Starts the API server as a subprocess - - Waits for server port to open (up to 30 seconds) - - Tears down server after all tests finish - """ - print("Pre-test port cleanup...") - clean_ports() - - base_path = os.getenv("MODEL_PATH") - if base_path: - model_path = os.path.join(base_path, "Qwen2-7B-Instruct") - else: - model_path = "./Qwen2-7B-Instruct" - - log_path = "server.log" - cmd = [ - sys.executable, - "-m", - "fastdeploy.entrypoints.openai.api_server", - "--model", - model_path, - "--port", - str(FD_API_PORT), - "--tensor-parallel-size", - "1", - "--engine-worker-queue-port", - str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", - str(FD_METRICS_PORT), - "--cache-queue-port", - str(FD_CACHE_QUEUE_PORT), - "--max-model-len", - "32768", - "--max-num-seqs", - "128", - "--quantization", - "wint8", - ] - - # Start subprocess in new process group - with open(log_path, "w") as logfile: - process = subprocess.Popen( - cmd, - stdout=logfile, - stderr=subprocess.STDOUT, - start_new_session=True, # Enables killing full group via os.killpg - ) - - # Wait up to 300 seconds for API server to be ready - for _ in range(300): - if is_port_open("127.0.0.1", FD_API_PORT): - print(f"API server is up on port {FD_API_PORT}") - break - time.sleep(1) - else: - print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...") - try: - os.killpg(process.pid, signal.SIGTERM) - except Exception as e: - print(f"Failed to kill process group: {e}") - raise RuntimeError(f"API server did not start on port {FD_API_PORT}") - - yield # Run tests - - print("\n===== Post-test server cleanup... =====") - try: - os.killpg(process.pid, signal.SIGTERM) - print(f"API server (pid={process.pid}) terminated") - except Exception as e: - print(f"Failed to terminate API server: {e}") - - -@pytest.fixture(scope="session") -def api_url(request): - """ - Returns the API endpoint URL for chat completions. - """ - return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" - - -@pytest.fixture(scope="session") -def metrics_url(request): - """ - Returns the metrics endpoint URL. - """ - return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" - - -@pytest.fixture -def headers(): - """ - Returns common HTTP request headers. - """ - return {"Content-Type": "application/json"} - - -@pytest.fixture -def consistent_payload(): - """ - Returns a fixed payload for consistency testing, - including a fixed random seed and temperature. - """ - return { - "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}], - "temperature": 0.9, - "top_p": 0, # fix top_p to reduce randomness - "seed": 13, # fixed random seed - } - - -# ========================== -# JSON Schema for validating chat API responses -# ========================== -chat_response_schema = { - "type": "object", - "properties": { - "id": {"type": "string"}, - "object": {"type": "string"}, - "created": {"type": "number"}, - "model": {"type": "string"}, - "choices": { - "type": "array", - "items": { - "type": "object", - "properties": { - "message": { - "type": "object", - "properties": { - "role": {"type": "string"}, - "content": {"type": "string"}, - }, - "required": ["role", "content"], - }, - "index": {"type": "number"}, - "finish_reason": {"type": "string"}, - }, - "required": ["message", "index", "finish_reason"], - }, - }, - }, - "required": ["id", "object", "created", "model", "choices"], -} - - -# ========================== -# Helper function to calculate difference rate between two texts -# ========================== -def calculate_diff_rate(text1, text2): - """ - Calculate the difference rate between two strings - based on the normalized Levenshtein edit distance. - Returns a float in [0,1], where 0 means identical. - """ - if text1 == text2: - return 0.0 - - len1, len2 = len(text1), len(text2) - dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] - - for i in range(len1 + 1): - for j in range(len2 + 1): - if i == 0 or j == 0: - dp[i][j] = i + j - elif text1[i - 1] == text2[j - 1]: - dp[i][j] = dp[i - 1][j - 1] - else: - dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) - - edit_distance = dp[len1][len2] - max_len = max(len1, len2) - return edit_distance / max_len if max_len > 0 else 0.0 - - -# ========================== -# Valid prompt test cases for parameterized testing -# ========================== -valid_prompts = [ - [{"role": "user", "content": "你好"}], - [{"role": "user", "content": "用一句话介绍 FastDeploy"}], -] - - -@pytest.mark.parametrize("messages", valid_prompts) -def test_valid_chat(messages, api_url, headers): - """ - Test valid chat requests. - """ - resp = requests.post(api_url, headers=headers, json={"messages": messages}) - - assert resp.status_code == 200 - validate(instance=resp.json(), schema=chat_response_schema) - - -# ========================== -# Consistency test for repeated runs with fixed payload -# ========================== -def test_consistency_between_runs(api_url, headers, consistent_payload): - """ - Test that two runs with the same fixed input produce similar outputs. - """ - # First request - resp1 = requests.post(api_url, headers=headers, json=consistent_payload) - assert resp1.status_code == 200 - result1 = resp1.json() - content1 = result1["choices"][0]["message"]["content"] - - # Second request - resp2 = requests.post(api_url, headers=headers, json=consistent_payload) - assert resp2.status_code == 200 - result2 = resp2.json() - content2 = result2["choices"][0]["message"]["content"] - - # Calculate difference rate - diff_rate = calculate_diff_rate(content1, content2) - - # Verify that the difference rate is below the threshold - assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})" - - -# ========================== -# Invalid prompt tests -# ========================== - -invalid_prompts = [ - [], # Empty array - [{}], # Empty object - [{"role": "user"}], # Missing content - [{"content": "hello"}], # Missing role -] - - -@pytest.mark.parametrize("messages", invalid_prompts) -def test_invalid_chat(messages, api_url, headers): - """ - Test invalid chat inputs - """ - resp = requests.post(api_url, headers=headers, json={"messages": messages}) - assert resp.status_code >= 400, "Invalid request should return an error status code" - - -# ========================== -# Test for input exceeding context length -# ========================== - - -def test_exceed_context_length(api_url, headers): - """ - Test case for inputs that exceed the model's maximum context length. - """ - # Construct an overly long message - long_content = "你好," * 20000 - - messages = [{"role": "user", "content": long_content}] - - resp = requests.post(api_url, headers=headers, json={"messages": messages}) - - # Check if the response indicates a token limit error or server error (500) - try: - response_json = resp.json() - except Exception: - response_json = {} - - # Check status code and response content - assert ( - resp.status_code != 200 or "token" in json.dumps(response_json).lower() - ), f"Expected token limit error or similar, but got a normal response: {response_json}" - - -# ========================== -# Multi-turn Conversation Test -# ========================== -def test_multi_turn_conversation(api_url, headers): - """ - Test whether multi-turn conversation context is effective. - """ - messages = [ - {"role": "user", "content": "你是谁?"}, - {"role": "assistant", "content": "我是AI助手"}, - {"role": "user", "content": "你能做什么?"}, - ] - resp = requests.post(api_url, headers=headers, json={"messages": messages}) - assert resp.status_code == 200 - validate(instance=resp.json(), schema=chat_response_schema) - - -# ========================== -# Concurrent Performance Test -# ========================== -def test_concurrent_perf(api_url, headers): - """ - Send concurrent requests to test stability and response time. - """ - prompts = [{"role": "user", "content": "Introduce FastDeploy."}] - - def send_request(): - """ - Send a single request - """ - resp = requests.post(api_url, headers=headers, json={"messages": prompts}) - assert resp.status_code == 200 - return resp.elapsed.total_seconds() - - with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: - futures = [executor.submit(send_request) for _ in range(8)] - durations = [f.result() for f in futures] - - print("\nResponse time for each request:", durations) - - -# ========================== -# Metrics Endpoint Test -# ========================== - - -def test_metrics_endpoint(metrics_url): - """ - Test the metrics monitoring endpoint. - """ - resp = requests.get(metrics_url, timeout=5) - - assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" - assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain" - - # Parse Prometheus metrics data - metrics_data = resp.text - lines = metrics_data.split("\n") - - metric_lines = [line for line in lines if not line.startswith("#") and line.strip() != ""] - - # 断言 具体值 - num_requests_running_found = False - num_requests_waiting_found = False - time_to_first_token_seconds_sum_found = False - time_per_output_token_seconds_sum_found = False - e2e_request_latency_seconds_sum_found = False - request_inference_time_seconds_sum_found = False - request_queue_time_seconds_sum_found = False - request_prefill_time_seconds_sum_found = False - request_decode_time_seconds_sum_found = False - prompt_tokens_total_found = False - generation_tokens_total_found = False - request_prompt_tokens_sum_found = False - request_generation_tokens_sum_found = False - gpu_cache_usage_perc_found = False - request_params_max_tokens_sum_found = False - request_success_total_found = False - cache_config_info_found = False - available_batch_size_found = False - hit_req_rate_found = False - hit_token_rate_found = False - cpu_hit_token_rate_found = False - gpu_hit_token_rate_found = False - - for line in metric_lines: - if line.startswith("fastdeploy:num_requests_running"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "num_requests_running 值错误" - num_requests_running_found = True - elif line.startswith("fastdeploy:num_requests_waiting"): - _, value = line.rsplit(" ", 1) - num_requests_waiting_found = True - assert float(value) >= 0, "num_requests_waiting 值错误" - elif line.startswith("fastdeploy:time_to_first_token_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "time_to_first_token_seconds_sum 值错误" - time_to_first_token_seconds_sum_found = True - elif line.startswith("fastdeploy:time_per_output_token_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "time_per_output_token_seconds_sum 值错误" - time_per_output_token_seconds_sum_found = True - elif line.startswith("fastdeploy:e2e_request_latency_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "e2e_request_latency_seconds_sum_found 值错误" - e2e_request_latency_seconds_sum_found = True - elif line.startswith("fastdeploy:request_inference_time_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_inference_time_seconds_sum 值错误" - request_inference_time_seconds_sum_found = True - elif line.startswith("fastdeploy:request_queue_time_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_queue_time_seconds_sum 值错误" - request_queue_time_seconds_sum_found = True - elif line.startswith("fastdeploy:request_prefill_time_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_prefill_time_seconds_sum 值错误" - request_prefill_time_seconds_sum_found = True - elif line.startswith("fastdeploy:request_decode_time_seconds_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_decode_time_seconds_sum 值错误" - request_decode_time_seconds_sum_found = True - elif line.startswith("fastdeploy:prompt_tokens_total"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "prompt_tokens_total 值错误" - prompt_tokens_total_found = True - elif line.startswith("fastdeploy:generation_tokens_total"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "generation_tokens_total 值错误" - generation_tokens_total_found = True - elif line.startswith("fastdeploy:request_prompt_tokens_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_prompt_tokens_sum 值错误" - request_prompt_tokens_sum_found = True - elif line.startswith("fastdeploy:request_generation_tokens_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_generation_tokens_sum 值错误" - request_generation_tokens_sum_found = True - elif line.startswith("fastdeploy:gpu_cache_usage_perc"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "gpu_cache_usage_perc 值错误" - gpu_cache_usage_perc_found = True - elif line.startswith("fastdeploy:request_params_max_tokens_sum"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_params_max_tokens_sum 值错误" - request_params_max_tokens_sum_found = True - elif line.startswith("fastdeploy:request_success_total"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "request_success_total 值错误" - request_success_total_found = True - elif line.startswith("fastdeploy:cache_config_info"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "cache_config_info 值错误" - cache_config_info_found = True - elif line.startswith("fastdeploy:available_batch_size"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "available_batch_size 值错误" - available_batch_size_found = True - elif line.startswith("fastdeploy:hit_req_rate"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "hit_req_rate 值错误" - hit_req_rate_found = True - elif line.startswith("fastdeploy:hit_token_rate"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "hit_token_rate 值错误" - hit_token_rate_found = True - elif line.startswith("fastdeploy:cpu_hit_token_rate"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "cpu_hit_token_rate 值错误" - cpu_hit_token_rate_found = True - elif line.startswith("fastdeploy:gpu_hit_token_rate"): - _, value = line.rsplit(" ", 1) - assert float(value) >= 0, "gpu_hit_token_rate 值错误" - gpu_hit_token_rate_found = True - assert num_requests_running_found, "缺少 fastdeploy:num_requests_running 指标" - assert num_requests_waiting_found, "缺少 fastdeploy:num_requests_waiting 指标" - assert time_to_first_token_seconds_sum_found, "缺少 fastdeploy:time_to_first_token_seconds_sum 指标" - assert time_per_output_token_seconds_sum_found, "缺少 fastdeploy:time_per_output_token_seconds_sum 指标" - assert e2e_request_latency_seconds_sum_found, "缺少 fastdeploy:e2e_request_latency_seconds_sum_found 指标" - assert request_inference_time_seconds_sum_found, "缺少 fastdeploy:request_inference_time_seconds_sum 指标" - assert request_queue_time_seconds_sum_found, "缺少 fastdeploy:request_queue_time_seconds_sum 指标" - assert request_prefill_time_seconds_sum_found, "缺少 fastdeploy:request_prefill_time_seconds_sum 指标" - assert request_decode_time_seconds_sum_found, "缺少 fastdeploy:request_decode_time_seconds_sum 指标" - assert prompt_tokens_total_found, "缺少 fastdeploy:prompt_tokens_total 指标" - assert generation_tokens_total_found, "缺少 fastdeploy:generation_tokens_total 指标" - assert request_prompt_tokens_sum_found, "缺少 fastdeploy:request_prompt_tokens_sum 指标" - assert request_generation_tokens_sum_found, "缺少 fastdeploy:request_generation_tokens_sum 指标" - assert gpu_cache_usage_perc_found, "缺少 fastdeploy:gpu_cache_usage_perc 指标" - assert request_params_max_tokens_sum_found, "缺少 fastdeploy:request_params_max_tokens_sum 指标" - assert request_success_total_found, "缺少 fastdeploy:request_success_total 指标" - assert cache_config_info_found, "缺少 fastdeploy:cache_config_info 指标" - assert available_batch_size_found, "缺少 fastdeploy:available_batch_size 指标" - assert hit_req_rate_found, "缺少 fastdeploy:hit_req_rate 指标" - assert hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标" - assert cpu_hit_token_rate_found, "缺少 fastdeploy:hit_token_rate 指标" - assert gpu_hit_token_rate_found, "缺少 fastdeploy:gpu_hit_token_rate 指标" - - -# ========================== -# OpenAI Client chat.completions Test -# ========================== - - -@pytest.fixture -def openai_client(): - ip = "0.0.0.0" - service_http_port = str(FD_API_PORT) - client = openai.Client( - base_url=f"http://{ip}:{service_http_port}/v1", - api_key="EMPTY_API_KEY", - ) - return client - - -# Non-streaming test -def test_non_streaming_chat(openai_client): - """Test non-streaming chat functionality with the local service""" - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, - {"role": "user", "content": "List 3 countries and their capitals."}, - ], - temperature=1, - max_tokens=1024, - stream=False, - ) - - assert hasattr(response, "choices") - assert len(response.choices) > 0 - assert hasattr(response.choices[0], "message") - assert hasattr(response.choices[0].message, "content") - - -# Streaming test -def test_streaming_chat(openai_client, capsys): - """Test streaming chat functionality with the local service""" - response = openai_client.chat.completions.create( - model="default", - messages=[ - {"role": "system", "content": "You are a helpful AI assistant."}, - {"role": "user", "content": "List 3 countries and their capitals."}, - { - "role": "assistant", - "content": "China(Beijing), France(Paris), Australia(Canberra).", - }, - {"role": "user", "content": "OK, tell more."}, - ], - temperature=1, - max_tokens=1024, - stream=True, - ) - - output = [] - for chunk in response: - if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): - output.append(chunk.choices[0].delta.content) - assert len(output) > 2 - - -# ========================== -# OpenAI Client completions Test -# ========================== - - -def test_non_streaming(openai_client): - """Test non-streaming chat functionality with the local service""" - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=1024, - stream=False, - ) - - # Assertions to check the response structure - assert hasattr(response, "choices") - assert len(response.choices) > 0 - - -def test_streaming(openai_client, capsys): - """Test streaming functionality with the local service""" - response = openai_client.completions.create( - model="default", - prompt="Hello, how are you?", - temperature=1, - max_tokens=1024, - stream=True, - ) - - # Collect streaming output - output = [] - for chunk in response: - output.append(chunk.choices[0].text) - assert len(output) > 0 - - -def test_profile_reset_block_num(): - """测试profile reset_block_num功能,与baseline diff不能超过5%""" - log_file = "./log/config.log" - baseline = 32562 - - if not os.path.exists(log_file): - pytest.fail(f"Log file not found: {log_file}") - - with open(log_file, "r") as f: - log_lines = f.readlines() - - target_line = None - for line in log_lines: - if "Reset block num" in line: - target_line = line.strip() - break - - if target_line is None: - pytest.fail("日志中没有Reset block num信息") - - match = re.search(r"total_block_num:(\d+)", target_line) - if not match: - pytest.fail(f"Failed to extract total_block_num from line: {target_line}") - - try: - actual_value = int(match.group(1)) - except ValueError: - pytest.fail(f"Invalid number format: {match.group(1)}") - - lower_bound = baseline * (1 - 0.05) - upper_bound = baseline * (1 + 0.05) - print(f"Reset total_block_num: {actual_value}. baseline: {baseline}") - - assert lower_bound <= actual_value <= upper_bound, ( - f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内" - f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]" - ) diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 83c4ca43131..fdb8139cfbe 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1388,7 +1388,11 @@ def setUp(self): clean_ports() # 3. 确定模型路径 - self.model_path = "baidu/ERNIE-4.5-0.3B-PT" + base_path = os.getenv("MODEL_PATH") + if base_path: + self.model_path = os.path.join(base_path, "ERNIE-4.5-0.3B-Paddle") + else: + self.model_path = "./ERNIE-4.5-0.3B-Paddle" self.run_batch_command = [sys.executable, "fastdeploy/entrypoints/openai/run_batch.py"] @@ -1520,7 +1524,7 @@ def run_fastdeploy_command(self, input_content, port=None): def test_completions(self): """测试正常的批量chat请求""" - return_code, contents, proc = self.run_fastdeploy_command(INPUT_BATCH, port="2235") + return_code, contents, proc = self.run_fastdeploy_command(INPUT_BATCH, port=str(FD_CACHE_QUEUE_PORT)) print(f"进程输出: {return_code}") self.assertEqual(return_code, 0, f"进程返回非零码: {return_code}, 进程信息: {proc}")