From 5d246a792ddf839e87fb777681dd2a9ac7f7eb5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 24 May 2026 09:51:31 +0200 Subject: [PATCH 01/21] convert : minor fixes for numpy 2.x (#23571) --- examples/convert_legacy_llama.py | 3 ++- gguf-py/gguf/quants.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c4ec5c524e9..5c9305b1237 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None: def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] - if np.uint32(1) == np.uint32(1).newbyteorder("<"): + dummy_val = np.uint32(1) + if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file") diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py index 1d9d9ab7d70..80966b6ef15 100644 --- a/gguf-py/gguf/quants.py +++ b/gguf-py/gguf/quants.py @@ -28,6 +28,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray: rows = arr.reshape((-1, arr.shape[-1])) + assert len(rows.shape) osize = 1 for dim in oshape: osize *= dim From 549b9d84330c327e6791fa812a7d60c0cf63572e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 24 May 2026 18:20:10 +0300 Subject: [PATCH 02/21] ci : update build-self-hosted.yml (#23616) --- .github/workflows/build-self-hosted.yml | 86 ++++++++++++++++--- .github/workflows/build.yml | 102 ++++++++++++----------- .github/workflows/server-self-hosted.yml | 77 +++++++++-------- ci/run.sh | 2 +- 4 files changed, 167 insertions(+), 100 deletions(-) diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index 2851c45601f..c247222eb4b 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -57,7 +57,7 @@ env: jobs: determine-tag: name: Determine tag name - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] outputs: tag_name: ${{ steps.tag.outputs.name }} steps: @@ -86,7 +86,7 @@ jobs: HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | nvidia-smi - GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-nvidia-vulkan-cm: needs: determine-tag @@ -103,7 +103,7 @@ jobs: HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary - GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-nvidia-vulkan-cm2: needs: determine-tag @@ -120,10 +120,11 @@ jobs: HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary - GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-nvidia-webgpu: - runs-on: [self-hosted, Linux, NVIDIA] + needs: determine-tag + runs-on: [self-hosted, Linux, NVIDIA, X64] steps: - name: Clone @@ -149,10 +150,11 @@ jobs: GG_BUILD_WEBGPU=1 \ GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \ - bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # TODO: provision AMX-compatible machine #ggml-ci-cpu-amx: + # needs: determine-tag # runs-on: [self-hosted, Linux, CPU, AMX] # steps: @@ -163,10 +165,11 @@ jobs: # - name: Test # id: ggml-ci # run: | - # bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # TODO: provision AMD GPU machine # ggml-ci-amd-vulkan: + # needs: determine-tag # runs-on: [self-hosted, Linux, AMD] # steps: @@ -178,10 +181,11 @@ jobs: # id: ggml-ci # run: | # vulkaninfo --summary - # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # TODO: provision AMD GPU machine # ggml-ci-amd-rocm: + # needs: determine-tag # runs-on: [self-hosted, Linux, AMD] # steps: @@ -193,7 +197,7 @@ jobs: # id: ggml-ci # run: | # amd-smi static - # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp + # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-mac-metal: needs: determine-tag @@ -337,4 +341,66 @@ jobs: HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | source ./openvino_toolkit/setupvars.sh - GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + ggml-ci-arm64-cpu-low-perf: + needs: determine-tag + runs-on: [self-hosted, Linux, ARM64] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + + ggml-ci-arm64-cpu-high-perf: + needs: determine-tag + runs-on: [self-hosted, Linux, ARM64] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Test + id: ggml-ci + run: | + LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp + +# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake: +# CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message): +# ARM -march/-mcpu not found, -mcpu=native will be used +# +# if we resolve this, we should be able to offload these jobs to the self-hosted runners +# +# ggml-ci-arm64-cpu-high-perf-sve: +# needs: determine-tag +# runs-on: [self-hosted, Linux, NVIDIA, ARM64] +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: Test +# id: ggml-ci +# run: | +# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp +# +# ggml-ci-arm64-cpu-kleidiai: +# needs: determine-tag +# runs-on: [self-hosted, Linux, NVIDIA, ARM64] +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: Test +# id: ggml-ci +# run: | +# GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 65fa24f4468..47b377ff72b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -931,31 +931,32 @@ jobs: run: | LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - ggml-ci-arm64-cpu-low-perf: - runs-on: ubuntu-22.04-arm - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-low-perf - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# note: moved to build-self-hosted.yml - can remove from here when everything is stable +# ggml-ci-arm64-cpu-low-perf: +# runs-on: ubuntu-22.04-arm +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: ccache +# uses: ggml-org/ccache-action@v1.2.21 +# with: +# key: ggml-ci-arm64-cpu-low-perf +# evict-old-files: 1d +# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} +# +# - name: Dependencies +# id: depends +# run: | +# sudo apt-get update +# sudo apt-get install build-essential +# +# - name: Test +# id: ggml-ci +# run: | +# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ggml-ci-x64-cpu-high-perf: runs-on: ubuntu-22.04 @@ -983,31 +984,32 @@ jobs: run: | LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt - ggml-ci-arm64-cpu-high-perf: - runs-on: ubuntu-22.04-arm - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v6 - - - name: ccache - uses: ggml-org/ccache-action@v1.2.21 - with: - key: ggml-ci-arm64-cpu-high-perf - evict-old-files: 1d - save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Test - id: ggml-ci - run: | - LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# note: moved to build-self-hosted.yml - can remove from here when everything is stable +# ggml-ci-arm64-cpu-high-perf: +# runs-on: ubuntu-22.04-arm +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v6 +# +# - name: ccache +# uses: ggml-org/ccache-action@v1.2.21 +# with: +# key: ggml-ci-arm64-cpu-high-perf +# evict-old-files: 1d +# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} +# +# - name: Dependencies +# id: depends +# run: | +# sudo apt-get update +# sudo apt-get install build-essential +# +# - name: Test +# id: ggml-ci +# run: | +# LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ggml-ci-arm64-cpu-high-perf-sve: runs-on: ubuntu-22.04-arm diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 857c72a4619..91e0653943c 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -91,45 +91,44 @@ jobs: export ${{ matrix.extra_args }} pytest -v -x -m "not slow" - # TODO: provision CUDA runner - # server-cuda: - # runs-on: [self-hosted, llama-server, Linux, NVIDIA] - # - # name: server-cuda (${{ matrix.wf_name }}) - # strategy: - # matrix: - # build_type: [Release] - # wf_name: ["GPUx1"] - # include: - # - build_type: Release - # extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - # wf_name: "GPUx1, backend-sampling" - # fail-fast: false - # - # steps: - # - name: Clone - # id: checkout - # uses: actions/checkout@v6 - # with: - # fetch-depth: 0 - # ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - # - # - name: Build - # id: cmake_build - # run: | - # cmake -B build -DGGML_SCHED_NO_REALLOC=ON - # cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server - # - # - name: Tests - # id: server_integration_tests - # if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} - # run: | - # cd tools/server/tests - # python3 -m venv venv - # source venv/bin/activate - # pip install -r requirements.txt - # export ${{ matrix.extra_args }} - # pytest -v -x -m "not slow" + server-cuda: + runs-on: [self-hosted, llama-server, Linux, NVIDIA] + + name: server-cuda (${{ matrix.wf_name }}) + strategy: + matrix: + build_type: [Release] + wf_name: ["GPUx1"] + include: + - build_type: Release + extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" + wf_name: "GPUx1, backend-sampling" + fail-fast: false + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Build + id: cmake_build + run: | + cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server + + - name: Tests + id: server_integration_tests + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + run: | + cd tools/server/tests + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + export ${{ matrix.extra_args }} + pytest -v -x -m "not slow" server-kleidiai: runs-on: ah-ubuntu_22_04-c8g_8x diff --git a/ci/run.sh b/ci/run.sh index b096dc23b66..4acf4375267 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -238,7 +238,7 @@ function gg_run_ctest_debug { (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } From 28123a3937f7d056a4dded43752b216507384373 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 May 2026 08:11:19 +0300 Subject: [PATCH 03/21] ci : move most slim jobs to self-hosted runners (#23619) * ci : remove tag from build-self-hosted.yml * ci : slim -> self-hosted * ci : prevent heavy CPU jobs from running on fast runners * ci : prevent cmake pkg to run on dedicated fast runners * ci : try to bump 3.11 -> 3.13 * ci : move lint back to 3.11 * ci : back to 3.11 * ci : add comment about UI jobs * ci : move python requirements check to CPU runners this job is a bit slow for a dedicated "fast" runner * ci : add self-hosted ui workflow * ci : fix UI naming * tmp to check if arm64 fast is compatible with all jobs * revert last commit --- .github/workflows/build-cmake-pkg.yml | 7 +- .github/workflows/build-self-hosted.yml | 58 +-------- .github/workflows/check-vendor.yml | 2 +- .github/workflows/code-style.yml | 2 +- .github/workflows/editorconfig.yml | 2 +- .github/workflows/pre-tokenizer-hashes.yml | 2 +- .../workflows/python-check-requirements.yml | 2 +- .github/workflows/python-lint.yml | 2 +- .github/workflows/python-type-check.yml | 2 +- .github/workflows/ui-build.yml | 3 +- .github/workflows/ui-self-hosted.yml | 118 ++++++++++++++++++ .github/workflows/{ui-ci.yml => ui.yml} | 10 +- .github/workflows/update-ops-docs.yml | 4 +- 13 files changed, 140 insertions(+), 74 deletions(-) create mode 100644 .github/workflows/ui-self-hosted.yml rename .github/workflows/{ui-ci.yml => ui.yml} (95%) diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml index b36ac5b8e6d..5becff09c1b 100644 --- a/.github/workflows/build-cmake-pkg.yml +++ b/.github/workflows/build-cmake-pkg.yml @@ -5,17 +5,12 @@ on: jobs: linux: - runs-on: ubuntu-slim + runs-on: [self-hosted, Linux, CPU] steps: - uses: actions/checkout@v6 with: fetch-depth: 0 - - name: Install dependencies - run: | - sudo apt update - sudo apt install -y build-essential tcl cmake - - name: Build run: | PREFIX="$(pwd)"/inst diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index c247222eb4b..375e75c5267 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -55,24 +55,7 @@ env: LLAMA_LOG_TIMESTAMPS: 1 jobs: - determine-tag: - name: Determine tag name - runs-on: [self-hosted, fast] - outputs: - tag_name: ${{ steps.tag.outputs.name }} - steps: - - name: Clone - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - ggml-ci-nvidia-cuda: - needs: determine-tag runs-on: [self-hosted, Linux, NVIDIA] steps: @@ -82,14 +65,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | nvidia-smi GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-nvidia-vulkan-cm: - needs: determine-tag runs-on: [self-hosted, Linux, NVIDIA] steps: @@ -99,14 +79,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-nvidia-vulkan-cm2: - needs: determine-tag runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2] steps: @@ -116,14 +93,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-nvidia-webgpu: - needs: determine-tag runs-on: [self-hosted, Linux, NVIDIA, X64] steps: @@ -154,7 +128,6 @@ jobs: # TODO: provision AMX-compatible machine #ggml-ci-cpu-amx: - # needs: determine-tag # runs-on: [self-hosted, Linux, CPU, AMX] # steps: @@ -169,7 +142,6 @@ jobs: # TODO: provision AMD GPU machine # ggml-ci-amd-vulkan: - # needs: determine-tag # runs-on: [self-hosted, Linux, AMD] # steps: @@ -185,7 +157,6 @@ jobs: # TODO: provision AMD GPU machine # ggml-ci-amd-rocm: - # needs: determine-tag # runs-on: [self-hosted, Linux, AMD] # steps: @@ -200,7 +171,6 @@ jobs: # GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-mac-metal: - needs: determine-tag runs-on: [self-hosted, macOS, ARM64] steps: @@ -210,13 +180,10 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-mac-webgpu: - needs: determine-tag runs-on: [self-hosted, macOS, ARM64] steps: @@ -239,14 +206,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \ bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-mac-vulkan: - needs: determine-tag runs-on: [self-hosted, macOS, ARM64] steps: @@ -256,14 +220,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-linux-intel-vulkan: - needs: determine-tag runs-on: [self-hosted, Linux, Intel] steps: @@ -275,14 +236,11 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-win-intel-vulkan: - needs: determine-tag runs-on: [self-hosted, Windows, X64, Intel] steps: @@ -297,7 +255,6 @@ jobs: MSYSTEM: UCRT64 CHERE_INVOKING: 1 PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }} - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | vulkaninfo --summary # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create @@ -305,7 +262,6 @@ jobs: LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp ggml-ci-intel-openvino-gpu-low-perf: - needs: determine-tag runs-on: [self-hosted, Linux, Intel, OpenVINO] concurrency: @@ -337,15 +293,12 @@ jobs: - name: Test id: ggml-ci - env: - HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }} run: | source ./openvino_toolkit/setupvars.sh GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-arm64-cpu-low-perf: - needs: determine-tag - runs-on: [self-hosted, Linux, ARM64] + runs-on: [self-hosted, Linux, ARM64, CPU] steps: - name: Clone @@ -358,8 +311,7 @@ jobs: LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp ggml-ci-arm64-cpu-high-perf: - needs: determine-tag - runs-on: [self-hosted, Linux, ARM64] + runs-on: [self-hosted, Linux, ARM64, CPU] steps: - name: Clone @@ -378,8 +330,7 @@ jobs: # if we resolve this, we should be able to offload these jobs to the self-hosted runners # # ggml-ci-arm64-cpu-high-perf-sve: -# needs: determine-tag -# runs-on: [self-hosted, Linux, NVIDIA, ARM64] +# runs-on: [self-hosted, Linux, ARM64, CPU] # # steps: # - name: Clone @@ -392,8 +343,7 @@ jobs: # LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp # # ggml-ci-arm64-cpu-kleidiai: -# needs: determine-tag -# runs-on: [self-hosted, Linux, NVIDIA, ARM64] +# runs-on: [self-hosted, Linux, ARM64, CPU] # # steps: # - name: Clone diff --git a/.github/workflows/check-vendor.yml b/.github/workflows/check-vendor.yml index 1671ed7b8bd..015629f380c 100644 --- a/.github/workflows/check-vendor.yml +++ b/.github/workflows/check-vendor.yml @@ -19,7 +19,7 @@ on: jobs: check-vendor: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - name: Checkout diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index c88396c0a7d..50b598b84dd 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -15,7 +15,7 @@ concurrency: jobs: model-naming: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - uses: actions/checkout@v6 - name: Check model naming conventions diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index 53f6a0ccfda..59159cd4144 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -15,7 +15,7 @@ concurrency: jobs: editorconfig: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - uses: actions/checkout@v6 - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0 diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml index 7126b62b690..fce70d03e52 100644 --- a/.github/workflows/pre-tokenizer-hashes.yml +++ b/.github/workflows/pre-tokenizer-hashes.yml @@ -12,7 +12,7 @@ on: jobs: pre-tokenizer-hashes: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] steps: - name: Checkout repository diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 1219b874592..2c7fab40b44 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -20,7 +20,7 @@ concurrency: jobs: python-check-requirements: - runs-on: ubuntu-slim + runs-on: [self-hosted, CPU, fast] name: check-requirements steps: - name: Check out source repository diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 1e5d64c1aee..0424f372a14 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -21,7 +21,7 @@ concurrency: jobs: flake8-lint: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] name: Lint steps: - name: Check out source repository diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml index cbeeb39d05b..14edb1a9d17 100644 --- a/.github/workflows/python-type-check.yml +++ b/.github/workflows/python-type-check.yml @@ -22,7 +22,7 @@ concurrency: jobs: python-type-check: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] name: python type-check steps: - name: Check out source repository diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml index 38fba13ad8e..2653afd06c7 100644 --- a/.github/workflows/ui-build.yml +++ b/.github/workflows/ui-build.yml @@ -5,8 +5,7 @@ on: jobs: build: - name: Build static output - runs-on: ubuntu-slim + runs-on: [self-hosted, fast] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml new file mode 100644 index 00000000000..64a4519c618 --- /dev/null +++ b/.github/workflows/ui-self-hosted.yml @@ -0,0 +1,118 @@ +name: UI (self-hosted) + +# these are the same as ui.yml, but with self-hosted runners +# the runners come with pre-installed Playwright browsers version: 1.56.1 +# the jobs are much lighter because they don't need to install node and playwright browsers + +on: + workflow_dispatch: + inputs: + sha: + description: 'Commit SHA1 to build' + required: false + type: string + push: + branches: + - master + paths: [ + '.github/workflows/ui-self-hosted.yml', + '.github/workflows/ui-build.yml', + 'tools/ui/**.*', + 'tools/server/tests/**.*' + ] + pull_request: + types: [opened, synchronize, reopened] + paths: [ + '.github/workflows/ui-self-hosted.yml', + '.github/workflows/ui-build.yml', + 'tools/ui/**.*', + 'tools/server/tests/**.*' + ] + +env: + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_LOG_VERBOSITY: 10 + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + ui-build: + name: Build static output + uses: ./.github/workflows/ui-build.yml + + ui-checks: + name: Checks + needs: ui-build + runs-on: [self-hosted, PLAYWRIGHT] + continue-on-error: true + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install dependencies + id: setup + run: npm ci + working-directory: tools/ui + + - name: Run type checking + if: ${{ always() && steps.setup.conclusion == 'success' }} + run: npm run check + working-directory: tools/ui + + - name: Run linting + if: ${{ always() && steps.setup.conclusion == 'success' }} + run: npm run lint + working-directory: tools/ui + + - name: Run Client tests + if: ${{ always() }} + run: npm run test:client + working-directory: tools/ui + + - name: Run Unit tests + if: ${{ always() }} + run: npm run test:unit + working-directory: tools/ui + + e2e-tests: + name: E2E Tests + needs: ui-build + runs-on: [self-hosted, PLAYWRIGHT] + steps: + - name: Checkout code + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install dependencies + id: setup + run: npm ci + working-directory: tools/ui + + - name: Build application + if: ${{ always() && steps.setup.conclusion == 'success' }} + run: npm run build + working-directory: tools/ui + + - name: Build Storybook + if: ${{ always() }} + run: npm run build-storybook + working-directory: tools/ui + + - name: Run UI tests + if: ${{ always() }} + run: npm run test:ui -- --testTimeout=60000 + working-directory: tools/ui + + - name: Run E2E tests + if: ${{ always() }} + run: npm run test:e2e + working-directory: tools/ui diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui.yml similarity index 95% rename from .github/workflows/ui-ci.yml rename to .github/workflows/ui.yml index 761a9319414..b5e7457182c 100644 --- a/.github/workflows/ui-ci.yml +++ b/.github/workflows/ui.yml @@ -1,4 +1,4 @@ -name: CI (UI) +name: UI on: workflow_dispatch: @@ -11,14 +11,16 @@ on: branches: - master paths: [ - '.github/workflows/ui-ci.yml', + '.github/workflows/ui.yml', + '.github/workflows/ui-build.yml', 'tools/ui/**.*', 'tools/server/tests/**.*' ] pull_request: types: [opened, synchronize, reopened] paths: [ - '.github/workflows/ui-ci.yml', + '.github/workflows/ui.yml', + '.github/workflows/ui-build.yml', 'tools/ui/**.*', 'tools/server/tests/**.*' ] @@ -39,7 +41,7 @@ jobs: uses: ./.github/workflows/ui-build.yml ui-checks: - name: UI Checks + name: Checks needs: ui-build runs-on: ubuntu-latest continue-on-error: true diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml index 2ab06eb9811..6e8bc1aa07c 100644 --- a/.github/workflows/update-ops-docs.yml +++ b/.github/workflows/update-ops-docs.yml @@ -3,18 +3,20 @@ name: Update Operations Documentation on: push: paths: + - '.github/workflows/update-ops-docs.yml' - 'docs/ops.md' - 'docs/ops/**' - 'scripts/create_ops_docs.py' pull_request: paths: + - '.github/workflows/update-ops-docs.yml' - 'docs/ops.md' - 'docs/ops/**' - 'scripts/create_ops_docs.py' jobs: update-ops-docs: - runs-on: ubuntu-slim + runs-on: [self-hosted, fast, ARM64] steps: - name: Checkout repository From 6d57c26ef8cb9a3583993d8d6527858985c82c7b Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Mon, 25 May 2026 07:12:39 +0200 Subject: [PATCH 04/21] perplexity : fix even more integer overflows (#23623) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stanisław Szymczyk --- tools/perplexity/perplexity.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index ea9de83e1a9..027cd53f29e 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -157,7 +157,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, break; } lock.unlock(); - const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]); + const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + size_t(i)*nv, tokens[i+1]); local_nll += v; local_nll2 += v*v; } @@ -169,7 +169,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, for (auto & w : workers) { w.join(); } - out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t)); + out.write((const char *)log_probs.data(), size_t(n_token)*nv*sizeof(uint16_t)); } struct kl_divergence_result { @@ -279,7 +279,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens break; } lock.unlock(); - std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + std::pair v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + size_t(i)*nv, tokens[i+1], local_kld); kld_values[i] = (float)v.first; p_diff_values[i] = v.second; } From e2ef8fe42ccef597bfeab901dd6e39589613b71e Mon Sep 17 00:00:00 2001 From: jacekpoplawski <67507230+jacekpoplawski@users.noreply.github.com> Date: Mon, 25 May 2026 07:56:18 +0200 Subject: [PATCH 05/21] server: fix checkpoints creation (#22929) * common : add common_chat_split_by_role * cont : fix spans to reach end of message * server: fix checkpoints creation - extract message_spans from chat templates - find the prompt token position before the latest user message - split prompt batching at that position - create a context checkpoint before the latest user input - avoid periodic mid-prompt checkpoints when that position is known - handle multimodal prompts when mapping text/template positions to server prompt tokens - add --checkpoint-min-step to control minimum spacing between checkpoints * cont : clean-up * Support autoparser detection for message barriers * server: fix message span delimiter and update docs --------- Co-authored-by: Alde Rojas Co-authored-by: Georgi Gerganov Co-authored-by: Piotr Wilkin --- common/arg.cpp | 11 +- common/chat-auto-parser-helpers.cpp | 6 +- common/chat-auto-parser.h | 6 + common/chat-diff-analyzer.cpp | 177 +++++++++++++++++++++++++++- common/chat.cpp | 65 ++++++++++ common/chat.h | 16 +++ common/common.cpp | 21 ++++ common/common.h | 3 +- tests/test-chat-auto-parser.cpp | 127 +++++++++++++++++++- tests/test-chat.cpp | 40 ++++++- tools/cli/README.md | 1 - tools/server/README.md | 2 +- tools/server/server-common.cpp | 10 ++ tools/server/server-context.cpp | 135 +++++++++++++++++---- tools/server/server-task.h | 3 + 15 files changed, 586 insertions(+), 37 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 24d9734b934..3df8010a2ec 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1334,12 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-cpent", "--checkpoint-every-n-tokens"}, "N", - string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt), + {"-cms", "--checkpoint-min-step"}, "N", + string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step), [](common_params & params, int value) { - params.checkpoint_every_nt = value; + if (value < 0) { + throw std::invalid_argument("checkpoint-min-step must be non-negative"); + } + params.checkpoint_min_step = value; } - ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + ).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cram", "--cache-ram"}, "N", string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)" diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp index 2499464cd82..81b17e5e1d2 100644 --- a/common/chat-auto-parser-helpers.cpp +++ b/common/chat-auto-parser-helpers.cpp @@ -310,6 +310,8 @@ std::vector prune_whitespace_segments(const std::vector & segm namespace autoparser { +static const std::string ERR_TMPL = "#**ERROR**#"; + std::string apply_template(const common_chat_template & tmpl, const template_params & params) { generation_params tmpl_params; tmpl_params.messages = params.messages; @@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par return common_chat_template_direct_apply(tmpl, tmpl_params); } catch (const std::exception & e) { LOG_DBG("Template application failed: %s\n", e.what()); - return ""; + return ERR_TMPL; } } @@ -347,7 +349,7 @@ std::optional compare_variants( std::string output_B = apply_template(tmpl, params_B); // Check for template application failures - if (output_A.empty() || output_B.empty()) { + if (output_A == ERR_TMPL || output_B == ERR_TMPL) { return std::nullopt; } diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h index c680e686867..7858f6572f2 100644 --- a/common/chat-auto-parser.h +++ b/common/chat-auto-parser.h @@ -377,6 +377,8 @@ struct analyze_tools : analyze_base { struct autoparser { jinja::caps jinja_caps; + std::string user_start; + std::string assistant_start; analyze_reasoning reasoning; analyze_content content; analyze_tools tools; @@ -387,6 +389,10 @@ struct autoparser { autoparser() = default; + // Find the starting marker for the user message and assistant message + std::string detect_user_start_marker(const common_chat_template & tmpl); + std::string detect_assistant_start_marker(const common_chat_template & tmpl); + // Run full differential analysis on a template void analyze_template(const common_chat_template & tmpl); diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 9c7c9678acd..0875c5347f4 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -8,6 +8,9 @@ #include "peg-parser.h" #include +#include +#include +#include #define ANSI_RESET "\033[0m" #define ANSI_PURPLE "\033[1m\x1b[38;5;126m" @@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S"; static const std::string ARG_FIRST = "AA_ARG_FST_AA"; static const std::string ARG_SECOND = "BB_ARG_SND_BB"; static const std::string USER_MSG = "U_USER_MSG Hello END_U"; +static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V"; static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A"; static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R"; static const std::string CALL_ID_001 = "call00001"; @@ -71,6 +75,7 @@ static std::vector"); analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>"); + analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>"; LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET); } }, @@ -108,7 +113,59 @@ static std::vector void { + if (tmpl.src.find("") != std::string::npos && tmpl.src.find("") != std::string::npos && + tmpl.src.find("") != std::string::npos && tmpl.src.find("") != std::string::npos) { + + analysis.tools.format.mode = tool_format::JSON_NATIVE; + analysis.tools.format.section_start = ""; + analysis.tools.format.section_end = ""; + analysis.tools.format.per_call_start = ""; + analysis.tools.format.per_call_end = ""; + analysis.content.mode = content_mode::PLAIN; + analysis.content.start = ""; + analysis.content.end = ""; + analysis.reasoning.mode = reasoning_mode::TAG_BASED; + analysis.reasoning.start = "\n\n"; + analysis.reasoning.end = ""; + analysis.assistant_start = "Assistant"; + analysis.user_start = "User"; + analysis.preserved_tokens.clear(); + analysis.preserved_tokens.push_back(""); + analysis.preserved_tokens.push_back(""); + analysis.preserved_tokens.push_back(""); + analysis.preserved_tokens.push_back(""); + analysis.preserved_tokens.push_back(""); + LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET); + } + }, + // Fireworks + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'" + " + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) { + analysis.assistant_start = "<|start_header_id|>assistant<|end_header_id|>"; + analysis.user_start = "<|start_header_id|>user<|end_header_id|>"; + LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET); + } + }, + // Solar Open + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) { + analysis.assistant_start = "<|begin|>assistant"; + LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET); + } + }, + // Apriel 1.6 + [](const common_chat_template & tmpl, autoparser & analysis) -> void { + if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) { + analysis.user_start = "<|begin_user|>"; + analysis.assistant_start = "<|begin_assistant|>"; + LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET); + } + }, + }); // Common JSON structures @@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) { reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls); content = analyze_content(tmpl, reasoning); tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools()); + assistant_start = detect_assistant_start_marker(tmpl); + user_start = detect_user_start_marker(tmpl); collect_preserved_tokens(); for (auto & workaround : workarounds) { @@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) { } LOG_DBG("\n--- Reasoning & Content Structure ---\n"); + LOG_DBG("user_msg_start: %s\n", user_start.c_str()); + LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str()); LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str()); LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str()); LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str()); @@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() { add_token(tools.call_id.suffix); } +std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) { + json user_msg = json{ + { "role", "user" }, + { "content", USER_MSG } + }; + + json assistant_no_reasoning = json{ + { "role", "assistant" }, + { "content", ASSISTANT_MSG } + }; + + template_params params; + params.messages = json::array({ user_msg }); + params.add_generation_prompt = false; + params.enable_thinking = true; + + auto comparison = compare_variants( + tmpl, params, [&](template_params & p) { + p.messages = json::array({ user_msg, assistant_no_reasoning }); + } + ); + + if (!comparison) { + LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__); + return ""; + } + + auto usermsg = comparison->diff.right; + if (usermsg.find(ASSISTANT_MSG) == std::string::npos) { + LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__); + } + + auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG)); + if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) { + ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start))); + } + if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) { + ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end))); + } + return trim_whitespace(ast_prefix); +} + +std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) { + json user_msg = json{ + { "role", "user" }, + { "content", USER_MSG } + }; + + json assistant = json{ + { "role", "assistant" }, + { "content", ASSISTANT_MSG } + }; + + json user_msg_two = json{ + { "role", "user" }, + { "content", USER_MSG_TWO } + }; + + template_params params; + params.messages = json::array({}); + params.add_generation_prompt = false; + params.enable_thinking = true; + + auto comparison = compare_variants( + tmpl, params, [&](template_params & p) { + p.messages = json::array({ user_msg }); + } + ); + + if (!comparison) { + LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__); + params.messages = json::array({ user_msg_two, assistant }); + comparison = compare_variants( + tmpl, params, [&](template_params & p) { + p.messages = json::array({ user_msg_two, assistant, user_msg }); + } + ); + if (!comparison) { + LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__); + return ""; + } + } + + auto usermsg = comparison->diff.right; + if (usermsg.find(USER_MSG) == std::string::npos) { + LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__); + } + + if (usermsg.find(ASSISTANT_MSG) != std::string::npos) { + usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size()); + } + + auto candidate = usermsg.substr(0, usermsg.find(USER_MSG)); + auto candidate_split = segmentize_markers(candidate); + std::stringstream result; + bool encountered_marker = false; + for (const auto & mrk : candidate_split) { + std::string lower_mrk = std::string(mrk.value); + std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(), + [](unsigned char c) { return std::tolower(c); }); + // heuristic to weed out potential end markers, but only at the start + if (mrk.type == segment_type::MARKER && !encountered_marker && + (lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) { + continue; + } + if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) { + continue; + } + encountered_marker |= mrk.type == segment_type::MARKER; + result << mrk.value; + } + return trim_whitespace(result.str()); +} + analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools) : analyze_base(tmpl) { LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET); diff --git a/common/chat.cpp b/common/chat.cpp index 56873e3a1e9..ef151691c38 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -90,6 +90,45 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const return text; } +std::vector common_chat_split_by_role(const std::string & prompt, const std::vector & delims) { + if (delims.empty() || prompt.empty()) { + return {}; + } + + auto parser = build_peg_parser([&](common_peg_parser_builder & p) { + std::vector all_delims; + std::vector tagged_messages; + + all_delims.reserve(delims.size()); + tagged_messages.reserve(delims.size()); + for (const auto & d : delims) { + all_delims.push_back(d.delimiter); + } + + auto any_delim = p.until_one_of(all_delims); + for (const auto & d : delims) { + tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim)); + } + + return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end(); + }); + + common_peg_parse_context ctx(prompt); + const auto result = parser.parse(ctx); + if (!result.success()) { + return {}; + } + + std::vector spans; + ctx.ast.visit(result, [&](const common_peg_ast_node & node) { + if (!node.tag.empty()) { + spans.push_back({ node.tag, node.start, node.end - node.start }); + } + }); + + return spans; +} + json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const { if (!content.empty() && !content_parts.empty()) { throw std::runtime_error("Cannot specify both content and content_parts"); @@ -1042,6 +1081,14 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp data.prompt = prompt; data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages); + data.message_spans = common_chat_split_by_role(prompt, { + { "assistant", "<|start|>assistant" }, + { "user", "<|start|>user" }, + { "system", "<|start|>developer" }, + { "system", "<|start|>system" }, + { "tool", "<|start|>functions" }, + }); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; @@ -1181,6 +1228,11 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ data.prompt += data.generation_prompt; } + data.message_spans = common_chat_split_by_role(data.prompt, { + { "user", "<|turn>user\n" }, + { "assistant", "<|turn>model\n" }, + }); + data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4; data.supports_thinking = true; data.thinking_start_tag = "<|channel>thought"; @@ -2393,6 +2445,19 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ struct autoparser::autoparser autoparser; autoparser.analyze_template(tmpl); auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser); + + std::vector delimiters; + if (!autoparser.assistant_start.empty()) { + delimiters.push_back({ "assistant", autoparser.assistant_start }); + } + if (!autoparser.user_start.empty()) { + delimiters.push_back({ "user", autoparser.user_start }); + } + + if (!delimiters.empty()) { + auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters); + } + auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE; if (auto_params.supports_thinking) { auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start); diff --git a/common/chat.h b/common/chat.h index b29c627e693..5659cd42a07 100644 --- a/common/chat.h +++ b/common/chat.h @@ -143,6 +143,17 @@ struct common_chat_msg_diff { } }; +struct common_chat_msg_span { + std::string role; + std::size_t pos = 0; + std::size_t len = 0; +}; + +struct common_chat_msg_delimiter { + std::string role; + std::string delimiter; +}; + struct common_chat_tool { std::string name; std::string description; @@ -208,6 +219,7 @@ struct common_chat_params { std::vector preserved_tokens; std::vector additional_stops; std::string parser; + std::vector message_spans; }; // per-message parsing syntax @@ -304,6 +316,7 @@ std::optional common_chat_try_specialized_template( const std::string & src, autoparser::generation_params & params); + // specialized per-task preset struct common_chat_prompt_preset { std::string system; @@ -311,3 +324,6 @@ struct common_chat_prompt_preset { }; common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates); + +std::vector common_chat_split_by_role(const std::string & prompt, const std::vector & delims); + diff --git a/common/common.cpp b/common/common.cpp index d77ddeda10e..97daf281783 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -445,6 +445,27 @@ std::string string_strip(const std::string & str) { return str.substr(start, end - start); } +std::string string_lcs(std::string_view a, std::string_view b) { + if (a.empty() || b.empty()) return {}; + + std::vector> dp(a.size() + 1, std::vector(b.size() + 1, 0)); + size_t best_len = 0; + size_t best_end_a = 0; + + for (size_t i = 1; i <= a.size(); ++i) { + for (size_t j = 1; j <= b.size(); ++j) { + if (a[i - 1] == b[j - 1]) { + dp[i][j] = dp[i - 1][j - 1] + 1; + if (dp[i][j] > best_len) { + best_len = dp[i][j]; + best_end_a = i; + } + } + } + } + return std::string(a.substr(best_end_a - best_len, best_len)); +} + std::string string_get_sortable_timestamp() { using clock = std::chrono::system_clock; diff --git a/common/common.h b/common/common.h index b0ad7b2ea4e..8a0e5eed5ee 100644 --- a/common/common.h +++ b/common/common.h @@ -594,7 +594,7 @@ struct common_params { bool cache_prompt = true; // whether to enable prompt caching bool cache_idle_slots = true; // save and clear idle slots upon starting a new task int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot - int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill + int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. std::string hostname = "127.0.0.1"; @@ -731,6 +731,7 @@ std::string string_format(const char * fmt, ...); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); +std::string string_lcs(std::string_view a, std::string_view b); std::string string_join(const std::vector & values, const std::string & separator); std::vector string_split(const std::string & str, const std::string & delimiter); diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index 1d96de718e2..6f8e957489c 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -81,6 +81,8 @@ static void test_normalize_quotes_with_embedded_quotes(testing & t); // TAG_WITH_TAGGED argument parsing tests static void test_tagged_args_with_embedded_quotes(testing & t); +static void test_role_markers_all_templates(testing & t); + int main(int argc, char * argv[]) { testing t(std::cout); t.verbose = true; @@ -103,6 +105,7 @@ int main(int argc, char * argv[]) { t.test("standard_json_tools", test_standard_json_tools_formats); t.test("normalize_quotes_to_json", test_normalize_quotes_to_json); t.test("tagged_args_embedded_quotes", test_tagged_args_with_embedded_quotes); + t.test("role_markers_all_templates", test_role_markers_all_templates); return t.summary(); } @@ -714,7 +717,7 @@ static void test_compare_variants_both_modifiers(testing & t) { static void test_compare_variants_template_failure(testing & t) { // Test with template that causes failure during application (not construction) // We use a valid template syntax but one that will fail during application - common_chat_template tmpl("{{ messages[0]['nonexistent_field'] }}", "", ""); + common_chat_template tmpl("{{ messages.cahoot()[0]['nonexistent_field'] }}", "", ""); template_params params; params.messages = json::array({ @@ -1848,6 +1851,128 @@ static json build_edit_tool() { }); } +// ============================================================================ +// Role marker detection tests for all autoparser-handled templates +// +// Verifies that detect_user_start_marker / detect_assistant_start_marker +// return the correct boundary text between turns for every template that +// falls through to the differential autoparser (i.e. is not handled by a +// dedicated specialized template in common_chat_try_specialized_template). +// +// Markers were deduced manually from the jinja sources in models/templates/. +// ============================================================================ +struct role_marker_case { + std::string template_file; + std::string expected_user_start; + std::string expected_assistant_start; +}; + +static void test_role_markers_all_templates(testing & t) { + // Each entry is { template filename, user_start, assistant_start } as + // produced when rendering the standard chatml-like sequences. The values + // come from reading each jinja template and tracing what text precedes + // a user/assistant message body once the autoparser strips any reasoning + // markers it detected first. + const std::vector cases = { + // ChatML family: <|im_start|>{role} ... <|im_end|> + { "Bielik-11B-v3.0-Instruct.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "HuggingFaceTB-SmolLM3-3B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "MiMo-VL.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen3.5-4B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen3-Coder.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen-Qwen2.5-7B-Instruct.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen-Qwen3-0.6B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "Qwen-QwQ-32B.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "StepFun3.5-Flash.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + { "stepfun-ai-Step-3.5-Flash.jinja", "<|im_start|>user", "<|im_start|>assistant" }, + + // DeepSeek family + { "deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja", "<|User|>", "<|Assistant|>" }, + { "deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja", "<|User|>", "<|Assistant|>" }, + { "deepseek-ai-DeepSeek-V3.1.jinja", "<|User|>", "<|Assistant|>" }, + { "llama-cpp-deepseek-r1.jinja", "<|User|>", "<|Assistant|>" }, + + // Llama 3 header family + { "meetkai-functionary-medium-v3.1.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + { "meta-llama-Llama-3.1-8B-Instruct.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + { "meta-llama-Llama-3.2-3B-Instruct.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + { "meta-llama-Llama-3.3-70B-Instruct.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + // fireworks-ai forces a trailing assistant header even without add_generation_prompt, + // so the marker is absorbed into the common suffix and assistant_start is detected as empty. + { "fireworks-ai-llama-3-firefunction-v2.jinja", "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" }, + + // Phi/GLM/Apriel-style: <|user|> / <|assistant|> + { "microsoft-Phi-3.5-mini-instruct.jinja", "<|user|>", "<|assistant|>" }, + { "GLM-4.6.jinja", "<|user|>", "<|assistant|>" }, + { "unsloth-Apriel-1.5.jinja", "<|user|>", "<|assistant|>" }, + { "GLM-4.7-Flash.jinja", "<|user|>", "<|assistant|>" }, + + // Gemma 2: {user|model} + { "google-gemma-2-2b-it.jinja", "user", "model" }, + + // IBM Granite + { "ibm-granite-granite-3.3-2B-Instruct.jinja", "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" }, + { "ibm-granite-granite-4.0.jinja", "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" }, + + // Cohere R-series + { "CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja", + "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_RESPONSE|>" }, + { "CohereForAI-c4ai-command-r-plus-tool_use.jinja", + "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" }, + + // Mistral: assistant content follows [/INST] immediately, no header + { "mistralai-Mistral-Nemo-Instruct-2407.jinja", "[INST]", "" }, + { "Mistral-Small-3.2-24B-Instruct-2506.jinja", "[INST]", "" }, + + // Apertus uses <|user_start|> / <|assistant_start|> but the user diff + // carries the preceding <|assistant_end|> from the previous turn. + { "Apertus-8B-Instruct.jinja", "<|user_start|>", "<|assistant_start|>" }, + + // Apriel 1.6 wraps the assistant body with <|begin_assistant|>, but + // <|begin_assistant|> is also the detected reasoning start, so the + // assistant_start is trimmed back to the preceding newline. + { "Apriel-1.6-15b-Thinker-fixed.jinja", "<|begin_user|>", "<|begin_assistant|>" }, + + // ByteDance Seed-OSS: {role} + { "ByteDance-Seed-OSS.jinja", "user", "assistant" }, + + // GigaChat 3.1: {role}<|role_sep|> + { "GigaChat3.1-10B-A1.8B.jinja", "user<|role_sep|>", "assistant<|role_sep|>" }, + + // MiniMax M2: ]~b]{user|ai} + { "MiniMax-M2.jinja", "]~b]user", "]~b]ai" }, + + // Nemotron Nano v2: {User|Assistant}; assistant marker + // is followed by a prefilled block that gets included. + { "NVIDIA-Nemotron-Nano-v2.jinja", "User", "Assistant" }, + + // Reka Edge: "human: " / "assistant: " — but the rendered preamble + // depends on enable_thinking, which currently confuses the user-start + // diff and trims the marker down. Lock in the observed value. + { "Reka-Edge.jinja", "human:", "assistant:" }, + + // RWKV-world chat preset: "User: " / "Assistant: " + { "llama-cpp-rwkv-world.jinja", "User:", "Assistant:" }, + + // Upstage Solar 100B: <|begin|>{role}... but reasoning marker absorbs + // the "<|begin|>assistant" prefix from assistant_start. + { "upstage-Solar-Open-100B.jinja", "<|begin|>user<|content|>", "<|begin|>assistant" }, + }; + + for (const auto & c : cases) { + t.test(c.template_file, [&](testing & t) { + common_chat_template tmpl = load_template(t, "models/templates/" + c.template_file); + struct autoparser ap; + ap.analyze_template(tmpl); + t.assert_equal("user_start", c.expected_user_start, ap.user_start); + t.assert_equal("assistant_start", c.expected_assistant_start, ap.assistant_start); + }); + } +} + // Test that reproduces the Seed-OSS template issue with embedded quotes static void test_tagged_args_with_embedded_quotes(testing & t) { json tools = build_edit_tool(); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index a428ef35c18..1a5161cc101 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1548,6 +1548,40 @@ static void test_msgs_oaicompat_json_conversion() { } } +static void test_split_by_role() { + LOG_DBG("%s\n", __func__); + + // Empty inputs + assert_equals(0, common_chat_split_by_role("", {}).size()); + assert_equals(0, common_chat_split_by_role("hello", {}).size()); + assert_equals(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size()); + + // Multi-role conversation, no leading/trailing content + { + const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye"; + const auto splits = common_chat_split_by_role(prompt, { + { "user", "<|user|>" }, + { "assistant", "<|assistant|>" }, + }); + assert_equals(3, splits.size()); + + assert_equals("user", splits[0].role); + assert_equals(0, splits[0].pos); + assert_equals(10, splits[0].len); + assert_equals("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len)); + + assert_equals("assistant", splits[1].role); + assert_equals(10, splits[1].pos); + assert_equals(18, splits[1].len); + assert_equals("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len)); + + assert_equals("user", splits[2].role); + assert_equals(28, splits[2].pos); + assert_equals(11, splits[2].len); + assert_equals("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len)); + } +} + static void test_tools_oaicompat_json_conversion() { LOG_DBG("%s\n", __func__); std::vector tools{ @@ -4338,16 +4372,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) { // Format: [{"name": "func", "arguments": {...}}] { auto tst = peg_tester("models/templates/NVIDIA-Nemotron-Nano-v2.jinja", detailed_debug); - tst.test("[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]") + tst.test("[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]") .tools({ special_function_tool }) .expect(message_assist_call) .run(); // Continuation tests tst.test("world!\nWhat's up?") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) .messages({ message_user, message_assist_prefill_content }) .add_generation_prompt(false) .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT) + .expect_reasoning("I'm thinking") .expect_content("Hello, world!\nWhat's up?") .run(); } @@ -5593,6 +5630,7 @@ int main(int argc, char ** argv) { { test_msg_diffs_compute(); test_msgs_oaicompat_json_conversion(); + test_split_by_role(); test_tools_oaicompat_json_conversion(); test_convert_responses_to_chatcmpl(); test_developer_role_to_system_workaround(); diff --git a/tools/cli/README.md b/tools/cli/README.md index bab65d50556..add4021e2a0 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -147,7 +147,6 @@ | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | -| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | diff --git a/tools/server/README.md b/tools/server/README.md index f2f73f6dc5f..0b7f9f994c2 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -163,7 +163,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | -| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | +| `-cms, --checkpoint-min-step N` | minimum spacing between context checkpoints in tokens (default: 256, 0 = no minimum)
(env: LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CACHE_IDLE_SLOTS) | diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index dc00edfa82a..fb71792fe6d 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1110,6 +1110,16 @@ json oaicompat_chat_params_parse( llama_params["chat_parser"] = chat_params.parser; } + llama_params["message_spans"] = json::array(); + + for (const auto & span : chat_params.message_spans) { + llama_params["message_spans"].push_back({ + { "role", span.role }, + { "pos", span.pos }, + { "len", span.len }, + }); + } + // Reasoning budget: pass parameters through to sampling layer { int reasoning_budget = opt.reasoning_budget; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c3daafd0d92..9fecc4247f5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1103,6 +1103,13 @@ struct server_context_impl { } SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); + if (params_base.n_ctx_checkpoints > 0) { + SRV_INF("context checkpoints enabled, max = %d, min spacing = %d\n", + params_base.n_ctx_checkpoints, params_base.checkpoint_min_step); + } else { + SRV_INF("%s", "context checkpoints disabled\n"); + } + if (!params_base.model_alias.empty()) { // backward compat: use first alias as model name model_name = *params_base.model_alias.begin(); @@ -2758,8 +2765,6 @@ struct server_context_impl { } if (pos_min >= pos_min_thold) { - SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa); - // search for a context checkpoint const auto it = std::find_if( slot.prompt.checkpoints.rbegin(), @@ -2776,7 +2781,6 @@ struct server_context_impl { if (!do_reset) { // restore the context checkpoint - it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); @@ -2912,6 +2916,9 @@ struct server_context_impl { has_mtmd = true; } + const int32_t n_before_user = slot.task->params.n_before_user; + const bool n_before_user_known = n_before_user > 0; + // add prompt tokens for processing in the current batch while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) { // get next token to process @@ -2940,6 +2947,13 @@ struct server_context_impl { slot.n_prompt_tokens_processed++; + // stop the prompt batch exactly before the latest user input, so a checkpoint + // can be created after the previous messages + if (n_before_user_known && + slot.prompt.n_tokens() == n_before_user) { + break; + } + // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created. // create checkpoints that many tokens before the end of the prompt: // - 4 + n_ubatch @@ -2965,6 +2979,8 @@ struct server_context_impl { // the number of tokens added to the batch for the current slot const auto n_tokens_cur = batch.n_tokens - n_tokens_prev; + const bool near_prompt_end = slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch; + // entire prompt has been processed if (slot.prompt.n_tokens() == slot.task->n_tokens()) { slot.state = SLOT_STATE_DONE_PROMPT; @@ -2979,39 +2995,49 @@ struct server_context_impl { slot.init_sampler(); } else { - if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) { - // near the end of the prompt - do_checkpoint = do_checkpoint && true; - } else { - // only do non-end checkpoints if the "checkpoint every n tokens" option is set - do_checkpoint = do_checkpoint && params_base.checkpoint_every_nt > 0; - - if (do_checkpoint) { - llama_pos last_checkpoint = 0; - if (!slot.prompt.checkpoints.empty()) { - last_checkpoint = slot.prompt.checkpoints.back().n_tokens; - } - - do_checkpoint = do_checkpoint && slot.prompt.n_tokens() - batch.n_tokens - last_checkpoint >= params_base.checkpoint_every_nt; - - if (do_checkpoint) { - SLT_INF(slot, "%d tokens since last checkpoint at %d, creating new checkpoint during processing at position %d\n", params_base.checkpoint_every_nt, last_checkpoint, slot.prompt.n_tokens()); - } - } + // skip ordinary mid-prompt checkpoints + if (!n_before_user_known && !near_prompt_end) { + do_checkpoint = false; } } const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id); - // no need for empty or small checkpoints - do_checkpoint = do_checkpoint && (pos_min >= 0 && slot.prompt.n_tokens() >= 64); + // checkpoints are created before the current batch is decoded, so + // their token position is the batch start rather than the prompt end + const int32_t n_tokens_start = slot.prompt.n_tokens() - n_tokens_cur; + + { + const bool is_on_user = + n_before_user_known && + n_tokens_start == n_before_user; + + const bool is_after_user = + n_before_user_known && + n_tokens_start > n_before_user; + + const bool is_allowed = + !n_before_user_known || + is_on_user || + (is_after_user && near_prompt_end); + + if (do_checkpoint && !is_allowed) { + do_checkpoint = false; + } + } + + // nothing to checkpoint yet + // TODO: is this check needed? + if (do_checkpoint && pos_min < 0) { + do_checkpoint = false; + } // do not checkpoint after mtmd chunks do_checkpoint = do_checkpoint && !has_mtmd; // no need to create checkpoints that are too close together - do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64); + do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step); SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max); // note: we create the checkpoint before calling llama_decode(), so the current batch is not @@ -3528,6 +3554,53 @@ void server_context::on_sleeping_changed(std::function callback) { impl->queue_tasks.on_sleeping_state(std::move(callback)); } +// compute the number of tokens before the last user message in the prompt +static int32_t prompt_get_n_before_user( + const json & message_spans, + const std::string & prompt, + const std::vector & files, + const llama_vocab * vocab, + mtmd_context * mctx) { + int32_t result = -1; + int32_t byte_pos = -1; + + for (const auto & span : message_spans) { + const std::string role = json_value(span, "role", std::string()); + + if (role == "user") { + byte_pos = json_value(span, "pos", -1); + } + } + + if (byte_pos >= 0) { + GGML_ASSERT((size_t) byte_pos <= prompt.size()); + + const std::string prefix = prompt.substr(0, (size_t) byte_pos); + + const std::string marker = get_media_marker(); + size_t n_prefix_media = 0; + for (size_t pos = 0; (pos = prefix.find(marker, pos)) != std::string::npos; pos += marker.size()) { + n_prefix_media++; + } + + GGML_ASSERT(n_prefix_media <= files.size()); + + if (mctx != nullptr && n_prefix_media > 0) { + // TODO: this makes a copy - avoid it + std::vector prefix_files(files.begin(), files.begin() + n_prefix_media); + + result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size(); + } else { + result = (int32_t) tokenize_input_prompts(vocab, nullptr, prefix, true, true)[0].size(); + } + + SRV_TRC("message_spans: last user message: byte_pos=%d, media=%zu, n_before_user=%d\n", + byte_pos, n_prefix_media, result); + } + + return result; +} + // // server_routes @@ -3577,6 +3650,18 @@ std::unique_ptr server_routes::handle_completions_impl( meta->slot_n_ctx, meta->logit_bias_eog, data); + + const auto message_spans = json_value(data, "message_spans", json::array()); + if (prompt.is_string() && message_spans.is_array()) { + task.params.n_before_user = + prompt_get_n_before_user( + message_spans, + prompt.get(), + files, + ctx_server.vocab, + ctx_server.mctx); + } + task.id_slot = json_value(data, "id_slot", -1); // OAI-compat diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 0978bb6ff16..60e216e7927 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -61,6 +61,9 @@ struct task_params { int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled) + // number of prompt tokens before the latest user message + int32_t n_before_user = -1; + int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit From 9627d0f5407f87ce7360e6819d39cbc0fbd1f15e Mon Sep 17 00:00:00 2001 From: "Alessandro de Oliveira Faria (A.K.A.CABELO)" Date: Mon, 25 May 2026 03:45:22 -0300 Subject: [PATCH 06/21] vendor : update cpp-httplib to 0.45.1 (#23639) --- scripts/sync_vendor.py | 2 +- vendor/cpp-httplib/httplib.cpp | 65 +++++++++++++++++++++------------- vendor/cpp-httplib/httplib.h | 28 +++++++++------ 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index 658f7326b96..5ac8843570b 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -5,7 +5,7 @@ import sys import subprocess -HTTPLIB_VERSION = "refs/tags/v0.45.0" +HTTPLIB_VERSION = "refs/tags/v0.45.1" vendor = { "https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp", diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp index b28549607a2..4ac497d0387 100644 --- a/vendor/cpp-httplib/httplib.cpp +++ b/vendor/cpp-httplib/httplib.cpp @@ -1567,7 +1567,7 @@ void mmap::close() { #endif size_ = 0; } -int close_socket(socket_t sock) { +int close_socket(socket_t sock) noexcept { #ifdef _WIN32 return closesocket(sock); #else @@ -1794,7 +1794,7 @@ bool process_client_socket( return callback(strm); } -int shutdown_socket(socket_t sock) { +int shutdown_socket(socket_t sock) noexcept { #ifdef _WIN32 return shutdown(sock, SD_BOTH); #else @@ -7149,7 +7149,7 @@ void Server::wait_until_ready() const { } } -void Server::stop() { +void Server::stop() noexcept { if (is_running_) { assert(svr_sock_ != INVALID_SOCKET); std::atomic sock(svr_sock_.exchange(INVALID_SOCKET)); @@ -12290,9 +12290,18 @@ bool enumerate_windows_system_certs(Callback cb) { template bool enumerate_macos_keychain_certs(Callback cb) { bool loaded = false; - CFArrayRef certs = nullptr; - OSStatus status = SecTrustCopyAnchorCertificates(&certs); - if (status == errSecSuccess && certs) { + const SecTrustSettingsDomain domains[] = { + kSecTrustSettingsDomainSystem, + kSecTrustSettingsDomainAdmin, + kSecTrustSettingsDomainUser, + }; + for (auto domain : domains) { + CFArrayRef certs = nullptr; + OSStatus status = SecTrustSettingsCopyCertificates(domain, &certs); + if (status != errSecSuccess || !certs) { + if (certs) CFRelease(certs); + continue; + } CFIndex count = CFArrayGetCount(certs); for (CFIndex i = 0; i < count; i++) { SecCertificateRef cert = @@ -12655,28 +12664,36 @@ bool load_system_certs(ctx_t ctx) { auto store = SSL_CTX_get_cert_store(ssl_ctx); if (!store) return false; - CFArrayRef certs = nullptr; - if (SecTrustCopyAnchorCertificates(&certs) != errSecSuccess || !certs) { - return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1; - } - bool loaded_any = false; - auto count = CFArrayGetCount(certs); - for (CFIndex i = 0; i < count; i++) { - auto cert = reinterpret_cast( - const_cast(CFArrayGetValueAtIndex(certs, i))); - CFDataRef der = SecCertificateCopyData(cert); - if (der) { - const unsigned char *data = CFDataGetBytePtr(der); - auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der)); - if (x509) { - if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; } - X509_free(x509); + const SecTrustSettingsDomain domains[] = { + kSecTrustSettingsDomainSystem, + kSecTrustSettingsDomainAdmin, + kSecTrustSettingsDomainUser, + }; + for (auto domain : domains) { + CFArrayRef certs = nullptr; + if (SecTrustSettingsCopyCertificates(domain, &certs) != errSecSuccess || + !certs) { + if (certs) CFRelease(certs); + continue; + } + auto count = CFArrayGetCount(certs); + for (CFIndex i = 0; i < count; i++) { + auto cert = reinterpret_cast( + const_cast(CFArrayGetValueAtIndex(certs, i))); + CFDataRef der = SecCertificateCopyData(cert); + if (der) { + const unsigned char *data = CFDataGetBytePtr(der); + auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der)); + if (x509) { + if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; } + X509_free(x509); + } + CFRelease(der); } - CFRelease(der); } + CFRelease(certs); } - CFRelease(certs); return loaded_any || SSL_CTX_set_default_verify_paths(ssl_ctx) == 1; #else return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1; diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h index af750cdd905..536f0cb4d5e 100644 --- a/vendor/cpp-httplib/httplib.h +++ b/vendor/cpp-httplib/httplib.h @@ -8,8 +8,8 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.45.0" -#define CPPHTTPLIB_VERSION_NUM "0x002d00" +#define CPPHTTPLIB_VERSION "0.45.1" +#define CPPHTTPLIB_VERSION_NUM "0x002d01" #ifdef _WIN32 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00 @@ -339,16 +339,26 @@ using socket_t = int; #include // On macOS with a TLS backend, enable Keychain root certificates by default -// unless the user explicitly opts out. +// unless the user explicitly opts out. Not enabled on iOS/tvOS/watchOS since +// the SecTrustSettings APIs used to enumerate anchor certificates are macOS +// only; on those platforms the user must provide a CA bundle explicitly. #if defined(__APPLE__) && defined(__clang__) && \ !defined(CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES) && \ (defined(CPPHTTPLIB_OPENSSL_SUPPORT) || \ defined(CPPHTTPLIB_MBEDTLS_SUPPORT) || \ defined(CPPHTTPLIB_WOLFSSL_SUPPORT)) +#if TARGET_OS_OSX #ifndef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN #define CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN #endif #endif +#endif + +#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && \ + defined(__APPLE__) && !TARGET_OS_OSX +#error \ + "CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN is only supported on macOS. On iOS/tvOS/watchOS, supply a CA bundle via set_ca_cert_path()." +#endif // On Windows, enable Schannel certificate verification by default // unless the user explicitly opts out. @@ -382,7 +392,7 @@ using socket_t = int; #endif // _WIN32 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN -#if TARGET_OS_MAC +#if TARGET_OS_OSX #include #endif #endif @@ -430,7 +440,7 @@ using socket_t = int; #endif #endif // _WIN32 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN -#if TARGET_OS_MAC +#if TARGET_OS_OSX #include #endif #endif @@ -473,7 +483,7 @@ using socket_t = int; #endif #endif // _WIN32 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN -#if TARGET_OS_MAC +#if TARGET_OS_OSX #include #endif #endif @@ -1597,7 +1607,7 @@ class RegexMatcher final : public MatcherBase { std::regex regex_; }; -int close_socket(socket_t sock); +int close_socket(socket_t sock) noexcept; ssize_t write_headers(Stream &strm, const Headers &headers); @@ -1734,7 +1744,7 @@ class Server { bool is_running() const; void wait_until_ready() const; - void stop(); + void stop() noexcept; void decommission(); std::function new_task_queue; @@ -3028,8 +3038,6 @@ bool parse_range_header(const std::string &s, Ranges &ranges); bool parse_accept_header(const std::string &s, std::vector &content_types); -int close_socket(socket_t sock); - ssize_t send_socket(socket_t sock, const void *ptr, size_t size, int flags); ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags); From b96487645cb902551bdefb905e29261d0b0971bf Mon Sep 17 00:00:00 2001 From: Saba Fallah <10401143+sfallah@users.noreply.github.com> Date: Mon, 25 May 2026 08:50:41 +0200 Subject: [PATCH 07/21] ui: media attachments before text (#23467) * ui: media attachments before text * fix prettier formatting --- tools/ui/src/lib/constants/image-size.ts | 2 +- tools/ui/src/lib/services/chat.service.ts | 15 +++++++-------- tools/ui/src/lib/utils/cap-img-size.ts | 3 +-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tools/ui/src/lib/constants/image-size.ts b/tools/ui/src/lib/constants/image-size.ts index 0d0c4e48434..bcc1da08b19 100644 --- a/tools/ui/src/lib/constants/image-size.ts +++ b/tools/ui/src/lib/constants/image-size.ts @@ -1 +1 @@ -export const MEGAPIXELS_TO_PIXELS = 1_000_000; \ No newline at end of file +export const MEGAPIXELS_TO_PIXELS = 1_000_000; diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts index 5b7b7e8229d..3c9ca74796d 100644 --- a/tools/ui/src/lib/services/chat.service.ts +++ b/tools/ui/src/lib/services/chat.service.ts @@ -879,14 +879,6 @@ export class ChatService { }); } - if (message.content) { - contentParts.push({ - type: ContentPartType.TEXT, - text: message.content - }); - } - - // Include images from all messages const imageFiles = message.extra.filter( (extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile => extra.type === AttachmentType.IMAGE @@ -919,6 +911,13 @@ export class ChatService { }); } + if (message.content) { + contentParts.push({ + type: ContentPartType.TEXT, + text: message.content + }); + } + const videoFiles = message.extra.filter( (extra: DatabaseMessageExtra): extra is DatabaseMessageExtraVideoFile => extra.type === AttachmentType.VIDEO diff --git a/tools/ui/src/lib/utils/cap-img-size.ts b/tools/ui/src/lib/utils/cap-img-size.ts index fa2af53f6fd..1ba218316b7 100644 --- a/tools/ui/src/lib/utils/cap-img-size.ts +++ b/tools/ui/src/lib/utils/cap-img-size.ts @@ -14,9 +14,8 @@ export function capImageDataURLSize( ): Promise { return new Promise((resolve, reject) => { try { - const mimeMatch = base64UrlImage.match(BASE64_IMAGE_URI_REGEX); - + if (!mimeMatch) { return reject(new Error('Invalid data URL format.')); } From 826539ce590fe294642db0acd54ea5e0a2fcd739 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 25 May 2026 02:15:46 -0500 Subject: [PATCH 08/21] ggml : Parallelize quant LUT init (#23595) - Use OpenMP to parallelize iq2xs_init_impl and iq3xs_init_impl. - Move the OpenMP detection from ggml-cpu to ggml-base. - Update OpenMP dependencies in ggml-config.cmake.in. --- ggml/cmake/ggml-config.cmake.in | 14 +- ggml/src/CMakeLists.txt | 17 ++ ggml/src/ggml-cpu/CMakeLists.txt | 14 +- ggml/src/ggml-quants.c | 328 ++++++++++++++++++++----------- 4 files changed, 246 insertions(+), 127 deletions(-) diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index 91c9d5cd343..23a3066f56d 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -6,6 +6,7 @@ include(CMakeFindDependencyMacro) find_dependency(Threads) if (NOT GGML_SHARED_LIB) + set(GGML_BASE_INTERFACE_LINK_LIBRARIES "") set(GGML_CPU_INTERFACE_LINK_LIBRARIES "") set(GGML_CPU_INTERFACE_LINK_OPTIONS "") @@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB) if (GGML_OPENMP_ENABLED) find_dependency(OpenMP) - list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "") + if (TARGET OpenMP::OpenMP_C) + list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C) + endif() + if (TARGET OpenMP::OpenMP_CXX) + list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX) + endif() + list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES}) + list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES}) endif() if (GGML_CPU_HBM) @@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml) add_library(ggml::ggml-base UNKNOWN IMPORTED) set_target_properties(ggml::ggml-base PROPERTIES - IMPORTED_LOCATION "${GGML_BASE_LIBRARY}") + IMPORTED_LOCATION "${GGML_BASE_LIBRARY}" + INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}") set(_ggml_all_targets "") if (NOT GGML_BACKEND_DL) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 3e48860bfc8..c26c3f1470d 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC) target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC) endif() +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "") + else() + set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") + message(WARNING "OpenMP not found") + endif() +else() + set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") +endif() + +if (GGML_OPENMP_ENABLED) + target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP) + target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) +endif() + add_library(ggml ggml-backend-dl.cpp ggml-backend-reg.cpp) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index f3eccff7d72..8c735a045b3 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() endif() - if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "") - target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - - target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "") - message(WARNING "OpenMP not found") - endif() + if (GGML_OPENMP_ENABLED) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) endif() if (GGML_LLAMAFILE) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15443aa554a..15d231f70c0 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -13,6 +13,10 @@ #include // for qsort #include // for GGML_ASSERT +#ifdef GGML_USE_OPENMP +#include +#endif + #define GROUP_MAX_EPS 1e-15f #define GROUP_MAX_EPS_IQ3_XXS 1e-8f #define GROUP_MAX_EPS_IQ2_S 1e-8f @@ -3064,70 +3068,121 @@ void iq2xs_init_impl(enum ggml_type type) { } kmap_q2xs[index] = i; } - int8_t pos[8]; - int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + // The neighbour search runs in three passes: + // 1. Parallel: for each i, qsort and count its neighbours into n_per_i, + // and reduce the totals (num_neighbors, num_not_in_map). + // 2. Serial: prefix-sum n_per_i into offsets[], so each i has a + // pre-assigned slice of kneighbors_q2xs to write into. + // 3. Parallel: redo the qsort and write each i's neighbour list at + // offsets[i]. + int * n_per_i = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(n_per_i); int num_neighbors = 0, num_not_in_map = 0; - for (int i = 0; i < kmap_size; ++i) { - if (kmap_q2xs[i] >= 0) continue; - ++num_not_in_map; - for (int k = 0; k < 8; ++k) { - int l = (i >> 2*k) & 0x3; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); - int d2 = 0; - for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); - int n = 0; int d2 = dist2[0]; - int nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - ++n; - } - num_neighbors += n; +#ifdef GGML_USE_OPENMP + #pragma omp parallel reduction(+:num_neighbors,num_not_in_map) +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[8]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) { + n_per_i[i] = 0; + continue; + } + ++num_not_in_map; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + n_per_i[i] = n; + num_neighbors += n; + } + free(dist2); } //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq2_data[gindex].neighbours = kneighbors_q2xs; + + int * offsets = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(offsets); int counter = 0; for (int i = 0; i < kmap_size; ++i) { - if (kmap_q2xs[i] >= 0) continue; - for (int k = 0; k < 8; ++k) { - int l = (i >> 2*k) & 0x3; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); - int d2 = 0; - for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); - kmap_q2xs[i] = -(counter + 1); - int d2 = dist2[0]; - uint16_t * start = &kneighbors_q2xs[counter++]; - int n = 0, nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - kneighbors_q2xs[counter++] = dist2[2*j+1]; - ++n; - } - *start = n; - } - free(dist2); + if (kmap_q2xs[i] >= 0) { + offsets[i] = -1; + continue; + } + offsets[i] = counter; + counter += 1 + n_per_i[i]; + } + +#ifdef GGML_USE_OPENMP + #pragma omp parallel +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[8]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q2xs[i] >= 0) continue; + for (int k = 0; k < 8; ++k) { + int l = (i >> 2*k) & 0x3; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + j); + int d2 = 0; + for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func); + int local_counter = offsets[i]; + kmap_q2xs[i] = -(local_counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q2xs[local_counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q2xs[local_counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); + } + free(offsets); + free(n_per_i); } void iq2xs_free_impl(enum ggml_type type) { @@ -3663,70 +3718,115 @@ void iq3xs_init_impl(int grid_size) { } kmap_q3xs[index] = i; } - int8_t pos[4]; - int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + // See explanation of parallelism in iq2xs_init_impl + int * n_per_i = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(n_per_i); int num_neighbors = 0, num_not_in_map = 0; - for (int i = 0; i < kmap_size; ++i) { - if (kmap_q3xs[i] >= 0) continue; - ++num_not_in_map; - for (int k = 0; k < 4; ++k) { - int l = (i >> 3*k) & 0x7; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); - int d2 = 0; - for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); - int n = 0; int d2 = dist2[0]; - int nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - ++n; - } - num_neighbors += n; +#ifdef GGML_USE_OPENMP + #pragma omp parallel reduction(+:num_neighbors,num_not_in_map) +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[4]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) { + n_per_i[i] = 0; + continue; + } + ++num_not_in_map; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + n_per_i[i] = n; + num_neighbors += n; + } + free(dist2); } //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq3_data[gindex].neighbours = kneighbors_q3xs; + + int * offsets = (int *)malloc(kmap_size*sizeof(int)); + GGML_ASSERT(offsets); int counter = 0; for (int i = 0; i < kmap_size; ++i) { - if (kmap_q3xs[i] >= 0) continue; - for (int k = 0; k < 4; ++k) { - int l = (i >> 3*k) & 0x7; - pos[k] = 2*l + 1; - } - for (int j = 0; j < grid_size; ++j) { - const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); - int d2 = 0; - for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); - dist2[2*j+0] = d2; - dist2[2*j+1] = j; - } - qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); - kmap_q3xs[i] = -(counter + 1); - int d2 = dist2[0]; - uint16_t * start = &kneighbors_q3xs[counter++]; - int n = 0, nhave = 1; - for (int j = 0; j < grid_size; ++j) { - if (dist2[2*j] > d2) { - if (nhave == nwant) break; - d2 = dist2[2*j]; - ++nhave; - } - kneighbors_q3xs[counter++] = dist2[2*j+1]; - ++n; - } - *start = n; - } - free(dist2); + if (kmap_q3xs[i] >= 0) { + offsets[i] = -1; + continue; + } + offsets[i] = counter; + counter += 1 + n_per_i[i]; + } + +#ifdef GGML_USE_OPENMP + #pragma omp parallel +#endif + { + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + GGML_ASSERT(dist2); + int8_t pos[4]; + int i; +#ifdef GGML_USE_OPENMP + #pragma omp for schedule(dynamic, 64) +#endif + for (i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int local_counter = offsets[i]; + kmap_q3xs[i] = -(local_counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q3xs[local_counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q3xs[local_counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); + } + free(offsets); + free(n_per_i); } void iq3xs_free_impl(int grid_size) { From d55fb9717459c4a61ae372d57c5214e1d791e4d2 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Mon, 25 May 2026 03:18:08 -0400 Subject: [PATCH 09/21] ci : install host compiler on android-ndk build (#23630) --- .github/workflows/build-android.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml index 5d88305a4f0..f4c3fd669de 100644 --- a/.github/workflows/build-android.yml +++ b/.github/workflows/build-android.yml @@ -73,6 +73,11 @@ jobs: fetch-depth: 0 lfs: false + - name: Dependencies + run: | + apt-get update + apt-get install -y build-essential + - name: Build id: ndk_build run: | From 314e729347defd9851e857f78084160c5786a7d8 Mon Sep 17 00:00:00 2001 From: Tim Neumann Date: Mon, 25 May 2026 09:29:28 +0200 Subject: [PATCH 10/21] llama : document that only one on-device state can be saved per sequence (#23520) --- include/llama.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index 75095b22d08..e8374c53b70 100644 --- a/include/llama.h +++ b/include/llama.h @@ -874,7 +874,8 @@ extern "C" { // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba) #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1 -// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +// Keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load). +// Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag. #define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2 typedef uint32_t llama_state_seq_flags; From 062d3115aa12267d34d7ac5b78bd5d0ad48778ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 25 May 2026 10:41:25 +0200 Subject: [PATCH 11/21] ci : fix pre-tokenizer-hashes check (#23651) --- .github/workflows/pre-tokenizer-hashes.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml index fce70d03e52..3e440b67d9b 100644 --- a/.github/workflows/pre-tokenizer-hashes.yml +++ b/.github/workflows/pre-tokenizer-hashes.yml @@ -3,11 +3,11 @@ name: Check Pre-Tokenizer Hashes on: push: paths: - - 'convert_hf_to_gguf.py' + - 'conversion/base.py' - 'convert_hf_to_gguf_update.py' pull_request: paths: - - 'convert_hf_to_gguf.py' + - 'conversion/base.py' - 'convert_hf_to_gguf_update.py' jobs: @@ -30,16 +30,16 @@ jobs: - name: Update pre-tokenizer hashes run: | - cp convert_hf_to_gguf.py /tmp + cp conversion/base.py /tmp .venv/bin/python convert_hf_to_gguf_update.py --check-missing - name: Check if committed pre-tokenizer hashes matches generated version run: | - if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then - echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)." - echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes" + if ! diff -q conversion/base.py /tmp/base.py; then + echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)." + echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes" echo "Differences found:" - diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true + diff conversion/base.py /tmp/base.py || true exit 1 fi echo "Model pre-tokenizer hashes are up to date." From 5fdf07e33bf35b8e6fbd33c393c0260fe208485a Mon Sep 17 00:00:00 2001 From: alex-spacemit Date: Mon, 25 May 2026 16:43:24 +0800 Subject: [PATCH 12/21] ci : update spacemit toolchain url and enhance curl command (#23642) * fix(action): update SpacemiT toolchain URL and version Change-Id: If4cc1c738a855274103f8c3ad52daa33528acd0c * fix(action): add -L flag to curl command for URL redirection Change-Id: I9b6c37390f0c7a733a36308c8fb53d22d234ab06 --- .github/actions/linux-setup-spacemit/action.yml | 2 +- .github/actions/unarchive-tar/action.yml | 2 +- .github/workflows/build-cross.yml | 2 +- docs/build-riscv64-spacemit.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/actions/linux-setup-spacemit/action.yml b/.github/actions/linux-setup-spacemit/action.yml index e2193e8931d..39e405b6779 100644 --- a/.github/actions/linux-setup-spacemit/action.yml +++ b/.github/actions/linux-setup-spacemit/action.yml @@ -15,6 +15,6 @@ runs: id: setup uses: ./.github/actions/unarchive-tar with: - url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz + url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz path: ${{ inputs.path }} strip: 1 diff --git a/.github/actions/unarchive-tar/action.yml b/.github/actions/unarchive-tar/action.yml index b97e402f46a..3d2f9be7bdd 100644 --- a/.github/actions/unarchive-tar/action.yml +++ b/.github/actions/unarchive-tar/action.yml @@ -24,4 +24,4 @@ runs: run: | mkdir -p ${{ inputs.path }} cd ${{ inputs.path }} - curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }} + curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }} diff --git a/.github/workflows/build-cross.yml b/.github/workflows/build-cross.yml index 97c7368387e..845e88fe43b 100644 --- a/.github/workflows/build-cross.yml +++ b/.github/workflows/build-cross.yml @@ -277,7 +277,7 @@ jobs: env: # Make sure this is in sync with build-cache.yml - SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2" + SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4" steps: - uses: actions/checkout@v6 diff --git a/docs/build-riscv64-spacemit.md b/docs/build-riscv64-spacemit.md index 08301d51cae..7a9a1f3ad4e 100644 --- a/docs/build-riscv64-spacemit.md +++ b/docs/build-riscv64-spacemit.md @@ -5,7 +5,7 @@ 1. Prepare Toolchain For RISCV ~~~ -wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz +wget https://github.com/spacemit-com/toolchain/releases/download/v1.2.4/spacemit-toolchain-linux-glibc-x86_64-v1.2.4.tar.xz ~~~ 2. Build From 6c4cbdc70b83ac054106e9de3ebc2ecaa82c4b1f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 25 May 2026 16:46:23 +0800 Subject: [PATCH 13/21] server: MTP layer kv-cache should respect draft type ctk (#23646) --- tools/server/server-context.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9fecc4247f5..ae9e0bf60d8 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -822,6 +822,8 @@ struct server_context_impl { auto cparams_dft = common_context_params_to_llama(params_dft); if (spec_mtp) { cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_dft.type_k = params_base.speculative.draft.cache_type_k; + cparams_dft.type_v = params_base.speculative.draft.cache_type_v; } cparams_dft.n_rs_seq = 0; @@ -940,6 +942,8 @@ struct server_context_impl { auto cparams_mtp = common_context_params_to_llama(params_base); cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_mtp.type_k = params_base.speculative.draft.cache_type_k; + cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; cparams_mtp.n_rs_seq = 0; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); From 66efd13375bb0dfb28730d8c4b7d5c57cfd2c22d Mon Sep 17 00:00:00 2001 From: "Gilad S." <7817232+giladgd@users.noreply.github.com> Date: Mon, 25 May 2026 11:33:29 +0200 Subject: [PATCH 14/21] ggml: `gguf_init_from_callback` and `gguf_init_from_buffer` (#22341) * ggml: implement `gguf_init_from_buffer` * test: `gguf_init_from_buffer` * fix: memory breakdown for a model loaded with `no_alloc` from a file is consistent with being loaded from a buffer * fix: use `GGML_UNUSED` Co-authored-by: Copilot * fix: remove `total_size` from `gguf_reader` * fix: file offset calculation, rename `offset` to `data_offset` Co-authored-by: Copilot * refactor: extract model loader bug fixes to another PR * feat: add `gguf_init_from_callback` * fix: always require a max expected size * fix: change `gguf_reader_callback_t`'s `output` type to `void *`, change `max_expected_size` and offsets to `uint64_t` * fix: harden against offset overflow in buffer read * fix: remove seek behavior from the callback * feat: `max_chunk_read == 0` means `SIZE_MAX` * fix: seeking in a gguf file with no tensors --------- Co-authored-by: Copilot --- ggml/include/gguf.h | 10 ++- ggml/src/gguf.cpp | 178 ++++++++++++++++++++++++++++++++++++++------ tests/test-gguf.cpp | 90 ++++++++++++++++++++-- 3 files changed, 248 insertions(+), 30 deletions(-) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 02d5f221c03..67851ba6f16 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -76,10 +76,16 @@ extern "C" { struct ggml_context ** ctx; }; + // callback to simulate or wrap a FILE pointer - read up to `len` bytes at `offset` into `output` and return the number of bytes read + typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len); + GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); + GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params); + + // max_chunk_read is the maximum number of bytes that the GGUF code will read at once from the callback, a value of 0 means no limit + GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params); GGML_API void gguf_free(struct gguf_context * ctx); @@ -87,7 +93,7 @@ extern "C" { GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx); GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); - GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); // padded to gguf_get_alignment if and only if the gguf_context contains at least one tensor GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx); GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index ab3cc974867..5e198618251 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -228,9 +228,18 @@ struct gguf_context { }; struct gguf_reader { - gguf_reader(FILE * file) : file(file) { - // read the remaining bytes once and update on each read - nbytes_remain = file_remain(file); + gguf_reader( + gguf_reader_callback_t callback, + void * userdata, + size_t max_chunk_read, + uint64_t data_offset = 0, + uint64_t nbytes_remain = 0) + : callback(callback), + userdata(userdata), + max_chunk_read(max_chunk_read), + data_offset(data_offset), + nbytes_remain(nbytes_remain) { + GGML_ASSERT(max_chunk_read > 0); } // helper for remaining bytes in a file @@ -257,12 +266,10 @@ struct gguf_reader { template bool read(T & dst) const { const size_t size = sizeof(dst); - if (nbytes_remain < size) { + if (size > nbytes_remain) { return false; } - const size_t nread = fread(&dst, 1, size, file); - nbytes_remain -= nread; - return nread == size; + return read_raw(&dst, size) == size; } template @@ -344,24 +351,71 @@ struct gguf_reader { return false; } dst.resize(static_cast(size)); - const size_t nread = fread(dst.data(), 1, size, file); - nbytes_remain -= nread; - return nread == size; + return read_raw(dst.data(), static_cast(size)) == size; } bool read(void * dst, const size_t size) const { if (size > nbytes_remain) { return false; } - const size_t nread = fread(dst, 1, size, file); - nbytes_remain -= nread; - return nread == size; + return read_raw(dst, size) == size; + } + + uint64_t tell() const { + return data_offset; + } + + bool seek(uint64_t absolute_offset) const { + const uint64_t end_offset = uint64_t(data_offset) + nbytes_remain; + if (absolute_offset > end_offset) { + return false; + } + + data_offset = absolute_offset; + nbytes_remain = end_offset - absolute_offset; + + return true; } private: - FILE * file; + size_t read_raw(void * dst, size_t size) const { + if (callback == nullptr || size == 0) { + return 0; + } + + uint8_t * data = static_cast(dst); + size_t total_nread = 0; + bool reached_eof = false; - mutable uint64_t nbytes_remain; + while (total_nread < size) { + const size_t chunk_size = std::min(max_chunk_read, size - total_nread); + if (data_offset + total_nread < data_offset) { + break; + } + const size_t nread = callback(userdata, static_cast(data + total_nread), data_offset + total_nread, chunk_size); + total_nread += nread; + if (nread != chunk_size) { + reached_eof = true; + break; + } + } + + data_offset += total_nread; + GGML_ASSERT(total_nread <= nbytes_remain); + nbytes_remain -= total_nread; + + if (reached_eof) { + nbytes_remain = 0; + } + + return total_nread; + } + + gguf_reader_callback_t callback = nullptr; + void * userdata = nullptr; + size_t max_chunk_read = 0; + mutable uint64_t data_offset = 0; + mutable uint64_t nbytes_remain = 0; }; struct gguf_context * gguf_init_empty(void) { @@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorinfo.size()) == n_tensors); // we require the data section to be aligned, so take into account any padding - if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) { + if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) { GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); gguf_free(ctx); return nullptr; } // store the current file offset - this is where the data section starts - ctx->offset = gguf_ftell(file); + ctx->offset = gr.tell(); // compute the total size of the data section, taking into account the alignment { @@ -844,6 +893,89 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para return ctx; } +struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params) { + if (callback == nullptr) { + return nullptr; + } + + const struct gguf_reader gr(callback, userdata, max_chunk_read == 0 ? SIZE_MAX : max_chunk_read, 0, max_expected_size); + return gguf_init_from_reader(gr, params); +} + +struct gguf_file_reader { + FILE * file; + uint64_t offset; +}; + +static size_t gguf_file_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) { + GGML_ASSERT(len > 0); + + gguf_file_reader & reader = *static_cast(userdata); + + if (reader.offset != offset) { + if (offset > INT64_MAX || gguf_fseek(reader.file, static_cast(offset), SEEK_SET) != 0) { + return 0; + } + + reader.offset = offset; + } + + const size_t nread = fread(static_cast(output), 1, len, reader.file); + reader.offset += nread; + return nread; +} + +struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) { + if (!file) { + return nullptr; + } + + const int64_t cur = gguf_ftell(file); + if (cur < 0) { + return nullptr; + } + + gguf_file_reader reader = { + /*.file = */ file, + /*.offset = */ static_cast(cur), + }; + const struct gguf_reader gr(gguf_file_reader_callback, &reader, SIZE_MAX, reader.offset, gguf_reader::file_remain(file)); + return gguf_init_from_reader(gr, params); +} + +struct gguf_buffer_reader { + const uint8_t * data; + size_t size; +}; + +static size_t gguf_buffer_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) { + GGML_ASSERT(len > 0); + + const gguf_buffer_reader & reader = *static_cast(userdata); + + if (offset > reader.size || len > reader.size - offset) { + return 0; + } + + const size_t data_offset = static_cast(offset); + const size_t nread = std::min(len, reader.size - data_offset); + memcpy(static_cast(output), reader.data + data_offset, nread); + return nread; +} + +struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) { + if (data == nullptr || size == 0) { + return nullptr; + } + + gguf_buffer_reader reader = { + /*.data = */ static_cast(data), + /*.size = */ size, + }; + const struct gguf_reader gr(gguf_buffer_reader_callback, &reader, SIZE_MAX, 0, size); + return gguf_init_from_reader(gr, params); +} + struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { FILE * file = ggml_fopen(fname, "rb"); diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index ed3070dc4de..1ae468fbd65 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -162,6 +162,42 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) { GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes); } +static std::vector read_file_to_buffer(FILE * file) { + GGML_ASSERT(file != nullptr); + GGML_ASSERT(fseek(file, 0, SEEK_END) == 0); + + const long size = ftell(file); + GGML_ASSERT(size >= 0); + + rewind(file); + + std::vector data(static_cast(size)); + GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size()); + + rewind(file); + return data; +} + +struct callback_reader_data { + const uint8_t * data; + size_t size; +}; + +static size_t read_buffer_callback(void * userdata, void * output, uint64_t offset, size_t len) { + GGML_ASSERT(len > 0); + + const callback_reader_data & reader = *static_cast(userdata); + + if (offset > reader.size || len > reader.size - offset) { + return 0; + } + + const size_t data_offset = static_cast(offset); + const size_t nread = std::min(len, reader.size - data_offset); + memcpy(static_cast(output), reader.data + data_offset, nread); + return nread; +} + static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) { FILE * file = tmpfile(); @@ -1095,10 +1131,29 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml return ok; } -static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) { +enum roundtrip_read_mode { + ROUNDTRIP_READ_MODE_FILE, + ROUNDTRIP_READ_MODE_BUFFER, + ROUNDTRIP_READ_MODE_CALLBACK, +}; + +static const char * roundtrip_read_mode_name(const roundtrip_read_mode mode) { + switch (mode) { + case ROUNDTRIP_READ_MODE_FILE: return "file"; + case ROUNDTRIP_READ_MODE_BUFFER: return "buffer"; + case ROUNDTRIP_READ_MODE_CALLBACK: return "callback"; + } + + GGML_ABORT("fatal error"); +} + +static std::pair test_roundtrip( + ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta, + const roundtrip_read_mode read_mode) { ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); - printf("%s: device=%s, backend=%s, only_meta=%s\n", - __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no"); + printf("%s: device=%s, backend=%s, only_meta=%s, read_mode=%s\n", + __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), + only_meta ? "yes" : "no", roundtrip_read_mode_name(read_mode)); int npass = 0; int ntest = 0; @@ -1133,7 +1188,22 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned /*no_alloc =*/ false, /*ctx =*/ only_meta ? nullptr : &ctx_1, }; - struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); + struct gguf_context * gguf_ctx_1 = nullptr; + const std::vector data = read_mode == ROUNDTRIP_READ_MODE_FILE + ? std::vector() + : read_file_to_buffer(file); + + if (read_mode == ROUNDTRIP_READ_MODE_BUFFER) { + gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params); + } else if (read_mode == ROUNDTRIP_READ_MODE_CALLBACK) { + callback_reader_data reader = { + /*.data = */ data.data(), + /*.size = */ data.size(), + }; + gguf_ctx_1 = gguf_init_from_callback(read_buffer_callback, &reader, 4096, 4ull << 30 /* 4GB */, gguf_params); + } else { + gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); + } printf("%s: same_version: ", __func__); if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) { @@ -1343,7 +1413,17 @@ int main(int argc, char ** argv) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); for (bool only_meta : {true, false}) { - std::pair result = test_roundtrip(dev, seed, only_meta); + std::pair result = test_roundtrip(dev, seed, only_meta, ROUNDTRIP_READ_MODE_FILE); + npass += result.first; + ntest += result.second; + } + { + std::pair result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_BUFFER); + npass += result.first; + ntest += result.second; + } + { + std::pair result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_CALLBACK); npass += result.first; ntest += result.second; } From ae251b5ff2634108822e0f8bb20ca4cd5c2c5dcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 25 May 2026 11:37:25 +0200 Subject: [PATCH 15/21] TP: fix ggml context size calculation (#22616) * TP: fix ggml context size calculation, memory leak * move split state cache back into the context * revert to constant ggml context size for cgraphs * increase headroom for statically allocated tensors * remove obsolete include --- ggml/src/ggml-backend-meta.cpp | 194 +++++++++++++++++++++++---------- 1 file changed, 137 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp index 5f9ae9c1bc5..d0d64523b4a 100644 --- a/ggml/src/ggml-backend-meta.cpp +++ b/ggml/src/ggml-backend-meta.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type( // meta backend buffer // +// Container to hold the tensor slices per simple ggml backend buffer. +struct ggml_backend_meta_simple_tensor_container { + std::vector ctxs; + std::map> simple_tensors; + + ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) { + ctxs.reserve(n_simple); + for (int i = 0; i < n_simple; i++) { + ctxs.emplace_back(ggml_init(params)); + } + } + ggml_backend_meta_simple_tensor_container() {} +}; + struct ggml_backend_meta_buffer_context { + // FIXME + // Most tensors can simply be stored statically in their own buffer. + // Externally created views however also need a mapping to simple tensors but they use the buffer of the view source. + // If external views are simply using that buffer they will slowly deplete its memory. + // Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp. + // Long-term: tie the lifetime of external views to the meta backend executing the graph instead, + // currently not possible due to graph-external operations in the backend scheduler. + ggml_backend_meta_simple_tensor_container stc_static; + ggml_backend_meta_simple_tensor_container stc_compute[2]; + int stc_compute_index = 0; + int stc_compute_index_next = 0; + std::vector bufs; + + // FIXME + // The size of the split state cache is unbounded and can theoretically grow infinitely large. + // However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive. static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding); - std::map, std::pair> split_state_cache; - std::map< const ggml_tensor *, std::vector> simple_tensors; - - struct buffer_config { - ggml_context * ctx; - ggml_backend_buffer_t buf; - - buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {} - }; - std::vector buf_configs; int debug; - ggml_backend_meta_buffer_context() { + ggml_backend_meta_buffer_context( + ggml_backend_meta_simple_tensor_container & stc_static, + ggml_backend_meta_simple_tensor_container & stc_compute_0, + ggml_backend_meta_simple_tensor_container & stc_compute_1, + const std::vector & bufs) + : stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} { + this->bufs.reserve(bufs.size()); + for (ggml_backend_buffer_t buf : bufs) { + this->bufs.emplace_back(buf); + } const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG"); debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0; } + + ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) { + if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) { + return stc_static; + } + return stc_compute[stc_compute_index]; + } }; static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) { GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; - for (auto & [ctx, buf] : buf_ctx->buf_configs) { - ggml_backend_buffer_free(buf); - ggml_free(ctx); - } delete buf_ctx; } static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) { GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context; - return buf_ctx->buf_configs.size(); + return buf_ctx->bufs.size(); } static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) { GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context; - GGML_ASSERT(index < buf_ctx->buf_configs.size()); - return buf_ctx->buf_configs[index].buf; + GGML_ASSERT(index < buf_ctx->bufs.size()); + return buf_ctx->bufs[index].get(); } static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) { GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; - GGML_ASSERT(index < buf_ctx->buf_configs.size()); + GGML_ASSERT(index < buf_ctx->bufs.size()); - auto it = buf_ctx->simple_tensors.find(tensor); - if (it == buf_ctx->simple_tensors.end()) { + ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor); + auto it = stc.simple_tensors.find(tensor); + if (it == stc.simple_tensors.end()) { return nullptr; } return it->second[index]; } -static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync); + +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state( + ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; @@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1}; continue; } - src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true); + src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true); GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); } @@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co return ret; } +static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync); +} + static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return (void *) 0x1000000000000000; // FIXME } -static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); - ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; - const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer); +static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) { + GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context; + const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer); - const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true); + const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true); GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN); GGML_ASSERT(split_state.n_segments <= 16); @@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer std::vector simple_tensors; simple_tensors.reserve(n_simple_bufs); for (size_t j = 0; j < n_simple_bufs; j++) { - ggml_context * simple_ctx = buf_ctx->buf_configs[j].ctx; - ggml_backend_buffer_t simple_buf = buf_ctx->buf_configs[j].buf; + ggml_context * simple_ctx = stc.ctxs[j].get(); + ggml_backend_buffer_t simple_buf = buf_ctx->bufs[j].get(); if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) { // TODO: the following assert fails for llama-parallel even though the results are correct: @@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs; } else if (simple_buf != nullptr) { t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf) - + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer)); + + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer)); } t_ij->extra = tensor->extra; for (int i = 0; i < GGML_MAX_SRC; i++) { @@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer } } - buf_ctx->simple_tensors[tensor] = simple_tensors; + stc.simple_tensors[tensor] = simple_tensors; return GGML_STATUS_SUCCESS; } +static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; + buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next; + return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor); +} + static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer); GGML_ASSERT(ggml_is_contiguous(tensor)); @@ -1413,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) { - const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer); - for (size_t i = 0; i < n_buffers; i++) { + GGML_ASSERT(ggml_backend_buffer_is_meta(buffer)); + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context; + for (size_t i = 0; i < buf_ctx->bufs.size(); i++) { ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i)); } } @@ -1440,21 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) { static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - ggml_init_params params = { - /*.mem_size =*/ 1024*1024*1024, // FIXME + const ggml_init_params params = { + /*.mem_size =*/ 1024*1024*ggml_tensor_overhead(), // FIXME /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + ggml_backend_meta_simple_tensor_container stc_static; + ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts); - ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(); size_t max_size = 0; - buf_ctx->buf_configs.reserve(n_simple_bufts); + std::vector bufs; + bufs.reserve(n_simple_bufts); for (size_t i = 0; i < n_simple_bufts; i++) { - ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size); - GGML_ASSERT(simple_buf != nullptr); - max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf)); - buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf); + bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size)); + GGML_ASSERT(bufs.back() != nullptr); + max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back())); } + ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs); return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size); } @@ -1462,26 +1516,32 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft); - ggml_init_params params = { - /*.mem_size =*/ 1024*1024*1024, // FIXME + constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals. + const ggml_init_params params_static = { + /*.mem_size =*/ ggml_get_mem_size(ctx), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, }; + const ggml_init_params params_compute = { + /*.mem_size =*/ compute_headroom*ggml_get_mem_size(ctx), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + ggml_backend_meta_simple_tensor_container stc_static (params_static, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts); + ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts); - ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(); - meta_buf_ctx->buf_configs.reserve(n_simple_bufts); - for (size_t i = 0; i < n_simple_bufts; i++) { - meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr); - } + std::vector bufs(n_simple_bufts, nullptr); + ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs); ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { t->buffer = meta_buf; - ggml_backend_meta_buffer_init_tensor(meta_buf, t); + ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t); t->data = (void *) 0x2000000000000000; // FIXME } for (size_t i = 0; i < n_simple_bufts; i++) { - ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx; + ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get(); ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i); // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL. @@ -1494,15 +1554,15 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc } } if (any_nonzero_slice) { - meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft); + meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft)); } else { - meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0); + meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0)); for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - t->buffer = meta_buf_ctx->buf_configs[i].buf; + t->buffer = meta_buf_ctx->bufs[i].get(); } } - GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr); - meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf)); + GGML_ASSERT(meta_buf_ctx->bufs[i]); + meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get())); } return meta_buf; } @@ -1724,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, } if (needs_rebuild) { + std::set used_buffers; + for (int i = 0; i < cgraph->n_leafs; i++) { + if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) { + used_buffers.emplace(cgraph->leafs[i]->buffer); + } + } + for (int i = 0; i < cgraph->n_nodes; i++) { + if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) { + used_buffers.emplace(cgraph->nodes[i]->buffer); + } + } + for (ggml_backend_buffer_t buf : used_buffers) { + ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context; + buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1; + ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next]; + for (ggml_context_ptr & ctx : stc.ctxs) { + ggml_reset(ctx.get()); + } + stc.simple_tensors.clear(); + } size_t n_subgraphs = 0; size_t max_tmp_size = 0; @@ -1909,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend, const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads); const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads); const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead(); - ggml_init_params params = { + const ggml_init_params params = { /*.mem_size =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux), /*.mem_buffer =*/ nullptr, /*.no_alloc =*/ true, From fa9704152416df37cd28b308a640378e1414ea8d Mon Sep 17 00:00:00 2001 From: Dev-X25874 <283057883+Dev-X25874@users.noreply.github.com> Date: Thu, 21 May 2026 17:28:08 +0530 Subject: [PATCH 16/21] ggml-alloc: fix out-of-bounds read in ggml_dyn_tallocr_remove_block (ggml/1492) --- ggml/src/ggml-alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index a4b01ccf8a1..3bda9abbe03 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) { // shift all elements after idx by 1 to the left, overwriting the element at idx - for (int i = idx; i < chunk->n_free_blocks; i++) { + for (int i = idx; i < chunk->n_free_blocks - 1; i++) { chunk->free_blocks[i] = chunk->free_blocks[i+1]; } chunk->n_free_blocks--; From b251f74f496950e88abc70db40530f85ed83f53f Mon Sep 17 00:00:00 2001 From: Ori Pekelman Date: Thu, 21 May 2026 12:00:16 +0000 Subject: [PATCH 17/21] ggml.h: correct ggml_silu_back arg docstring (a=dy, b=x) (ggml/1500) --- ggml/include/ggml.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 41566d41aef..f6725265504 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1189,8 +1189,8 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); - // a - x - // b - dy + // a - dy + // b - x GGML_API struct ggml_tensor * ggml_silu_back( struct ggml_context * ctx, struct ggml_tensor * a, From ce5890b5f7d88fe3408398dfbbada00aec03d352 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 May 2026 12:13:21 +0300 Subject: [PATCH 18/21] ggml : bump version to 0.12.1 (ggml/1508) --- ggml/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4aac5094d1c..03020888f97 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -5,7 +5,7 @@ project("ggml" C CXX ASM) ### GGML Version set(GGML_VERSION_MAJOR 0) set(GGML_VERSION_MINOR 12) -set(GGML_VERSION_PATCH 0) +set(GGML_VERSION_PATCH 1) set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") From 22307b3e8b96179a195956056e37b872eb6df475 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 May 2026 12:33:22 +0300 Subject: [PATCH 19/21] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 0fa47782fd9..68a145ca643 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -0ce7ad348a3151e1da9f65d962044546bcaad421 +8fb5f5f931ce615d064aef2183edf618a677bff6 From 45158f460ee1eca215e2dd60c63bc6422d432bf1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 May 2026 12:40:17 +0300 Subject: [PATCH 20/21] ggml : bump version to 0.13.0 (ggml/1510) --- ggml/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 03020888f97..f542f18b6d4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -4,8 +4,8 @@ project("ggml" C CXX ASM) ### GGML Version set(GGML_VERSION_MAJOR 0) -set(GGML_VERSION_MINOR 12) -set(GGML_VERSION_PATCH 1) +set(GGML_VERSION_MINOR 13) +set(GGML_VERSION_PATCH 0) set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") From d161ea707192f9510288561104a8caf8e1829edf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 25 May 2026 12:42:28 +0300 Subject: [PATCH 21/21] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 68a145ca643..a4f87b2b9ae 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -8fb5f5f931ce615d064aef2183edf618a677bff6 +e705c5fed490514458bdd2eaddc43bd098fcce9b