From 5d246a792ddf839e87fb777681dd2a9ac7f7eb5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 24 May 2026 09:51:31 +0200
Subject: [PATCH 01/21] convert : minor fixes for numpy 2.x (#23571)

---
 examples/convert_legacy_llama.py | 3 ++-
 gguf-py/gguf/quants.py           | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py
index c4ec5c524e9..5c9305b1237 100755
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@@ -1308,7 +1308,8 @@ def do_dump_model(model_plus: ModelPlus) -> None:
 
 def main(args_in: list[str] | None = None) -> None:
     output_choices = ["f32", "f16"]
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+    dummy_val = np.uint32(1)
+    if dummy_val == dummy_val.view(dummy_val.dtype.newbyteorder("<")):
         # We currently only support Q8_0 output on little endian systems.
         output_choices.append("q8_0")
     parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
diff --git a/gguf-py/gguf/quants.py b/gguf-py/gguf/quants.py
index 1d9d9ab7d70..80966b6ef15 100644
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@@ -28,6 +28,7 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
 # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
 def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
     rows = arr.reshape((-1, arr.shape[-1]))
+    assert len(rows.shape)
     osize = 1
     for dim in oshape:
         osize *= dim

From 549b9d84330c327e6791fa812a7d60c0cf63572e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 24 May 2026 18:20:10 +0300
Subject: [PATCH 02/21] ci : update build-self-hosted.yml (#23616)

---
 .github/workflows/build-self-hosted.yml  |  86 ++++++++++++++++---
 .github/workflows/build.yml              | 102 ++++++++++++-----------
 .github/workflows/server-self-hosted.yml |  77 +++++++++--------
 ci/run.sh                                |   2 +-
 4 files changed, 167 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
index 2851c45601f..c247222eb4b 100644
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -57,7 +57,7 @@ env:
 jobs:
   determine-tag:
     name: Determine tag name
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     outputs:
       tag_name: ${{ steps.tag.outputs.name }}
     steps:
@@ -86,7 +86,7 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-vulkan-cm:
     needs: determine-tag
@@ -103,7 +103,7 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-vulkan-cm2:
     needs: determine-tag
@@ -120,10 +120,11 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-webgpu:
-    runs-on: [self-hosted, Linux, NVIDIA]
+    needs: determine-tag
+    runs-on: [self-hosted, Linux, NVIDIA, X64]
 
     steps:
       - name: Clone
@@ -149,10 +150,11 @@ jobs:
           GG_BUILD_WEBGPU=1 \
           GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
           GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMX-compatible machine
   #ggml-ci-cpu-amx:
+  #  needs: determine-tag
   #  runs-on: [self-hosted, Linux, CPU, AMX]
 
   #  steps:
@@ -163,10 +165,11 @@ jobs:
   #    - name: Test
   #      id: ggml-ci
   #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMD GPU machine
   # ggml-ci-amd-vulkan:
+  #   needs: determine-tag
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -178,10 +181,11 @@ jobs:
   #       id: ggml-ci
   #       run: |
   #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   # TODO: provision AMD GPU machine
   # ggml-ci-amd-rocm:
+  #   needs: determine-tag
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -193,7 +197,7 @@ jobs:
   #       id: ggml-ci
   #       run: |
   #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-mac-metal:
     needs: determine-tag
@@ -337,4 +341,66 @@ jobs:
           HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-arm64-cpu-low-perf:
+    needs: determine-tag
+    runs-on: [self-hosted, Linux, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-arm64-cpu-high-perf:
+    needs: determine-tag
+    runs-on: [self-hosted, Linux, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
+#         CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
+#           ARM -march/-mcpu not found, -mcpu=native will be used
+#
+#       if we resolve this, we should be able to offload these jobs to the self-hosted runners
+#
+#  ggml-ci-arm64-cpu-high-perf-sve:
+#    needs: determine-tag
+#    runs-on: [self-hosted, Linux, NVIDIA, ARM64]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+#
+#  ggml-ci-arm64-cpu-kleidiai:
+#    needs: determine-tag
+#    runs-on: [self-hosted, Linux, NVIDIA, ARM64]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 65fa24f4468..47b377ff72b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -931,31 +931,32 @@ jobs:
         run: |
           LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
-  ggml-ci-arm64-cpu-low-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-low-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# note: moved to build-self-hosted.yml - can remove from here when everything is stable
+#  ggml-ci-arm64-cpu-low-perf:
+#    runs-on: ubuntu-22.04-arm
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: ggml-ci-arm64-cpu-low-perf
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Dependencies
+#        id: depends
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
   ggml-ci-x64-cpu-high-perf:
     runs-on: ubuntu-22.04
@@ -983,31 +984,32 @@ jobs:
         run: |
           LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
-  ggml-ci-arm64-cpu-high-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ggml-ci-arm64-cpu-high-perf
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# note: moved to build-self-hosted.yml - can remove from here when everything is stable
+#  ggml-ci-arm64-cpu-high-perf:
+#    runs-on: ubuntu-22.04-arm
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: ggml-ci-arm64-cpu-high-perf
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Dependencies
+#        id: depends
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
   ggml-ci-arm64-cpu-high-perf-sve:
     runs-on: ubuntu-22.04-arm
diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml
index 857c72a4619..91e0653943c 100644
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -91,45 +91,44 @@ jobs:
           export ${{ matrix.extra_args }}
           pytest -v -x -m "not slow"
 
-  # TODO: provision CUDA runner
-  #  server-cuda:
-  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-  #
-  #    name: server-cuda (${{ matrix.wf_name }})
-  #    strategy:
-  #      matrix:
-  #        build_type: [Release]
-  #        wf_name: ["GPUx1"]
-  #        include:
-  #          - build_type: Release
-  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-  #            wf_name:    "GPUx1, backend-sampling"
-  #      fail-fast: false
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #        with:
-  #          fetch-depth: 0
-  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-  #
-  #      - name: Build
-  #        id: cmake_build
-  #        run: |
-  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-  #
-  #      - name: Tests
-  #        id: server_integration_tests
-  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-  #        run: |
-  #          cd tools/server/tests
-  #          python3 -m venv venv
-  #          source venv/bin/activate
-  #          pip install -r requirements.txt
-  #          export ${{ matrix.extra_args }}
-  #          pytest -v -x -m "not slow"
+  server-cuda:
+    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
+
+    name: server-cuda (${{ matrix.wf_name }})
+    strategy:
+      matrix:
+        build_type: [Release]
+        wf_name: ["GPUx1"]
+        include:
+          - build_type: Release
+            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+            wf_name:    "GPUx1, backend-sampling"
+      fail-fast: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        run: |
+          cd tools/server/tests
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          export ${{ matrix.extra_args }}
+          pytest -v -x -m "not slow"
 
   server-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x
diff --git a/ci/run.sh b/ci/run.sh
index b096dc23b66..4acf4375267 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -238,7 +238,7 @@ function gg_run_ctest_debug {
     (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
     set +e
 }

From 28123a3937f7d056a4dded43752b216507384373 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 May 2026 08:11:19 +0300
Subject: [PATCH 03/21] ci : move most slim jobs to self-hosted runners
 (#23619)

* ci : remove tag from build-self-hosted.yml

* ci : slim -> self-hosted

* ci : prevent heavy CPU jobs from running on fast runners

* ci : prevent cmake pkg to run on dedicated fast runners

* ci : try to bump 3.11 -> 3.13

* ci : move lint back to 3.11

* ci : back to 3.11

* ci : add comment about UI jobs

* ci : move python requirements check to CPU runners

this job is a bit slow for a dedicated "fast" runner

* ci : add self-hosted ui workflow

* ci : fix UI naming

* tmp to check if arm64 fast is compatible with all jobs

* revert last commit
---
 .github/workflows/build-cmake-pkg.yml         |   7 +-
 .github/workflows/build-self-hosted.yml       |  58 +--------
 .github/workflows/check-vendor.yml            |   2 +-
 .github/workflows/code-style.yml              |   2 +-
 .github/workflows/editorconfig.yml            |   2 +-
 .github/workflows/pre-tokenizer-hashes.yml    |   2 +-
 .../workflows/python-check-requirements.yml   |   2 +-
 .github/workflows/python-lint.yml             |   2 +-
 .github/workflows/python-type-check.yml       |   2 +-
 .github/workflows/ui-build.yml                |   3 +-
 .github/workflows/ui-self-hosted.yml          | 118 ++++++++++++++++++
 .github/workflows/{ui-ci.yml => ui.yml}       |  10 +-
 .github/workflows/update-ops-docs.yml         |   4 +-
 13 files changed, 140 insertions(+), 74 deletions(-)
 create mode 100644 .github/workflows/ui-self-hosted.yml
 rename .github/workflows/{ui-ci.yml => ui.yml} (95%)

diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
index b36ac5b8e6d..5becff09c1b 100644
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -5,17 +5,12 @@ on:
 
 jobs:
   linux:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, Linux, CPU]
     steps:
       - uses: actions/checkout@v6
         with:
           fetch-depth: 0
 
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y build-essential tcl cmake
-
       - name: Build
         run: |
           PREFIX="$(pwd)"/inst
diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml
index c247222eb4b..375e75c5267 100644
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -55,24 +55,7 @@ env:
   LLAMA_LOG_TIMESTAMPS: 1
 
 jobs:
-  determine-tag:
-    name: Determine tag name
-    runs-on: [self-hosted, fast]
-    outputs:
-      tag_name: ${{ steps.tag.outputs.name }}
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
   ggml-ci-nvidia-cuda:
-    needs: determine-tag
     runs-on: [self-hosted, Linux, NVIDIA]
 
     steps:
@@ -82,14 +65,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           nvidia-smi
           GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-vulkan-cm:
-    needs: determine-tag
     runs-on: [self-hosted, Linux, NVIDIA]
 
     steps:
@@ -99,14 +79,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-vulkan-cm2:
-    needs: determine-tag
     runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
 
     steps:
@@ -116,14 +93,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-nvidia-webgpu:
-    needs: determine-tag
     runs-on: [self-hosted, Linux, NVIDIA, X64]
 
     steps:
@@ -154,7 +128,6 @@ jobs:
 
   # TODO: provision AMX-compatible machine
   #ggml-ci-cpu-amx:
-  #  needs: determine-tag
   #  runs-on: [self-hosted, Linux, CPU, AMX]
 
   #  steps:
@@ -169,7 +142,6 @@ jobs:
 
   # TODO: provision AMD GPU machine
   # ggml-ci-amd-vulkan:
-  #   needs: determine-tag
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -185,7 +157,6 @@ jobs:
 
   # TODO: provision AMD GPU machine
   # ggml-ci-amd-rocm:
-  #   needs: determine-tag
   #   runs-on: [self-hosted, Linux, AMD]
 
   #   steps:
@@ -200,7 +171,6 @@ jobs:
   #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-mac-metal:
-    needs: determine-tag
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -210,13 +180,10 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-mac-webgpu:
-    needs: determine-tag
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -239,14 +206,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
             bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-mac-vulkan:
-    needs: determine-tag
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -256,14 +220,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-linux-intel-vulkan:
-    needs: determine-tag
     runs-on: [self-hosted, Linux, Intel]
 
     steps:
@@ -275,14 +236,11 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-win-intel-vulkan:
-    needs: determine-tag
     runs-on: [self-hosted, Windows, X64, Intel]
 
     steps:
@@ -297,7 +255,6 @@ jobs:
           MSYSTEM: UCRT64
           CHERE_INVOKING: 1
           PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           vulkaninfo --summary
           # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
@@ -305,7 +262,6 @@ jobs:
           LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
 
   ggml-ci-intel-openvino-gpu-low-perf:
-    needs: determine-tag
     runs-on: [self-hosted, Linux, Intel, OpenVINO]
 
     concurrency:
@@ -337,15 +293,12 @@ jobs:
 
       - name: Test
         id: ggml-ci
-        env:
-          HF_UI_VERSION: ${{ needs.determine-tag.outputs.tag_name }}
         run: |
           source ./openvino_toolkit/setupvars.sh
           GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-arm64-cpu-low-perf:
-    needs: determine-tag
-    runs-on: [self-hosted, Linux, ARM64]
+    runs-on: [self-hosted, Linux, ARM64, CPU]
 
     steps:
       - name: Clone
@@ -358,8 +311,7 @@ jobs:
           LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
   ggml-ci-arm64-cpu-high-perf:
-    needs: determine-tag
-    runs-on: [self-hosted, Linux, ARM64]
+    runs-on: [self-hosted, Linux, ARM64, CPU]
 
     steps:
       - name: Clone
@@ -378,8 +330,7 @@ jobs:
 #       if we resolve this, we should be able to offload these jobs to the self-hosted runners
 #
 #  ggml-ci-arm64-cpu-high-perf-sve:
-#    needs: determine-tag
-#    runs-on: [self-hosted, Linux, NVIDIA, ARM64]
+#    runs-on: [self-hosted, Linux, ARM64, CPU]
 #
 #    steps:
 #      - name: Clone
@@ -392,8 +343,7 @@ jobs:
 #          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 #
 #  ggml-ci-arm64-cpu-kleidiai:
-#    needs: determine-tag
-#    runs-on: [self-hosted, Linux, NVIDIA, ARM64]
+#    runs-on: [self-hosted, Linux, ARM64, CPU]
 #
 #    steps:
 #      - name: Clone
diff --git a/.github/workflows/check-vendor.yml b/.github/workflows/check-vendor.yml
index 1671ed7b8bd..015629f380c 100644
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -19,7 +19,7 @@ on:
 
 jobs:
   check-vendor:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
 
     steps:
       - name: Checkout
diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml
index c88396c0a7d..50b598b84dd 100644
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   model-naming:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     steps:
       - uses: actions/checkout@v6
       - name: Check model naming conventions
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
index 53f6a0ccfda..59159cd4144 100644
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -15,7 +15,7 @@ concurrency:
 
 jobs:
   editorconfig:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     steps:
       - uses: actions/checkout@v6
       - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
index 7126b62b690..fce70d03e52 100644
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -12,7 +12,7 @@ on:
 
 jobs:
     pre-tokenizer-hashes:
-        runs-on: ubuntu-slim
+        runs-on: [self-hosted, fast]
 
         steps:
         - name: Checkout repository
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 1219b874592..2c7fab40b44 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -20,7 +20,7 @@ concurrency:
 
 jobs:
   python-check-requirements:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, CPU, fast]
     name: check-requirements
     steps:
       - name: Check out source repository
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
index 1e5d64c1aee..0424f372a14 100644
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -21,7 +21,7 @@ concurrency:
 
 jobs:
   flake8-lint:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     name: Lint
     steps:
       - name: Check out source repository
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
index cbeeb39d05b..14edb1a9d17 100644
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   python-type-check:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     name: python type-check
     steps:
       - name: Check out source repository
diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml
index 38fba13ad8e..2653afd06c7 100644
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -5,8 +5,7 @@ on:
 
 jobs:
   build:
-    name: Build static output
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
     env:
       BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml
new file mode 100644
index 00000000000..64a4519c618
--- /dev/null
+++ b/.github/workflows/ui-self-hosted.yml
@@ -0,0 +1,118 @@
+name: UI (self-hosted)
+
+# these are the same as ui.yml, but with self-hosted runners
+# the runners come with pre-installed Playwright browsers version: 1.56.1
+# the jobs are much lighter because they don't need to install node and playwright browsers
+
+on:
+  workflow_dispatch:
+    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/ui-self-hosted.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
+    ]
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/ui-self-hosted.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
+    ]
+
+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ui-build:
+    name: Build static output
+    uses: ./.github/workflows/ui-build.yml
+
+  ui-checks:
+    name: Checks
+    needs: ui-build
+    runs-on: [self-hosted, PLAYWRIGHT]
+    continue-on-error: true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install dependencies
+        id: setup
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Run type checking
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run check
+        working-directory: tools/ui
+
+      - name: Run linting
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run lint
+        working-directory: tools/ui
+
+      - name: Run Client tests
+        if: ${{ always() }}
+        run: npm run test:client
+        working-directory: tools/ui
+
+      - name: Run Unit tests
+        if: ${{ always() }}
+        run: npm run test:unit
+        working-directory: tools/ui
+
+  e2e-tests:
+    name: E2E Tests
+    needs: ui-build
+    runs-on: [self-hosted, PLAYWRIGHT]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install dependencies
+        id: setup
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Build Storybook
+        if: ${{ always() }}
+        run: npm run build-storybook
+        working-directory: tools/ui
+
+      - name: Run UI tests
+        if: ${{ always() }}
+        run: npm run test:ui -- --testTimeout=60000
+        working-directory: tools/ui
+
+      - name: Run E2E tests
+        if: ${{ always() }}
+        run: npm run test:e2e
+        working-directory: tools/ui
diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui.yml
similarity index 95%
rename from .github/workflows/ui-ci.yml
rename to .github/workflows/ui.yml
index 761a9319414..b5e7457182c 100644
--- a/.github/workflows/ui-ci.yml
+++ b/.github/workflows/ui.yml
@@ -1,4 +1,4 @@
-name: CI (UI)
+name: UI
 
 on:
   workflow_dispatch:
@@ -11,14 +11,16 @@ on:
     branches:
       - master
     paths: [
-      '.github/workflows/ui-ci.yml',
+      '.github/workflows/ui.yml',
+      '.github/workflows/ui-build.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
   pull_request:
     types: [opened, synchronize, reopened]
     paths: [
-      '.github/workflows/ui-ci.yml',
+      '.github/workflows/ui.yml',
+      '.github/workflows/ui-build.yml',
       'tools/ui/**.*',
       'tools/server/tests/**.*'
     ]
@@ -39,7 +41,7 @@ jobs:
     uses: ./.github/workflows/ui-build.yml
 
   ui-checks:
-    name: UI Checks
+    name: Checks
     needs: ui-build
     runs-on: ubuntu-latest
     continue-on-error: true
diff --git a/.github/workflows/update-ops-docs.yml b/.github/workflows/update-ops-docs.yml
index 2ab06eb9811..6e8bc1aa07c 100644
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -3,18 +3,20 @@ name: Update Operations Documentation
 on:
     push:
         paths:
+            - '.github/workflows/update-ops-docs.yml'
             - 'docs/ops.md'
             - 'docs/ops/**'
             - 'scripts/create_ops_docs.py'
     pull_request:
         paths:
+            - '.github/workflows/update-ops-docs.yml'
             - 'docs/ops.md'
             - 'docs/ops/**'
             - 'scripts/create_ops_docs.py'
 
 jobs:
     update-ops-docs:
-        runs-on: ubuntu-slim
+        runs-on: [self-hosted, fast, ARM64]
 
         steps:
         - name: Checkout repository

From 6d57c26ef8cb9a3583993d8d6527858985c82c7b Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Mon, 25 May 2026 07:12:39 +0200
Subject: [PATCH 04/21] perplexity : fix even more integer overflows (#23623)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
---
 tools/perplexity/perplexity.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index ea9de83e1a9..027cd53f29e 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -157,7 +157,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
                 break;
             }
             lock.unlock();
-            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
+            const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + size_t(i)*nv, tokens[i+1]);
             local_nll += v;
             local_nll2 += v*v;
         }
@@ -169,7 +169,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
     for (auto & w : workers) {
         w.join();
     }
-    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
+    out.write((const char *)log_probs.data(), size_t(n_token)*nv*sizeof(uint16_t));
 }
 
 struct kl_divergence_result {
@@ -279,7 +279,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
                 break;
             }
             lock.unlock();
-            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + size_t(i)*nv, tokens[i+1], local_kld);
             kld_values[i]    = (float)v.first;
             p_diff_values[i] = v.second;
         }

From e2ef8fe42ccef597bfeab901dd6e39589613b71e Mon Sep 17 00:00:00 2001
From: jacekpoplawski <67507230+jacekpoplawski@users.noreply.github.com>
Date: Mon, 25 May 2026 07:56:18 +0200
Subject: [PATCH 05/21] server: fix checkpoints creation (#22929)

* common : add common_chat_split_by_role

* cont : fix spans to reach end of message

* server: fix checkpoints creation

- extract message_spans from chat templates
- find the prompt token position before the latest user message
- split prompt batching at that position
- create a context checkpoint before the latest user input
- avoid periodic mid-prompt checkpoints when that position is known
- handle multimodal prompts when mapping text/template positions to server prompt tokens
- add --checkpoint-min-step to control minimum spacing between checkpoints

* cont : clean-up

* Support autoparser detection for message barriers

* server: fix message span delimiter and update docs

---------

Co-authored-by: Alde Rojas <hello@alde.dev>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Piotr Wilkin <piotr.wilkin@syndatis.com>
---
 common/arg.cpp                      |  11 +-
 common/chat-auto-parser-helpers.cpp |   6 +-
 common/chat-auto-parser.h           |   6 +
 common/chat-diff-analyzer.cpp       | 177 +++++++++++++++++++++++++++-
 common/chat.cpp                     |  65 ++++++++++
 common/chat.h                       |  16 +++
 common/common.cpp                   |  21 ++++
 common/common.h                     |   3 +-
 tests/test-chat-auto-parser.cpp     | 127 +++++++++++++++++++-
 tests/test-chat.cpp                 |  40 ++++++-
 tools/cli/README.md                 |   1 -
 tools/server/README.md              |   2 +-
 tools/server/server-common.cpp      |  10 ++
 tools/server/server-context.cpp     | 135 +++++++++++++++++----
 tools/server/server-task.h          |   3 +
 15 files changed, 586 insertions(+), 37 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 24d9734b934..3df8010a2ec 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1334,12 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"-cpent", "--checkpoint-every-n-tokens"}, "N",
-        string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
+        {"-cms", "--checkpoint-min-step"}, "N",
+        string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step),
         [](common_params & params, int value) {
-            params.checkpoint_every_nt = value;
+            if (value < 0) {
+                throw std::invalid_argument("checkpoint-min-step must be non-negative");
+            }
+            params.checkpoint_min_step = value;
         }
-    ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cram", "--cache-ram"}, "N",
         string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
diff --git a/common/chat-auto-parser-helpers.cpp b/common/chat-auto-parser-helpers.cpp
index 2499464cd82..81b17e5e1d2 100644
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm
 
 namespace autoparser {
 
+static const std::string ERR_TMPL = "#**ERROR**#";
+
 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
     generation_params tmpl_params;
     tmpl_params.messages              = params.messages;
@@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
         return common_chat_template_direct_apply(tmpl, tmpl_params);
     } catch (const std::exception & e) {
         LOG_DBG("Template application failed: %s\n", e.what());
-        return "";
+        return ERR_TMPL;
     }
 }
 
@@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants(
     std::string output_B = apply_template(tmpl, params_B);
 
     // Check for template application failures
-    if (output_A.empty() || output_B.empty()) {
+    if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
         return std::nullopt;
     }
 
diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h
index c680e686867..7858f6572f2 100644
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -377,6 +377,8 @@ struct analyze_tools : analyze_base {
 
 struct autoparser {
     jinja::caps          jinja_caps;
+    std::string          user_start;
+    std::string          assistant_start;
     analyze_reasoning    reasoning;
     analyze_content      content;
     analyze_tools        tools;
@@ -387,6 +389,10 @@ struct autoparser {
 
     autoparser() = default;
 
+    // Find the starting marker for the user message and assistant message
+    std::string detect_user_start_marker(const common_chat_template & tmpl);
+    std::string detect_assistant_start_marker(const common_chat_template & tmpl);
+
     // Run full differential analysis on a template
     void analyze_template(const common_chat_template & tmpl);
 
diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp
index 9c7c9678acd..0875c5347f4 100644
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -8,6 +8,9 @@
 #include "peg-parser.h"
 
 #include <algorithm>
+#include <cctype>
+#include <ostream>
+#include <sstream>
 
 #define ANSI_RESET  "\033[0m"
 #define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
@@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
 static const std::string ARG_FIRST = "AA_ARG_FST_AA";
 static const std::string ARG_SECOND = "BB_ARG_SND_BB";
 static const std::string USER_MSG = "U_USER_MSG Hello END_U";
+static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
 static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
 static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
 static const std::string CALL_ID_001 = "call00001";
@@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
               analysis.content.end   = "<|END_OF_TURN_TOKEN|>";
               analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
               analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
+              analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
               LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
           }
       },
@@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
               analysis.tools.function.close        = "```";
               LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
           }
-      }
+      },
+      // Nemotron Nano v2
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
+              tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
+
+              analysis.tools.format.mode           = tool_format::JSON_NATIVE;
+              analysis.tools.format.section_start  = "";
+              analysis.tools.format.section_end    = "";
+              analysis.tools.format.per_call_start = "<TOOLCALL>";
+              analysis.tools.format.per_call_end   = "</TOOLCALL>";
+              analysis.content.mode                = content_mode::PLAIN;
+              analysis.content.start               = "";
+              analysis.content.end                 = "";
+              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
+              analysis.reasoning.start             = "<think>\n\n";
+              analysis.reasoning.end               = "</think>";
+              analysis.assistant_start             = "<SPECIAL_11>Assistant";
+              analysis.user_start                  = "<SPECIAL_11>User";
+              analysis.preserved_tokens.clear();
+              analysis.preserved_tokens.push_back("<SPECIAL_12>");
+              analysis.preserved_tokens.push_back("<SPECIAL_11>");
+              analysis.preserved_tokens.push_back("</think>");
+              analysis.preserved_tokens.push_back("<TOOLCALL>");
+              analysis.preserved_tokens.push_back("</TOOLCALL>");
+              LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
+          }
+      },
+      // Fireworks
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
+            " + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
+              analysis.assistant_start             = "<|start_header_id|>assistant<|end_header_id|>";
+              analysis.user_start                  = "<|start_header_id|>user<|end_header_id|>";
+              LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
+          }
+      },
+      // Solar Open
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
+              analysis.assistant_start             = "<|begin|>assistant";
+              LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
+          }
+      },
+      // Apriel 1.6
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
+              analysis.user_start                  = "<|begin_user|>";
+              analysis.assistant_start             = "<|begin_assistant|>";
+              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
+          }
+      },
+
     });
 
 // Common JSON structures
@@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
     reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
     content = analyze_content(tmpl, reasoning);
     tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
+    assistant_start = detect_assistant_start_marker(tmpl);
+    user_start = detect_user_start_marker(tmpl);
     collect_preserved_tokens();
 
     for (auto & workaround : workarounds) {
@@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
     }
 
     LOG_DBG("\n--- Reasoning & Content Structure ---\n");
+    LOG_DBG("user_msg_start: %s\n", user_start.c_str());
+    LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
     LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
     LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
     LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
@@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() {
     add_token(tools.call_id.suffix);
 }
 
+std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
+    json user_msg = json{
+        { "role",    "user"   },
+        { "content", USER_MSG }
+    };
+
+    json assistant_no_reasoning = json{
+        { "role",    "assistant"   },
+        { "content", ASSISTANT_MSG }
+    };
+
+    template_params params;
+    params.messages              = json::array({ user_msg });
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        tmpl, params, [&](template_params & p) {
+            p.messages = json::array({ user_msg, assistant_no_reasoning });
+        }
+    );
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
+        return "";
+    }
+
+    auto usermsg = comparison->diff.right;
+    if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
+        LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
+    }
+
+    auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
+    if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
+        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
+    }
+    if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
+        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
+    }
+    return trim_whitespace(ast_prefix);
+}
+
+std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
+    json user_msg = json{
+        { "role",    "user"   },
+        { "content", USER_MSG }
+    };
+
+    json assistant = json{
+        { "role",    "assistant"   },
+        { "content", ASSISTANT_MSG }
+    };
+
+    json user_msg_two = json{
+        { "role",    "user"       },
+        { "content", USER_MSG_TWO }
+    };
+
+    template_params params;
+    params.messages              = json::array({});
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        tmpl, params, [&](template_params & p) {
+            p.messages = json::array({ user_msg });
+        }
+    );
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
+        params.messages = json::array({ user_msg_two, assistant });
+        comparison = compare_variants(
+            tmpl, params, [&](template_params & p) {
+                p.messages = json::array({ user_msg_two, assistant, user_msg });
+            }
+        );
+        if (!comparison) {
+            LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
+            return "";
+        }
+    }
+
+    auto usermsg = comparison->diff.right;
+    if (usermsg.find(USER_MSG) == std::string::npos) {
+        LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
+    }
+
+    if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
+        usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
+    }
+
+    auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
+    auto candidate_split = segmentize_markers(candidate);
+    std::stringstream result;
+    bool encountered_marker = false;
+    for (const auto & mrk : candidate_split) {
+        std::string lower_mrk = std::string(mrk.value);
+        std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
+            [](unsigned char c) { return std::tolower(c); });
+        // heuristic to weed out potential end markers, but only at the start
+        if (mrk.type == segment_type::MARKER && !encountered_marker &&
+            (lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
+            continue;
+        }
+        if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
+            continue;
+        }
+        encountered_marker |= mrk.type == segment_type::MARKER;
+        result << mrk.value;
+    }
+    return trim_whitespace(result.str());
+}
+
 analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
     : analyze_base(tmpl) {
     LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
diff --git a/common/chat.cpp b/common/chat.cpp
index 56873e3a1e9..ef151691c38 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -90,6 +90,45 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const
     return text;
 }
 
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
+    if (delims.empty() || prompt.empty()) {
+        return {};
+    }
+
+    auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
+        std::vector<std::string>       all_delims;
+        std::vector<common_peg_parser> tagged_messages;
+
+        all_delims.reserve(delims.size());
+        tagged_messages.reserve(delims.size());
+        for (const auto & d : delims) {
+            all_delims.push_back(d.delimiter);
+        }
+
+        auto any_delim = p.until_one_of(all_delims);
+        for (const auto & d : delims) {
+            tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
+        }
+
+        return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
+    });
+
+    common_peg_parse_context ctx(prompt);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        return {};
+    }
+
+    std::vector<common_chat_msg_span> spans;
+    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+        if (!node.tag.empty()) {
+            spans.push_back({ node.tag, node.start, node.end - node.start });
+        }
+    });
+
+    return spans;
+}
+
 json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
     if (!content.empty() && !content_parts.empty()) {
         throw std::runtime_error("Cannot specify both content and content_parts");
@@ -1042,6 +1081,14 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 
     data.prompt            = prompt;
     data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
+    data.message_spans = common_chat_split_by_role(prompt, {
+        { "assistant", "<|start|>assistant" },
+        { "user",      "<|start|>user"      },
+        { "system",    "<|start|>developer" },
+        { "system",    "<|start|>system"    },
+        { "tool",      "<|start|>functions" },
+    });
+
     data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
     data.supports_thinking = true;
 
@@ -1181,6 +1228,11 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
         data.prompt += data.generation_prompt;
     }
 
+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "user",      "<|turn>user\n"  },
+        { "assistant", "<|turn>model\n" },
+    });
+
     data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
     data.supports_thinking  = true;
     data.thinking_start_tag = "<|channel>thought";
@@ -2393,6 +2445,19 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
         struct autoparser::autoparser autoparser;
         autoparser.analyze_template(tmpl);
         auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
+
+        std::vector<common_chat_msg_delimiter> delimiters;
+        if (!autoparser.assistant_start.empty()) {
+            delimiters.push_back({ "assistant", autoparser.assistant_start });
+        }
+        if (!autoparser.user_start.empty()) {
+            delimiters.push_back({ "user", autoparser.user_start });
+        }
+
+        if (!delimiters.empty()) {
+            auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
+        }
+
         auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
         if (auto_params.supports_thinking) {
             auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
diff --git a/common/chat.h b/common/chat.h
index b29c627e693..5659cd42a07 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -143,6 +143,17 @@ struct common_chat_msg_diff {
     }
 };
 
+struct common_chat_msg_span {
+    std::string role;
+    std::size_t pos = 0;
+    std::size_t len = 0;
+};
+
+struct common_chat_msg_delimiter {
+    std::string role;
+    std::string delimiter;
+};
+
 struct common_chat_tool {
     std::string name;
     std::string description;
@@ -208,6 +219,7 @@ struct common_chat_params {
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
     std::string                         parser;
+    std::vector<common_chat_msg_span>   message_spans;
 };
 
 // per-message parsing syntax
@@ -304,6 +316,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
         const std::string &                   src,
         autoparser::generation_params & params);
 
+
 // specialized per-task preset
 struct common_chat_prompt_preset {
     std::string system;
@@ -311,3 +324,6 @@ struct common_chat_prompt_preset {
 };
 
 common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
+
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+
diff --git a/common/common.cpp b/common/common.cpp
index d77ddeda10e..97daf281783 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -445,6 +445,27 @@ std::string string_strip(const std::string & str) {
     return str.substr(start, end - start);
 }
 
+std::string string_lcs(std::string_view a, std::string_view b) {
+    if (a.empty() || b.empty()) return {};
+
+    std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
+    size_t best_len = 0;
+    size_t best_end_a = 0;
+
+    for (size_t i = 1; i <= a.size(); ++i) {
+        for (size_t j = 1; j <= b.size(); ++j) {
+            if (a[i - 1] == b[j - 1]) {
+                dp[i][j] = dp[i - 1][j - 1] + 1;
+                if (dp[i][j] > best_len) {
+                    best_len = dp[i][j];
+                    best_end_a = i;
+                }
+            }
+        }
+    }
+    return std::string(a.substr(best_end_a - best_len, best_len));
+}
+
 std::string string_get_sortable_timestamp() {
     using clock = std::chrono::system_clock;
 
diff --git a/common/common.h b/common/common.h
index b0ad7b2ea4e..8a0e5eed5ee 100644
--- a/common/common.h
+++ b/common/common.h
@@ -594,7 +594,7 @@ struct common_params {
     bool    cache_prompt        = true;  // whether to enable prompt caching
     bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
     int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
+    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
     int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
 
     std::string hostname      = "127.0.0.1";
@@ -731,6 +731,7 @@ std::string string_format(const char * fmt, ...);
 
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
+std::string string_lcs(std::string_view a, std::string_view b);
 
 std::string string_join(const std::vector<std::string> & values, const std::string & separator);
 std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp
index 1d96de718e2..6f8e957489c 100644
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@@ -81,6 +81,8 @@ static void test_normalize_quotes_with_embedded_quotes(testing & t);
 // TAG_WITH_TAGGED argument parsing tests
 static void test_tagged_args_with_embedded_quotes(testing & t);
 
+static void test_role_markers_all_templates(testing & t);
+
 int main(int argc, char * argv[]) {
     testing t(std::cout);
     t.verbose = true;
@@ -103,6 +105,7 @@ int main(int argc, char * argv[]) {
     t.test("standard_json_tools", test_standard_json_tools_formats);
     t.test("normalize_quotes_to_json", test_normalize_quotes_to_json);
     t.test("tagged_args_embedded_quotes", test_tagged_args_with_embedded_quotes);
+    t.test("role_markers_all_templates", test_role_markers_all_templates);
 
     return t.summary();
 }
@@ -714,7 +717,7 @@ static void test_compare_variants_both_modifiers(testing & t) {
 static void test_compare_variants_template_failure(testing & t) {
     // Test with template that causes failure during application (not construction)
     // We use a valid template syntax but one that will fail during application
-    common_chat_template tmpl("{{ messages[0]['nonexistent_field'] }}", "", "");
+    common_chat_template tmpl("{{ messages.cahoot()[0]['nonexistent_field'] }}", "", "");
 
     template_params params;
     params.messages = json::array({
@@ -1848,6 +1851,128 @@ static json build_edit_tool() {
     });
 }
 
+// ============================================================================
+// Role marker detection tests for all autoparser-handled templates
+//
+// Verifies that detect_user_start_marker / detect_assistant_start_marker
+// return the correct boundary text between turns for every template that
+// falls through to the differential autoparser (i.e. is not handled by a
+// dedicated specialized template in common_chat_try_specialized_template).
+//
+// Markers were deduced manually from the jinja sources in models/templates/.
+// ============================================================================
+struct role_marker_case {
+    std::string template_file;
+    std::string expected_user_start;
+    std::string expected_assistant_start;
+};
+
+static void test_role_markers_all_templates(testing & t) {
+    // Each entry is { template filename, user_start, assistant_start } as
+    // produced when rendering the standard chatml-like sequences. The values
+    // come from reading each jinja template and tracing what text precedes
+    // a user/assistant message body once the autoparser strips any reasoning
+    // markers it detected first.
+    const std::vector<role_marker_case> cases = {
+        // ChatML family: <|im_start|>{role} ... <|im_end|>
+        { "Bielik-11B-v3.0-Instruct.jinja",                  "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "HuggingFaceTB-SmolLM3-3B.jinja",                  "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "MiMo-VL.jinja",                                   "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja", "<|im_start|>user",   "<|im_start|>assistant"      },
+        { "NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja",   "<|im_start|>user",   "<|im_start|>assistant"      },
+        { "NVIDIA-Nemotron-3-Nano-30B-A3B-BF16.jinja",       "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen3.5-4B.jinja",                                "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen3-Coder.jinja",                               "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen-Qwen2.5-7B-Instruct.jinja",                  "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen-Qwen3-0.6B.jinja",                           "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "Qwen-QwQ-32B.jinja",                              "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "StepFun3.5-Flash.jinja",                          "<|im_start|>user",       "<|im_start|>assistant"      },
+        { "stepfun-ai-Step-3.5-Flash.jinja",                 "<|im_start|>user",       "<|im_start|>assistant"      },
+
+        // DeepSeek family
+        { "deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja",  "<｜User｜>",                "<｜Assistant｜>"             },
+        { "deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja",  "<｜User｜>",                "<｜Assistant｜>"             },
+        { "deepseek-ai-DeepSeek-V3.1.jinja",                 "<｜User｜>",                "<｜Assistant｜>"             },
+        { "llama-cpp-deepseek-r1.jinja",                     "<｜User｜>",                "<｜Assistant｜>"             },
+
+        // Llama 3 header family
+        { "meetkai-functionary-medium-v3.1.jinja",           "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        { "meta-llama-Llama-3.1-8B-Instruct.jinja",          "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        { "meta-llama-Llama-3.2-3B-Instruct.jinja",          "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        { "meta-llama-Llama-3.3-70B-Instruct.jinja",         "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+        // fireworks-ai forces a trailing assistant header even without add_generation_prompt,
+        // so the marker is absorbed into the common suffix and assistant_start is detected as empty.
+        { "fireworks-ai-llama-3-firefunction-v2.jinja",      "<|start_header_id|>user<|end_header_id|>", "<|start_header_id|>assistant<|end_header_id|>" },
+
+        // Phi/GLM/Apriel-style: <|user|> / <|assistant|>
+        { "microsoft-Phi-3.5-mini-instruct.jinja",           "<|user|>",               "<|assistant|>"              },
+        { "GLM-4.6.jinja",                                   "<|user|>",               "<|assistant|>"              },
+        { "unsloth-Apriel-1.5.jinja",                        "<|user|>",               "<|assistant|>"              },
+        { "GLM-4.7-Flash.jinja",                             "<|user|>",                 "<|assistant|>"                },
+
+        // Gemma 2: <start_of_turn>{user|model}
+        { "google-gemma-2-2b-it.jinja",                      "<start_of_turn>user",    "<start_of_turn>model"       },
+
+        // IBM Granite
+        { "ibm-granite-granite-3.3-2B-Instruct.jinja",       "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" },
+        { "ibm-granite-granite-4.0.jinja",                   "<|start_of_role|>user<|end_of_role|>", "<|start_of_role|>assistant<|end_of_role|>" },
+
+        // Cohere R-series
+        { "CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja",
+            "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_RESPONSE|>" },
+        { "CohereForAI-c4ai-command-r-plus-tool_use.jinja",
+            "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" },
+
+        // Mistral: assistant content follows [/INST] immediately, no header
+        { "mistralai-Mistral-Nemo-Instruct-2407.jinja",      "[INST]",                   "" },
+        { "Mistral-Small-3.2-24B-Instruct-2506.jinja",       "[INST]",                   "" },
+
+        // Apertus uses <|user_start|> / <|assistant_start|> but the user diff
+        // carries the preceding <|assistant_end|> from the previous turn.
+        { "Apertus-8B-Instruct.jinja",                       "<|user_start|>", "<|assistant_start|>" },
+
+        // Apriel 1.6 wraps the assistant body with <|begin_assistant|>, but
+        // <|begin_assistant|> is also the detected reasoning start, so the
+        // assistant_start is trimmed back to the preceding newline.
+        { "Apriel-1.6-15b-Thinker-fixed.jinja",              "<|begin_user|>", "<|begin_assistant|>" },
+
+        // ByteDance Seed-OSS: <seed:bos>{role}
+        { "ByteDance-Seed-OSS.jinja",                        "<seed:bos>user",         "<seed:bos>assistant"        },
+
+        // GigaChat 3.1: {role}<|role_sep|>
+        { "GigaChat3.1-10B-A1.8B.jinja",                     "user<|role_sep|>",       "assistant<|role_sep|>"      },
+
+        // MiniMax M2: ]~b]{user|ai}
+        { "MiniMax-M2.jinja",                                "]~b]user",               "]~b]ai"                     },
+
+        // Nemotron Nano v2: <SPECIAL_11>{User|Assistant}; assistant marker
+        // is followed by a prefilled <think> block that gets included.
+        { "NVIDIA-Nemotron-Nano-v2.jinja",                   "<SPECIAL_11>User",       "<SPECIAL_11>Assistant" },
+
+        // Reka Edge: "human: " / "assistant: " — but the rendered preamble
+        // depends on enable_thinking, which currently confuses the user-start
+        // diff and trims the marker down. Lock in the observed value.
+        { "Reka-Edge.jinja",                                 "human:",                     "assistant:"       },
+
+        // RWKV-world chat preset: "User: " / "Assistant: "
+        { "llama-cpp-rwkv-world.jinja",                      "User:",               "Assistant:"              },
+
+        // Upstage Solar 100B: <|begin|>{role}... but reasoning marker absorbs
+        // the "<|begin|>assistant" prefix from assistant_start.
+        { "upstage-Solar-Open-100B.jinja",                   "<|begin|>user<|content|>", "<|begin|>assistant"           },
+    };
+
+    for (const auto & c : cases) {
+        t.test(c.template_file, [&](testing & t) {
+            common_chat_template tmpl = load_template(t, "models/templates/" + c.template_file);
+            struct autoparser ap;
+            ap.analyze_template(tmpl);
+            t.assert_equal("user_start",      c.expected_user_start,      ap.user_start);
+            t.assert_equal("assistant_start", c.expected_assistant_start, ap.assistant_start);
+        });
+    }
+}
+
 // Test that reproduces the Seed-OSS template issue with embedded quotes
 static void test_tagged_args_with_embedded_quotes(testing & t) {
     json tools = build_edit_tool();
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index a428ef35c18..1a5161cc101 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1548,6 +1548,40 @@ static void test_msgs_oaicompat_json_conversion() {
     }
 }
 
+static void test_split_by_role() {
+    LOG_DBG("%s\n", __func__);
+
+    // Empty inputs
+    assert_equals<size_t>(0, common_chat_split_by_role("", {}).size());
+    assert_equals<size_t>(0, common_chat_split_by_role("hello", {}).size());
+    assert_equals<size_t>(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size());
+
+    // Multi-role conversation, no leading/trailing content
+    {
+        const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye";
+        const auto splits = common_chat_split_by_role(prompt, {
+            { "user",      "<|user|>"      },
+            { "assistant", "<|assistant|>" },
+        });
+        assert_equals<size_t>(3, splits.size());
+
+        assert_equals<std::string>("user", splits[0].role);
+        assert_equals<size_t>(0, splits[0].pos);
+        assert_equals<size_t>(10, splits[0].len);
+        assert_equals<std::string>("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len));
+
+        assert_equals<std::string>("assistant", splits[1].role);
+        assert_equals<size_t>(10, splits[1].pos);
+        assert_equals<size_t>(18, splits[1].len);
+        assert_equals<std::string>("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len));
+
+        assert_equals<std::string>("user", splits[2].role);
+        assert_equals<size_t>(28, splits[2].pos);
+        assert_equals<size_t>(11, splits[2].len);
+        assert_equals<std::string>("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len));
+    }
+}
+
 static void test_tools_oaicompat_json_conversion() {
     LOG_DBG("%s\n", __func__);
     std::vector<common_chat_tool> tools{
@@ -4338,16 +4372,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
     // Format: <TOOLCALL>[{"name": "func", "arguments": {...}}]</TOOLCALL>
     {
         auto tst = peg_tester("models/templates/NVIDIA-Nemotron-Nano-v2.jinja", detailed_debug);
-        tst.test("<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL><SPECIAL_12>")
+        tst.test("<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>")
             .tools({ special_function_tool })
             .expect(message_assist_call)
             .run();
 
         // Continuation tests
         tst.test("world!\nWhat's up?")
+            .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
+            .enable_thinking(true)
             .messages({ message_user, message_assist_prefill_content })
             .add_generation_prompt(false)
             .continue_final_message(COMMON_CHAT_CONTINUATION_CONTENT)
+            .expect_reasoning("I'm thinking")
             .expect_content("Hello, world!\nWhat's up?")
             .run();
     }
@@ -5593,6 +5630,7 @@ int main(int argc, char ** argv) {
     {
         test_msg_diffs_compute();
         test_msgs_oaicompat_json_conversion();
+        test_split_by_role();
         test_tools_oaicompat_json_conversion();
         test_convert_responses_to_chatcmpl();
         test_developer_role_to_system_workaround();
diff --git a/tools/cli/README.md b/tools/cli/README.md
index bab65d50556..add4021e2a0 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -147,7 +147,6 @@
 | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
 | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
 | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
-| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
diff --git a/tools/server/README.md b/tools/server/README.md
index f2f73f6dc5f..0b7f9f994c2 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -163,7 +163,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
 | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
-| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
+| `-cms, --checkpoint-min-step N` | minimum spacing between context checkpoints in tokens (default: 256, 0 = no minimum)<br/>(env: LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
 | `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index dc00edfa82a..fb71792fe6d 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1110,6 +1110,16 @@ json oaicompat_chat_params_parse(
         llama_params["chat_parser"] = chat_params.parser;
     }
 
+    llama_params["message_spans"] = json::array();
+
+    for (const auto & span : chat_params.message_spans) {
+        llama_params["message_spans"].push_back({
+            { "role", span.role },
+            { "pos",  span.pos  },
+            { "len",  span.len  },
+        });
+    }
+
     // Reasoning budget: pass parameters through to sampling layer
     {
         int reasoning_budget = opt.reasoning_budget;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index c3daafd0d92..9fecc4247f5 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1103,6 +1103,13 @@ struct server_context_impl {
         }
         SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
+        if (params_base.n_ctx_checkpoints > 0) {
+            SRV_INF("context checkpoints enabled, max = %d, min spacing = %d\n",
+                    params_base.n_ctx_checkpoints, params_base.checkpoint_min_step);
+        } else {
+            SRV_INF("%s", "context checkpoints disabled\n");
+        }
+
         if (!params_base.model_alias.empty()) {
             // backward compat: use first alias as model name
             model_name = *params_base.model_alias.begin();
@@ -2758,8 +2765,6 @@ struct server_context_impl {
                                 }
 
                                 if (pos_min >= pos_min_thold) {
-                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
-
                                     // search for a context checkpoint
                                     const auto it = std::find_if(
                                         slot.prompt.checkpoints.rbegin(),
@@ -2776,7 +2781,6 @@ struct server_context_impl {
 
                                     if (!do_reset) {
                                         // restore the context checkpoint
-
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
@@ -2912,6 +2916,9 @@ struct server_context_impl {
                         has_mtmd = true;
                     }
 
+                    const int32_t n_before_user = slot.task->params.n_before_user;
+                    const bool n_before_user_known = n_before_user > 0;
+
                     // add prompt tokens for processing in the current batch
                     while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
                         // get next token to process
@@ -2940,6 +2947,13 @@ struct server_context_impl {
 
                         slot.n_prompt_tokens_processed++;
 
+                        // stop the prompt batch exactly before the latest user input, so a checkpoint
+                        // can be created after the previous messages
+                        if (n_before_user_known &&
+                            slot.prompt.n_tokens() == n_before_user) {
+                            break;
+                        }
+
                         // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
                         // create checkpoints that many tokens before the end of the prompt:
                         //  - 4 + n_ubatch
@@ -2965,6 +2979,8 @@ struct server_context_impl {
                     // the number of tokens added to the batch for the current slot
                     const auto n_tokens_cur = batch.n_tokens - n_tokens_prev;
 
+                    const bool near_prompt_end = slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch;
+
                     // entire prompt has been processed
                     if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
                         slot.state = SLOT_STATE_DONE_PROMPT;
@@ -2979,39 +2995,49 @@ struct server_context_impl {
 
                         slot.init_sampler();
                     } else {
-                        if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
-                            // near the end of the prompt
-                            do_checkpoint = do_checkpoint && true;
-                        } else {
-                            // only do non-end checkpoints if the "checkpoint every n tokens" option is set
-                            do_checkpoint = do_checkpoint && params_base.checkpoint_every_nt > 0;
-
-                            if (do_checkpoint) {
-                                llama_pos last_checkpoint = 0;
-                                if (!slot.prompt.checkpoints.empty()) {
-                                    last_checkpoint = slot.prompt.checkpoints.back().n_tokens;
-                                }
-
-                                do_checkpoint = do_checkpoint && slot.prompt.n_tokens() - batch.n_tokens - last_checkpoint >= params_base.checkpoint_every_nt;
-
-                                if (do_checkpoint) {
-                                    SLT_INF(slot, "%d tokens since last checkpoint at %d, creating new checkpoint during processing at position %d\n", params_base.checkpoint_every_nt, last_checkpoint, slot.prompt.n_tokens());
-                                }
-                            }
+                        // skip ordinary mid-prompt checkpoints
+                        if (!n_before_user_known && !near_prompt_end) {
+                            do_checkpoint = false;
                         }
                     }
 
                     const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
                     const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id);
 
-                    // no need for empty or small checkpoints
-                    do_checkpoint = do_checkpoint && (pos_min >= 0 && slot.prompt.n_tokens() >= 64);
+                    // checkpoints are created before the current batch is decoded, so
+                    // their token position is the batch start rather than the prompt end
+                    const int32_t n_tokens_start = slot.prompt.n_tokens() - n_tokens_cur;
+
+                    {
+                        const bool is_on_user =
+                            n_before_user_known &&
+                            n_tokens_start == n_before_user;
+
+                        const bool is_after_user =
+                            n_before_user_known &&
+                            n_tokens_start > n_before_user;
+
+                        const bool is_allowed =
+                            !n_before_user_known ||
+                            is_on_user ||
+                            (is_after_user && near_prompt_end);
+
+                        if (do_checkpoint && !is_allowed) {
+                            do_checkpoint = false;
+                        }
+                    }
+
+                    // nothing to checkpoint yet
+                    // TODO: is this check needed?
+                    if (do_checkpoint && pos_min < 0) {
+                        do_checkpoint = false;
+                    }
 
                     // do not checkpoint after mtmd chunks
                     do_checkpoint = do_checkpoint && !has_mtmd;
 
                     // no need to create checkpoints that are too close together
-                    do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || slot.prompt.n_tokens() - n_tokens_cur > slot.prompt.checkpoints.back().n_tokens + 64);
+                    do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step);
                     SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max);
 
                     // note: we create the checkpoint before calling llama_decode(), so the current batch is not
@@ -3528,6 +3554,53 @@ void server_context::on_sleeping_changed(std::function<void(bool)> callback) {
     impl->queue_tasks.on_sleeping_state(std::move(callback));
 }
 
+// compute the number of tokens before the last user message in the prompt
+static int32_t prompt_get_n_before_user(
+        const json & message_spans,
+        const std::string & prompt,
+        const std::vector<raw_buffer> & files,
+        const llama_vocab * vocab,
+        mtmd_context * mctx) {
+    int32_t result = -1;
+    int32_t byte_pos = -1;
+
+    for (const auto & span : message_spans) {
+        const std::string role = json_value(span, "role", std::string());
+
+        if (role == "user") {
+            byte_pos = json_value(span, "pos", -1);
+        }
+    }
+
+    if (byte_pos >= 0) {
+        GGML_ASSERT((size_t) byte_pos <= prompt.size());
+
+        const std::string prefix = prompt.substr(0, (size_t) byte_pos);
+
+        const std::string marker = get_media_marker();
+        size_t n_prefix_media = 0;
+        for (size_t pos = 0; (pos = prefix.find(marker, pos)) != std::string::npos; pos += marker.size()) {
+            n_prefix_media++;
+        }
+
+        GGML_ASSERT(n_prefix_media <= files.size());
+
+        if (mctx != nullptr && n_prefix_media > 0) {
+            // TODO: this makes a copy - avoid it
+            std::vector<raw_buffer> prefix_files(files.begin(), files.begin() + n_prefix_media);
+
+            result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size();
+        } else {
+            result = (int32_t) tokenize_input_prompts(vocab, nullptr, prefix, true, true)[0].size();
+        }
+
+        SRV_TRC("message_spans: last user message: byte_pos=%d, media=%zu, n_before_user=%d\n",
+                byte_pos, n_prefix_media, result);
+    }
+
+    return result;
+}
+
 
 //
 // server_routes
@@ -3577,6 +3650,18 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
                     meta->slot_n_ctx,
                     meta->logit_bias_eog,
                     data);
+
+            const auto message_spans = json_value(data, "message_spans", json::array());
+            if (prompt.is_string() && message_spans.is_array()) {
+                task.params.n_before_user =
+                    prompt_get_n_before_user(
+                        message_spans,
+                        prompt.get<std::string>(),
+                        files,
+                        ctx_server.vocab,
+                        ctx_server.mctx);
+            }
+
             task.id_slot = json_value(data, "id_slot", -1);
 
             // OAI-compat
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 0978bb6ff16..60e216e7927 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -61,6 +61,9 @@ struct task_params {
 
     int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled)
 
+    // number of prompt tokens before the latest user message
+    int32_t n_before_user = -1;
+
     int64_t t_max_prompt_ms  = -1; // TODO: implement
     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
 

From 9627d0f5407f87ce7360e6819d39cbc0fbd1f15e Mon Sep 17 00:00:00 2001
From: "Alessandro de Oliveira Faria (A.K.A.CABELO)" <cabelo@opensuse.org>
Date: Mon, 25 May 2026 03:45:22 -0300
Subject: [PATCH 06/21] vendor : update cpp-httplib to 0.45.1 (#23639)

---
 scripts/sync_vendor.py         |  2 +-
 vendor/cpp-httplib/httplib.cpp | 65 +++++++++++++++++++++-------------
 vendor/cpp-httplib/httplib.h   | 28 +++++++++------
 3 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py
index 658f7326b96..5ac8843570b 100755
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@
 import sys
 import subprocess
 
-HTTPLIB_VERSION = "refs/tags/v0.45.0"
+HTTPLIB_VERSION = "refs/tags/v0.45.1"
 
 vendor = {
     "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp
index b28549607a2..4ac497d0387 100644
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -1567,7 +1567,7 @@ void mmap::close() {
 #endif
   size_ = 0;
 }
-int close_socket(socket_t sock) {
+int close_socket(socket_t sock) noexcept {
 #ifdef _WIN32
   return closesocket(sock);
 #else
@@ -1794,7 +1794,7 @@ bool process_client_socket(
   return callback(strm);
 }
 
-int shutdown_socket(socket_t sock) {
+int shutdown_socket(socket_t sock) noexcept {
 #ifdef _WIN32
   return shutdown(sock, SD_BOTH);
 #else
@@ -7149,7 +7149,7 @@ void Server::wait_until_ready() const {
   }
 }
 
-void Server::stop() {
+void Server::stop() noexcept {
   if (is_running_) {
     assert(svr_sock_ != INVALID_SOCKET);
     std::atomic<socket_t> sock(svr_sock_.exchange(INVALID_SOCKET));
@@ -12290,9 +12290,18 @@ bool enumerate_windows_system_certs(Callback cb) {
 template <typename Callback>
 bool enumerate_macos_keychain_certs(Callback cb) {
   bool loaded = false;
-  CFArrayRef certs = nullptr;
-  OSStatus status = SecTrustCopyAnchorCertificates(&certs);
-  if (status == errSecSuccess && certs) {
+  const SecTrustSettingsDomain domains[] = {
+      kSecTrustSettingsDomainSystem,
+      kSecTrustSettingsDomainAdmin,
+      kSecTrustSettingsDomainUser,
+  };
+  for (auto domain : domains) {
+    CFArrayRef certs = nullptr;
+    OSStatus status = SecTrustSettingsCopyCertificates(domain, &certs);
+    if (status != errSecSuccess || !certs) {
+      if (certs) CFRelease(certs);
+      continue;
+    }
     CFIndex count = CFArrayGetCount(certs);
     for (CFIndex i = 0; i < count; i++) {
       SecCertificateRef cert =
@@ -12655,28 +12664,36 @@ bool load_system_certs(ctx_t ctx) {
   auto store = SSL_CTX_get_cert_store(ssl_ctx);
   if (!store) return false;
 
-  CFArrayRef certs = nullptr;
-  if (SecTrustCopyAnchorCertificates(&certs) != errSecSuccess || !certs) {
-    return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
-  }
-
   bool loaded_any = false;
-  auto count = CFArrayGetCount(certs);
-  for (CFIndex i = 0; i < count; i++) {
-    auto cert = reinterpret_cast<SecCertificateRef>(
-        const_cast<void *>(CFArrayGetValueAtIndex(certs, i)));
-    CFDataRef der = SecCertificateCopyData(cert);
-    if (der) {
-      const unsigned char *data = CFDataGetBytePtr(der);
-      auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der));
-      if (x509) {
-        if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; }
-        X509_free(x509);
+  const SecTrustSettingsDomain domains[] = {
+      kSecTrustSettingsDomainSystem,
+      kSecTrustSettingsDomainAdmin,
+      kSecTrustSettingsDomainUser,
+  };
+  for (auto domain : domains) {
+    CFArrayRef certs = nullptr;
+    if (SecTrustSettingsCopyCertificates(domain, &certs) != errSecSuccess ||
+        !certs) {
+      if (certs) CFRelease(certs);
+      continue;
+    }
+    auto count = CFArrayGetCount(certs);
+    for (CFIndex i = 0; i < count; i++) {
+      auto cert = reinterpret_cast<SecCertificateRef>(
+          const_cast<void *>(CFArrayGetValueAtIndex(certs, i)));
+      CFDataRef der = SecCertificateCopyData(cert);
+      if (der) {
+        const unsigned char *data = CFDataGetBytePtr(der);
+        auto x509 = d2i_X509(nullptr, &data, CFDataGetLength(der));
+        if (x509) {
+          if (X509_STORE_add_cert(store, x509) == 1) { loaded_any = true; }
+          X509_free(x509);
+        }
+        CFRelease(der);
       }
-      CFRelease(der);
     }
+    CFRelease(certs);
   }
-  CFRelease(certs);
   return loaded_any || SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
 #else
   return SSL_CTX_set_default_verify_paths(ssl_ctx) == 1;
diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h
index af750cdd905..536f0cb4d5e 100644
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
 
-#define CPPHTTPLIB_VERSION "0.45.0"
-#define CPPHTTPLIB_VERSION_NUM "0x002d00"
+#define CPPHTTPLIB_VERSION "0.45.1"
+#define CPPHTTPLIB_VERSION_NUM "0x002d01"
 
 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@@ -339,16 +339,26 @@ using socket_t = int;
 #include <utility>
 
 // On macOS with a TLS backend, enable Keychain root certificates by default
-// unless the user explicitly opts out.
+// unless the user explicitly opts out. Not enabled on iOS/tvOS/watchOS since
+// the SecTrustSettings APIs used to enumerate anchor certificates are macOS
+// only; on those platforms the user must provide a CA bundle explicitly.
 #if defined(__APPLE__) && defined(__clang__) &&                                \
     !defined(CPPHTTPLIB_DISABLE_MACOSX_AUTOMATIC_ROOT_CERTIFICATES) &&         \
     (defined(CPPHTTPLIB_OPENSSL_SUPPORT) ||                                    \
      defined(CPPHTTPLIB_MBEDTLS_SUPPORT) ||                                    \
      defined(CPPHTTPLIB_WOLFSSL_SUPPORT))
+#if TARGET_OS_OSX
 #ifndef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
 #define CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
 #endif
 #endif
+#endif
+
+#if defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) &&                      \
+    defined(__APPLE__) && !TARGET_OS_OSX
+#error                                                                         \
+    "CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN is only supported on macOS. On iOS/tvOS/watchOS, supply a CA bundle via set_ca_cert_path()."
+#endif
 
 // On Windows, enable Schannel certificate verification by default
 // unless the user explicitly opts out.
@@ -382,7 +392,7 @@ using socket_t = int;
 #endif // _WIN32
 
 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
-#if TARGET_OS_MAC
+#if TARGET_OS_OSX
 #include <Security/Security.h>
 #endif
 #endif
@@ -430,7 +440,7 @@ using socket_t = int;
 #endif
 #endif // _WIN32
 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
-#if TARGET_OS_MAC
+#if TARGET_OS_OSX
 #include <Security/Security.h>
 #endif
 #endif
@@ -473,7 +483,7 @@ using socket_t = int;
 #endif
 #endif // _WIN32
 #ifdef CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN
-#if TARGET_OS_MAC
+#if TARGET_OS_OSX
 #include <Security/Security.h>
 #endif
 #endif
@@ -1597,7 +1607,7 @@ class RegexMatcher final : public MatcherBase {
   std::regex regex_;
 };
 
-int close_socket(socket_t sock);
+int close_socket(socket_t sock) noexcept;
 
 ssize_t write_headers(Stream &strm, const Headers &headers);
 
@@ -1734,7 +1744,7 @@ class Server {
 
   bool is_running() const;
   void wait_until_ready() const;
-  void stop();
+  void stop() noexcept;
   void decommission();
 
   std::function<TaskQueue *(void)> new_task_queue;
@@ -3028,8 +3038,6 @@ bool parse_range_header(const std::string &s, Ranges &ranges);
 bool parse_accept_header(const std::string &s,
                          std::vector<std::string> &content_types);
 
-int close_socket(socket_t sock);
-
 ssize_t send_socket(socket_t sock, const void *ptr, size_t size, int flags);
 
 ssize_t read_socket(socket_t sock, void *ptr, size_t size, int flags);

From b96487645cb902551bdefb905e29261d0b0971bf Mon Sep 17 00:00:00 2001
From: Saba Fallah <10401143+sfallah@users.noreply.github.com>
Date: Mon, 25 May 2026 08:50:41 +0200
Subject: [PATCH 07/21] ui: media attachments before text (#23467)

* ui: media attachments before text

* fix prettier formatting
---
 tools/ui/src/lib/constants/image-size.ts  |  2 +-
 tools/ui/src/lib/services/chat.service.ts | 15 +++++++--------
 tools/ui/src/lib/utils/cap-img-size.ts    |  3 +--
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/tools/ui/src/lib/constants/image-size.ts b/tools/ui/src/lib/constants/image-size.ts
index 0d0c4e48434..bcc1da08b19 100644
--- a/tools/ui/src/lib/constants/image-size.ts
+++ b/tools/ui/src/lib/constants/image-size.ts
@@ -1 +1 @@
-export const MEGAPIXELS_TO_PIXELS = 1_000_000;
\ No newline at end of file
+export const MEGAPIXELS_TO_PIXELS = 1_000_000;
diff --git a/tools/ui/src/lib/services/chat.service.ts b/tools/ui/src/lib/services/chat.service.ts
index 5b7b7e8229d..3c9ca74796d 100644
--- a/tools/ui/src/lib/services/chat.service.ts
+++ b/tools/ui/src/lib/services/chat.service.ts
@@ -879,14 +879,6 @@ export class ChatService {
 			});
 		}
 
-		if (message.content) {
-			contentParts.push({
-				type: ContentPartType.TEXT,
-				text: message.content
-			});
-		}
-
-		// Include images from all messages
 		const imageFiles = message.extra.filter(
 			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
 				extra.type === AttachmentType.IMAGE
@@ -919,6 +911,13 @@ export class ChatService {
 			});
 		}
 
+		if (message.content) {
+			contentParts.push({
+				type: ContentPartType.TEXT,
+				text: message.content
+			});
+		}
+
 		const videoFiles = message.extra.filter(
 			(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraVideoFile =>
 				extra.type === AttachmentType.VIDEO
diff --git a/tools/ui/src/lib/utils/cap-img-size.ts b/tools/ui/src/lib/utils/cap-img-size.ts
index fa2af53f6fd..1ba218316b7 100644
--- a/tools/ui/src/lib/utils/cap-img-size.ts
+++ b/tools/ui/src/lib/utils/cap-img-size.ts
@@ -14,9 +14,8 @@ export function capImageDataURLSize(
 ): Promise<string> {
 	return new Promise((resolve, reject) => {
 		try {
-			
 			const mimeMatch = base64UrlImage.match(BASE64_IMAGE_URI_REGEX);
-			
+
 			if (!mimeMatch) {
 				return reject(new Error('Invalid data URL format.'));
 			}

From 826539ce590fe294642db0acd54ea5e0a2fcd739 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Mon, 25 May 2026 02:15:46 -0500
Subject: [PATCH 08/21] ggml : Parallelize quant LUT init (#23595)

- Use OpenMP to parallelize iq2xs_init_impl and iq3xs_init_impl.
- Move the OpenMP detection from ggml-cpu to ggml-base.
- Update OpenMP dependencies in ggml-config.cmake.in.
---
 ggml/cmake/ggml-config.cmake.in  |  14 +-
 ggml/src/CMakeLists.txt          |  17 ++
 ggml/src/ggml-cpu/CMakeLists.txt |  14 +-
 ggml/src/ggml-quants.c           | 328 ++++++++++++++++++++-----------
 4 files changed, 246 insertions(+), 127 deletions(-)

diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in
index 91c9d5cd343..23a3066f56d 100644
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -6,6 +6,7 @@
 include(CMakeFindDependencyMacro)
 find_dependency(Threads)
 if (NOT GGML_SHARED_LIB)
+    set(GGML_BASE_INTERFACE_LINK_LIBRARIES "")
     set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
     set(GGML_CPU_INTERFACE_LINK_OPTIONS   "")
 
@@ -20,7 +21,15 @@ if (NOT GGML_SHARED_LIB)
 
     if (GGML_OPENMP_ENABLED)
         find_dependency(OpenMP)
-        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        set(GGML_OPENMP_INTERFACE_LINK_LIBRARIES "")
+        if (TARGET OpenMP::OpenMP_C)
+            list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C)
+        endif()
+        if (TARGET OpenMP::OpenMP_CXX)
+            list(APPEND GGML_OPENMP_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_CXX)
+        endif()
+        list(APPEND GGML_BASE_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
+        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${GGML_OPENMP_INTERFACE_LINK_LIBRARIES})
     endif()
 
     if (GGML_CPU_HBM)
@@ -122,7 +131,8 @@ if(NOT TARGET ggml::ggml)
     add_library(ggml::ggml-base UNKNOWN IMPORTED)
     set_target_properties(ggml::ggml-base
         PROPERTIES
-            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
+            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}"
+            INTERFACE_LINK_LIBRARIES "${GGML_BASE_INTERFACE_LINK_LIBRARIES}")
 
     set(_ggml_all_targets "")
     if (NOT GGML_BACKEND_DL)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 3e48860bfc8..c26c3f1470d 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -222,6 +222,23 @@ if (GGML_SCHED_NO_REALLOC)
     target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
 endif()
 
+if (GGML_OPENMP)
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
+    else()
+        set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
+        message(WARNING "OpenMP not found")
+    endif()
+else()
+    set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
+endif()
+
+if (GGML_OPENMP_ENABLED)
+    target_compile_definitions(ggml-base PRIVATE GGML_USE_OPENMP)
+    target_link_libraries(ggml-base PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+endif()
+
 add_library(ggml
             ggml-backend-dl.cpp
             ggml-backend-reg.cpp)
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index f3eccff7d72..8c735a045b3 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -72,17 +72,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
         endif()
     endif()
 
-    if (GGML_OPENMP)
-        find_package(OpenMP)
-        if (OpenMP_FOUND)
-            set(GGML_OPENMP_ENABLED "ON" CACHE INTERNAL "")
-            target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
-
-            target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
-        else()
-            set(GGML_OPENMP_ENABLED "OFF" CACHE INTERNAL "")
-            message(WARNING "OpenMP not found")
-        endif()
+    if (GGML_OPENMP_ENABLED)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP)
+        target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
     endif()
 
     if (GGML_LLAMAFILE)
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 15443aa554a..15d231f70c0 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -13,6 +13,10 @@
 #include <stdlib.h> // for qsort
 #include <stdio.h>  // for GGML_ASSERT
 
+#ifdef GGML_USE_OPENMP
+#include <omp.h>
+#endif
+
 #define GROUP_MAX_EPS 1e-15f
 #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
 #define GROUP_MAX_EPS_IQ2_S 1e-8f
@@ -3064,70 +3068,121 @@ void iq2xs_init_impl(enum ggml_type type) {
         }
         kmap_q2xs[index] = i;
     }
-    int8_t pos[8];
-    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    // The neighbour search runs in three passes:
+    //   1. Parallel: for each i, qsort and count its neighbours into n_per_i,
+    //      and reduce the totals (num_neighbors, num_not_in_map).
+    //   2. Serial: prefix-sum n_per_i into offsets[], so each i has a
+    //      pre-assigned slice of kneighbors_q2xs to write into.
+    //   3. Parallel: redo the qsort and write each i's neighbour list at
+    //      offsets[i].
+    int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(n_per_i);
     int num_neighbors = 0, num_not_in_map = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q2xs[i] >= 0) continue;
-        ++num_not_in_map;
-        for (int k = 0; k < 8; ++k) {
-            int l = (i >> 2*k) & 0x3;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
-        int n = 0; int d2 = dist2[0];
-        int nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            ++n;
-        }
-        num_neighbors += n;
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[8];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q2xs[i] >= 0) {
+                n_per_i[i] = 0;
+                continue;
+            }
+            ++num_not_in_map;
+            for (int k = 0; k < 8; ++k) {
+                int l = (i >> 2*k) & 0x3;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+            int n = 0; int d2 = dist2[0];
+            int nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                ++n;
+            }
+            n_per_i[i] = n;
+            num_neighbors += n;
+        }
+        free(dist2);
     }
     //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
     kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
     iq2_data[gindex].neighbours = kneighbors_q2xs;
+
+    int * offsets = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(offsets);
     int counter = 0;
     for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q2xs[i] >= 0) continue;
-        for (int k = 0; k < 8; ++k) {
-            int l = (i >> 2*k) & 0x3;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
-        kmap_q2xs[i] = -(counter + 1);
-        int d2 = dist2[0];
-        uint16_t * start = &kneighbors_q2xs[counter++];
-        int n = 0, nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            kneighbors_q2xs[counter++] = dist2[2*j+1];
-            ++n;
-        }
-        *start = n;
-    }
-    free(dist2);
+        if (kmap_q2xs[i] >= 0) {
+            offsets[i] = -1;
+            continue;
+        }
+        offsets[i] = counter;
+        counter += 1 + n_per_i[i];
+    }
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[8];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q2xs[i] >= 0) continue;
+            for (int k = 0; k < 8; ++k) {
+                int l = (i >> 2*k) & 0x3;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
+            int local_counter = offsets[i];
+            kmap_q2xs[i] = -(local_counter + 1);
+            int d2 = dist2[0];
+            uint16_t * start = &kneighbors_q2xs[local_counter++];
+            int n = 0, nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                kneighbors_q2xs[local_counter++] = dist2[2*j+1];
+                ++n;
+            }
+            *start = n;
+        }
+        free(dist2);
+    }
+    free(offsets);
+    free(n_per_i);
 }
 
 void iq2xs_free_impl(enum ggml_type type) {
@@ -3663,70 +3718,115 @@ void iq3xs_init_impl(int grid_size) {
         }
         kmap_q3xs[index] = i;
     }
-    int8_t pos[4];
-    int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+    // See explanation of parallelism in iq2xs_init_impl
+    int * n_per_i = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(n_per_i);
     int num_neighbors = 0, num_not_in_map = 0;
-    for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q3xs[i] >= 0) continue;
-        ++num_not_in_map;
-        for (int k = 0; k < 4; ++k) {
-            int l = (i >> 3*k) & 0x7;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
-        int n = 0; int d2 = dist2[0];
-        int nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            ++n;
-        }
-        num_neighbors += n;
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel reduction(+:num_neighbors,num_not_in_map)
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[4];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q3xs[i] >= 0) {
+                n_per_i[i] = 0;
+                continue;
+            }
+            ++num_not_in_map;
+            for (int k = 0; k < 4; ++k) {
+                int l = (i >> 3*k) & 0x7;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+            int n = 0; int d2 = dist2[0];
+            int nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                ++n;
+            }
+            n_per_i[i] = n;
+            num_neighbors += n;
+        }
+        free(dist2);
     }
     //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
     kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
     iq3_data[gindex].neighbours = kneighbors_q3xs;
+
+    int * offsets = (int *)malloc(kmap_size*sizeof(int));
+    GGML_ASSERT(offsets);
     int counter = 0;
     for (int i = 0; i < kmap_size; ++i) {
-        if (kmap_q3xs[i] >= 0) continue;
-        for (int k = 0; k < 4; ++k) {
-            int l = (i >> 3*k) & 0x7;
-            pos[k] = 2*l + 1;
-        }
-        for (int j = 0; j < grid_size; ++j) {
-            const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
-            int d2 = 0;
-            for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
-            dist2[2*j+0] = d2;
-            dist2[2*j+1] = j;
-        }
-        qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
-        kmap_q3xs[i] = -(counter + 1);
-        int d2 = dist2[0];
-        uint16_t * start = &kneighbors_q3xs[counter++];
-        int n = 0, nhave = 1;
-        for (int j = 0; j < grid_size; ++j) {
-            if (dist2[2*j] > d2) {
-                if (nhave == nwant) break;
-                d2 = dist2[2*j];
-                ++nhave;
-            }
-            kneighbors_q3xs[counter++] = dist2[2*j+1];
-            ++n;
-        }
-        *start = n;
-    }
-    free(dist2);
+        if (kmap_q3xs[i] >= 0) {
+            offsets[i] = -1;
+            continue;
+        }
+        offsets[i] = counter;
+        counter += 1 + n_per_i[i];
+    }
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel
+#endif
+    {
+        int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
+        GGML_ASSERT(dist2);
+        int8_t pos[4];
+        int i;
+#ifdef GGML_USE_OPENMP
+        #pragma omp for schedule(dynamic, 64)
+#endif
+        for (i = 0; i < kmap_size; ++i) {
+            if (kmap_q3xs[i] >= 0) continue;
+            for (int k = 0; k < 4; ++k) {
+                int l = (i >> 3*k) & 0x7;
+                pos[k] = 2*l + 1;
+            }
+            for (int j = 0; j < grid_size; ++j) {
+                const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
+                int d2 = 0;
+                for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
+                dist2[2*j+0] = d2;
+                dist2[2*j+1] = j;
+            }
+            qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
+            int local_counter = offsets[i];
+            kmap_q3xs[i] = -(local_counter + 1);
+            int d2 = dist2[0];
+            uint16_t * start = &kneighbors_q3xs[local_counter++];
+            int n = 0, nhave = 1;
+            for (int j = 0; j < grid_size; ++j) {
+                if (dist2[2*j] > d2) {
+                    if (nhave == nwant) break;
+                    d2 = dist2[2*j];
+                    ++nhave;
+                }
+                kneighbors_q3xs[local_counter++] = dist2[2*j+1];
+                ++n;
+            }
+            *start = n;
+        }
+        free(dist2);
+    }
+    free(offsets);
+    free(n_per_i);
 }
 
 void iq3xs_free_impl(int grid_size) {

From d55fb9717459c4a61ae372d57c5214e1d791e4d2 Mon Sep 17 00:00:00 2001
From: Aldehir Rojas <hello@alde.dev>
Date: Mon, 25 May 2026 03:18:08 -0400
Subject: [PATCH 09/21] ci : install host compiler on android-ndk build
 (#23630)

---
 .github/workflows/build-android.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml
index 5d88305a4f0..f4c3fd669de 100644
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -73,6 +73,11 @@ jobs:
           fetch-depth: 0
           lfs: false
 
+      - name: Dependencies
+        run: |
+          apt-get update
+          apt-get install -y build-essential
+
       - name: Build
         id: ndk_build
         run: |

From 314e729347defd9851e857f78084160c5786a7d8 Mon Sep 17 00:00:00 2001
From: Tim Neumann <mail@timnn.me>
Date: Mon, 25 May 2026 09:29:28 +0200
Subject: [PATCH 10/21] llama : document that only one on-device state can be
 saved per sequence (#23520)

---
 include/llama.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index 75095b22d08..e8374c53b70 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -874,7 +874,8 @@ extern "C" {
 // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
 #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
 
-// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
+// Keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load).
+// Getting the state for a seq_id with this flag invalidates all prior states gotten for that seq_id with this flag.
 #define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
 
     typedef uint32_t llama_state_seq_flags;

From 062d3115aa12267d34d7ac5b78bd5d0ad48778ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 25 May 2026 10:41:25 +0200
Subject: [PATCH 11/21] ci : fix pre-tokenizer-hashes check (#23651)

---
 .github/workflows/pre-tokenizer-hashes.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml
index fce70d03e52..3e440b67d9b 100644
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -3,11 +3,11 @@ name: Check Pre-Tokenizer Hashes
 on:
     push:
         paths:
-            - 'convert_hf_to_gguf.py'
+            - 'conversion/base.py'
             - 'convert_hf_to_gguf_update.py'
     pull_request:
         paths:
-            - 'convert_hf_to_gguf.py'
+            - 'conversion/base.py'
             - 'convert_hf_to_gguf_update.py'
 
 jobs:
@@ -30,16 +30,16 @@ jobs:
 
         - name: Update pre-tokenizer hashes
           run: |
-              cp convert_hf_to_gguf.py /tmp
+              cp conversion/base.py /tmp
               .venv/bin/python convert_hf_to_gguf_update.py --check-missing
 
         - name: Check if committed pre-tokenizer hashes matches generated version
           run: |
-              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
-                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+              if ! diff -q conversion/base.py /tmp/base.py; then
+                  echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
                   echo "Differences found:"
-                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  diff conversion/base.py /tmp/base.py || true
                   exit 1
               fi
               echo "Model pre-tokenizer hashes are up to date."

From 5fdf07e33bf35b8e6fbd33c393c0260fe208485a Mon Sep 17 00:00:00 2001
From: alex-spacemit <jinghui.huang@spacemit.com>
Date: Mon, 25 May 2026 16:43:24 +0800
Subject: [PATCH 12/21] ci : update spacemit toolchain url and enhance curl
 command (#23642)

* fix(action): update SpacemiT toolchain URL and version

Change-Id: If4cc1c738a855274103f8c3ad52daa33528acd0c

* fix(action): add -L flag to curl command for URL redirection

Change-Id: I9b6c37390f0c7a733a36308c8fb53d22d234ab06
---
 .github/actions/linux-setup-spacemit/action.yml | 2 +-
 .github/actions/unarchive-tar/action.yml        | 2 +-
 .github/workflows/build-cross.yml               | 2 +-
 docs/build-riscv64-spacemit.md                  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/actions/linux-setup-spacemit/action.yml b/.github/actions/linux-setup-spacemit/action.yml
index e2193e8931d..39e405b6779 100644
--- a/.github/actions/linux-setup-spacemit/action.yml
+++ b/.github/actions/linux-setup-spacemit/action.yml
@@ -15,6 +15,6 @@ runs:
       id: setup
       uses: ./.github/actions/unarchive-tar
       with:
-        url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
+        url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
         path: ${{ inputs.path }}
         strip: 1
diff --git a/.github/actions/unarchive-tar/action.yml b/.github/actions/unarchive-tar/action.yml
index b97e402f46a..3d2f9be7bdd 100644
--- a/.github/actions/unarchive-tar/action.yml
+++ b/.github/actions/unarchive-tar/action.yml
@@ -24,4 +24,4 @@ runs:
       run: |
         mkdir -p ${{ inputs.path }}
         cd ${{ inputs.path }}
-        curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
+        curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
diff --git a/.github/workflows/build-cross.yml b/.github/workflows/build-cross.yml
index 97c7368387e..845e88fe43b 100644
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -277,7 +277,7 @@ jobs:
 
     env:
       # Make sure this is in sync with build-cache.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"
 
     steps:
       - uses: actions/checkout@v6
diff --git a/docs/build-riscv64-spacemit.md b/docs/build-riscv64-spacemit.md
index 08301d51cae..7a9a1f3ad4e 100644
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -5,7 +5,7 @@
 
 1. Prepare Toolchain For RISCV
 ~~~
-wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
+wget https://github.com/spacemit-com/toolchain/releases/download/v1.2.4/spacemit-toolchain-linux-glibc-x86_64-v1.2.4.tar.xz
 ~~~
 
 2. Build

From 6c4cbdc70b83ac054106e9de3ebc2ecaa82c4b1f Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Mon, 25 May 2026 16:46:23 +0800
Subject: [PATCH 13/21] server: MTP layer kv-cache should respect draft type
 ctk (#23646)

---
 tools/server/server-context.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 9fecc4247f5..ae9e0bf60d8 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -822,6 +822,8 @@ struct server_context_impl {
                 auto cparams_dft = common_context_params_to_llama(params_dft);
                 if (spec_mtp) {
                     cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
+                    cparams_dft.type_k   = params_base.speculative.draft.cache_type_k;
+                    cparams_dft.type_v   = params_base.speculative.draft.cache_type_v;
                 }
                 cparams_dft.n_rs_seq = 0;
 
@@ -940,6 +942,8 @@ struct server_context_impl {
 
             auto cparams_mtp = common_context_params_to_llama(params_base);
             cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP;
+            cparams_mtp.type_k   = params_base.speculative.draft.cache_type_k;
+            cparams_mtp.type_v   = params_base.speculative.draft.cache_type_v;
             cparams_mtp.n_rs_seq = 0;
 
             ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp));

From 66efd13375bb0dfb28730d8c4b7d5c57cfd2c22d Mon Sep 17 00:00:00 2001
From: "Gilad S." <7817232+giladgd@users.noreply.github.com>
Date: Mon, 25 May 2026 11:33:29 +0200
Subject: [PATCH 14/21] ggml: `gguf_init_from_callback` and
 `gguf_init_from_buffer` (#22341)

* ggml: implement `gguf_init_from_buffer`

* test: `gguf_init_from_buffer`

* fix: memory breakdown for a model loaded with `no_alloc` from a file is consistent with being loaded from a buffer

* fix: use `GGML_UNUSED`

Co-authored-by: Copilot <copilot@github.com>

* fix: remove `total_size` from `gguf_reader`

* fix: file offset calculation, rename `offset` to `data_offset`

Co-authored-by: Copilot <copilot@github.com>

* refactor: extract model loader bug fixes to another PR

* feat: add `gguf_init_from_callback`

* fix: always require a max expected size

* fix: change `gguf_reader_callback_t`'s `output` type to `void *`, change `max_expected_size` and offsets to `uint64_t`

* fix: harden against offset overflow in buffer read

* fix: remove seek behavior from the callback

* feat: `max_chunk_read == 0` means `SIZE_MAX`

* fix: seeking in a gguf file with no tensors

---------

Co-authored-by: Copilot <copilot@github.com>
---
 ggml/include/gguf.h |  10 ++-
 ggml/src/gguf.cpp   | 178 ++++++++++++++++++++++++++++++++++++++------
 tests/test-gguf.cpp |  90 ++++++++++++++++++++--
 3 files changed, 248 insertions(+), 30 deletions(-)

diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 02d5f221c03..67851ba6f16 100644
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@@ -76,10 +76,16 @@ extern "C" {
         struct ggml_context ** ctx;
     };
 
+    // callback to simulate or wrap a FILE pointer - read up to `len` bytes at `offset` into `output` and return the number of bytes read
+    typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len);
+
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+    GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params);
+
+    // max_chunk_read is the maximum number of bytes that the GGUF code will read at once from the callback, a value of 0 means no limit
+    GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params);
 
     GGML_API void gguf_free(struct gguf_context * ctx);
 
@@ -87,7 +93,7 @@ extern "C" {
 
     GGML_API uint32_t gguf_get_version    (const struct gguf_context * ctx);
     GGML_API size_t   gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);
+    GGML_API size_t   gguf_get_data_offset(const struct gguf_context * ctx);  // padded to gguf_get_alignment if and only if the gguf_context contains at least one tensor
 
     GGML_API int64_t      gguf_get_n_kv(const struct gguf_context * ctx);
     GGML_API int64_t      gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index ab3cc974867..5e198618251 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -228,9 +228,18 @@ struct gguf_context {
 };
 
 struct gguf_reader {
-    gguf_reader(FILE * file) : file(file) {
-        // read the remaining bytes once and update on each read
-        nbytes_remain = file_remain(file);
+    gguf_reader(
+            gguf_reader_callback_t callback,
+            void * userdata,
+            size_t max_chunk_read,
+            uint64_t data_offset = 0,
+            uint64_t nbytes_remain = 0)
+        : callback(callback),
+          userdata(userdata),
+          max_chunk_read(max_chunk_read),
+          data_offset(data_offset),
+          nbytes_remain(nbytes_remain) {
+        GGML_ASSERT(max_chunk_read > 0);
     }
 
     // helper for remaining bytes in a file
@@ -257,12 +266,10 @@ struct gguf_reader {
     template <typename T>
     bool read(T & dst) const {
         const size_t size = sizeof(dst);
-        if (nbytes_remain < size) {
+        if (size > nbytes_remain) {
             return false;
         }
-        const size_t nread = fread(&dst, 1, size, file);
-        nbytes_remain -= nread;
-        return nread == size;
+        return read_raw(&dst, size) == size;
     }
 
     template <typename T>
@@ -344,24 +351,71 @@ struct gguf_reader {
             return false;
         }
         dst.resize(static_cast<size_t>(size));
-        const size_t nread = fread(dst.data(), 1, size, file);
-        nbytes_remain -= nread;
-        return nread == size;
+        return read_raw(dst.data(), static_cast<size_t>(size)) == size;
     }
 
     bool read(void * dst, const size_t size) const {
         if (size > nbytes_remain) {
             return false;
         }
-        const size_t nread = fread(dst, 1, size, file);
-        nbytes_remain -= nread;
-        return nread == size;
+        return read_raw(dst, size) == size;
+    }
+
+    uint64_t tell() const {
+        return data_offset;
+    }
+
+    bool seek(uint64_t absolute_offset) const {
+        const uint64_t end_offset = uint64_t(data_offset) + nbytes_remain;
+        if (absolute_offset > end_offset) {
+            return false;
+        }
+
+        data_offset = absolute_offset;
+        nbytes_remain = end_offset - absolute_offset;
+
+        return true;
     }
 
 private:
-    FILE * file;
+    size_t read_raw(void * dst, size_t size) const {
+        if (callback == nullptr || size == 0) {
+            return 0;
+        }
+
+        uint8_t * data = static_cast<uint8_t *>(dst);
+        size_t total_nread = 0;
+        bool reached_eof = false;
 
-    mutable uint64_t nbytes_remain;
+        while (total_nread < size) {
+            const size_t chunk_size = std::min(max_chunk_read, size - total_nread);
+            if (data_offset + total_nread < data_offset) {
+                break;
+            }
+            const size_t nread = callback(userdata, static_cast<void *>(data + total_nread), data_offset + total_nread, chunk_size);
+            total_nread += nread;
+            if (nread != chunk_size) {
+                reached_eof = true;
+                break;
+            }
+        }
+
+        data_offset += total_nread;
+        GGML_ASSERT(total_nread <= nbytes_remain);
+        nbytes_remain -= total_nread;
+
+        if (reached_eof) {
+            nbytes_remain = 0;
+        }
+
+        return total_nread;
+    }
+
+    gguf_reader_callback_t callback = nullptr;
+    void * userdata = nullptr;
+    size_t max_chunk_read = 0;
+    mutable uint64_t data_offset = 0;
+    mutable uint64_t nbytes_remain = 0;
 };
 
 struct gguf_context * gguf_init_empty(void) {
@@ -394,12 +448,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
     return true;
 }
 
-struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
-    if (!file) {
-        return nullptr;
-    }
-
-    const struct gguf_reader gr(file);
+static struct gguf_context * gguf_init_from_reader(const struct gguf_reader & gr, struct gguf_init_params params) {
     struct gguf_context * ctx = new gguf_context;
 
     bool ok = true;
@@ -700,14 +749,14 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors);
 
     // we require the data section to be aligned, so take into account any padding
-    if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) {
+    if (n_tensors > 0 && !gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) {
         GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
         gguf_free(ctx);
         return nullptr;
     }
 
     // store the current file offset - this is where the data section starts
-    ctx->offset = gguf_ftell(file);
+    ctx->offset = gr.tell();
 
     // compute the total size of the data section, taking into account the alignment
     {
@@ -844,6 +893,89 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para
     return ctx;
 }
 
+struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params) {
+    if (callback == nullptr) {
+        return nullptr;
+    }
+
+    const struct gguf_reader gr(callback, userdata, max_chunk_read == 0 ? SIZE_MAX : max_chunk_read, 0, max_expected_size);
+    return gguf_init_from_reader(gr, params);
+}
+
+struct gguf_file_reader {
+    FILE * file;
+    uint64_t offset;
+};
+
+static size_t gguf_file_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) {
+    GGML_ASSERT(len > 0);
+
+    gguf_file_reader & reader = *static_cast<gguf_file_reader *>(userdata);
+
+    if (reader.offset != offset) {
+        if (offset > INT64_MAX || gguf_fseek(reader.file, static_cast<int64_t>(offset), SEEK_SET) != 0) {
+            return 0;
+        }
+
+        reader.offset = offset;
+    }
+
+    const size_t nread = fread(static_cast<uint8_t *>(output), 1, len, reader.file);
+    reader.offset += nread;
+    return nread;
+}
+
+struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
+    if (!file) {
+        return nullptr;
+    }
+
+    const int64_t cur = gguf_ftell(file);
+    if (cur < 0) {
+        return nullptr;
+    }
+
+    gguf_file_reader reader = {
+        /*.file   = */ file,
+        /*.offset = */ static_cast<uint64_t>(cur),
+    };
+    const struct gguf_reader gr(gguf_file_reader_callback, &reader, SIZE_MAX, reader.offset, gguf_reader::file_remain(file));
+    return gguf_init_from_reader(gr, params);
+}
+
+struct gguf_buffer_reader {
+    const uint8_t * data;
+    size_t          size;
+};
+
+static size_t gguf_buffer_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) {
+    GGML_ASSERT(len > 0);
+
+    const gguf_buffer_reader & reader = *static_cast<gguf_buffer_reader *>(userdata);
+
+    if (offset > reader.size || len > reader.size - offset) {
+        return 0;
+    }
+
+    const size_t data_offset = static_cast<size_t>(offset);
+    const size_t nread = std::min(len, reader.size - data_offset);
+    memcpy(static_cast<uint8_t *>(output), reader.data + data_offset, nread);
+    return nread;
+}
+
+struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) {
+    if (data == nullptr || size == 0) {
+        return nullptr;
+    }
+
+    gguf_buffer_reader reader = {
+        /*.data = */ static_cast<const uint8_t *>(data),
+        /*.size = */ size,
+    };
+    const struct gguf_reader gr(gguf_buffer_reader_callback, &reader, SIZE_MAX, 0, size);
+    return gguf_init_from_reader(gr, params);
+}
+
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
     FILE * file = ggml_fopen(fname, "rb");
 
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
index ed3070dc4de..1ae468fbd65 100644
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@@ -162,6 +162,42 @@ static void helper_write(FILE * file, const void * data, const size_t nbytes) {
     GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes);
 }
 
+static std::vector<uint8_t> read_file_to_buffer(FILE * file) {
+    GGML_ASSERT(file != nullptr);
+    GGML_ASSERT(fseek(file, 0, SEEK_END) == 0);
+
+    const long size = ftell(file);
+    GGML_ASSERT(size >= 0);
+
+    rewind(file);
+
+    std::vector<uint8_t> data(static_cast<size_t>(size));
+    GGML_ASSERT(fread(data.data(), 1, data.size(), file) == data.size());
+
+    rewind(file);
+    return data;
+}
+
+struct callback_reader_data {
+    const uint8_t * data;
+    size_t size;
+};
+
+static size_t read_buffer_callback(void * userdata, void * output, uint64_t offset, size_t len) {
+    GGML_ASSERT(len > 0);
+
+    const callback_reader_data & reader = *static_cast<callback_reader_data *>(userdata);
+
+    if (offset > reader.size || len > reader.size - offset) {
+        return 0;
+    }
+
+    const size_t data_offset = static_cast<size_t>(offset);
+    const size_t nread = std::min(len, reader.size - data_offset);
+    memcpy(static_cast<uint8_t *>(output), reader.data + data_offset, nread);
+    return nread;
+}
+
 static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) {
     FILE * file = tmpfile();
 
@@ -1095,10 +1131,29 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
     return ok;
 }
 
-static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
+enum roundtrip_read_mode {
+    ROUNDTRIP_READ_MODE_FILE,
+    ROUNDTRIP_READ_MODE_BUFFER,
+    ROUNDTRIP_READ_MODE_CALLBACK,
+};
+
+static const char * roundtrip_read_mode_name(const roundtrip_read_mode mode) {
+    switch (mode) {
+        case ROUNDTRIP_READ_MODE_FILE:     return "file";
+        case ROUNDTRIP_READ_MODE_BUFFER:   return "buffer";
+        case ROUNDTRIP_READ_MODE_CALLBACK: return "callback";
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+static std::pair<int, int> test_roundtrip(
+        ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta,
+        const roundtrip_read_mode read_mode) {
     ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
-    printf("%s: device=%s, backend=%s, only_meta=%s\n",
-        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no");
+    printf("%s: device=%s, backend=%s, only_meta=%s, read_mode=%s\n",
+        __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend),
+        only_meta ? "yes" : "no", roundtrip_read_mode_name(read_mode));
 
     int npass = 0;
     int ntest = 0;
@@ -1133,7 +1188,22 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
         /*no_alloc =*/ false,
         /*ctx      =*/ only_meta ? nullptr : &ctx_1,
     };
-    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
+    struct gguf_context * gguf_ctx_1 = nullptr;
+    const std::vector<uint8_t> data = read_mode == ROUNDTRIP_READ_MODE_FILE
+        ? std::vector<uint8_t>()
+        : read_file_to_buffer(file);
+
+    if (read_mode == ROUNDTRIP_READ_MODE_BUFFER) {
+        gguf_ctx_1 = gguf_init_from_buffer(data.data(), data.size(), gguf_params);
+    } else if (read_mode == ROUNDTRIP_READ_MODE_CALLBACK) {
+        callback_reader_data reader = {
+            /*.data = */ data.data(),
+            /*.size = */ data.size(),
+        };
+        gguf_ctx_1 = gguf_init_from_callback(read_buffer_callback, &reader, 4096, 4ull << 30 /* 4GB */, gguf_params);
+    } else {
+        gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
+    }
 
     printf("%s: same_version: ", __func__);
     if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {
@@ -1343,7 +1413,17 @@ int main(int argc, char ** argv) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 
         for (bool only_meta : {true, false}) {
-            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta);
+            std::pair<int, int> result = test_roundtrip(dev, seed, only_meta, ROUNDTRIP_READ_MODE_FILE);
+            npass += result.first;
+            ntest += result.second;
+        }
+        {
+            std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_BUFFER);
+            npass += result.first;
+            ntest += result.second;
+        }
+        {
+            std::pair<int, int> result = test_roundtrip(dev, seed, /*only_meta=*/false, ROUNDTRIP_READ_MODE_CALLBACK);
             npass += result.first;
             ntest += result.second;
         }

From ae251b5ff2634108822e0f8bb20ca4cd5c2c5dcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 25 May 2026 11:37:25 +0200
Subject: [PATCH 15/21] TP: fix ggml context size calculation (#22616)

* TP: fix ggml context size calculation, memory leak

* move split state cache back into the context

* revert to constant ggml context size for cgraphs

* increase headroom for statically allocated tensors

* remove obsolete include
---
 ggml/src/ggml-backend-meta.cpp | 194 +++++++++++++++++++++++----------
 1 file changed, 137 insertions(+), 57 deletions(-)

diff --git a/ggml/src/ggml-backend-meta.cpp b/ggml/src/ggml-backend-meta.cpp
index 5f9ae9c1bc5..d0d64523b4a 100644
--- a/ggml/src/ggml-backend-meta.cpp
+++ b/ggml/src/ggml-backend-meta.cpp
@@ -13,6 +13,7 @@
 #include <cstring>
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -392,64 +393,100 @@ static ggml_backend_buffer_type_t ggml_backend_meta_device_get_host_buffer_type(
 // meta backend buffer
 //
 
+// Container to hold the tensor slices per simple ggml backend buffer.
+struct ggml_backend_meta_simple_tensor_container {
+    std::vector<ggml_context_ptr> ctxs;
+    std::map<const ggml_tensor *, std::vector<ggml_tensor *>> simple_tensors;
+
+    ggml_backend_meta_simple_tensor_container(const ggml_init_params & params, const int n_simple) {
+        ctxs.reserve(n_simple);
+        for (int i = 0; i < n_simple; i++) {
+            ctxs.emplace_back(ggml_init(params));
+        }
+    }
+    ggml_backend_meta_simple_tensor_container() {}
+};
+
 struct ggml_backend_meta_buffer_context {
+    // FIXME
+    // Most tensors can simply be stored statically in their own buffer.
+    // Externally created views however also need a mapping to simple tensors but they use the buffer of the view source.
+    // If external views are simply using that buffer they will slowly deplete its memory.
+    // Current solution: rotating set of 2 "compute" containers to hold external views, works correctly for llama.cpp.
+    // Long-term: tie the lifetime of external views to the meta backend executing the graph instead,
+    //     currently not possible due to graph-external operations in the backend scheduler.
+    ggml_backend_meta_simple_tensor_container stc_static;
+    ggml_backend_meta_simple_tensor_container stc_compute[2];
+    int stc_compute_index      = 0;
+    int stc_compute_index_next = 0;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    // FIXME
+    // The size of the split state cache is unbounded and can theoretically grow infinitely large.
+    // However, it is also expensive to build and clearing it on every rebuild in ggml_backend_meta_graph_compute is too expensive.
     static constexpr size_t nbtc = GGML_TENSOR_SIZE - sizeof(ggml_tensor::padding);
-
     std::map<std::pair<const ggml_tensor *, bool>, std::pair<ggml_backend_meta_split_state, char[nbtc]>> split_state_cache;
-    std::map<          const ggml_tensor *,        std::vector<ggml_tensor *>>                           simple_tensors;
-
-    struct buffer_config {
-        ggml_context          * ctx;
-        ggml_backend_buffer_t   buf;
-
-        buffer_config(ggml_context * ctx, ggml_backend_buffer_t buf) : ctx(ctx), buf(buf) {}
-    };
-    std::vector<buffer_config> buf_configs;
 
     int debug;
 
-    ggml_backend_meta_buffer_context() {
+    ggml_backend_meta_buffer_context(
+            ggml_backend_meta_simple_tensor_container & stc_static,
+            ggml_backend_meta_simple_tensor_container & stc_compute_0,
+            ggml_backend_meta_simple_tensor_container & stc_compute_1,
+            const std::vector<ggml_backend_buffer_t> & bufs)
+            : stc_static(std::move(stc_static)), stc_compute{std::move(stc_compute_0), std::move(stc_compute_1)} {
+        this->bufs.reserve(bufs.size());
+        for (ggml_backend_buffer_t buf : bufs) {
+            this->bufs.emplace_back(buf);
+        }
         const char * GGML_META_DEBUG = getenv("GGML_META_DEBUG");
         debug = GGML_META_DEBUG ? atoi(GGML_META_DEBUG) : 0;
     }
+
+    ggml_backend_meta_simple_tensor_container & get_simple_tensor_container(const ggml_tensor * tensor) {
+        if (stc_static.simple_tensors.find(tensor) != stc_static.simple_tensors.end()) {
+            return stc_static;
+        }
+        return stc_compute[stc_compute_index];
+    }
 };
 
 static void ggml_backend_meta_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    for (auto & [ctx, buf] : buf_ctx->buf_configs) {
-        ggml_backend_buffer_free(buf);
-        ggml_free(ctx);
-    }
     delete buf_ctx;
 }
 
 static size_t ggml_backend_meta_buffer_n_bufs(ggml_backend_buffer_t meta_buf) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
-    return buf_ctx->buf_configs.size();
+    return buf_ctx->bufs.size();
 }
 
 static ggml_backend_buffer_t ggml_backend_meta_buffer_simple_buffer(ggml_backend_buffer_t meta_buf, size_t index) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(meta_buf));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) meta_buf->context;
-    GGML_ASSERT(index < buf_ctx->buf_configs.size());
-    return buf_ctx->buf_configs[index].buf;
+    GGML_ASSERT(index < buf_ctx->bufs.size());
+    return buf_ctx->bufs[index].get();
 }
 
 static struct ggml_tensor * ggml_backend_meta_buffer_simple_tensor(const struct ggml_tensor * tensor, size_t index) {
     GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
-    GGML_ASSERT(index < buf_ctx->buf_configs.size());
+    GGML_ASSERT(index < buf_ctx->bufs.size());
 
-    auto it = buf_ctx->simple_tensors.find(tensor);
-    if (it == buf_ctx->simple_tensors.end()) {
+    ggml_backend_meta_simple_tensor_container & stc = buf_ctx->get_simple_tensor_container(tensor);
+    auto it = stc.simple_tensors.find(tensor);
+    if (it == stc.simple_tensors.end()) {
         return nullptr;
     }
     return it->second[index];
 }
 
-static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync);
+
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(
+        ggml_backend_meta_simple_tensor_container & stc, const struct ggml_tensor * tensor, bool assume_sync) {
     const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
     ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
 
@@ -785,7 +822,7 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
                 src_ss[i] = {GGML_BACKEND_SPLIT_AXIS_UNKNOWN, {0}, 1};
                 continue;
             }
-            src_ss[i] = ggml_backend_meta_get_split_state(tensor->src[i], /*assume_sync =*/ true);
+            src_ss[i] = ggml_backend_meta_get_split_state(stc, tensor->src[i], /*assume_sync =*/ true);
             GGML_ASSERT(src_ss[i].axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
         }
 
@@ -1079,17 +1116,23 @@ static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(co
     return ret;
 }
 
+static struct ggml_backend_meta_split_state ggml_backend_meta_get_split_state(const struct ggml_tensor * tensor, bool assume_sync) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
+    return ggml_backend_meta_get_split_state(buf_ctx->get_simple_tensor_container(tensor), tensor, assume_sync);
+}
+
 static void * ggml_backend_meta_buffer_get_base(ggml_backend_buffer_t buffer) {
     GGML_UNUSED(buffer);
     return (void *) 0x1000000000000000; // FIXME
 }
 
-static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
-    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
-    const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
+static enum ggml_status ggml_backend_meta_buffer_init_tensor_impl(ggml_backend_meta_simple_tensor_container & stc, ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(tensor->buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) tensor->buffer->context;
+    const size_t n_simple_bufs = ggml_backend_meta_buffer_n_bufs(tensor->buffer);
 
-    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(tensor, /*assume_sync =*/ true);
+    const ggml_backend_meta_split_state split_state = ggml_backend_meta_get_split_state(stc, tensor, /*assume_sync =*/ true);
     GGML_ASSERT(ggml_nelements(tensor) == 0 || split_state.axis != GGML_BACKEND_SPLIT_AXIS_UNKNOWN);
     GGML_ASSERT(split_state.n_segments <= 16);
 
@@ -1104,8 +1147,8 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
     std::vector<ggml_tensor *> simple_tensors;
     simple_tensors.reserve(n_simple_bufs);
     for (size_t j = 0; j < n_simple_bufs; j++) {
-        ggml_context          * simple_ctx = buf_ctx->buf_configs[j].ctx;
-        ggml_backend_buffer_t   simple_buf = buf_ctx->buf_configs[j].buf;
+        ggml_context          * simple_ctx = stc.ctxs[j].get();
+        ggml_backend_buffer_t   simple_buf = buf_ctx->bufs[j].get();
 
         if (split_dim >= 0 && split_dim < GGML_MAX_DIMS) {
             // TODO: the following assert fails for llama-parallel even though the results are correct:
@@ -1158,7 +1201,7 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
             t_ij->data = (char *) t_ij->view_src->data + t_ij->view_offs;
         } else if (simple_buf != nullptr) {
             t_ij->data = (char *) ggml_backend_buffer_get_base(simple_buf)
-                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(buffer));
+                + size_t(tensor->data) - size_t(ggml_backend_buffer_get_base(tensor->buffer));
         }
         t_ij->extra = tensor->extra;
         for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -1194,11 +1237,18 @@ static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer
         }
     }
 
-    buf_ctx->simple_tensors[tensor] = simple_tensors;
+    stc.simple_tensors[tensor] = simple_tensors;
 
     return GGML_STATUS_SUCCESS;
 }
 
+static enum ggml_status ggml_backend_meta_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    buf_ctx->stc_compute_index = buf_ctx->stc_compute_index_next;
+    return ggml_backend_meta_buffer_init_tensor_impl(buf_ctx->get_simple_tensor_container(tensor), tensor);
+}
+
 static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     const size_t n_bufs = ggml_backend_meta_buffer_n_bufs(buffer);
     GGML_ASSERT(ggml_is_contiguous(tensor));
@@ -1413,8 +1463,9 @@ static void ggml_backend_meta_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static void ggml_backend_meta_buffer_reset(ggml_backend_buffer_t buffer) {
-    const size_t n_buffers = ggml_backend_meta_buffer_n_bufs(buffer);
-    for (size_t i = 0; i < n_buffers; i++) {
+    GGML_ASSERT(ggml_backend_buffer_is_meta(buffer));
+    ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buffer->context;
+    for (size_t i = 0; i < buf_ctx->bufs.size(); i++) {
         ggml_backend_buffer_reset(ggml_backend_meta_buffer_simple_buffer(buffer, i));
     }
 }
@@ -1440,21 +1491,24 @@ bool ggml_backend_buffer_is_meta(ggml_backend_buffer_t buf) {
 static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
 
-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
+    const ggml_init_params params = {
+        /*.mem_size   =*/ 1024*1024*ggml_tensor_overhead(), // FIXME
         /*.mem_buffer =*/ nullptr,
         /*.no_alloc   =*/ true,
     };
+    ggml_backend_meta_simple_tensor_container stc_static;
+    ggml_backend_meta_simple_tensor_container stc_compute_0(params, n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_1(params, n_simple_bufts);
 
-    ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context();
     size_t max_size = 0;
-    buf_ctx->buf_configs.reserve(n_simple_bufts);
+    std::vector<ggml_backend_buffer_t> bufs;
+    bufs.reserve(n_simple_bufts);
     for (size_t i = 0; i < n_simple_bufts; i++) {
-        ggml_backend_buffer_t simple_buf = ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size);
-        GGML_ASSERT(simple_buf != nullptr);
-        max_size = std::max(max_size, ggml_backend_buffer_get_size(simple_buf));
-        buf_ctx->buf_configs.emplace_back(ggml_init(params), simple_buf);
+        bufs.push_back(ggml_backend_buft_alloc_buffer(ggml_backend_meta_buft_simple_buft(buft, i), size));
+        GGML_ASSERT(bufs.back() != nullptr);
+        max_size = std::max(max_size, ggml_backend_buffer_get_size(bufs.back()));
     }
+    ggml_backend_meta_buffer_context * buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
 
     return ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, buf_ctx, max_size);
 }
@@ -1462,26 +1516,32 @@ static ggml_backend_buffer_t ggml_backend_meta_buffer_type_alloc_buffer(ggml_bac
 struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
     const size_t n_simple_bufts = ggml_backend_meta_buft_n_bufts(buft);
 
-    ggml_init_params params = {
-        /*.mem_size   =*/ 1024*1024*1024, // FIXME
+    constexpr size_t compute_headroom = 16; // Maximum number of views per statically allocated tensor that can be created between evals.
+    const ggml_init_params params_static = {
+        /*.mem_size   =*/ ggml_get_mem_size(ctx),
         /*.mem_buffer =*/ nullptr,
         /*.no_alloc   =*/ true,
     };
+    const ggml_init_params params_compute = {
+        /*.mem_size   =*/ compute_headroom*ggml_get_mem_size(ctx),
+        /*.mem_buffer =*/ nullptr,
+        /*.no_alloc   =*/ true,
+    };
+    ggml_backend_meta_simple_tensor_container stc_static   (params_static,  n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_0(params_compute, n_simple_bufts);
+    ggml_backend_meta_simple_tensor_container stc_compute_1(params_compute, n_simple_bufts);
 
-    ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context();
-    meta_buf_ctx->buf_configs.reserve(n_simple_bufts);
-    for (size_t i = 0; i < n_simple_bufts; i++) {
-        meta_buf_ctx->buf_configs.emplace_back(ggml_init(params), nullptr);
-    }
+    std::vector<ggml_backend_buffer_t> bufs(n_simple_bufts, nullptr);
+    ggml_backend_meta_buffer_context * meta_buf_ctx = new ggml_backend_meta_buffer_context(stc_static, stc_compute_0, stc_compute_1, bufs);
 
     ggml_backend_buffer_t meta_buf = ggml_backend_buffer_init(buft, ggml_backend_meta_buffer_iface, meta_buf_ctx, 0);
     for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
         t->buffer = meta_buf;
-        ggml_backend_meta_buffer_init_tensor(meta_buf, t);
+        ggml_backend_meta_buffer_init_tensor_impl(meta_buf_ctx->stc_static, t);
         t->data = (void *) 0x2000000000000000; // FIXME
     }
     for (size_t i = 0; i < n_simple_bufts; i++) {
-        ggml_context * ctx = meta_buf_ctx->buf_configs[i].ctx;
+        ggml_context * ctx = meta_buf_ctx->stc_static.ctxs[i].get();
         ggml_backend_buffer_type_t simple_buft = ggml_backend_meta_buft_simple_buft(buft, i);
 
         // If a ggml_context only has zero-sized tensors, ggml_backend_alloc_ctx_tensors_from_buft returns NULL.
@@ -1494,15 +1554,15 @@ struct ggml_backend_buffer * ggml_backend_meta_alloc_ctx_tensors_from_buft(struc
             }
         }
         if (any_nonzero_slice) {
-            meta_buf_ctx->buf_configs[i].buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft);
+            meta_buf_ctx->bufs[i].reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx, simple_buft));
         } else {
-            meta_buf_ctx->buf_configs[i].buf = ggml_backend_buft_alloc_buffer(simple_buft, 0);
+            meta_buf_ctx->bufs[i].reset(ggml_backend_buft_alloc_buffer(simple_buft, 0));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-                t->buffer = meta_buf_ctx->buf_configs[i].buf;
+                t->buffer = meta_buf_ctx->bufs[i].get();
             }
         }
-        GGML_ASSERT(meta_buf_ctx->buf_configs[i].buf != nullptr);
-        meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->buf_configs[i].buf));
+        GGML_ASSERT(meta_buf_ctx->bufs[i]);
+        meta_buf->size = std::max(meta_buf->size, ggml_backend_buffer_get_size(meta_buf_ctx->bufs[i].get()));
     }
     return meta_buf;
 }
@@ -1724,6 +1784,26 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
     }
 
     if (needs_rebuild) {
+        std::set<ggml_backend_buffer_t> used_buffers;
+        for (int i = 0; i < cgraph->n_leafs; i++) {
+            if (ggml_backend_buffer_is_meta(cgraph->leafs[i]->buffer)) {
+                used_buffers.emplace(cgraph->leafs[i]->buffer);
+            }
+        }
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (ggml_backend_buffer_is_meta(cgraph->nodes[i]->buffer)) {
+                used_buffers.emplace(cgraph->nodes[i]->buffer);
+            }
+        }
+        for (ggml_backend_buffer_t buf : used_buffers) {
+            ggml_backend_meta_buffer_context * buf_ctx = (ggml_backend_meta_buffer_context *) buf->context;
+            buf_ctx->stc_compute_index_next = buf_ctx->stc_compute_index ^ 1;
+            ggml_backend_meta_simple_tensor_container & stc = buf_ctx->stc_compute[buf_ctx->stc_compute_index_next];
+            for (ggml_context_ptr & ctx : stc.ctxs) {
+                ggml_reset(ctx.get());
+            }
+            stc.simple_tensors.clear();
+        }
         size_t n_subgraphs  = 0;
         size_t max_tmp_size = 0;
 
@@ -1909,7 +1989,7 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
             const size_t mem_per_device_graphs_main = backend_ctx->max_subgraphs*ggml_graph_overhead_custom(backend_ctx->max_nnodes, cgraph->grads);
             const size_t mem_per_device_graphs_aux = n_cgraphs_per_device*backend_ctx->max_subgraphs*ggml_graph_overhead_custom(1, cgraph->grads);
             const size_t mem_per_device_nodes_aux = n_nodes_per_device*backend_ctx->max_subgraphs*ggml_tensor_overhead();
-            ggml_init_params params = {
+            const ggml_init_params params = {
                 /*.mem_size   =*/ n_backends * (mem_per_device_graphs_main + mem_per_device_graphs_aux + mem_per_device_nodes_aux),
                 /*.mem_buffer =*/ nullptr,
                 /*.no_alloc   =*/ true,

From fa9704152416df37cd28b308a640378e1414ea8d Mon Sep 17 00:00:00 2001
From: Dev-X25874 <283057883+Dev-X25874@users.noreply.github.com>
Date: Thu, 21 May 2026 17:28:08 +0530
Subject: [PATCH 16/21] ggml-alloc: fix out-of-bounds read in
 ggml_dyn_tallocr_remove_block (ggml/1492)

---
 ggml/src/ggml-alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index a4b01ccf8a1..3bda9abbe03 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -150,7 +150,7 @@ static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t o
 
 static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
     // shift all elements after idx by 1 to the left, overwriting the element at idx
-    for (int i = idx; i < chunk->n_free_blocks; i++) {
+    for (int i = idx; i < chunk->n_free_blocks - 1; i++) {
         chunk->free_blocks[i] = chunk->free_blocks[i+1];
     }
     chunk->n_free_blocks--;

From b251f74f496950e88abc70db40530f85ed83f53f Mon Sep 17 00:00:00 2001
From: Ori Pekelman <ori@pekelman.com>
Date: Thu, 21 May 2026 12:00:16 +0000
Subject: [PATCH 17/21] ggml.h: correct ggml_silu_back arg docstring (a=dy,
 b=x) (ggml/1500)

---
 ggml/include/ggml.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41566d41aef..f6725265504 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1189,8 +1189,8 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
-    // a - x
-    // b - dy
+    // a - dy
+    // b - x
     GGML_API struct ggml_tensor * ggml_silu_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,

From ce5890b5f7d88fe3408398dfbbada00aec03d352 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 May 2026 12:13:21 +0300
Subject: [PATCH 18/21] ggml : bump version to 0.12.1 (ggml/1508)

---
 ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4aac5094d1c..03020888f97 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 12)
-set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_PATCH 1)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

From 22307b3e8b96179a195956056e37b872eb6df475 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 May 2026 12:33:22 +0300
Subject: [PATCH 19/21] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 0fa47782fd9..68a145ca643 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-0ce7ad348a3151e1da9f65d962044546bcaad421
+8fb5f5f931ce615d064aef2183edf618a677bff6

From 45158f460ee1eca215e2dd60c63bc6422d432bf1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 May 2026 12:40:17 +0300
Subject: [PATCH 20/21] ggml : bump version to 0.13.0 (ggml/1510)

---
 ggml/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 03020888f97..f542f18b6d4 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -4,8 +4,8 @@ project("ggml" C CXX ASM)
 
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 12)
-set(GGML_VERSION_PATCH 1)
+set(GGML_VERSION_MINOR 13)
+set(GGML_VERSION_PATCH 0)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

From d161ea707192f9510288561104a8caf8e1829edf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 May 2026 12:42:28 +0300
Subject: [PATCH 21/21] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 68a145ca643..a4f87b2b9ae 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-8fb5f5f931ce615d064aef2183edf618a677bff6
+e705c5fed490514458bdd2eaddc43bd098fcce9b