diff --git a/miles/utils/chat_template_utils/tito_tokenizer.py b/miles/utils/chat_template_utils/tito_tokenizer.py
index 837be090e3..544c499bea 100644
--- a/miles/utils/chat_template_utils/tito_tokenizer.py
+++ b/miles/utils/chat_template_utils/tito_tokenizer.py
@@ -340,25 +340,90 @@ def merge_tokens(
 
 
 # ---------------------------------------------------------------------------
-# K2V3 family implementation
+# K2V3 family — current (IFM) chat template
 # ---------------------------------------------------------------------------
 
 
 class K2V3TITOTokenizer(TITOTokenizer):
-    """K2V3 family.
+    r"""K2V3 family with the IFM-style chat template (introduced 2026-06-01).
 
-    The chat template emits ``<|im_end|>\\n`` after every message (jinja
-    block whitespace between ``{{- '<|im_end|>' }}`` and the next block
-    is preserved by default ``trim_blocks``), but the model
-    autoregressively stops at ``<|im_end|>`` without generating the
-    trailing ``\\n``. ``merge_tokens`` inserts the missing newline so the
-    pretokenized buffer matches the canonical template output.
+    The current K2V3 chat template (``bbq-0601`` / ``bbq-8b-mid3_v3`` and
+    later) namespaces ChatML tokens as ``<|ifm|im_start|>`` /
+    ``<|ifm|im_end|>`` and emits NO whitespace between
+    ``<|ifm|im_end|>`` and the next ``<|ifm|im_start|>``. The model
+    autoregressively stops at ``<|ifm|im_end|>`` with no trailing byte;
+    the rollout buffer already matches the canonical template render
+    exactly. ``merge_tokens`` therefore needs no boundary fix — it
+    inherits the base ``TITOTokenizer`` concat behavior.
+
+    Empirical sanity check::
+
+        apply_chat_template([user, asst, user], tokenize=False)
+        → '...A1<|ifm|im_end|><|ifm|im_start|>user\n...'
+                              ^^ no \n between messages
+
+    For legacy K2V3 checkpoints (``bbq-8b-mid3-final`` and earlier) whose
+    chat template uses ``<|im_end|>\n`` between messages, use
+    :class:`K2V3OldBackupTITOTokenizer` (``--tito-model k2v3_oldbackup``)
+    instead.
+    """
+
+    _default_assistant_start_str: str = "<|ifm|im_start|>assistant"
+
+    def __init__(
+        self,
+        tokenizer: Any,
+        chat_template_kwargs: dict[str, Any] | None = None,
+        assistant_start_str: str | None = None,
+        allowed_append_roles: list[str] | None = None,
+    ):
+        super().__init__(
+            tokenizer,
+            chat_template_kwargs,
+            assistant_start_str or self._default_assistant_start_str,
+            allowed_append_roles=allowed_append_roles,
+        )
+        # Hard assert against misconfiguration: refuse to load on a legacy
+        # K2V3 checkpoint whose vocab does not have <|ifm|im_end|>.
+        ifm_end_id = tokenizer.convert_tokens_to_ids("<|ifm|im_end|>")
+        unk_id = getattr(tokenizer, "unk_token_id", None)
+        if ifm_end_id is None or ifm_end_id == unk_id:
+            raise ValueError(
+                "K2V3TITOTokenizer (current/IFM chat template) requires "
+                "<|ifm|im_end|> in the tokenizer vocab. The loaded "
+                "tokenizer does not have this token, suggesting you are "
+                "on a legacy K2V3 checkpoint. Use --tito-model "
+                "k2v3_oldbackup for those."
+            )
+        self._im_end_id: int = ifm_end_id
+        self.trailing_token_ids = frozenset({ifm_end_id})
+
+
+# ---------------------------------------------------------------------------
+# K2V3 family — legacy (<|im_end|>\n) chat template
+# ---------------------------------------------------------------------------
+
+
+class K2V3OldBackupTITOTokenizer(TITOTokenizer):
+    r"""K2V3 family with the LEGACY chat template (``<|im_end|>\n``).
+
+    Use this with legacy K2V3 checkpoints (``bbq-8b-mid3-final`` and
+    earlier) whose chat template emits ``<|im_end|>\n`` after every
+    message (jinja block whitespace between ``{{- '<|im_end|>' }}`` and
+    the next block is preserved by default ``trim_blocks``), but where
+    the model autoregressively stops at ``<|im_end|>`` without producing
+    the trailing ``\n``. ``merge_tokens`` inserts the missing newline so
+    the pretokenized buffer matches the canonical template output.
 
     Empirical sanity check::
 
         apply_chat_template([user, assistant, user], tokenize=False)
-        → '...hello<|im_end|>\\n<|im_start|>user\\n...'
+        → '...hello<|im_end|>\n<|im_start|>user\n...'
                           ^^
+
+    For current K2V3 checkpoints (``bbq-8b-mid3_v3`` and later) whose
+    template uses ``<|ifm|im_end|>`` with no trailing ``\n``, use
+    :class:`K2V3TITOTokenizer` (``--tito-model k2v3``) instead.
     """
 
     _default_assistant_start_str: str = "<|im_start|>assistant"
@@ -376,10 +441,22 @@ def __init__(
             assistant_start_str or self._default_assistant_start_str,
             allowed_append_roles=allowed_append_roles,
         )
+        # Hard assert against misconfiguration: refuse to load on a current
+        # K2V3 checkpoint whose vocab does not have <|im_end|>.
+        im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        unk_id = getattr(tokenizer, "unk_token_id", None)
+        if im_end_id is None or im_end_id == unk_id:
+            raise ValueError(
+                "K2V3OldBackupTITOTokenizer (legacy chat template) "
+                "requires <|im_end|> in the tokenizer vocab. The loaded "
+                "tokenizer does not have this token, suggesting you are "
+                "on a current K2V3 checkpoint that uses the IFM template. "
+                "Use --tito-model k2v3 for those."
+            )
         nl_ids = tokenizer.encode("\n", add_special_tokens=False)
         assert len(nl_ids) == 1, f"Expected single newline token, got {nl_ids}"
         self._newline_id: int = nl_ids[0]
-        self._im_end_id: int = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        self._im_end_id: int = im_end_id
         self.trailing_token_ids = frozenset({self._newline_id})
 
     def merge_tokens(
@@ -406,6 +483,7 @@ class TITOTokenizerType(str, Enum):
     QWEN3 = "qwen3"
     GLM47 = "glm47"
     K2V3 = "k2v3"
+    K2V3_OLDBACKUP = "k2v3_oldbackup"
 
 
 _TOKENIZER_REGISTRY: dict[TITOTokenizerType, type[TITOTokenizer]] = {
@@ -413,6 +491,7 @@ class TITOTokenizerType(str, Enum):
     TITOTokenizerType.QWEN3: Qwen3TITOTokenizer,
     TITOTokenizerType.GLM47: GLM47TITOTokenizer,
     TITOTokenizerType.K2V3: K2V3TITOTokenizer,
+    TITOTokenizerType.K2V3_OLDBACKUP: K2V3OldBackupTITOTokenizer,
 }
 
 
diff --git a/tests/fast/utils/chat_template_utils/test_tito_k2v3.py b/tests/fast/utils/chat_template_utils/test_tito_k2v3.py
index 35e5ece3c3..a6c6cd4f78 100644
--- a/tests/fast/utils/chat_template_utils/test_tito_k2v3.py
+++ b/tests/fast/utils/chat_template_utils/test_tito_k2v3.py
@@ -1,19 +1,36 @@
-"""TITO contract tests for the K2V3 family.
+r"""TITO contract tests for the K2V3 family — current IFM chat template.
+
+This file targets ``K2V3TITOTokenizer`` (``--tito-model k2v3``), used for
+current K2V3 checkpoints (``bbq-8b-mid3_v3`` and later) whose chat
+template is the IFM-style ``bbq-0601`` template:
+
+  - ChatML tokens are namespaced as ``<|ifm|im_start|>`` / ``<|ifm|im_end|>``
+  - NO whitespace is emitted between ``<|ifm|im_end|>`` and the next
+    ``<|ifm|im_start|>`` (the model autoregressively stops at
+    ``<|ifm|im_end|>`` and the rollout buffer already matches the
+    canonical template render)
+  - Assistant messages REQUIRE a thinking field
+    (``think`` / ``think_fast`` / ``think_faster`` / ``reasoning`` /
+    ``reasoning_content``) — the template raises otherwise
+
+For legacy K2V3 checkpoints (``bbq-8b-mid3-final`` and earlier) using the
+``<|im_end|>\n`` template, see ``test_tito_k2v3_oldbackup.py``.
 
 Coverage contract — this file protects these invariants:
 
-  (I1) K2V3 canonical chat template renders ``<|im_end|>\\n`` after every
-       message (the trailing ``\\n`` comes from jinja block whitespace).
-  (I2) Realistic rollout buffers can end at ``<|im_end|>`` WITHOUT the
-       trailing ``\\n`` — the model stops at ``<|im_end|>`` on
-       autoregressive emission.
-  (I3) ``K2V3TITOTokenizer.merge_tokens`` inserts the missing ``\\n``
-       when ``prefix[-1] == <|im_end|>``, so the merged buffer matches
-       canonical render.
+  (I1) Current K2V3 (IFM) chat template emits ``<|ifm|im_end|>`` with NO
+       trailing whitespace between messages.
+  (I2) Realistic rollout buffers end at ``<|ifm|im_end|>`` (matches
+       canonical render token-for-token — no fix required).
+  (I3) ``K2V3TITOTokenizer.merge_tokens`` does NOT insert any boundary
+       tokens (regression guard: prevents reintroducing the legacy
+       ``\n`` fix that would break bit-identity here).
   (I4) Appended env messages (tool / user / system) round-trip through
        ``merge_tokens`` and still match the canonical render — across
        both realistic single-turn buffers and multi-turn parser-driven
        session histories.
+  (I5) Hard-asserted refusal: instantiating ``K2V3TITOTokenizer`` on a
+       legacy checkpoint (no ``<|ifm|im_end|>`` token) raises at init.
 
 The file is split into three banner-marked sections:
 
@@ -37,16 +54,17 @@
             — runtime defense (``update_pretokenized_state``'s prefix
             check) is alive
       * ``test_k2v3_subclass_is_wired``
-            — registry returns the K2V3 subclass, not the base
-
-Why this file exists separately from ``test_tito_tokenizer_model_matrix.py``:
-that file builds ``pretokenized`` via ``apply_chat_template(..., add_generation_prompt=False)``,
-which already contains the trailing ``\\n``, so the boundary fix path
-never fires and the test passes whether the fix exists or not. This file
-routes through ``update_pretokenized_state`` instead, producing the
-realistic ``prefix[-1] == <|im_end|>`` state that the fix exists for.
+            — registry returns ``K2V3TITOTokenizer``, not the base or
+            the legacy class
+      * ``test_k2v3_init_rejects_legacy_checkpoint``
+            — I5: init raises ValueError when loaded on a tokenizer
+            whose vocab lacks ``<|ifm|im_end|>``
 
 Skips at module level if the K2V3 checkpoint is not on this host.
+
+NOTE: production training on this IFM checkpoint also requires
+IFM-compatible SGLang parsers (see LLM360/sglang#33) — those are
+orthogonal to TITO correctness but mandatory for the rollout path.
 """
 
 from __future__ import annotations
@@ -79,32 +97,26 @@
 
 K2V3_MODEL_PATH = os.environ.get(
     "TITO_TEST_MODEL_PATH_K2V3",
-    "/mnt/weka/shrd/k2m/suqi.sun/bbq_image/bbq-8b-mid3-final",
+    "/mnt/weka/shrd/k2m/suqi.sun/bbq_image/bbq-8b-mid3_v3/checkpoint_0005500",
 )
 _ALLOWED_APPEND_ROLES = ["tool", "user", "system"]
 
-# K2V3 chat template's generation prompt depends on reasoning_effort
-# (high → <think>, medium → <think_fast>, low → <think_faster>). Production
-# runs with high effort; pinning here so test is deterministic regardless
-# of any future template-default change. Override via env if needed.
+# K2V3 IFM template's generation prompt depends on reasoning_effort
+# (high → <ifm|think>, medium → <ifm|think_fast>, low → <ifm|think_faster>).
+# The IFM template REQUIRES a valid reasoning_effort value — raises on
+# anything outside {high, medium, low}. Production runs with high effort.
 _K2V3_REASONING_EFFORT = os.environ.get("TITO_TEST_REASONING_EFFORT_K2V3", "high")
 _K2V3_CHAT_TEMPLATE_KWARGS = {"reasoning_effort": _K2V3_REASONING_EFFORT}
 
-# Per-K2V3 SGLang parser names. Defaults match the K2V3 production
-# config:
-#   SGLANG_TOOL_PARSER=hermes
-#   SGLANG_REASONING_PARSER=deepseek-r1
-# Both rely on `<think>...</think>` (deepseek-r1) and the hermes
-# `<tool_call>\n{json}\n</tool_call>` shape that K2V3's chat template emits.
+# Per-K2V3 SGLang parser names for the IFM tokens (<ifm|tool_call>,
+# <ifm|think>, etc.). Defaults match the K2V3 IFM production config (see
+# LLM360/sglang#33 for the IFM-compatible parser implementations).
 #
-# Older SGLang builds may register `hermes` under a different name (e.g.
-# the qwen25 detector handles the same shape). Override via env in those
-# environments — e.g. ``TITO_TEST_TOOL_PARSER_K2V3=qwen25``. If the
-# configured parser is not registered in this SGLang build, the parser
-# round-trip test skips with an explicit reason rather than silently
-# turning green.
-_K2V3_TOOL_PARSER = os.environ.get("TITO_TEST_TOOL_PARSER_K2V3", "hermes")
-_K2V3_REASONING_PARSER = os.environ.get("TITO_TEST_REASONING_PARSER_K2V3", "deepseek-r1")
+# If the configured parser is not registered in this SGLang build, the
+# parser round-trip tests skip with an explicit reason rather than
+# silently turning green.
+_K2V3_TOOL_PARSER = os.environ.get("TITO_TEST_TOOL_PARSER_K2V3", "k2_v3")
+_K2V3_REASONING_PARSER = os.environ.get("TITO_TEST_REASONING_PARSER_K2V3", "k2_v3")
 
 
 @pytest.fixture(scope="module")
@@ -158,24 +170,31 @@ class _Synthesized:
     return _Synthesized
 
 
-# Native + synthetic-thinking-injected trajectories. Each entry exercises a
-# distinct rollout shape; the thinking variants additionally trigger the
-# K2V3 chat template's reasoning-block path (<|im_start|>assistant\n<think>\n
-# ... </think>\ncontent<|im_end|>).
+# All assistant messages in this file's trajectories carry a thinking
+# field: the IFM chat template raises if an assistant message lacks one
+# of {think, think_fast, think_faster, reasoning, reasoning_content}.
+# Trajectories that don't natively carry thinking are wrapped via
+# ``_with_synthetic_thinking`` to inject ``reasoning_content`` on each
+# assistant turn before rendering. Native thinking trajectories are used
+# as-is (they already carry per-message reasoning content).
 CONVERSATIONS: list[tuple[str, type]] = [
     # Single assistant turn — single tool call.
-    ("single_tool", SingleToolTrajectory),
+    ("single_tool", _with_synthetic_thinking(SingleToolTrajectory)),
     ("single_tool_thinking", SingleToolThinkingTrajectory),
     # Multiple assistant turns — single tool call per turn.
-    ("multi_turn", MultiTurnTrajectory),
+    ("multi_turn", _with_synthetic_thinking(MultiTurnTrajectory)),
     ("multi_turn_thinking", MultiTurnThinkingTrajectory),
     # Single assistant turn — multiple parallel tool calls.
-    ("multi_tool_single_turn", MultiToolSingleTurnTrajectory),
-    # No native thinking variant exists for parallel-tools-single-turn;
-    # synthesize by injecting reasoning_content into the assistant turn.
-    ("multi_tool_single_turn_thinking", _with_synthetic_thinking(MultiToolSingleTurnTrajectory)),
+    ("multi_tool_single_turn", _with_synthetic_thinking(MultiToolSingleTurnTrajectory)),
+    # Native thinking variant doesn't exist for parallel-tools-single-turn;
+    # synthesize a second distinct shape via the same wrapper with a
+    # different reasoning string.
+    (
+        "multi_tool_single_turn_thinking",
+        _with_synthetic_thinking(MultiToolSingleTurnTrajectory, reasoning="Planning the parallel tool calls."),
+    ),
     # Multiple assistant turns AND tool calls (chain shape).
-    ("multi_tool_multi_turn", LongChainTrajectory),
+    ("multi_tool_multi_turn", _with_synthetic_thinking(LongChainTrajectory)),
     ("multi_tool_multi_turn_thinking", LongChainThinkingTrajectory),
 ]
 
@@ -240,17 +259,17 @@ def _realistic_emit_ids(
     tools: list[dict] | None,
     tokenizer: AutoTokenizer,
 ) -> list[int]:
-    """Synthesize completion_token_ids that mirror SGLang's autoregressive emit.
+    r"""Synthesize completion_token_ids that mirror SGLang's autoregressive emit.
 
     The model emits starting from inside the assistant generation prompt
-    and stops at ``<|im_end|>`` (no trailing ``\\n``). We compute this by
+    and stops at ``<|ifm|im_end|>`` (no trailing ``\n``). We compute this by
     diffing two chat-template renders:
 
         full   = render(request + [assistant], add_generation_prompt=False)
         prompt = render(request,               add_generation_prompt=True)
         emit_text = full[len(prompt):]                # what model would emit
-        emit_text = emit_text.rstrip("\\n")          # strip jinja's trailing \\n
-        assert emit_text.endswith("<|im_end|>")
+        emit_text = emit_text.rstrip("\n")           # strip jinja's trailing \n
+        assert emit_text.endswith("<|ifm|im_end|>")
         emit_ids = tokenizer.encode(emit_text)
     """
     full_text = _render_text(
@@ -271,11 +290,12 @@ def _realistic_emit_ids(
     )
     emit_text = full_text[len(prompt_text) :]
     # Strip the trailing newline(s) the jinja whitespace adds after
-    # `<|im_end|>`. The model autoregressively stops at the stop token
-    # without producing them.
+    # `<|ifm|im_end|>`. The model autoregressively stops at the stop token
+    # without producing trailing whitespace. (The IFM template emits no
+    # newline between messages; the rstrip is a no-op safety net.)
     emit_text_stop = emit_text.rstrip("\n")
-    assert emit_text_stop.endswith("<|im_end|>"), (
-        f"unexpected emit_text shape (does not end with <|im_end|>): " f"{emit_text_stop!r}"
+    assert emit_text_stop.endswith("<|ifm|im_end|>"), (
+        f"unexpected emit_text shape (does not end with <|ifm|im_end|>): " f"{emit_text_stop!r}"
     )
     return list(tokenizer.encode(emit_text_stop, add_special_tokens=False))
 
@@ -338,18 +358,20 @@ def _drive_session_through_trajectory(
     ids=lambda x: x if isinstance(x, str) else None,
 )
 def test_buffer_matches_canonical_under_realistic_rollout(name, trajectory_cls, tito_tok):
-    """Invariants I1+I2+I3: rollout buffer ending at ``<|im_end|>`` (no
-    trailing ``\\n``) merges back to canonical chat-template render.
+    r"""Invariants I1+I2+I3: rollout buffer ending at ``<|ifm|im_end|>``
+    matches canonical chat-template render under pure concat (no
+    boundary fix needed).
 
     Phase 1 compares the finalized session buffer to canonical. Phase 2
     appends a synthetic tool follow-up so ``merge_tokens`` runs against
-    a buffer whose last token is ``<|im_end|>`` even on single-turn
-    trajectories (defeats ``trim_trailing_ids`` shielding that would
-    otherwise hide a missing boundary fix).
-
-    ``ASSISTANT_TEXT`` mismatches are tolerated (BPE-merge noise,
-    non-severe by the comparator); ``SPECIAL_TOKEN_*`` and
-    ``NON_ASSISTANT_TEXT`` mismatches fail the test.
+    a buffer whose last token is ``<|ifm|im_end|>`` mid-sequence — a
+    regression guard against anyone reintroducing the legacy ``\n``
+    fix (which would inject a spurious byte here).
+
+    ``ASSISTANT_TEXT`` mismatches are tolerated (BPE-merge noise +
+    parser whitespace, non-severe by the comparator);
+    ``SPECIAL_TOKEN_*`` and ``NON_ASSISTANT_TEXT`` mismatches fail
+    the test.
     """
     messages = deepcopy(trajectory_cls.MESSAGES)
     tools = deepcopy(getattr(trajectory_cls, "TOOLS", None))
@@ -360,9 +382,9 @@ def test_buffer_matches_canonical_under_realistic_rollout(name, trajectory_cls,
     comparator = tito_tok.create_comparator()
 
     # Phase 1 — finalized buffer vs canonical (covers structural drift in the
-    # whole trajectory, but the comparator's ``trim_trailing_ids`` hides
-    # end-of-sequence ``<|im_end|>`` vs ``<|im_end|>\\n`` differences if the
-    # trajectory has only ONE assistant turn).
+    # whole trajectory). For the IFM template there's no trailing-newline
+    # difference between buffer end-state and canonical render, so this phase
+    # is a pure correctness check rather than relying on ``trim_trailing_ids``.
     expected_final = _render_ids(
         session.messages,
         tito_tok.tokenizer,
@@ -388,9 +410,9 @@ def test_buffer_matches_canonical_under_realistic_rollout(name, trajectory_cls,
     # trajectories: simulate a NEXT-turn env append by calling
     # ``prepare_pretokenized`` with one extra ``tool`` message. This triggers
     # ``tito_tok.merge_tokens(...)`` against a buffer whose last token is
-    # ``<|im_end|>`` (the model's autoregressive stop), which is the
+    # ``<|ifm|im_end|>`` (the model's autoregressive stop), which is the
     # production state the boundary fix exists for. The follow-up moves the
-    # ``<|im_end|>`` from end-of-sequence to mid-sequence, defeating
+    # ``<|ifm|im_end|>`` from end-of-sequence to mid-sequence, defeating
     # ``trim_trailing_ids`` and surfacing missing-fix bugs that phase 1
     # would hide.
     follow_up = {"role": "tool", "content": "[test] synthetic follow-up env"}
@@ -424,12 +446,11 @@ def test_buffer_matches_canonical_under_realistic_rollout(name, trajectory_cls,
 
 
 # ---------------------------------------------------------------------------
-# (Section A cont.) Append-case test — mirrors the breadth of
-# ``test_tito_tokenizer_model_matrix.py`` but routes through
-# ``update_pretokenized_state`` so the buffer used for ``merge_tokens`` has
-# the realistic ``<|im_end|>``-end shape (defeats the comparator's
-# ``trim_trailing_ids`` shielding that hides missing-fix bugs in the
-# model_matrix variant).
+# (Section A cont.) Append-case test — drives every (trajectory shape x env
+# append shape) combination through ``merge_tokens`` against a realistic
+# ``<|ifm|im_end|>``-terminated buffer. Catches both missing-bit-identity
+# bugs in merge_tokens itself and any spurious boundary tokens accidentally
+# reintroduced from the legacy implementation.
 # ---------------------------------------------------------------------------
 
 
@@ -502,7 +523,7 @@ class _EnvAppendShape:
 )
 def test_append_via_realistic_buffer(traj_name, traj_cls, env_shape, tito_tok):
     """Invariants I3+I4 (core): ``merge_tokens`` against a realistic
-    ``<|im_end|>``-terminated buffer matches canonical render, for the
+    ``<|ifm|im_end|>``-terminated buffer matches canonical render, for the
     cross-product of trajectory shape × env append shape.
 
     8 trajectories × 4 env shapes = 32 ``merge_tokens`` contexts —
@@ -523,7 +544,7 @@ def test_append_via_realistic_buffer(traj_name, traj_cls, env_shape, tito_tok):
     pretokenized_buffer = list(session.token_ids)
     assert pretokenized_buffer and pretokenized_buffer[-1] == tito_tok._im_end_id, (
         f"K2V3 [{traj_name} + {env_shape.name}] setup error: pretokenized "
-        f"buffer should end at <|im_end|> after drive, got last token "
+        f"buffer should end at <|ifm|im_end|> after drive, got last token "
         f"{pretokenized_buffer[-1] if pretokenized_buffer else 'EMPTY'}"
     )
 
@@ -664,11 +685,11 @@ def test_chat_template_round_trip_through_real_sglang_parsers(traj_name, traj_cl
     parser shape (plain / + tool_calls / + reasoning / + parallel
     tool_calls) gets exercised.
 
-    ``ASSISTANT_TEXT`` mismatches are tolerated — the ``deepseek-r1``
-    parser does not ``rstrip`` reasoning content, so re-render inserts
-    an extra ``\\n`` before ``</think>``. Production classifies this as
-    ``ASSISTANT_TEXT`` and the strict CI check excludes it; this test
-    matches that contract.
+    ``ASSISTANT_TEXT`` mismatches are tolerated as parser whitespace /
+    BPE noise (matches production CI's strict-assertion exemption).
+    The IFM-compatible parsers (LLM360/sglang#33) may differ from the
+    legacy parsers' rstrip behavior; this test enforces the structural
+    round-trip contract regardless.
 
     Skips if SGLang parsers are unavailable in this environment.
     """
@@ -705,13 +726,13 @@ def test_chat_template_round_trip_through_real_sglang_parsers(traj_name, traj_cl
         f"K2V3 [{traj_name}] chat template not append-only: prompt-only " f"render is not a prefix of full render."
     )
     raw_assistant_emit = full_text[len(prompt_text) :].rstrip("\n")
-    assert raw_assistant_emit.endswith("<|im_end|>"), (
+    assert raw_assistant_emit.endswith("<|ifm|im_end|>"), (
         f"K2V3 [{traj_name}] unexpected raw_assistant_emit shape: " f"{raw_assistant_emit!r}"
     )
 
     # 2) Run real ReasoningParser on the raw emit (only if the trajectory's
     #    truth_msg actually has reasoning_content — otherwise there's no
-    #    <think>...</think> to extract).
+    #    <ifm|think>...</ifm|think> to extract).
     text_after_reasoning = raw_assistant_emit
     parsed_reasoning = ""
     if _K2V3_REASONING_PARSER and has_reasoning:
@@ -759,7 +780,7 @@ def test_chat_template_round_trip_through_real_sglang_parsers(traj_name, traj_cl
         parsed_msg["reasoning_content"] = parsed_reasoning
 
     # 4) Drive session with parser-derived assistant_message.
-    # ``raw_assistant_emit`` already ends with ``<|im_end|>`` (the model's
+    # ``raw_assistant_emit`` already ends with ``<|ifm|im_end|>`` (the model's
     # autoregressive stop), so the tokenized form is the complete emit.
     # Do NOT append ``tokenizer.eos_token_id`` — for K2V3 that is
     # ``<|endoftext|>``, which the model never emits at turn boundary
@@ -960,7 +981,7 @@ def _drive_one_assistant_turn_through_real_parsers(
         "chat template not append-only between " "render(request_messages) and render(request_messages + [truth_msg])"
     )
     raw_emit = full_text[len(prompt_text) :].rstrip("\n")
-    assert raw_emit.endswith("<|im_end|>"), f"unexpected raw_emit shape: {raw_emit!r}"
+    assert raw_emit.endswith("<|ifm|im_end|>"), f"unexpected raw_emit shape: {raw_emit!r}"
 
     has_reasoning = bool(truth_assistant_msg.get("reasoning_content"))
     parsed_content, parsed_tool_calls, parsed_reasoning = _run_parsers_on_emit(
@@ -1133,7 +1154,8 @@ def test_production_prefix_check_raises_on_intentional_violation(tito_tok):
     """
     session = LinearTrajectory()
     user_q = {"role": "user", "content": "Test."}
-    asst1 = {"role": "assistant", "content": "ok"}
+    # IFM template requires assistant messages to carry a thinking field.
+    asst1 = {"role": "assistant", "content": "ok", "reasoning_content": "thinking"}
 
     # Seed: drive a single normal turn so the session has stored token_ids.
     prompt_ids = _render_ids(
@@ -1142,10 +1164,12 @@ def test_production_prefix_check_raises_on_intentional_violation(tito_tok):
         tools=None,
         add_generation_prompt=True,
     )
-    eos = getattr(tito_tok.tokenizer, "eos_token_id", None)
+    # The model autoregressively stops at <|ifm|im_end|> (not eos_token,
+    # which is <|ifm|endoftext|> in the IFM family — used for sequence
+    # separators in SFT data, not for message boundaries).
     completion_ids = list(tito_tok.tokenizer.encode("ok", add_special_tokens=False))
-    if eos is not None and (not completion_ids or completion_ids[-1] != int(eos)):
-        completion_ids.append(int(eos))
+    if not completion_ids or completion_ids[-1] != tito_tok._im_end_id:
+        completion_ids.append(tito_tok._im_end_id)
     session.update_pretokenized_state(
         request_messages=[user_q],
         assistant_message=asst1,
@@ -1157,7 +1181,7 @@ def test_production_prefix_check_raises_on_intentional_violation(tito_tok):
     # Now feed bogus prompt_ids — completely different from what's stored.
     bogus_prompt = [99999] * (len(session.token_ids) + 5)
     bogus_completion = [12345]
-    asst2 = {"role": "assistant", "content": "next"}
+    asst2 = {"role": "assistant", "content": "next", "reasoning_content": "thinking"}
     tool_msg = {"role": "tool", "content": "irrelevant"}
 
     with pytest.raises(TokenizationError, match=r"pretokenized prefix mismatch"):
@@ -1172,12 +1196,52 @@ def test_production_prefix_check_raises_on_intentional_violation(tito_tok):
 
 def test_k2v3_subclass_is_wired(tito_tok):
     """Sanity: ``get_tito_tokenizer(..., TITOTokenizerType.K2V3)`` returns
-    the K2V3 subclass — not silently falling back to the base
-    ``TITOTokenizer``. Catches a future regression where the registry entry
-    is removed or pointed elsewhere."""
-    from miles.utils.chat_template_utils.tito_tokenizer import K2V3TITOTokenizer
+    the current ``K2V3TITOTokenizer`` (IFM) — not silently falling back to
+    the base ``TITOTokenizer`` or accidentally to ``K2V3OldBackupTITOTokenizer``.
+    Catches a future regression where the registry entry is removed or
+    pointed elsewhere."""
+    from miles.utils.chat_template_utils.tito_tokenizer import K2V3OldBackupTITOTokenizer, K2V3TITOTokenizer
 
     assert isinstance(tito_tok, K2V3TITOTokenizer), (
         f"expected K2V3TITOTokenizer, got {type(tito_tok).__name__}. "
         f"_TOKENIZER_REGISTRY[TITOTokenizerType.K2V3] may be misregistered."
     )
+    assert not isinstance(tito_tok, K2V3OldBackupTITOTokenizer), (
+        "K2V3 is now the IFM tokenizer; TITOTokenizerType.K2V3 must not " "map to K2V3OldBackupTITOTokenizer."
+    )
+
+
+def test_k2v3_init_rejects_legacy_checkpoint(tokenizer):
+    """Invariant I5: instantiating ``K2V3TITOTokenizer`` on a tokenizer
+    whose vocab lacks ``<|ifm|im_end|>`` raises a ValueError at init,
+    pointing users at ``--tito-model k2v3_oldbackup`` for legacy
+    checkpoints.
+
+    Uses a stub tokenizer wrapper that pretends ``<|ifm|im_end|>`` is the
+    unk token — the same condition the production loader hits on a
+    legacy checkpoint that doesn't have the IFM token in vocab.
+    """
+    from miles.utils.chat_template_utils.tito_tokenizer import K2V3TITOTokenizer
+
+    class _LegacyVocabStub:
+        """Wraps ``tokenizer`` but maps <|ifm|im_end|> to unk_token_id, the
+        production-realistic shape of a legacy checkpoint."""
+
+        def __init__(self, real):
+            self._real = real
+            self.unk_token_id = getattr(real, "unk_token_id", 0) or 0
+
+        def __getattr__(self, name):
+            return getattr(self._real, name)
+
+        def convert_tokens_to_ids(self, token):
+            if token == "<|ifm|im_end|>":
+                return self.unk_token_id
+            return self._real.convert_tokens_to_ids(token)
+
+    with pytest.raises(ValueError, match=r"requires <\|ifm\|im_end\|>"):
+        K2V3TITOTokenizer(
+            _LegacyVocabStub(tokenizer),
+            chat_template_kwargs=_K2V3_CHAT_TEMPLATE_KWARGS,
+            allowed_append_roles=_ALLOWED_APPEND_ROLES,
+        )
diff --git a/tests/fast/utils/chat_template_utils/test_tito_k2v3_oldbackup.py b/tests/fast/utils/chat_template_utils/test_tito_k2v3_oldbackup.py
new file mode 100644
index 0000000000..5f286076f8
--- /dev/null
+++ b/tests/fast/utils/chat_template_utils/test_tito_k2v3_oldbackup.py
@@ -0,0 +1,1192 @@
+r"""TITO contract tests for the K2V3 family — LEGACY chat template.
+
+This file targets ``K2V3OldBackupTITOTokenizer`` (``--tito-model
+k2v3_oldbackup``), used for legacy K2V3 checkpoints (``bbq-8b-mid3-final``
+and earlier) whose chat template emits ``<|im_end|>\n`` between messages.
+
+For current K2V3 checkpoints (``bbq-8b-mid3_v3`` and later) using the
+IFM template, see ``test_tito_k2v3.py``.
+
+Coverage contract — this file protects these invariants:
+
+  (I1) Legacy K2V3 canonical chat template renders ``<|im_end|>\n`` after
+       every message (the trailing ``\n`` comes from jinja block whitespace).
+  (I2) Realistic rollout buffers can end at ``<|im_end|>`` WITHOUT the
+       trailing ``\n`` — the model stops at ``<|im_end|>`` on
+       autoregressive emission.
+  (I3) ``K2V3OldBackupTITOTokenizer.merge_tokens`` inserts the missing ``\n``
+       when ``prefix[-1] == <|im_end|>``, so the merged buffer matches
+       canonical render.
+  (I4) Appended env messages (tool / user / system) round-trip through
+       ``merge_tokens`` and still match the canonical render — across
+       both realistic single-turn buffers and multi-turn parser-driven
+       session histories.
+
+The file is split into three banner-marked sections:
+
+  SECTION A — CORE INVARIANT TESTS (I1-I4)
+      * ``test_buffer_matches_canonical_under_realistic_rollout``
+            — I1 + I2 + I3
+      * ``test_append_via_realistic_buffer``
+            — I3 + I4 (core; 8 trajectories × 4 env shapes = 32 cases)
+      * ``test_chat_template_round_trip_through_real_sglang_parsers``
+            — I4 with parser-derived ``parsed_msg`` substituted for raw
+            model emit (structural round-trip only)
+
+  SECTION B — INTEGRATION STRESS
+      * ``test_end_to_end_realistic_rollout_with_real_parsers``
+            — I3 + I4 on parser-tainted multi-turn session.messages;
+            failure here that doesn't reproduce in section A is a
+            parser-interaction regression specific to accumulated state
+
+  SECTION C — SANITY (orthogonal to I1-I4)
+      * ``test_production_prefix_check_raises_on_intentional_violation``
+            — runtime defense (``update_pretokenized_state``'s prefix
+            check) is alive
+      * ``test_k2v3_oldbackup_subclass_is_wired``
+            — registry returns ``K2V3OldBackupTITOTokenizer``, not the
+            base or current K2V3 class
+
+Why this file exists separately from ``test_tito_tokenizer_model_matrix.py``:
+that file builds ``pretokenized`` via ``apply_chat_template(..., add_generation_prompt=False)``,
+which already contains the trailing ``\n``, so the boundary fix path
+never fires and the test passes whether the fix exists or not. This file
+routes through ``update_pretokenized_state`` instead, producing the
+realistic ``prefix[-1] == <|im_end|>`` state that the fix exists for.
+
+Skips at module level if the K2V3 checkpoint is not on this host.
+"""
+
+from __future__ import annotations
+
+import os
+from copy import deepcopy
+from dataclasses import dataclass
+
+import pytest
+from transformers import AutoTokenizer
+
+from miles.rollout.session.linear_trajectory import LinearTrajectory
+from miles.rollout.session.session_errors import TokenizationError
+from miles.utils.chat_template_utils import MismatchType, apply_chat_template, try_get_fixed_chat_template
+from miles.utils.chat_template_utils.tito_tokenizer import TITOTokenizerType, get_tito_tokenizer
+from miles.utils.processing_utils import load_tokenizer
+from miles.utils.test_utils.mock_trajectories import (
+    LongChainThinkingTrajectory,
+    LongChainTrajectory,
+    MultiToolSingleTurnTrajectory,
+    MultiTurnThinkingTrajectory,
+    MultiTurnTrajectory,
+    SingleToolThinkingTrajectory,
+    SingleToolTrajectory,
+)
+
+# ---------------------------------------------------------------------------
+# Path + fixtures
+# ---------------------------------------------------------------------------
+
+K2V3_MODEL_PATH = os.environ.get(
+    "TITO_TEST_MODEL_PATH_K2V3",
+    "/mnt/weka/shrd/k2m/suqi.sun/bbq_image/bbq-8b-mid3-final",
+)
+_ALLOWED_APPEND_ROLES = ["tool", "user", "system"]
+
+# K2V3 chat template's generation prompt depends on reasoning_effort
+# (high → <think>, medium → <think_fast>, low → <think_faster>). Production
+# runs with high effort; pinning here so test is deterministic regardless
+# of any future template-default change. Override via env if needed.
+_K2V3_REASONING_EFFORT = os.environ.get("TITO_TEST_REASONING_EFFORT_K2V3", "high")
+_K2V3_CHAT_TEMPLATE_KWARGS = {"reasoning_effort": _K2V3_REASONING_EFFORT}
+
+# Per-K2V3 SGLang parser names. Defaults match the K2V3 production
+# config:
+#   SGLANG_TOOL_PARSER=hermes
+#   SGLANG_REASONING_PARSER=deepseek-r1
+# Both rely on `<think>...</think>` (deepseek-r1) and the hermes
+# `<tool_call>\n{json}\n</tool_call>` shape that K2V3's chat template emits.
+#
+# Older SGLang builds may register `hermes` under a different name (e.g.
+# the qwen25 detector handles the same shape). Override via env in those
+# environments — e.g. ``TITO_TEST_TOOL_PARSER_K2V3=qwen25``. If the
+# configured parser is not registered in this SGLang build, the parser
+# round-trip test skips with an explicit reason rather than silently
+# turning green.
+_K2V3_TOOL_PARSER = os.environ.get("TITO_TEST_TOOL_PARSER_K2V3", "hermes")
+_K2V3_REASONING_PARSER = os.environ.get("TITO_TEST_REASONING_PARSER_K2V3", "deepseek-r1")
+
+
+@pytest.fixture(scope="module")
+def tokenizer() -> AutoTokenizer:
+    if not os.path.isdir(K2V3_MODEL_PATH):
+        pytest.skip(f"K2V3 checkpoint not present on this host: {K2V3_MODEL_PATH}")
+    return load_tokenizer(
+        K2V3_MODEL_PATH,
+        chat_template_path=try_get_fixed_chat_template(K2V3_MODEL_PATH),
+        trust_remote_code=True,
+    )
+
+
+@pytest.fixture
+def tito_tok(tokenizer):
+    return get_tito_tokenizer(
+        tokenizer,
+        tokenizer_type=TITOTokenizerType.K2V3_OLDBACKUP,
+        allowed_append_roles=_ALLOWED_APPEND_ROLES,
+        chat_template_kwargs=_K2V3_CHAT_TEMPLATE_KWARGS,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Trajectories — realistic conversation shapes from mock_trajectories
+# ---------------------------------------------------------------------------
+
+
+def _with_synthetic_thinking(
+    trajectory_cls: type,
+    reasoning: str = "Let me work through this step by step.",
+) -> type:
+    """Synthesize a thinking variant by injecting ``reasoning_content`` on
+    each assistant message of the trajectory.
+
+    Used to build coverage shapes that ``mock_trajectories`` doesn't ship
+    a native thinking variant for (e.g. multi-tool single-turn with
+    thinking — production exercises this combination but no native
+    fixture exists).
+    """
+    new_messages = deepcopy(trajectory_cls.MESSAGES)
+    for m in new_messages:
+        if m.get("role") == "assistant":
+            m["reasoning_content"] = reasoning
+
+    class _Synthesized:
+        TOOLS = deepcopy(getattr(trajectory_cls, "TOOLS", None))
+        MESSAGES = new_messages
+
+    _Synthesized.__name__ = trajectory_cls.__name__ + "_WithSyntheticThinking"
+    return _Synthesized
+
+
+# Native + synthetic-thinking-injected trajectories. Each entry exercises a
+# distinct rollout shape; the thinking variants additionally trigger the
+# K2V3 chat template's reasoning-block path (<|im_start|>assistant\n<think>\n
+# ... </think>\ncontent<|im_end|>).
+CONVERSATIONS: list[tuple[str, type]] = [
+    # Single assistant turn — single tool call.
+    ("single_tool", SingleToolTrajectory),
+    ("single_tool_thinking", SingleToolThinkingTrajectory),
+    # Multiple assistant turns — single tool call per turn.
+    ("multi_turn", MultiTurnTrajectory),
+    ("multi_turn_thinking", MultiTurnThinkingTrajectory),
+    # Single assistant turn — multiple parallel tool calls.
+    ("multi_tool_single_turn", MultiToolSingleTurnTrajectory),
+    # No native thinking variant exists for parallel-tools-single-turn;
+    # synthesize by injecting reasoning_content into the assistant turn.
+    ("multi_tool_single_turn_thinking", _with_synthetic_thinking(MultiToolSingleTurnTrajectory)),
+    # Multiple assistant turns AND tool calls (chain shape).
+    ("multi_tool_multi_turn", LongChainTrajectory),
+    ("multi_tool_multi_turn_thinking", LongChainThinkingTrajectory),
+]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _render_text(
+    messages: list[dict],
+    tokenizer: AutoTokenizer,
+    tools: list[dict] | None,
+    *,
+    add_generation_prompt: bool,
+) -> str:
+    """``apply_chat_template(...) → str`` with K2V3 chat_template_kwargs auto-applied."""
+    return apply_chat_template(
+        messages,
+        tokenizer=tokenizer,
+        tools=tools,
+        add_generation_prompt=add_generation_prompt,
+        tokenize=False,
+        **_K2V3_CHAT_TEMPLATE_KWARGS,
+    )
+
+
+def _render_ids(
+    messages: list[dict],
+    tokenizer: AutoTokenizer,
+    tools: list[dict] | None,
+    *,
+    add_generation_prompt: bool,
+) -> list[int]:
+    """``apply_chat_template(...) → list[int]`` with K2V3 chat_template_kwargs auto-applied."""
+    return list(
+        apply_chat_template(
+            messages,
+            tokenizer=tokenizer,
+            tools=tools,
+            add_generation_prompt=add_generation_prompt,
+            tokenize=True,
+            **_K2V3_CHAT_TEMPLATE_KWARGS,
+        )
+    )
+
+
+def _first_diff(a, b) -> str:
+    for i in range(min(len(a), len(b))):
+        if a[i] != b[i]:
+            return f"position {i}: a[{i}]={a[i]} b[{i}]={b[i]}"
+    return f"length differs (len(a)={len(a)} len(b)={len(b)})"
+
+
+def _assistant_indices(messages: list[dict]) -> list[int]:
+    return [i for i, m in enumerate(messages) if m["role"] == "assistant"]
+
+
+def _realistic_emit_ids(
+    request_messages: list[dict],
+    assistant_message: dict,
+    tools: list[dict] | None,
+    tokenizer: AutoTokenizer,
+) -> list[int]:
+    r"""Synthesize completion_token_ids that mirror SGLang's autoregressive emit.
+
+    The model emits starting from inside the assistant generation prompt
+    and stops at ``<|im_end|>`` (no trailing ``\n``). We compute this by
+    diffing two chat-template renders:
+
+        full   = render(request + [assistant], add_generation_prompt=False)
+        prompt = render(request,               add_generation_prompt=True)
+        emit_text = full[len(prompt):]                # what model would emit
+        emit_text = emit_text.rstrip("\n")           # strip jinja's trailing \n
+        assert emit_text.endswith("<|im_end|>")
+        emit_ids = tokenizer.encode(emit_text)
+    """
+    full_text = _render_text(
+        request_messages + [assistant_message],
+        tokenizer,
+        tools,
+        add_generation_prompt=False,
+    )
+    prompt_text = _render_text(
+        request_messages,
+        tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+    assert full_text.startswith(prompt_text), (
+        "chat template not append-only: prompt-only render is not a prefix "
+        "of full render. TITO's premise breaks here."
+    )
+    emit_text = full_text[len(prompt_text) :]
+    # Strip the trailing newline(s) the jinja whitespace adds after
+    # `<|im_end|>`. The model autoregressively stops at the stop token
+    # without producing them.
+    emit_text_stop = emit_text.rstrip("\n")
+    assert emit_text_stop.endswith("<|im_end|>"), (
+        f"unexpected emit_text shape (does not end with <|im_end|>): " f"{emit_text_stop!r}"
+    )
+    return list(tokenizer.encode(emit_text_stop, add_special_tokens=False))
+
+
+def _drive_session_through_trajectory(
+    session: LinearTrajectory,
+    tito_tok,
+    messages: list[dict],
+    tools: list[dict] | None,
+) -> None:
+    """Drive ``session`` turn-by-turn using the trajectory's messages.
+
+    For each assistant message in the trajectory, builds the realistic
+    emit_ids and calls ``update_pretokenized_state`` exactly as production
+    does. After this call, ``session.token_ids`` reflects what the rollout
+    buffer would hold mid-conversation.
+    """
+    for asst_idx in _assistant_indices(messages):
+        request_messages = messages[:asst_idx]
+        assistant_message = messages[asst_idx]
+
+        pre = session.prepare_pretokenized(request_messages, tools, tito_tokenizer=tito_tok)
+        if pre is None:
+            prompt_ids = _render_ids(
+                request_messages,
+                tito_tok.tokenizer,
+                tools,
+                add_generation_prompt=True,
+            )
+        else:
+            prompt_ids = list(pre["input_ids"])
+
+        emit_ids = _realistic_emit_ids(request_messages, assistant_message, tools, tito_tok.tokenizer)
+
+        session.update_pretokenized_state(
+            request_messages=request_messages,
+            assistant_message=assistant_message,
+            prompt_token_ids=prompt_ids,
+            completion_token_ids=emit_ids,
+            max_trim_tokens=tito_tok.max_trim_tokens,
+        )
+
+
+# ###########################################################################
+# ###########################################################################
+# ##                                                                       ##
+# ##  SECTION A — CORE INVARIANT TESTS                                     ##
+# ##                                                                       ##
+# ##  Each test below leads with the invariant(s) it protects (I1-I4 per   ##
+# ##  module docstring). These are the tests a reviewer should read first  ##
+# ##  to understand the contract this file enforces.                       ##
+# ##                                                                       ##
+# ###########################################################################
+# ###########################################################################
+
+
+@pytest.mark.parametrize(
+    "name, trajectory_cls",
+    CONVERSATIONS,
+    ids=lambda x: x if isinstance(x, str) else None,
+)
+def test_buffer_matches_canonical_under_realistic_rollout(name, trajectory_cls, tito_tok):
+    r"""Invariants I1+I2+I3: rollout buffer ending at ``<|im_end|>`` (no
+    trailing ``\n``) merges back to canonical chat-template render.
+
+    Phase 1 compares the finalized session buffer to canonical. Phase 2
+    appends a synthetic tool follow-up so ``merge_tokens`` runs against
+    a buffer whose last token is ``<|im_end|>`` even on single-turn
+    trajectories (defeats ``trim_trailing_ids`` shielding that would
+    otherwise hide a missing boundary fix).
+
+    ``ASSISTANT_TEXT`` mismatches are tolerated (BPE-merge noise,
+    non-severe by the comparator); ``SPECIAL_TOKEN_*`` and
+    ``NON_ASSISTANT_TEXT`` mismatches fail the test.
+    """
+    messages = deepcopy(trajectory_cls.MESSAGES)
+    tools = deepcopy(getattr(trajectory_cls, "TOOLS", None))
+
+    session = LinearTrajectory()
+    _drive_session_through_trajectory(session, tito_tok, messages, tools)
+
+    comparator = tito_tok.create_comparator()
+
+    # Phase 1 — finalized buffer vs canonical (covers structural drift in the
+    # whole trajectory, but the comparator's ``trim_trailing_ids`` hides
+    # end-of-sequence ``<|im_end|>`` vs ``<|im_end|>\n`` differences if the
+    # trajectory has only ONE assistant turn).
+    expected_final = _render_ids(
+        session.messages,
+        tito_tok.tokenizer,
+        tools,
+        add_generation_prompt=False,
+    )
+    actual_final = list(session.token_ids)
+    severe_final = [
+        m for m in comparator.compare_sequences(expected_final, actual_final) if m.type != MismatchType.ASSISTANT_TEXT
+    ]
+    if severe_final:
+        details = "\n".join(
+            f"  {m.type.value} at segment {m.segment_index}: "
+            f"expected={m.expected_text!r} actual={m.actual_text!r}" + (f" — {m.detail}" if m.detail else "")
+            for m in severe_final[:5]
+        )
+        pytest.fail(
+            f"K2V3 [{name}] phase-1 (finalized buffer) canonical mismatch.\n"
+            f"  first_diff: {_first_diff(expected_final, actual_final)}\n{details}"
+        )
+
+    # Phase 2 — force the boundary fix path even for single-assistant-turn
+    # trajectories: simulate a NEXT-turn env append by calling
+    # ``prepare_pretokenized`` with one extra ``tool`` message. This triggers
+    # ``tito_tok.merge_tokens(...)`` against a buffer whose last token is
+    # ``<|im_end|>`` (the model's autoregressive stop), which is the
+    # production state the boundary fix exists for. The follow-up moves the
+    # ``<|im_end|>`` from end-of-sequence to mid-sequence, defeating
+    # ``trim_trailing_ids`` and surfacing missing-fix bugs that phase 1
+    # would hide.
+    follow_up = {"role": "tool", "content": "[test] synthetic follow-up env"}
+    extended_messages = list(session.messages) + [follow_up]
+    pre = session.prepare_pretokenized(extended_messages, tools, tito_tokenizer=tito_tok)
+    assert pre is not None, (
+        f"K2V3 [{name}] phase-2 setup error: prepare_pretokenized returned "
+        f"None even though session has {len(session.messages)} stored messages"
+    )
+    merged = list(pre["input_ids"])
+    expected_next = _render_ids(
+        extended_messages,
+        tito_tok.tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+    severe_next = [
+        m for m in comparator.compare_sequences(expected_next, merged) if m.type != MismatchType.ASSISTANT_TEXT
+    ]
+    if severe_next:
+        details = "\n".join(
+            f"  {m.type.value} at segment {m.segment_index}: "
+            f"expected={m.expected_text!r} actual={m.actual_text!r}" + (f" — {m.detail}" if m.detail else "")
+            for m in severe_next[:5]
+        )
+        pytest.fail(
+            f"K2V3 [{name}] phase-2 (next-turn merged input_ids) canonical "
+            f"mismatch — the per-model boundary fix is likely broken.\n"
+            f"  first_diff: {_first_diff(expected_next, merged)}\n{details}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# (Section A cont.) Append-case test — mirrors the breadth of
+# ``test_tito_tokenizer_model_matrix.py`` but routes through
+# ``update_pretokenized_state`` so the buffer used for ``merge_tokens`` has
+# the realistic ``<|im_end|>``-end shape (defeats the comparator's
+# ``trim_trailing_ids`` shielding that hides missing-fix bugs in the
+# model_matrix variant).
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class _EnvAppendShape:
+    """Generic env append shape — the messages to be appended after the
+    session has been driven through some trajectory."""
+
+    name: str
+    appended_messages: list[dict]
+    required_contents: tuple[str, ...]
+
+
+# Generic append shapes. Each gets cross-producted with every trajectory in
+# CONVERSATIONS, so we exercise merge_tokens against many distinct buffer
+# end-states (single tool, parallel tools, multi-turn with thinking, etc.)
+# combined with each env shape (single tool / single user / single system /
+# alternating). Strings inside ``required_contents`` are unique markers so
+# the in-order check pinpoints exactly which env content the incremental
+# tokens dropped if the test fails.
+_ENV_APPEND_SHAPES: list[_EnvAppendShape] = [
+    _EnvAppendShape(
+        name="env_tool",
+        appended_messages=[
+            {"role": "tool", "tool_call_id": "call_test_xyz", "content": "_marker_tool_xyz_42_"},
+        ],
+        required_contents=("_marker_tool_xyz_42_",),
+    ),
+    _EnvAppendShape(
+        name="env_user",
+        appended_messages=[
+            {"role": "user", "content": "_marker_user_abc_99_"},
+        ],
+        required_contents=("_marker_user_abc_99_",),
+    ),
+    _EnvAppendShape(
+        name="env_system",
+        appended_messages=[
+            {"role": "system", "content": "_marker_system_def_77_"},
+        ],
+        required_contents=("_marker_system_def_77_",),
+    ),
+    _EnvAppendShape(
+        name="env_alternating_user_tool",
+        appended_messages=[
+            {"role": "tool", "tool_call_id": "call_alt_1", "content": "_marker_alt_tool1_aaa_"},
+            {"role": "user", "content": "_marker_alt_user1_bbb_"},
+            {"role": "tool", "tool_call_id": "call_alt_2", "content": "_marker_alt_tool2_ccc_"},
+            {"role": "user", "content": "_marker_alt_user2_ddd_"},
+        ],
+        required_contents=(
+            "_marker_alt_tool1_aaa_",
+            "_marker_alt_user1_bbb_",
+            "_marker_alt_tool2_ccc_",
+            "_marker_alt_user2_ddd_",
+        ),
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "traj_name, traj_cls",
+    CONVERSATIONS,
+    ids=lambda x: x if isinstance(x, str) else None,
+)
+@pytest.mark.parametrize(
+    "env_shape",
+    _ENV_APPEND_SHAPES,
+    ids=lambda s: s.name,
+)
+def test_append_via_realistic_buffer(traj_name, traj_cls, env_shape, tito_tok):
+    """Invariants I3+I4 (core): ``merge_tokens`` against a realistic
+    ``<|im_end|>``-terminated buffer matches canonical render, for the
+    cross-product of trajectory shape × env append shape.
+
+    8 trajectories × 4 env shapes = 32 ``merge_tokens`` contexts —
+    coverage spans buffer end-states (single-tool / parallel-tools /
+    thinking) × env shapes (tool / user / system / mixed).
+
+    Checks:
+      1. merged input_ids match canonical (modulo ``ASSISTANT_TEXT``).
+      2. Each ``required_content`` marker appears IN ORDER in the
+         incremental segment (catches dropped/reordered env messages).
+    """
+    messages = deepcopy(traj_cls.MESSAGES)
+    tools = deepcopy(getattr(traj_cls, "TOOLS", None))
+
+    session = LinearTrajectory()
+    _drive_session_through_trajectory(session, tito_tok, messages, tools)
+
+    pretokenized_buffer = list(session.token_ids)
+    assert pretokenized_buffer and pretokenized_buffer[-1] == tito_tok._im_end_id, (
+        f"K2V3 [{traj_name} + {env_shape.name}] setup error: pretokenized "
+        f"buffer should end at <|im_end|> after drive, got last token "
+        f"{pretokenized_buffer[-1] if pretokenized_buffer else 'EMPTY'}"
+    )
+
+    extended = list(session.messages) + list(env_shape.appended_messages)
+    pre = session.prepare_pretokenized(extended, tools, tito_tokenizer=tito_tok)
+    assert pre is not None, (
+        f"K2V3 [{traj_name} + {env_shape.name}] setup error: "
+        f"prepare_pretokenized returned None despite stored token_ids of "
+        f"length {len(pretokenized_buffer)}"
+    )
+    merged = list(pre["input_ids"])
+
+    expected = _render_ids(
+        extended,
+        tito_tok.tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+
+    comparator = tito_tok.create_comparator()
+    severe = [m for m in comparator.compare_sequences(expected, merged) if m.type != MismatchType.ASSISTANT_TEXT]
+    if severe:
+        details = "\n".join(
+            f"  {m.type.value} at segment {m.segment_index}: "
+            f"expected={m.expected_text!r} actual={m.actual_text!r}" + (f" — {m.detail}" if m.detail else "")
+            for m in severe[:5]
+        )
+        pytest.fail(
+            f"K2V3 [{traj_name} + {env_shape.name}] merged-vs-canonical "
+            f"mismatch under realistic buffer.\n"
+            f"  first_diff: {_first_diff(expected, merged)}\n{details}"
+        )
+
+    # required-contents-in-order check on the incremental segment.
+    incremental_text = tito_tok.tokenizer.decode(merged[len(pretokenized_buffer) :], skip_special_tokens=False)
+    cursor = 0
+    for content in env_shape.required_contents:
+        found = incremental_text.find(content, cursor)
+        assert found >= 0, (
+            f"K2V3 [{traj_name} + {env_shape.name}] required_content "
+            f"{content!r} missing from incremental tokens (or out of order). "
+            f"incremental_text={incremental_text!r}"
+        )
+        cursor = found + len(content)
+
+
+# ---------------------------------------------------------------------------
+# (Section A cont.) Real-SGLang-parser round-trip.
+#
+# Production server-side parsing flow:
+#   raw model text → ReasoningParser → FunctionCallParser
+#                  → structured assistant_message in session.messages
+#                  → next turn's chat_template re-renders it back to text
+#
+# If parser output drifts from what chat_template re-emits (whitespace
+# stripping, reasoning-block boundaries, tool_call argument formatting),
+# the structured message in history fails to round-trip — either causing
+# a buffer-vs-canonical mismatch on subsequent turns, or causing
+# chat_template to raise (e.g. K2V3's "tool_call.arguments must be dict").
+# ---------------------------------------------------------------------------
+
+
+# (Parser config is declared at the top of the file alongside K2V3_MODEL_PATH.)
+
+_TEST_TOOL_DICT = {
+    "type": "function",
+    "function": {
+        "name": "multiply",
+        "description": "Multiply two integers and return the product.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "a": {"type": "integer"},
+                "b": {"type": "integer"},
+            },
+            "required": ["a", "b"],
+        },
+    },
+}
+
+
+def _load_sglang_parsers():
+    """Return (FunctionCallParser_cls, ReasoningParser_cls) — either may be
+    None if SGLang is missing the corresponding module. Caller decides
+    whether to skip."""
+    fcp_cls = None
+    try:
+        from sglang.srt.function_call.function_call_parser import FunctionCallParser
+
+        fcp_cls = FunctionCallParser
+    except ImportError:
+        pass
+    rp_cls = None
+    try:
+        from sglang.srt.parser.reasoning_parser import ReasoningParser
+
+        rp_cls = ReasoningParser
+    except ImportError:
+        try:
+            from sglang.srt.reasoning_parser import ReasoningParser  # older SGLang layout
+
+            rp_cls = ReasoningParser
+        except ImportError:
+            pass
+    return fcp_cls, rp_cls
+
+
+def _try_json_decode_tool_args(tool_calls: list[dict]) -> list[dict]:
+    """K2V3's chat template requires ``tool_call.arguments`` to be a dict.
+    Hermes parser returns it as a JSON string. Decode for template
+    compatibility — this mirrors what production agent loops do."""
+    import json
+
+    out = []
+    for tc in tool_calls:
+        fn = tc.get("function", {})
+        args = fn.get("arguments")
+        if isinstance(args, str):
+            try:
+                fn = {**fn, "arguments": json.loads(args)}
+            except Exception:
+                pass
+        out.append({**tc, "function": fn})
+    return out
+
+
+@pytest.mark.parametrize(
+    "traj_name, traj_cls",
+    CONVERSATIONS,
+    ids=lambda x: x if isinstance(x, str) else None,
+)
+def test_chat_template_round_trip_through_real_sglang_parsers(traj_name, traj_cls, tito_tok):
+    r"""Invariant I4 with parser substitution: raw assistant emit →
+    ReasoningParser + FunctionCallParser → ``parsed_msg`` → re-render via
+    chat_template still round-trips structurally to canonical.
+
+    Parametrized over every trajectory in ``CONVERSATIONS``, so each
+    parser shape (plain / + tool_calls / + reasoning / + parallel
+    tool_calls) gets exercised.
+
+    ``ASSISTANT_TEXT`` mismatches are tolerated — the ``deepseek-r1``
+    parser does not ``rstrip`` reasoning content, so re-render inserts
+    an extra ``\n`` before ``</think>``. Production classifies this as
+    ``ASSISTANT_TEXT`` and the strict CI check excludes it; this test
+    matches that contract.
+
+    Skips if SGLang parsers are unavailable in this environment.
+    """
+    FCP, RP = _load_sglang_parsers()
+    if FCP is None:
+        pytest.skip("sglang.srt.function_call.function_call_parser not importable")
+
+    tokenizer = tito_tok.tokenizer
+    messages = deepcopy(traj_cls.MESSAGES)
+    tools = deepcopy(getattr(traj_cls, "TOOLS", None))
+
+    # Pick the first assistant message — that's our parser-test ``truth_msg``.
+    # The messages preceding it (system + user typically) are kept as the
+    # request prefix so the chat template renders in correct context.
+    first_asst_idx = next(i for i, m in enumerate(messages) if m["role"] == "assistant")
+    request_messages = messages[:first_asst_idx]
+    truth_msg = messages[first_asst_idx]
+    has_reasoning = bool(truth_msg.get("reasoning_content"))
+
+    # 1) Render truth_msg via chat_template — that is the raw emit shape.
+    full_text = _render_text(
+        request_messages + [truth_msg],
+        tokenizer,
+        tools,
+        add_generation_prompt=False,
+    )
+    prompt_text = _render_text(
+        request_messages,
+        tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+    assert full_text.startswith(prompt_text), (
+        f"K2V3 [{traj_name}] chat template not append-only: prompt-only " f"render is not a prefix of full render."
+    )
+    raw_assistant_emit = full_text[len(prompt_text) :].rstrip("\n")
+    assert raw_assistant_emit.endswith("<|im_end|>"), (
+        f"K2V3 [{traj_name}] unexpected raw_assistant_emit shape: " f"{raw_assistant_emit!r}"
+    )
+
+    # 2) Run real ReasoningParser on the raw emit (only if the trajectory's
+    #    truth_msg actually has reasoning_content — otherwise there's no
+    #    <think>...</think> to extract).
+    text_after_reasoning = raw_assistant_emit
+    parsed_reasoning = ""
+    if _K2V3_REASONING_PARSER and has_reasoning:
+        if RP is None:
+            pytest.skip("sglang reasoning parser not importable")
+        try:
+            rp = RP(model_type=_K2V3_REASONING_PARSER)
+        except Exception as e:
+            pytest.skip(f"reasoning parser {_K2V3_REASONING_PARSER!r} unsupported " f"by this SGLang build: {e}")
+        r_out, n_out = rp.parse_non_stream(raw_assistant_emit)
+        parsed_reasoning = r_out or ""
+        text_after_reasoning = n_out if n_out is not None else ""
+
+    # 3) Run real FunctionCallParser on the post-reasoning text.
+    try:
+        from sglang.srt.entrypoints.openai.protocol import Tool as SGLangTool
+    except ImportError as e:
+        pytest.skip(f"sglang.srt.entrypoints.openai.protocol.Tool not importable: {e}")
+    sglang_tools = [SGLangTool(**t) for t in (tools or [])]
+    try:
+        fcp = FCP(tools=sglang_tools, tool_call_parser=_K2V3_TOOL_PARSER)
+    except Exception as e:
+        pytest.skip(f"tool parser {_K2V3_TOOL_PARSER!r} unsupported by this SGLang " f"build: {e}")
+    normal_text, tool_call_items = fcp.parse_non_stream(text_after_reasoning)
+    parsed_content = normal_text if normal_text is not None else ""
+    parsed_tool_calls = [
+        {
+            "id": f"call_{i}",
+            "type": "function",
+            "function": {"name": item.name, "arguments": item.parameters},
+        }
+        for i, item in enumerate(tool_call_items)
+    ]
+    # Hermes returns arguments as a JSON string; K2V3 chat template requires
+    # a dict. Decoding here mirrors what a production agent loop does
+    # before storing the assistant message.
+    parsed_tool_calls = _try_json_decode_tool_args(parsed_tool_calls)
+
+    parsed_msg: dict = {
+        "role": "assistant",
+        "content": parsed_content,
+        "tool_calls": parsed_tool_calls,
+    }
+    if has_reasoning:
+        parsed_msg["reasoning_content"] = parsed_reasoning
+
+    # 4) Drive session with parser-derived assistant_message.
+    # ``raw_assistant_emit`` already ends with ``<|im_end|>`` (the model's
+    # autoregressive stop), so the tokenized form is the complete emit.
+    # Do NOT append ``tokenizer.eos_token_id`` — for K2V3 that is
+    # ``<|endoftext|>``, which the model never emits at turn boundary
+    # and would create a spurious extra special-token mismatch.
+    emit_ids = list(tokenizer.encode(raw_assistant_emit, add_special_tokens=False))
+    prompt_ids = _render_ids(
+        request_messages,
+        tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+    session = LinearTrajectory()
+    session.update_pretokenized_state(
+        request_messages=list(request_messages),
+        assistant_message=parsed_msg,
+        prompt_token_ids=prompt_ids,
+        completion_token_ids=emit_ids,
+        max_trim_tokens=tito_tok.max_trim_tokens,
+    )
+
+    # 5) Compare ``session.token_ids`` (rollout buffer with raw emit tokens)
+    #    against ``apply_chat_template(session.messages)`` canonical (which
+    #    re-renders parsed_msg back to text). Severe types only.
+    expected = _render_ids(
+        session.messages,
+        tokenizer,
+        tools,
+        add_generation_prompt=False,
+    )
+    actual = list(session.token_ids)
+    comparator = tito_tok.create_comparator()
+    mismatches = comparator.compare_sequences(expected, actual)
+    severe = [m for m in mismatches if m.type != MismatchType.ASSISTANT_TEXT]
+    if severe:
+        details = "\n".join(
+            f"  {m.type.value} at segment {m.segment_index}: "
+            f"expected={m.expected_text!r} actual={m.actual_text!r}" + (f" — {m.detail}" if m.detail else "")
+            for m in severe[:8]
+        )
+        pytest.fail(
+            f"K2V3 [{traj_name}] chat-template ↔ SGLang parser structural "
+            f"round-trip mismatch (tool_parser={_K2V3_TOOL_PARSER!r}, "
+            f"reasoning_parser={_K2V3_REASONING_PARSER!r}). "
+            f"Severe types only — ASSISTANT_TEXT-only mismatches are "
+            f"tolerated (whitespace inside assistant content; production "
+            f"already classifies these as non-severe).\n"
+            f"{details}\n"
+            f"({len(severe)} severe mismatch(es) total; "
+            f"showing first {min(8, len(severe))}.)"
+        )
+
+
+# ###########################################################################
+# ###########################################################################
+# ##                                                                       ##
+# ##  SECTION B — INTEGRATION STRESS                                       ##
+# ##                                                                       ##
+# ##  Chains real parsers across every assistant turn so parser-derived    ##
+# ##  ``parsed_msg`` accumulates in ``session.messages``, then runs        ##
+# ##  ``prepare_pretokenized → merge_tokens`` against that parser-tainted  ##
+# ##  history with a complex env follow-up.                                ##
+# ##                                                                       ##
+# ##  Section A covers each invariant in isolation. A failure here that    ##
+# ##  does NOT reproduce in section A indicates a parser-interaction       ##
+# ##  regression specific to accumulated multi-turn state.                 ##
+# ##                                                                       ##
+# ###########################################################################
+# ###########################################################################
+
+
+@dataclass(frozen=True)
+class _BossFlow:
+    name: str
+    trajectory_cls: type
+    final_env: list[dict]
+
+
+# Build the synthesized thinking variant of the parallel-tools trajectory
+# at module load (so it's a stable type referenced in _BOSS_FLOWS).
+_MultiToolSingleTurnThinking = _with_synthetic_thinking(MultiToolSingleTurnTrajectory)
+
+
+_BOSS_FLOWS: list[_BossFlow] = [
+    _BossFlow(
+        name="multi_turn_thinking + tool_followup",
+        trajectory_cls=MultiTurnThinkingTrajectory,
+        final_env=[
+            {"role": "tool", "tool_call_id": "boss_call_1", "content": "_boss_tool_followup_xyz_42_"},
+        ],
+    ),
+    _BossFlow(
+        name="multi_tool_multi_turn_thinking + alternating_user_tool_followup",
+        trajectory_cls=LongChainThinkingTrajectory,
+        final_env=[
+            {"role": "tool", "tool_call_id": "boss_call_2a", "content": "_boss_alt_tool1_aaa_"},
+            {"role": "user", "content": "_boss_alt_user1_bbb_"},
+            {"role": "tool", "tool_call_id": "boss_call_2b", "content": "_boss_alt_tool2_ccc_"},
+            {"role": "user", "content": "_boss_alt_user2_ddd_"},
+        ],
+    ),
+    _BossFlow(
+        name="multi_tool_single_turn_thinking + system_inject",
+        trajectory_cls=_MultiToolSingleTurnThinking,
+        final_env=[
+            {"role": "system", "content": "_boss_system_inject_def_77_"},
+        ],
+    ),
+    _BossFlow(
+        name="multi_tool_multi_turn_thinking + complex_env_chain",
+        trajectory_cls=LongChainThinkingTrajectory,
+        final_env=[
+            {"role": "tool", "tool_call_id": "boss_call_4a", "content": "_boss_chain_tool1_AAA_"},
+            {"role": "user", "content": "_boss_chain_user1_BBB_"},
+            {"role": "tool", "tool_call_id": "boss_call_4b", "content": "_boss_chain_tool2_CCC_"},
+            {"role": "system", "content": "_boss_chain_system_DDD_"},
+            {"role": "tool", "tool_call_id": "boss_call_4c", "content": "_boss_chain_tool3_EEE_"},
+        ],
+    ),
+]
+
+
+def _run_parsers_on_emit(
+    raw_emit: str,
+    tools: list[dict] | None,
+    *,
+    fcp_cls,
+    rp_cls,
+    has_reasoning: bool,
+) -> tuple[str, list[dict], str]:
+    """Invoke real SGLang parsers on a raw assistant emit. Returns
+    (parsed_content, parsed_tool_calls, parsed_reasoning)."""
+    text_after_reasoning = raw_emit
+    parsed_reasoning = ""
+    if has_reasoning and _K2V3_REASONING_PARSER:
+        if rp_cls is None:
+            pytest.skip("sglang reasoning parser not importable")
+        try:
+            rp = rp_cls(model_type=_K2V3_REASONING_PARSER)
+        except Exception as e:
+            pytest.skip(f"reasoning parser {_K2V3_REASONING_PARSER!r} unsupported " f"by this SGLang build: {e}")
+        r_out, n_out = rp.parse_non_stream(raw_emit)
+        parsed_reasoning = r_out or ""
+        text_after_reasoning = n_out if n_out is not None else ""
+
+    try:
+        from sglang.srt.entrypoints.openai.protocol import Tool as SGLangTool
+    except ImportError as e:
+        pytest.skip(f"sglang.srt.entrypoints.openai.protocol.Tool not importable: {e}")
+    sglang_tools = [SGLangTool(**t) for t in (tools or [])]
+    try:
+        fcp = fcp_cls(tools=sglang_tools, tool_call_parser=_K2V3_TOOL_PARSER)
+    except Exception as e:
+        pytest.skip(f"tool parser {_K2V3_TOOL_PARSER!r} unsupported by this SGLang " f"build: {e}")
+    normal_text, tool_call_items = fcp.parse_non_stream(text_after_reasoning)
+    parsed_content = normal_text if normal_text is not None else ""
+    parsed_tool_calls = [
+        {
+            "id": f"call_{i}",
+            "type": "function",
+            "function": {"name": item.name, "arguments": item.parameters},
+        }
+        for i, item in enumerate(tool_call_items)
+    ]
+    parsed_tool_calls = _try_json_decode_tool_args(parsed_tool_calls)
+    return parsed_content, parsed_tool_calls, parsed_reasoning
+
+
+def _drive_one_assistant_turn_through_real_parsers(
+    session: LinearTrajectory,
+    tito_tok,
+    *,
+    fcp_cls,
+    rp_cls,
+    request_messages: list[dict],
+    truth_assistant_msg: dict,
+    tools: list[dict] | None,
+) -> dict:
+    """Render ``truth_assistant_msg`` to raw_emit, parse it with real
+    SGLang parsers, build ``parsed_msg`` from parser output, drive the
+    session with ``parsed_msg`` (NOT ``truth_assistant_msg`` — production
+    stores parser output in messages history). Returns ``parsed_msg``.
+    """
+    tokenizer = tito_tok.tokenizer
+
+    full_text = _render_text(
+        request_messages + [truth_assistant_msg],
+        tokenizer,
+        tools,
+        add_generation_prompt=False,
+    )
+    prompt_text = _render_text(
+        request_messages,
+        tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+    assert full_text.startswith(prompt_text), (
+        "chat template not append-only between " "render(request_messages) and render(request_messages + [truth_msg])"
+    )
+    raw_emit = full_text[len(prompt_text) :].rstrip("\n")
+    assert raw_emit.endswith("<|im_end|>"), f"unexpected raw_emit shape: {raw_emit!r}"
+
+    has_reasoning = bool(truth_assistant_msg.get("reasoning_content"))
+    parsed_content, parsed_tool_calls, parsed_reasoning = _run_parsers_on_emit(
+        raw_emit,
+        tools,
+        fcp_cls=fcp_cls,
+        rp_cls=rp_cls,
+        has_reasoning=has_reasoning,
+    )
+
+    parsed_msg: dict = {
+        "role": "assistant",
+        "content": parsed_content,
+        "tool_calls": parsed_tool_calls,
+    }
+    if has_reasoning:
+        parsed_msg["reasoning_content"] = parsed_reasoning
+
+    pre = session.prepare_pretokenized(request_messages, tools, tito_tokenizer=tito_tok)
+    if pre is None:
+        prompt_ids = _render_ids(
+            request_messages,
+            tokenizer,
+            tools,
+            add_generation_prompt=True,
+        )
+    else:
+        prompt_ids = list(pre["input_ids"])
+
+    emit_ids = list(tokenizer.encode(raw_emit, add_special_tokens=False))
+
+    session.update_pretokenized_state(
+        request_messages=list(request_messages),
+        assistant_message=parsed_msg,
+        prompt_token_ids=prompt_ids,
+        completion_token_ids=emit_ids,
+        max_trim_tokens=tito_tok.max_trim_tokens,
+    )
+    return parsed_msg
+
+
+@pytest.mark.parametrize("flow", _BOSS_FLOWS, ids=lambda f: f.name)
+def test_end_to_end_realistic_rollout_with_real_parsers(flow: _BossFlow, tito_tok):
+    """Invariants I3+I4 under integration stress: drive every assistant
+    turn of a multi-turn trajectory through real parsers so
+    ``session.messages`` accumulates parser-derived ``parsed_msg`` across
+    turns, then append a complex env chain and verify
+    ``merge_tokens`` over the parser-tainted history still matches
+    canonical.
+
+    Failure here that doesn't reproduce in the simpler per-shape tests
+    above indicates a parser-interaction regression specific to
+    accumulated session state.
+
+    Skips if SGLang parsers are unavailable.
+    """
+    FCP, RP = _load_sglang_parsers()
+    if FCP is None:
+        pytest.skip("sglang.srt.function_call.function_call_parser not importable")
+
+    messages = deepcopy(flow.trajectory_cls.MESSAGES)
+    tools = deepcopy(getattr(flow.trajectory_cls, "TOOLS", None))
+    asst_indices = _assistant_indices(messages)
+    assert asst_indices, f"boss flow {flow.name} has no assistant turns"
+
+    session = LinearTrajectory()
+
+    # Track running messages — these become the request_messages prefix
+    # for each subsequent turn, with each prior turn's truth_assistant
+    # replaced by its parser-derived parsed_msg.
+    running_messages: list[dict] = []
+
+    for k, asst_idx in enumerate(asst_indices):
+        if k == 0:
+            # Pre-first-assistant: typically [system, user]
+            request_messages = list(messages[:asst_idx])
+        else:
+            # Add env messages from the trajectory between previous
+            # assistant and this one (tool results, user follow-ups, etc.)
+            prev_asst_idx = asst_indices[k - 1]
+            env_between = list(messages[prev_asst_idx + 1 : asst_idx])
+            request_messages = list(running_messages) + env_between
+
+        truth_msg = messages[asst_idx]
+        parsed_msg = _drive_one_assistant_turn_through_real_parsers(
+            session,
+            tito_tok,
+            fcp_cls=FCP,
+            rp_cls=RP,
+            request_messages=request_messages,
+            truth_assistant_msg=truth_msg,
+            tools=tools,
+        )
+        running_messages = list(request_messages) + [parsed_msg]
+
+    # Final env follow-up — triggers prepare_pretokenized → merge_tokens
+    # over a session.messages that has been fully populated by parser-
+    # derived parsed_msg's.
+    extended = list(session.messages) + list(flow.final_env)
+    pre = session.prepare_pretokenized(extended, tools, tito_tokenizer=tito_tok)
+    assert pre is not None, (
+        f"K2V3 [boss/{flow.name}] setup error: prepare_pretokenized "
+        f"returned None even though session has "
+        f"{len(session.messages)} stored messages"
+    )
+    merged = list(pre["input_ids"])
+
+    expected = _render_ids(
+        extended,
+        tito_tok.tokenizer,
+        tools,
+        add_generation_prompt=True,
+    )
+
+    comparator = tito_tok.create_comparator()
+    severe = [m for m in comparator.compare_sequences(expected, merged) if m.type != MismatchType.ASSISTANT_TEXT]
+    if severe:
+        details = "\n".join(
+            f"  {m.type.value} at segment {m.segment_index}: "
+            f"expected={m.expected_text!r} actual={m.actual_text!r}" + (f" — {m.detail}" if m.detail else "")
+            for m in severe[:8]
+        )
+        pytest.fail(
+            f"K2V3 [boss/{flow.name}] integration mismatch: "
+            f"merged input_ids vs canonical render diverge after multi-turn "
+            f"parser-driven flow.\n"
+            f"  first_diff: {_first_diff(expected, merged)}\n{details}\n"
+            f"({len(severe)} severe mismatch(es) total; "
+            f"showing first {min(8, len(severe))}.)"
+        )
+
+    # Required-content marker check on the incremental segment — ensures
+    # the final env chain's content (which includes user/tool/system
+    # markers) actually flows into the incremental tokens in order.
+    pretokenized_buffer = list(session.token_ids)
+    incremental_text = tito_tok.tokenizer.decode(merged[len(pretokenized_buffer) :], skip_special_tokens=False)
+    cursor = 0
+    for env_msg in flow.final_env:
+        marker = env_msg.get("content", "")
+        if not marker:
+            continue
+        found = incremental_text.find(marker, cursor)
+        assert found >= 0, (
+            f"K2V3 [boss/{flow.name}] env marker {marker!r} missing "
+            f"from incremental tokens (or out of order). "
+            f"incremental_text={incremental_text!r}"
+        )
+        cursor = found + len(marker)
+
+
+# ###########################################################################
+# ###########################################################################
+# ##                                                                       ##
+# ##  SECTION C — SANITY (orthogonal to I1-I4)                             ##
+# ##                                                                       ##
+# ##  Guards on adjacent runtime defenses and registry wiring — these do   ##
+# ##  not test the boundary-fix invariants themselves but catch nearby     ##
+# ##  regressions that would silently disable the protection above.        ##
+# ##                                                                       ##
+# ###########################################################################
+# ###########################################################################
+
+
+def test_production_prefix_check_raises_on_intentional_violation(tito_tok):
+    """Validate that production's ``update_pretokenized_state`` prefix check
+    fires when fed prompt_token_ids that do not extend the stored prefix.
+
+    If a refactor disables this check, this test fails — protecting the
+    runtime defense that catches the same class of bugs in real rollouts.
+    """
+    session = LinearTrajectory()
+    user_q = {"role": "user", "content": "Test."}
+    asst1 = {"role": "assistant", "content": "ok"}
+
+    # Seed: drive a single normal turn so the session has stored token_ids.
+    prompt_ids = _render_ids(
+        [user_q],
+        tito_tok.tokenizer,
+        tools=None,
+        add_generation_prompt=True,
+    )
+    eos = getattr(tito_tok.tokenizer, "eos_token_id", None)
+    completion_ids = list(tito_tok.tokenizer.encode("ok", add_special_tokens=False))
+    if eos is not None and (not completion_ids or completion_ids[-1] != int(eos)):
+        completion_ids.append(int(eos))
+    session.update_pretokenized_state(
+        request_messages=[user_q],
+        assistant_message=asst1,
+        prompt_token_ids=prompt_ids,
+        completion_token_ids=completion_ids,
+        max_trim_tokens=tito_tok.max_trim_tokens,
+    )
+
+    # Now feed bogus prompt_ids — completely different from what's stored.
+    bogus_prompt = [99999] * (len(session.token_ids) + 5)
+    bogus_completion = [12345]
+    asst2 = {"role": "assistant", "content": "next"}
+    tool_msg = {"role": "tool", "content": "irrelevant"}
+
+    with pytest.raises(TokenizationError, match=r"pretokenized prefix mismatch"):
+        session.update_pretokenized_state(
+            request_messages=[user_q, asst1, tool_msg],
+            assistant_message=asst2,
+            prompt_token_ids=bogus_prompt,
+            completion_token_ids=bogus_completion,
+            max_trim_tokens=0,
+        )
+
+
+def test_k2v3_oldbackup_subclass_is_wired(tito_tok):
+    """Sanity: ``get_tito_tokenizer(..., TITOTokenizerType.K2V3_OLDBACKUP)`` returns
+    the K2V3OldBackup subclass — not silently falling back to the base
+    ``TITOTokenizer`` or accidentally to the current K2V3 class. Catches a
+    future regression where the registry entry is removed or pointed
+    elsewhere."""
+    from miles.utils.chat_template_utils.tito_tokenizer import K2V3OldBackupTITOTokenizer
+
+    assert isinstance(tito_tok, K2V3OldBackupTITOTokenizer), (
+        f"expected K2V3OldBackupTITOTokenizer, got {type(tito_tok).__name__}. "
+        f"_TOKENIZER_REGISTRY[TITOTokenizerType.K2V3_OLDBACKUP] may be misregistered."
+    )