mudler · Anai-Guo · Jul 1, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/core/backend/llm.go b/core/backend/llm.go
@@ -47,6 +47,28 @@ func needsThinkingProbe(c *config.ModelConfig) bool {
 			c.ReasoningConfig.DisableReasoningTagPrefill == nil)
 }
 
+// persistProbedReasoning writes the post-probe reasoning slots (and media
+// marker) from probed back into the loader's persisted config for modelName,
+// skipping any reasoning slot the probe was not actually allowed to fill.
+// persistDisableReasoning/persistDisableTagPrefill must be snapshotted from
+// probed's reasoning slots *before* the probe ran: a slot that already
+// carried a value at that point was populated by request-time
+// ApplyReasoningEffort, not by backend detection, and persisting it would
+// masquerade as an operator's explicit reasoning.disable (see #10622).
+func persistProbedReasoning(cl *config.ModelConfigLoader, modelName string, probed *config.ModelConfig, persistDisableReasoning, persistDisableTagPrefill bool) {
+	cl.UpdateModelConfig(modelName, func(cfg *config.ModelConfig) {
+		if persistDisableReasoning {
+			cfg.ReasoningConfig.DisableReasoning = probed.ReasoningConfig.DisableReasoning
+		}
+		if persistDisableTagPrefill {
+			cfg.ReasoningConfig.DisableReasoningTagPrefill = probed.ReasoningConfig.DisableReasoningTagPrefill
+		}
+		if probed.MediaMarker != "" {
+			cfg.MediaMarker = probed.MediaMarker
+		}
+	})
+}
+
 // HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
 // Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
 func (t TokenUsage) HasChatDeltaContent() bool {
@@ -127,15 +149,19 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 	needsMarkerProbe := c.MediaMarker == ""
 	if shouldProbeThinking || needsMarkerProbe {
 		modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
+		// DetectThinkingSupportFromBackend only fills reasoning slots that are
+		// still nil, so a slot that already carries a value here was populated by
+		// request-time ApplyReasoningEffort (e.g. a `reasoning_effort: none`
+		// default), not by backend detection. Persisting such a request-scoped
+		// value would masquerade as an operator's explicit reasoning.disable and
+		// permanently defeat future per-request reasoning_effort overrides
+		// (see #10622). Only persist the slots the probe is actually allowed to
+		// fill.
+		persistDisableReasoning := c.ReasoningConfig.DisableReasoning == nil
+		persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil
 		config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
 		// Update the config in the loader so it persists for future requests
-		cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
-			cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
-			cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
-			if c.MediaMarker != "" {
-				cfg.MediaMarker = c.MediaMarker
-			}
-		})
+		persistProbedReasoning(cl, c.Name, c, persistDisableReasoning, persistDisableTagPrefill)
 	}
 
 	var protoMessages []*proto.Message

diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go
@@ -1,6 +1,8 @@
 package backend
 
 import (
+	"os"
+
 	"github.com/mudler/LocalAI/core/config"
 
 	"github.com/gpustack/gguf-parser-go/util/ptr"
@@ -27,3 +29,90 @@ var _ = Describe("thinking probe gating", func() {
 		Expect(needsThinkingProbe(cfg)).To(BeFalse())
 	})
 })
+
+var _ = Describe("persistProbedReasoning", func() {
+	const modelName = "probe-test"
+
+	// newLoaderWithConfig seeds a ModelConfigLoader with a single model config
+	// parsed from yamlBody, mirroring how the loader is populated from disk.
+	newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader {
+		tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml")
+		Expect(err).ToNot(HaveOccurred())
+		defer func() { _ = os.Remove(tmp.Name()) }()
+
+		_, err = tmp.WriteString(yamlBody)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(tmp.Close()).To(Succeed())
+
+		cl := config.NewModelConfigLoader("")
+		Expect(cl.ReadModelConfig(tmp.Name())).To(Succeed())
+		return cl
+	}
+
+	It("persists a reasoning slot the probe was allowed to fill (was nil beforehand)", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		probed.ReasoningConfig.DisableReasoning = ptr.To(false) // backend detected: supports thinking
+		probed.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)
+
+		persistProbedReasoning(cl, modelName, probed, true, true)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue())
+	})
+
+	It("does not persist a slot that already carried a request-scoped value before the probe ran", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		// Simulates ApplyReasoningEffort("none") having set this on the
+		// request-scoped copy before the probe ran - not a genuine backend
+		// detection, so it must never reach the persisted config (#10622).
+		probed.ReasoningConfig.DisableReasoning = ptr.To(true)
+
+		persistProbedReasoning(cl, modelName, probed, false, false)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoning).To(BeNil())
+		Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeNil())
+	})
+
+	It("preserves an operator's explicit persisted disable when the guard is false", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\nreasoning:\n  disable: true\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		// Even if the request-scoped copy ends up holding a different value,
+		// persistDisableReasoning=false must keep the operator's own setting.
+		probed.ReasoningConfig.DisableReasoning = ptr.To(false)
+
+		persistProbedReasoning(cl, modelName, probed, false, false)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("persists the media marker regardless of the reasoning guards", func() {
+		cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")
+
+		probed := &config.ModelConfig{}
+		probed.Name = modelName
+		probed.MediaMarker = "<__media__>"
+
+		persistProbedReasoning(cl, modelName, probed, false, false)
+
+		cfg, ok := cl.GetModelConfig(modelName)
+		Expect(ok).To(BeTrue())
+		Expect(cfg.MediaMarker).To(Equal("<__media__>"))
+	})
+})