diff --git a/core/backend/llm.go b/core/backend/llm.go index 4f6b4d216b5a..409e02e5d26e 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -47,6 +47,28 @@ func needsThinkingProbe(c *config.ModelConfig) bool { c.ReasoningConfig.DisableReasoningTagPrefill == nil) } +// persistProbedReasoning writes the post-probe reasoning slots (and media +// marker) from probed back into the loader's persisted config for modelName, +// skipping any reasoning slot the probe was not actually allowed to fill. +// persistDisableReasoning/persistDisableTagPrefill must be snapshotted from +// probed's reasoning slots *before* the probe ran: a slot that already +// carried a value at that point was populated by request-time +// ApplyReasoningEffort, not by backend detection, and persisting it would +// masquerade as an operator's explicit reasoning.disable (see #10622). +func persistProbedReasoning(cl *config.ModelConfigLoader, modelName string, probed *config.ModelConfig, persistDisableReasoning, persistDisableTagPrefill bool) { + cl.UpdateModelConfig(modelName, func(cfg *config.ModelConfig) { + if persistDisableReasoning { + cfg.ReasoningConfig.DisableReasoning = probed.ReasoningConfig.DisableReasoning + } + if persistDisableTagPrefill { + cfg.ReasoningConfig.DisableReasoningTagPrefill = probed.ReasoningConfig.DisableReasoningTagPrefill + } + if probed.MediaMarker != "" { + cfg.MediaMarker = probed.MediaMarker + } + }) +} + // HasChatDeltaContent returns true if any chat delta carries content or reasoning text. // Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction. func (t TokenUsage) HasChatDeltaContent() bool { @@ -127,15 +149,19 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima needsMarkerProbe := c.MediaMarker == "" if shouldProbeThinking || needsMarkerProbe { modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath) + // DetectThinkingSupportFromBackend only fills reasoning slots that are + // still nil, so a slot that already carries a value here was populated by + // request-time ApplyReasoningEffort (e.g. a `reasoning_effort: none` + // default), not by backend detection. Persisting such a request-scoped + // value would masquerade as an operator's explicit reasoning.disable and + // permanently defeat future per-request reasoning_effort overrides + // (see #10622). Only persist the slots the probe is actually allowed to + // fill. + persistDisableReasoning := c.ReasoningConfig.DisableReasoning == nil + persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts) // Update the config in the loader so it persists for future requests - cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) { - cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning - cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill - if c.MediaMarker != "" { - cfg.MediaMarker = c.MediaMarker - } - }) + persistProbedReasoning(cl, c.Name, c, persistDisableReasoning, persistDisableTagPrefill) } var protoMessages []*proto.Message diff --git a/core/backend/llm_probe_test.go b/core/backend/llm_probe_test.go index 73ed9f967b85..29b68f5f52a0 100644 --- a/core/backend/llm_probe_test.go +++ b/core/backend/llm_probe_test.go @@ -1,6 +1,8 @@ package backend import ( + "os" + "github.com/mudler/LocalAI/core/config" "github.com/gpustack/gguf-parser-go/util/ptr" @@ -27,3 +29,90 @@ var _ = Describe("thinking probe gating", func() { Expect(needsThinkingProbe(cfg)).To(BeFalse()) }) }) + +var _ = Describe("persistProbedReasoning", func() { + const modelName = "probe-test" + + // newLoaderWithConfig seeds a ModelConfigLoader with a single model config + // parsed from yamlBody, mirroring how the loader is populated from disk. + newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader { + tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml") + Expect(err).ToNot(HaveOccurred()) + defer func() { _ = os.Remove(tmp.Name()) }() + + _, err = tmp.WriteString(yamlBody) + Expect(err).ToNot(HaveOccurred()) + Expect(tmp.Close()).To(Succeed()) + + cl := config.NewModelConfigLoader("") + Expect(cl.ReadModelConfig(tmp.Name())).To(Succeed()) + return cl + } + + It("persists a reasoning slot the probe was allowed to fill (was nil beforehand)", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + probed.ReasoningConfig.DisableReasoning = ptr.To(false) // backend detected: supports thinking + probed.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true) + + persistProbedReasoning(cl, modelName, probed, true, true) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue()) + }) + + It("does not persist a slot that already carried a request-scoped value before the probe ran", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + // Simulates ApplyReasoningEffort("none") having set this on the + // request-scoped copy before the probe ran - not a genuine backend + // detection, so it must never reach the persisted config (#10622). + probed.ReasoningConfig.DisableReasoning = ptr.To(true) + + persistProbedReasoning(cl, modelName, probed, false, false) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoning).To(BeNil()) + Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeNil()) + }) + + It("preserves an operator's explicit persisted disable when the guard is false", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\nreasoning:\n disable: true\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + // Even if the request-scoped copy ends up holding a different value, + // persistDisableReasoning=false must keep the operator's own setting. + probed.ReasoningConfig.DisableReasoning = ptr.To(false) + + persistProbedReasoning(cl, modelName, probed, false, false) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("persists the media marker regardless of the reasoning guards", func() { + cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n") + + probed := &config.ModelConfig{} + probed.Name = modelName + probed.MediaMarker = "<__media__>" + + persistProbedReasoning(cl, modelName, probed, false, false) + + cfg, ok := cl.GetModelConfig(modelName) + Expect(ok).To(BeTrue()) + Expect(cfg.MediaMarker).To(Equal("<__media__>")) + }) +})