Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 33 additions & 7 deletions core/backend/llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,28 @@ func needsThinkingProbe(c *config.ModelConfig) bool {
c.ReasoningConfig.DisableReasoningTagPrefill == nil)
}

// persistProbedReasoning writes the post-probe reasoning slots (and media
// marker) from probed back into the loader's persisted config for modelName,
// skipping any reasoning slot the probe was not actually allowed to fill.
// persistDisableReasoning/persistDisableTagPrefill must be snapshotted from
// probed's reasoning slots *before* the probe ran: a slot that already
// carried a value at that point was populated by request-time
// ApplyReasoningEffort, not by backend detection, and persisting it would
// masquerade as an operator's explicit reasoning.disable (see #10622).
func persistProbedReasoning(cl *config.ModelConfigLoader, modelName string, probed *config.ModelConfig, persistDisableReasoning, persistDisableTagPrefill bool) {
cl.UpdateModelConfig(modelName, func(cfg *config.ModelConfig) {
if persistDisableReasoning {
cfg.ReasoningConfig.DisableReasoning = probed.ReasoningConfig.DisableReasoning
}
if persistDisableTagPrefill {
cfg.ReasoningConfig.DisableReasoningTagPrefill = probed.ReasoningConfig.DisableReasoningTagPrefill
}
if probed.MediaMarker != "" {
cfg.MediaMarker = probed.MediaMarker
}
})
}

// HasChatDeltaContent returns true if any chat delta carries content or reasoning text.
// Used to decide whether to prefer C++ autoparser deltas over Go-side tag extraction.
func (t TokenUsage) HasChatDeltaContent() bool {
Expand Down Expand Up @@ -127,15 +149,19 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
needsMarkerProbe := c.MediaMarker == ""
if shouldProbeThinking || needsMarkerProbe {
modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath)
// DetectThinkingSupportFromBackend only fills reasoning slots that are
// still nil, so a slot that already carries a value here was populated by
// request-time ApplyReasoningEffort (e.g. a `reasoning_effort: none`
// default), not by backend detection. Persisting such a request-scoped
// value would masquerade as an operator's explicit reasoning.disable and
// permanently defeat future per-request reasoning_effort overrides
// (see #10622). Only persist the slots the probe is actually allowed to
// fill.
persistDisableReasoning := c.ReasoningConfig.DisableReasoning == nil
persistDisableTagPrefill := c.ReasoningConfig.DisableReasoningTagPrefill == nil
config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts)
// Update the config in the loader so it persists for future requests
cl.UpdateModelConfig(c.Name, func(cfg *config.ModelConfig) {
cfg.ReasoningConfig.DisableReasoning = c.ReasoningConfig.DisableReasoning
cfg.ReasoningConfig.DisableReasoningTagPrefill = c.ReasoningConfig.DisableReasoningTagPrefill
if c.MediaMarker != "" {
cfg.MediaMarker = c.MediaMarker
}
})
persistProbedReasoning(cl, c.Name, c, persistDisableReasoning, persistDisableTagPrefill)
}

var protoMessages []*proto.Message
Expand Down
89 changes: 89 additions & 0 deletions core/backend/llm_probe_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package backend

import (
"os"

"github.com/mudler/LocalAI/core/config"

"github.com/gpustack/gguf-parser-go/util/ptr"
Expand All @@ -27,3 +29,90 @@ var _ = Describe("thinking probe gating", func() {
Expect(needsThinkingProbe(cfg)).To(BeFalse())
})
})

var _ = Describe("persistProbedReasoning", func() {
const modelName = "probe-test"

// newLoaderWithConfig seeds a ModelConfigLoader with a single model config
// parsed from yamlBody, mirroring how the loader is populated from disk.
newLoaderWithConfig := func(yamlBody string) *config.ModelConfigLoader {
tmp, err := os.CreateTemp("", "persist-probed-reasoning-*.yaml")
Expect(err).ToNot(HaveOccurred())
defer func() { _ = os.Remove(tmp.Name()) }()

_, err = tmp.WriteString(yamlBody)
Expect(err).ToNot(HaveOccurred())
Expect(tmp.Close()).To(Succeed())

cl := config.NewModelConfigLoader("")
Expect(cl.ReadModelConfig(tmp.Name())).To(Succeed())
return cl
}

It("persists a reasoning slot the probe was allowed to fill (was nil beforehand)", func() {
cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")

probed := &config.ModelConfig{}
probed.Name = modelName
probed.ReasoningConfig.DisableReasoning = ptr.To(false) // backend detected: supports thinking
probed.ReasoningConfig.DisableReasoningTagPrefill = ptr.To(true)

persistProbedReasoning(cl, modelName, probed, true, true)

cfg, ok := cl.GetModelConfig(modelName)
Expect(ok).To(BeTrue())
Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeFalse())
Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).ToNot(BeNil())
Expect(*cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeTrue())
})

It("does not persist a slot that already carried a request-scoped value before the probe ran", func() {
cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")

probed := &config.ModelConfig{}
probed.Name = modelName
// Simulates ApplyReasoningEffort("none") having set this on the
// request-scoped copy before the probe ran - not a genuine backend
// detection, so it must never reach the persisted config (#10622).
probed.ReasoningConfig.DisableReasoning = ptr.To(true)

persistProbedReasoning(cl, modelName, probed, false, false)

cfg, ok := cl.GetModelConfig(modelName)
Expect(ok).To(BeTrue())
Expect(cfg.ReasoningConfig.DisableReasoning).To(BeNil())
Expect(cfg.ReasoningConfig.DisableReasoningTagPrefill).To(BeNil())
})

It("preserves an operator's explicit persisted disable when the guard is false", func() {
cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\nreasoning:\n disable: true\n")

probed := &config.ModelConfig{}
probed.Name = modelName
// Even if the request-scoped copy ends up holding a different value,
// persistDisableReasoning=false must keep the operator's own setting.
probed.ReasoningConfig.DisableReasoning = ptr.To(false)

persistProbedReasoning(cl, modelName, probed, false, false)

cfg, ok := cl.GetModelConfig(modelName)
Expect(ok).To(BeTrue())
Expect(cfg.ReasoningConfig.DisableReasoning).ToNot(BeNil())
Expect(*cfg.ReasoningConfig.DisableReasoning).To(BeTrue())
})

It("persists the media marker regardless of the reasoning guards", func() {
cl := newLoaderWithConfig("name: probe-test\nbackend: llama-cpp\n")

probed := &config.ModelConfig{}
probed.Name = modelName
probed.MediaMarker = "<__media__>"

persistProbedReasoning(cl, modelName, probed, false, false)

cfg, ok := cl.GetModelConfig(modelName)
Expect(ok).To(BeTrue())
Expect(cfg.MediaMarker).To(Equal("<__media__>"))
})
})
Loading