HandleBrokerRevocationWebhookAsync(
@@ -486,4 +814,85 @@ private static byte[] Base64UrlDecode(string value)
}
return Convert.FromBase64String(padded);
}
+
+ ///
+ /// Render the user-facing success page returned in the OAuth-callback
+ /// response. Issue #513 phase 1 asked for a "callback success → please pick
+ /// a model" prompt. The full version is a card update pushed back into
+ /// Lark, which requires capturing the /init card's adapter-owned message
+ /// id and passing it through the OAuth state token — substantial new
+ /// design surface left as a follow-up. This page is the browser-side
+ /// substitute the user sees immediately after the OAuth redirect, and it
+ /// names the next-step commands (/model, /whoami) explicitly
+ /// so the user is not left guessing what to type back in Lark.
+ ///
+ ///
+ /// Display name comes from the id_token "name" / sub claim; HTML-encoded
+ /// before interpolation so a malicious id_token cannot inject markup.
+ /// Other error paths in the callback intentionally keep returning JSON for
+ /// ops/programmatic consumers.
+ ///
+ internal static IResult RenderBoundSuccessHtml(string? displayName, bool alreadyBound) =>
+ RenderBoundSuccess(displayName, alreadyBound, format: null);
+
+ ///
+ /// Render the post-binding success response. Default is the HTML browser page that
+ /// users land on after clicking the OAuth approve button. Programmatic consumers
+ /// (CLI, SDK, integration tests) opt into a JSON envelope by passing
+ /// ?format=json on the callback URL — the same shape the endpoint returned
+ /// before the HTML render landed (PR #570 review #24).
+ ///
+ internal static IResult RenderBoundSuccess(string? displayName, bool alreadyBound, string? format)
+ {
+ if (string.Equals(format, "json", StringComparison.OrdinalIgnoreCase))
+ {
+ return Results.Json(new
+ {
+ status = "bound",
+ already_bound = alreadyBound,
+ display_name = string.IsNullOrWhiteSpace(displayName) ? null : displayName,
+ });
+ }
+
+ return RenderBoundSuccessHtmlInternal(displayName, alreadyBound);
+ }
+
+ internal static IResult RenderBoundSuccessHtmlInternal(string? displayName, bool alreadyBound)
+ {
+ var badge = alreadyBound ? "已绑定" : "绑定成功";
+ var heading = alreadyBound ? "NyxID 账号已绑定" : "已绑定 NyxID 账号";
+ var displayLine = string.IsNullOrWhiteSpace(displayName)
+ ? string.Empty
+ : $"账号:{System.Net.WebUtility.HtmlEncode(displayName)}
";
+ var body = alreadyBound
+ ? "当前账号已经完成绑定,无需重复操作。可以关闭此页,回到 Lark 继续对话。
"
+ : "可以关闭此页,回到 Lark 继续对话。
";
+
+ var html = $@"
+
+
+
+
+NyxID 绑定 — {badge}
+
+
+
+{badge}
+{heading}
+{displayLine}
+{body}
+
+下一步
+回到 Lark 后,发送 /model 选择想用的模型,或 /whoami 查看当前绑定状态。
+
+
+";
+ return Results.Content(html, "text/html; charset=utf-8");
+ }
}
diff --git a/agents/Aevatar.GAgents.Channel.Identity/ExternalIdentityBindingGAgent.cs b/agents/Aevatar.GAgents.Channel.Identity/ExternalIdentityBindingGAgent.cs
index 2fbbe4a21..cbfea5d09 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/ExternalIdentityBindingGAgent.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/ExternalIdentityBindingGAgent.cs
@@ -83,8 +83,8 @@ public async Task HandleCommitBinding(CommitBindingCommand cmd)
// was never activated (issue #549 follow-up: the binding scope
// missed an EnsureProjectionForActorAsync wiring while every
// other GAgent had one) leaves the readmodel empty, the OAuth
- // callback's readiness wait times out, and the next inbound
- // message's binding gate keeps re-sending the user back to /init.
+ // callback's readiness wait times out, and binding-required
+ // commands keep re-sending the user back to /init.
// Apply is identity, so the binding facts are not mutated by
// this event.
await PersistDomainEventAsync(new ExternalIdentityBindingProjectionRebuildRequestedEvent
@@ -118,11 +118,13 @@ await PersistDomainEventAsync(new ExternalIdentityBoundEvent
}
///
- /// Revokes the active binding. NO-OP when state has no active binding
- /// (e.g. concurrent /unbind, or revoke-after-revoke from invalid_grant
- /// retry). Caller must have already invoked the NyxID-side revoke
- /// (or observed invalid_grant) — this command only transitions
- /// local state.
+ /// Revokes the active binding. When state has no active binding (for
+ /// example concurrent /unbind, revoke-after-revoke from
+ /// invalid_grant, or remote-side self-heal after projection drift),
+ /// emits a no-op rebuild event so the readmodel is overwritten from the
+ /// actor's authoritative empty state. Caller must have already invoked
+ /// the NyxID-side revoke (or observed invalid_grant) — this command
+ /// only transitions local state.
///
[EventHandler]
public async Task HandleRevokeBinding(RevokeBindingCommand cmd)
@@ -138,26 +140,37 @@ public async Task HandleRevokeBinding(RevokeBindingCommand cmd)
if (!IsCommandSubjectMatchingActor(cmd.ExternalSubject))
return;
+ // Use the explicit "unspecified" sentinel so the persisted audit
+ // trail distinguishes "caller did not supply a reason" from a
+ // missing/empty value. The event Reason field is non-nullable in
+ // proto3 (defaults to ""), so the sentinel substitution lives at
+ // the boundary here rather than relying on per-call interpretation
+ // (kimi-k2p6 L109 / L124 5/5 consensus).
+ var reason = string.IsNullOrWhiteSpace(cmd.Reason) ? "unspecified" : cmd.Reason;
+
if (string.IsNullOrEmpty(State.BindingId))
{
+ // Remote revocation self-heal can land here when the actor state
+ // is already empty but the readmodel still contains an old active
+ // binding. Persisting an identity event republishes the committed
+ // state root, allowing the projector to overwrite that stale
+ // document without inventing query-time repair logic.
+ await PersistDomainEventAsync(new ExternalIdentityBindingProjectionRebuildRequestedEvent
+ {
+ Reason = $"revoke_without_active_binding:{reason}",
+ RequestedAt = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
+ });
Logger.LogInformation(
- "RevokeBinding skipped: no active binding for {Platform}:{Tenant}:{User}",
+ "RevokeBinding found no active binding for {Platform}:{Tenant}:{User}; rebuild requested so the projector materializes the authoritative empty state (reason={Reason})",
cmd.ExternalSubject.Platform,
cmd.ExternalSubject.Tenant,
- cmd.ExternalSubject.ExternalUserId);
+ cmd.ExternalSubject.ExternalUserId,
+ reason);
return;
}
var revokedBindingId = State.BindingId;
- // Use the explicit "unspecified" sentinel so the persisted audit
- // trail distinguishes "caller did not supply a reason" from a
- // missing/empty value. The event Reason field is non-nullable in
- // proto3 (defaults to ""), so the sentinel substitution lives at
- // the boundary here rather than relying on per-call interpretation
- // (kimi-k2p6 L109 / L124 5/5 consensus).
- var reason = string.IsNullOrWhiteSpace(cmd.Reason) ? "unspecified" : cmd.Reason;
-
await PersistDomainEventAsync(new ExternalIdentityBindingRevokedEvent
{
ExternalSubject = cmd.ExternalSubject.Clone(),
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionPort.cs b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionPort.cs
index 789cf2653..f530659a4 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionPort.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionPort.cs
@@ -17,13 +17,14 @@ namespace Aevatar.GAgents.Channel.Identity;
/// Pre-this-port, the binding scope was never activated for any actor and
/// every legacy cluster's binding readmodel was empty even when the
/// actor's State held an active binding — the OAuth callback's readiness
-/// wait would time out, and the next inbound message's binding gate would
-/// keep sending the user back to /init forever (issue #549 follow-up
+/// wait would time out, and binding-required commands would keep sending
+/// the user back to /init forever (issue #549 follow-up
/// observed 2026-05-01: CommitBinding discarded: already bound
/// without a corresponding readmodel materialization).
///
public sealed class ExternalIdentityBindingProjectionPort
- : MaterializationProjectionPortBase
+ : MaterializationProjectionPortBase,
+ IExternalIdentityBindingProjectionPort
{
public const string ProjectionKind = "external-identity-binding";
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionQueryPort.cs b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionQueryPort.cs
index 17c099775..dc737a605 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionQueryPort.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionQueryPort.cs
@@ -8,8 +8,9 @@ namespace Aevatar.GAgents.Channel.Identity;
/// Reads through the projection
/// document reader (Elasticsearch / in-memory provider). No event-store replay,
/// no actor state mirror, no query-time priming — see ADR-0018 §Projection
-/// Readiness. A miss returns null; callers MUST drive the sender to
-/// /init rather than fall back to bot-owner credentials.
+/// Readiness. A miss returns null; binding-required command handlers can
+/// prompt /init, while normal LLM turns may fall back to bot-owner
+/// credentials.
///
public sealed class ExternalIdentityBindingProjectionQueryPort
: IExternalIdentityBindingQueryPort
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionReadinessPort.cs b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionReadinessPort.cs
index 7ad020a53..102103012 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionReadinessPort.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjectionReadinessPort.cs
@@ -57,6 +57,8 @@ expectedBindingId is null
private static bool Matches(ExternalIdentityBindingDocument? document, string? expectedBindingId)
{
+ if (expectedBindingId is null && document is null)
+ return true;
if (document is null)
return false;
if (expectedBindingId is null)
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjector.cs b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjector.cs
index bb3b77112..3247aa73e 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjector.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Projection/ExternalIdentityBindingProjector.cs
@@ -3,6 +3,8 @@
using Aevatar.CQRS.Projection.Runtime.Abstractions;
using Aevatar.CQRS.Projection.Stores.Abstractions;
using Aevatar.Foundation.Abstractions;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
namespace Aevatar.GAgents.Channel.Identity;
@@ -14,18 +16,31 @@ namespace Aevatar.GAgents.Channel.Identity;
/// the write dispatcher. Read side (`IExternalIdentityBindingQueryPort`)
/// reads the same documents — see ADR-0018 §Projection Readiness.
///
+///
+/// READMODEL CONTRACT: when state.BindingId is empty (revoked / never bound),
+/// the projector DELETES the document rather than upserting an inactive record. This
+/// is a deliberate semantic change from earlier builds that left an inactive document
+/// behind: IExternalIdentityBindingQueryPort.ResolveAsync returns null
+/// for revoked bindings now, which lets ExternalIdentityBindingProjectionReadinessPort.Matches
+/// match the (null, null) tuple cleanly. Downstream consumers that want the
+/// audit history (e.g. admin dashboards) must consume the committed-event log directly
+/// — they cannot rely on a tombstone in the readmodel.
+///
public sealed class ExternalIdentityBindingProjector
: ICurrentStateProjectionMaterializer
{
private readonly IProjectionWriteDispatcher _writeDispatcher;
private readonly IProjectionClock _clock;
+ private readonly ILogger _logger;
public ExternalIdentityBindingProjector(
IProjectionWriteDispatcher writeDispatcher,
- IProjectionClock clock)
+ IProjectionClock clock,
+ ILogger? logger = null)
{
_writeDispatcher = writeDispatcher ?? throw new ArgumentNullException(nameof(writeDispatcher));
_clock = clock ?? throw new ArgumentNullException(nameof(clock));
+ _logger = logger ?? NullLogger.Instance;
}
public async ValueTask ProjectAsync(
@@ -56,6 +71,17 @@ public async ValueTask ProjectAsync(
UpdatedAt = CommittedStateEventEnvelope.ResolveTimestamp(envelope, _clock.UtcNow),
};
+ if (string.IsNullOrEmpty(document.BindingId))
+ {
+ _logger.LogWarning(
+ "Deleting external identity binding document {DocumentId} because projected BindingId is empty. event={EventId}, version={Version}",
+ document.Id,
+ document.LastEventId,
+ document.StateVersion);
+ await _writeDispatcher.DeleteAsync(document.Id, ct);
+ return;
+ }
+
await _writeDispatcher.UpsertAsync(document, ct);
}
}
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Projection/IExternalIdentityBindingProjectionPort.cs b/agents/Aevatar.GAgents.Channel.Identity/Projection/IExternalIdentityBindingProjectionPort.cs
new file mode 100644
index 000000000..ddcf8bb7a
--- /dev/null
+++ b/agents/Aevatar.GAgents.Channel.Identity/Projection/IExternalIdentityBindingProjectionPort.cs
@@ -0,0 +1,19 @@
+using Aevatar.CQRS.Projection.Core.Orchestration;
+
+namespace Aevatar.GAgents.Channel.Identity;
+
+///
+/// Abstraction for activating the projection materialization scope for a per-(platform,
+/// tenant, external_user_id) . Consumers
+/// (OAuth endpoints, identity slash-command self-heal) must depend on this interface
+/// per CLAUDE.md "依赖反转" rather than the concrete
+/// — that gives the host a seam to
+/// swap implementations (e.g. fire-and-forget self-heal in tests vs. a real activation
+/// service in production).
+///
+public interface IExternalIdentityBindingProjectionPort
+{
+ Task EnsureProjectionForActorAsync(
+ string actorId,
+ CancellationToken ct = default);
+}
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthAdminOptions.cs b/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthAdminOptions.cs
new file mode 100644
index 000000000..1b4d4b8fb
--- /dev/null
+++ b/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthAdminOptions.cs
@@ -0,0 +1,35 @@
+namespace Aevatar.GAgents.Channel.Identity;
+
+///
+/// Operator credentials for the cluster-singleton OAuth client admin
+/// surface. Currently only protects the rebuild endpoint
+/// (POST /api/oauth/aevatar-client/rebuild) — see issue #549 for the
+/// production wedge that motivated it.
+///
+///
+/// Bound from configuration section ChannelIdentity:Admin. When
+/// is empty the rebuild endpoint refuses to
+/// run (503), so a misconfigured cluster is fail-secure rather than
+/// fail-open. Production deploys set the token via env var
+/// ChannelIdentity__Admin__RebuildToken; tests/dev clusters may
+/// leave it unset and the endpoint stays disabled.
+///
+public sealed class AevatarOAuthAdminOptions
+{
+ ///
+ /// Configuration section name under .
+ ///
+ public const string SectionName = "ChannelIdentity:Admin";
+
+ ///
+ /// Header callers send the rebuild token in. Constant-time compared to
+ /// ; mismatch returns 401.
+ ///
+ public const string RebuildTokenHeader = "X-Aevatar-Admin-Token";
+
+ ///
+ /// Shared secret required on the rebuild endpoint. Empty disables the
+ /// endpoint entirely (fail-secure default).
+ ///
+ public string RebuildToken { get; set; } = string.Empty;
+}
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthClientGAgent.cs b/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthClientGAgent.cs
index a4c538a84..85309562b 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthClientGAgent.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Provisioning/AevatarOAuthClientGAgent.cs
@@ -289,9 +289,19 @@ private Task AbsorbPeerHmacSeedAsync(EventStoreOptimisticConcurrencyExcept
/// production bootstrap path uses
/// instead so the actor (not the
/// caller) mediates the DCR call. Idempotent: re-issuing the same
- /// triple is a no-op. Always seeds a fresh HMAC key when the state has
- /// none — bootstrap and provisioning are single-step.
+ /// snapshot (client_id + authority + redirect_uri + oauth_scope) is a
+ /// no-op. Always seeds a fresh HMAC key when the state has none —
+ /// bootstrap and provisioning are single-step.
///
+ ///
+ /// The same-snapshot check covers redirect_uri + oauth_scope on top of
+ /// client_id + authority because the operator-rebuild path
+ /// (POST /api/oauth/aevatar-client/rebuild, issue #549) must be
+ /// able to heal a wedged actor whose state has the right client_id but
+ /// stale or empty redirect_uri / oauth_scope — leaving those drifted
+ /// would let the next bootstrap re-DCR and replace the operator's
+ /// freshly-pinned client_id with a new (orphan-creating) one.
+ ///
[EventHandler]
public async Task HandleProvision(ProvisionAevatarOAuthClientCommand cmd)
{
@@ -307,9 +317,21 @@ public async Task HandleProvision(ProvisionAevatarOAuthClientCommand cmd)
return;
}
- var sameClient = string.Equals(State.ClientId, cmd.ClientId, StringComparison.Ordinal)
- && string.Equals(State.NyxidAuthority, cmd.NyxidAuthority, StringComparison.Ordinal);
- if (!sameClient)
+ // Empty cmd field = "field not supplied by this caller", NOT "set
+ // to empty". Otherwise a legacy / pre-redirect_uri caller (e.g.
+ // ProvisionAevatarOAuthClientCommand v1 wire-compatibility, manual
+ // operator scripts that only know client_id + authority) would
+ // overwrite previously-persisted redirect_uri / oauth_scope with
+ // "" — and the next bootstrap pass would observe the cleared
+ // value, detect drift, re-DCR the freshly-pinned client, and
+ // rotate it away. Codex P1 on PR #570.
+ var redirectUri = string.IsNullOrEmpty(cmd.RedirectUri) ? State.RedirectUri : cmd.RedirectUri;
+ var oauthScope = string.IsNullOrEmpty(cmd.OauthScope) ? State.OauthScope : cmd.OauthScope;
+ var sameSnapshot = string.Equals(State.ClientId, cmd.ClientId, StringComparison.Ordinal)
+ && string.Equals(State.NyxidAuthority, cmd.NyxidAuthority, StringComparison.Ordinal)
+ && string.Equals(State.RedirectUri, redirectUri, StringComparison.Ordinal)
+ && string.Equals(State.OauthScope, oauthScope, StringComparison.Ordinal);
+ if (!sameSnapshot)
{
await PersistDomainEventAsync(new AevatarOAuthClientProvisionedEvent
{
@@ -317,12 +339,14 @@ await PersistDomainEventAsync(new AevatarOAuthClientProvisionedEvent
ClientIdIssuedAtUnix = cmd.ClientIdIssuedAtUnix,
NyxidAuthority = cmd.NyxidAuthority,
PersistedAt = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
- OauthScope = cmd.OauthScope ?? string.Empty,
+ OauthScope = oauthScope,
+ RedirectUri = redirectUri,
});
Logger.LogInformation(
- "Provisioned aevatar OAuth client: client_id={ClientId}, authority={Authority}",
+ "Provisioned aevatar OAuth client: client_id={ClientId}, authority={Authority}, redirect_uri={RedirectUri}",
cmd.ClientId,
- cmd.NyxidAuthority);
+ cmd.NyxidAuthority,
+ string.IsNullOrEmpty(redirectUri) ? "" : redirectUri);
}
if (State.HmacKey.Length == 0)
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Slash/UnbindChannelSlashCommandHandler.cs b/agents/Aevatar.GAgents.Channel.Identity/Slash/UnbindChannelSlashCommandHandler.cs
index bbbeac6e3..4a529aff0 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Slash/UnbindChannelSlashCommandHandler.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Slash/UnbindChannelSlashCommandHandler.cs
@@ -16,16 +16,16 @@ namespace Aevatar.GAgents.Channel.Identity.Slash;
public sealed class UnbindChannelSlashCommandHandler : IChannelSlashCommandHandler
{
private readonly INyxIdCapabilityBroker _broker;
- private readonly IActorRuntime _actorRuntime;
+ private readonly IActorDispatchPort _actorDispatchPort;
private readonly ILogger _logger;
public UnbindChannelSlashCommandHandler(
INyxIdCapabilityBroker broker,
- IActorRuntime actorRuntime,
+ IActorDispatchPort actorDispatchPort,
ILogger logger)
{
_broker = broker ?? throw new ArgumentNullException(nameof(broker));
- _actorRuntime = actorRuntime ?? throw new ArgumentNullException(nameof(actorRuntime));
+ _actorDispatchPort = actorDispatchPort ?? throw new ArgumentNullException(nameof(actorDispatchPort));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
@@ -70,9 +70,6 @@ public UnbindChannelSlashCommandHandler(
{
try
{
- var actor = await _actorRuntime
- .CreateAsync(actorId, ct)
- .ConfigureAwait(false);
var envelope = new EventEnvelope
{
Id = Guid.NewGuid().ToString("N"),
@@ -82,12 +79,9 @@ public UnbindChannelSlashCommandHandler(
ExternalSubject = context.Subject.Clone(),
Reason = "user_unbind",
}),
- Route = new EnvelopeRoute
- {
- Direct = new DirectRoute { TargetActorId = actorId },
- },
+ Route = EnvelopeRouteSemantics.CreateDirect("channel.identity.unbind", actorId),
};
- await actor.HandleEventAsync(envelope, ct).ConfigureAwait(false);
+ await _actorDispatchPort.DispatchAsync(actorId, envelope, ct).ConfigureAwait(false);
localDispatchError = null;
break;
}
diff --git a/agents/Aevatar.GAgents.Channel.Identity/Slash/WhoamiChannelSlashCommandHandler.cs b/agents/Aevatar.GAgents.Channel.Identity/Slash/WhoamiChannelSlashCommandHandler.cs
index 6d22caa1a..1d8302fda 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/Slash/WhoamiChannelSlashCommandHandler.cs
+++ b/agents/Aevatar.GAgents.Channel.Identity/Slash/WhoamiChannelSlashCommandHandler.cs
@@ -4,15 +4,17 @@
namespace Aevatar.GAgents.Channel.Identity.Slash;
///
-/// /whoami — show the inbound sender their current binding state. Always
-/// requires a binding; the runner short-circuits unbound senders to the
-/// /init prompt before invoking the handler.
+/// /whoami — show the inbound sender their current binding state. Issue #513
+/// Phase 6 specifies /init, /unbind, and /whoami do NOT
+/// require a binding so an unbound sender can introspect their own state
+/// without being bounced through the binding gate. Bound senders see masked
+/// binding info; unbound senders see "未绑定" with a /init hint.
///
public sealed class WhoamiChannelSlashCommandHandler : IChannelSlashCommandHandler
{
public string Name => "whoami";
- public bool RequiresBinding => true;
+ public bool RequiresBinding => false;
public ChannelSlashCommandUsage Usage => new(
Name,
@@ -28,13 +30,21 @@ public sealed class WhoamiChannelSlashCommandHandler : IChannelSlashCommandHandl
? context.SenderId
: context.SenderName;
- var lines = new[]
- {
- $"已绑定 NyxID 账号。",
- $"- 平台账号:{senderName}",
- $"- Binding ID:{Mask(bindingId)}",
- $"- 平台:{context.Subject.Platform}",
- };
+ var lines = string.IsNullOrEmpty(bindingId)
+ ? new[]
+ {
+ "未绑定 NyxID 账号。",
+ $"- 平台账号:{senderName}",
+ $"- 平台:{context.Subject.Platform}",
+ "发送 /init 完成绑定。",
+ }
+ : new[]
+ {
+ "已绑定 NyxID 账号。",
+ $"- 平台账号:{senderName}",
+ $"- Binding ID:{Mask(bindingId)}",
+ $"- 平台:{context.Subject.Platform}",
+ };
var reply = new MessageContent
{
diff --git a/agents/Aevatar.GAgents.Channel.Identity/protos/aevatar_oauth_client.proto b/agents/Aevatar.GAgents.Channel.Identity/protos/aevatar_oauth_client.proto
index ff6ae2d68..7eb286667 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/protos/aevatar_oauth_client.proto
+++ b/agents/Aevatar.GAgents.Channel.Identity/protos/aevatar_oauth_client.proto
@@ -74,16 +74,25 @@ message EnsureAevatarOAuthClientProvisionedCommand {
}
// Issued by tests / manual operator scripts that already hold a client_id
-// (e.g. seeded fixture, post-rotation retag). Bootstrap NEVER uses this —
-// it always sends EnsureAevatarOAuthClientProvisionedCommand instead so the
-// actor mediates the DCR call.
+// (e.g. seeded fixture, post-rotation retag, post-incident rebuild). Bootstrap
+// NEVER uses this — it always sends EnsureAevatarOAuthClientProvisionedCommand
+// instead so the actor mediates the DCR call.
message ProvisionAevatarOAuthClientCommand {
string client_id = 1;
int64 client_id_issued_at_unix = 2;
string nyxid_authority = 3;
// Optional diagnostic scope for manually provisioned clients. Bootstrap
- // never uses this command path; an empty value means unknown.
+ // never uses this command path; an empty value means unknown. The
+ // operator-rebuild path must set this to AevatarOAuthClientScopes
+ // .AuthorizationScope so the next bootstrap does not detect drift and
+ // re-DCR the freshly-pinned client.
string oauth_scope = 4;
+ // Optional redirect URI. The operator-rebuild path (POST /api/oauth/
+ // aevatar-client/rebuild) must set this to the resolver output so the next
+ // bootstrap does not detect redirect drift and re-DCR the freshly-pinned
+ // client. Tests / fixture seeds may leave it empty when they don't care
+ // about drift detection on a subsequent bootstrap pass.
+ string redirect_uri = 5;
}
// Issued by ops to force a fresh HMAC key rotation. Old tokens signed with
diff --git a/agents/Aevatar.GAgents.Channel.Identity/protos/external_identity_binding.proto b/agents/Aevatar.GAgents.Channel.Identity/protos/external_identity_binding.proto
index 0dcae6467..bc0586187 100644
--- a/agents/Aevatar.GAgents.Channel.Identity/protos/external_identity_binding.proto
+++ b/agents/Aevatar.GAgents.Channel.Identity/protos/external_identity_binding.proto
@@ -35,8 +35,9 @@ message CommitBindingCommand {
}
// Issued by the /unbind handler after a successful NyxID DELETE call, or by
-// the turn path on `invalid_grant` from token-exchange. NO-OP at the actor
-// when state has no active binding.
+// the turn path on `invalid_grant` from token-exchange. When state has no
+// active binding, the actor leaves binding facts unchanged but republishes
+// its authoritative state root so stale readmodels can be overwritten.
message RevokeBindingCommand {
aevatar.gagents.channel.abstractions.ExternalSubjectRef external_subject = 1;
// Free-form reason for audit (e.g. "user_unbind", "nyx_invalid_grant",
@@ -59,13 +60,12 @@ message ExternalIdentityBindingRevokedEvent {
}
// Persisted when an inbound CommitBindingCommand is discarded because the
-// actor already holds an active binding_id, OR when a deploy needs to re-
-// publish the authoritative state root for a legacy binding actor whose
-// projection scope was never activated. Apply is identity — the binding
-// facts are not mutated. The projector still sees a state-root publication
-// and materializes the existing binding into the readmodel, fixing the
-// 2026-05-01 production regression where the binding scope was missing
-// (issue #549 follow-up).
+// actor already holds an active binding_id, when RevokeBindingCommand observes
+// already-empty actor state, OR when a deploy needs to re-publish the
+// authoritative state root for a legacy binding actor whose projection scope
+// was never activated. Apply is identity — the binding facts are not mutated.
+// The projector still sees a state-root publication and materializes the
+// authoritative state into the readmodel.
message ExternalIdentityBindingProjectionRebuildRequestedEvent {
string reason = 1;
google.protobuf.Timestamp requested_at = 2;
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/ChannelMetadataKeys.cs b/agents/Aevatar.GAgents.Channel.Runtime/ChannelMetadataKeys.cs
index bb788f49a..168d25a47 100644
--- a/agents/Aevatar.GAgents.Channel.Runtime/ChannelMetadataKeys.cs
+++ b/agents/Aevatar.GAgents.Channel.Runtime/ChannelMetadataKeys.cs
@@ -39,8 +39,8 @@ public static class ChannelMetadataKeys
///
/// Authoritative outbound Lark receive_id for the current workflow run, captured at
/// agent-create time. Propagated via WorkflowChatRunRequest.Metadata so workflow
- /// modules (e.g. TwitterPublishModule) can surface their result back into the same
- /// chat without having to look up the catalog at execution time.
+ /// modules can surface their result back into the same chat without having to look up the
+ /// catalog at execution time.
///
public const string LarkReceiveId = "channel.lark.receive_id";
/// Companion to — its receive_id_type.
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.LarkCardStreaming.cs b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.LarkCardStreaming.cs
new file mode 100644
index 000000000..a6b134361
--- /dev/null
+++ b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.LarkCardStreaming.cs
@@ -0,0 +1,512 @@
+using Aevatar.GAgents.Channel.Abstractions;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+
+namespace Aevatar.GAgents.Channel.Runtime;
+
+public sealed partial class ConversationGAgent
+{
+ private readonly Dictionary _larkCardStreamingStates = new(StringComparer.Ordinal);
+
+ ///
+ /// Per-turn phase of the Lark CardKit streaming pipeline. Distinct from
+ /// (which models channel-relay edit-message
+ /// streaming): card streaming has its own lifecycle (allocate card entity, bind to
+ /// chat, stream element content, close streaming mode) and goes through the API-key
+ /// proxy directly rather than channel-relay's /reply{,/update} surface.
+ ///
+ ///
+ /// Fallback semantics: when card creation fails (), the
+ /// dispatcher routes the turn to the legacy text-edit sink (NyxRelayStreamingPhase
+ /// machine). Once is reached, the card path owns the turn —
+ /// mid-stream rate-limit / table-limit failures terminate the turn at
+ /// with the last flushed text persisted as partial.
+ ///
+ private enum LarkCardStreamingPhase
+ {
+ Idle,
+ Creating,
+ Streaming,
+ Completed,
+ Aborted,
+ Terminated,
+ CreationFailed,
+ }
+
+ private enum LarkCardStreamingGuardSource
+ {
+ AcceptInterimChunk,
+ Finalize,
+ }
+
+ ///
+ /// Actor-scoped, in-memory streaming state for one CardKit-driven turn. Keyed by
+ /// correlation_id, same lifecycle as .
+ ///
+ /// Lifecycle phase; gates interim updates and finalization.
+ ///
+ /// CardKit card entity id returned by cardkit/v1/cards. Null until
+ /// ; required for every element-content
+ /// and settings update afterwards.
+ ///
+ ///
+ /// Lark IM message id returned by the im/v1/messages send that bound the card
+ /// to a chat. Used by the unavailable-guard to detect upstream message recall.
+ ///
+ ///
+ /// Preserved card id for terminal full-card update if mid-stream we fall back to text
+ /// patch (table-limit class errors). Currently always equal to ;
+ /// reserved for the mid-stream-fallback follow-up (#589 Scope D).
+ ///
+ ///
+ /// Last text successfully streamed into the card element. Persisted as the user-visible
+ /// terminal state when finalization fails after streaming started.
+ ///
+ ///
+ /// Monotonic counter passed to every CardKit write. Pre-incremented before each call;
+ /// Lark rejects stale writes deterministically.
+ ///
+ ///
+ /// Element id within the card to stream into. Defaults to streaming_main;
+ /// must match the card template's element naming.
+ ///
+ /// Diagnostic reason captured on entry to terminal phases.
+ private sealed record LarkCardStreamingState(
+ LarkCardStreamingPhase Phase,
+ string? CardId,
+ string? CardMessageId,
+ string? OriginalCardId,
+ string LastFlushedText,
+ long Sequence,
+ string StreamingElementId,
+ string? TerminalReason)
+ {
+ public const string DefaultStreamingElementId = "streaming_main";
+
+ public static LarkCardStreamingState Initial { get; } = new(
+ LarkCardStreamingPhase.Idle,
+ CardId: null,
+ CardMessageId: null,
+ OriginalCardId: null,
+ LastFlushedText: string.Empty,
+ Sequence: 0,
+ StreamingElementId: DefaultStreamingElementId,
+ TerminalReason: null);
+
+ /// Phase permits accepting a new chunk (initial or interim).
+ public bool AllowsInterimEdit =>
+ Phase is LarkCardStreamingPhase.Idle
+ or LarkCardStreamingPhase.Streaming;
+
+ ///
+ /// Card creation already failed — dispatcher should route subsequent chunks to the
+ /// text-edit sink for the rest of this turn.
+ ///
+ public bool AllowsTextEditFallback =>
+ Phase is LarkCardStreamingPhase.Idle
+ or LarkCardStreamingPhase.CreationFailed;
+
+ /// Phase permits attempting a finalize (close streaming + optional final update).
+ public bool AllowsFinalize =>
+ Phase is LarkCardStreamingPhase.Streaming;
+ }
+
+ private static bool IsTerminalLarkCardStreamingPhase(LarkCardStreamingPhase phase) =>
+ phase is LarkCardStreamingPhase.Completed
+ or LarkCardStreamingPhase.Aborted
+ or LarkCardStreamingPhase.Terminated
+ or LarkCardStreamingPhase.CreationFailed;
+
+ private static bool IsLegalLarkCardStreamingTransition(LarkCardStreamingPhase from, LarkCardStreamingPhase to) =>
+ (from, to) switch
+ {
+ (LarkCardStreamingPhase.Idle, LarkCardStreamingPhase.Creating) => true,
+
+ (LarkCardStreamingPhase.Creating, LarkCardStreamingPhase.Streaming) => true,
+ (LarkCardStreamingPhase.Creating, LarkCardStreamingPhase.CreationFailed) => true,
+ (LarkCardStreamingPhase.Creating, LarkCardStreamingPhase.Terminated) => true,
+
+ (LarkCardStreamingPhase.Streaming, LarkCardStreamingPhase.Streaming) => true,
+ (LarkCardStreamingPhase.Streaming, LarkCardStreamingPhase.Completed) => true,
+ (LarkCardStreamingPhase.Streaming, LarkCardStreamingPhase.Aborted) => true,
+ (LarkCardStreamingPhase.Streaming, LarkCardStreamingPhase.Terminated) => true,
+
+ _ => false,
+ };
+
+ private LarkCardStreamingState GetOrInitLarkCardStreamingState(string correlationId) =>
+ _larkCardStreamingStates.GetValueOrDefault(correlationId) ?? LarkCardStreamingState.Initial;
+
+ private static bool ShouldSkipLarkCardStreamingForUnavailable(
+ LarkCardStreamingState state,
+ LarkCardStreamingGuardSource source) =>
+ source switch
+ {
+ LarkCardStreamingGuardSource.AcceptInterimChunk => !state.AllowsInterimEdit,
+ LarkCardStreamingGuardSource.Finalize => !state.AllowsFinalize,
+ _ => false,
+ };
+
+ private LarkCardStreamingState TransitionLarkCardStreamingPhase(
+ string correlationId,
+ LarkCardStreamingState current,
+ LarkCardStreamingPhase next,
+ string? terminalReason = null,
+ Func? fieldUpdate = null)
+ {
+ if (!IsLegalLarkCardStreamingTransition(current.Phase, next))
+ {
+ Logger.LogWarning(
+ "Illegal Lark card streaming phase transition {From}->{To} for correlation={CorrelationId}; keeping current state",
+ current.Phase, next, correlationId);
+ return current;
+ }
+
+ var carried = fieldUpdate?.Invoke(current) ?? current;
+ var updated = carried with
+ {
+ Phase = next,
+ TerminalReason = IsTerminalLarkCardStreamingPhase(next)
+ ? (terminalReason ?? carried.TerminalReason)
+ : carried.TerminalReason,
+ };
+ _larkCardStreamingStates[correlationId] = updated;
+ return updated;
+ }
+
+ private IConversationCardTurnRunner ResolveCardRunner() =>
+ Services.GetService() ?? new NullConversationCardTurnRunner();
+
+ ///
+ /// Drives one CardKit-mode streaming chunk. Returns true when the card handler owns the
+ /// outcome (Idle->Creating[->Streaming], Streaming->Streaming, terminal-drop) and false
+ /// only when the caller should fall through to the legacy text-edit path —
+ /// CreationFailed phase signals "card path is dead for this turn, route the rest of the
+ /// chunks through edit-message streaming."
+ ///
+ private async Task HandleLarkCardStreamingChunkCoreAsync(
+ LlmReplyCardStreamChunkEvent evt,
+ string correlationId)
+ {
+ var state = GetOrInitLarkCardStreamingState(correlationId);
+
+ // Already-decided text-edit fallback: let the caller continue down the text-edit path.
+ if (state.Phase is LarkCardStreamingPhase.CreationFailed)
+ return false;
+
+ if (ShouldSkipLarkCardStreamingForUnavailable(state, LarkCardStreamingGuardSource.AcceptInterimChunk))
+ return true;
+
+ var runtimeContext = BuildNyxRelayRuntimeContext(evt.CorrelationId, evt.Activity);
+ var runner = ResolveCardRunner();
+
+ if (state.Phase is LarkCardStreamingPhase.Idle)
+ {
+ TransitionLarkCardStreamingPhase(correlationId, state, LarkCardStreamingPhase.Creating);
+ var creating = GetOrInitLarkCardStreamingState(correlationId);
+ ConversationCardCreateResult createResult;
+ try
+ {
+ createResult = await runner.RunCardCreateAsync(
+ evt,
+ creating.StreamingElementId,
+ runtimeContext,
+ CancellationToken.None);
+ }
+ catch (Exception ex)
+ {
+ Logger.LogWarning(ex, "Card create threw; falling back to text-edit. correlation={CorrelationId}", evt.CorrelationId);
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ creating,
+ LarkCardStreamingPhase.CreationFailed,
+ terminalReason: $"create_threw:{ex.GetType().Name}");
+ return false;
+ }
+
+ if (!createResult.Success)
+ {
+ if (createResult.IsPostSendFailure)
+ {
+ // Card was already sent to the chat — falling back to text-edit would
+ // produce a duplicate visible reply. Terminate the turn at Terminated and
+ // persist a partial-card record using the orphan card_message_id so the
+ // event store has a terminal entry. The runner has already attempted a
+ // best-effort streaming-mode close on the orphan card.
+ Logger.LogWarning(
+ "Card post-send failure (create+send succeeded, first stream failed); terminating turn without text-edit fallback. correlation={CorrelationId}, code={ErrorCode}, cardId={CardId}",
+ evt.CorrelationId,
+ createResult.ErrorCode,
+ createResult.CardId);
+ var terminated = TransitionLarkCardStreamingPhase(
+ correlationId,
+ creating,
+ LarkCardStreamingPhase.Terminated,
+ terminalReason: $"create_post_send_failed:{createResult.ErrorCode}",
+ fieldUpdate: s => s with
+ {
+ CardId = createResult.CardId,
+ CardMessageId = createResult.CardMessageId,
+ OriginalCardId = createResult.CardId,
+ });
+ await PersistCardStreamedCompletionAsync(
+ correlationId,
+ BuildLlmReplyCommandId(evt.CorrelationId),
+ evt.Activity,
+ evt.Activity,
+ terminated.CardMessageId ?? string.Empty,
+ terminated.LastFlushedText);
+ return true;
+ }
+
+ Logger.LogInformation(
+ "Card create failed; falling back to text-edit for the rest of this turn. correlation={CorrelationId}, code={ErrorCode}, rateLimited={RateLimited}, tableLimit={TableLimit}, cardUnavailable={CardUnavailable}",
+ evt.CorrelationId,
+ createResult.ErrorCode,
+ createResult.IsRateLimited,
+ createResult.IsTableLimitExceeded,
+ createResult.IsCardUnavailable);
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ creating,
+ LarkCardStreamingPhase.CreationFailed,
+ terminalReason: $"create_failed:{createResult.ErrorCode}");
+ return false;
+ }
+
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ creating,
+ LarkCardStreamingPhase.Streaming,
+ fieldUpdate: s => s with
+ {
+ CardId = createResult.CardId,
+ CardMessageId = createResult.CardMessageId,
+ OriginalCardId = createResult.CardId,
+ LastFlushedText = evt.AccumulatedText,
+ Sequence = 1,
+ });
+ return true;
+ }
+
+ // Streaming: interim element-content update. Sequence pre-incremented; on success
+ // record the new sequence + last-flushed text so finalize knows whether to write.
+ var nextSequence = state.Sequence + 1;
+ ConversationCardStreamResult streamResult;
+ try
+ {
+ streamResult = await runner.RunCardStreamAsync(
+ evt,
+ state.CardId ?? string.Empty,
+ state.StreamingElementId,
+ nextSequence,
+ runtimeContext,
+ CancellationToken.None);
+ }
+ catch (Exception ex)
+ {
+ Logger.LogWarning(ex, "Card stream threw; dropping frame. correlation={CorrelationId}, seq={Sequence}", evt.CorrelationId, nextSequence);
+ return true;
+ }
+
+ if (!streamResult.Success)
+ {
+ if (streamResult.IsRateLimited)
+ {
+ // Recoverable: skip the frame, keep sequence unchanged so the next chunk
+ // re-uses this slot.
+ Logger.LogDebug(
+ "Card stream rate-limited; dropping frame. correlation={CorrelationId}, seq={Sequence}",
+ evt.CorrelationId, nextSequence);
+ return true;
+ }
+ if (streamResult.IsTableLimitExceeded || streamResult.IsCardUnavailable)
+ {
+ Logger.LogWarning(
+ "Card stream terminal failure; ending turn. correlation={CorrelationId}, code={ErrorCode}",
+ evt.CorrelationId, streamResult.ErrorCode);
+ var terminated = TransitionLarkCardStreamingPhase(
+ correlationId,
+ state,
+ LarkCardStreamingPhase.Terminated,
+ terminalReason: $"stream_failed:{streamResult.ErrorCode}");
+ // Persist the partial-card terminal record so the event store records the
+ // turn even though LlmReplyReady has not arrived yet. Without this the
+ // ProcessedCommandIds guard in HandleLlmReplyReadyAsync would still see no
+ // matching entry, fall through to the legacy reply path, and post a
+ // duplicate text reply on top of the visible card.
+ await PersistCardStreamedCompletionAsync(
+ correlationId,
+ BuildLlmReplyCommandId(evt.CorrelationId),
+ evt.Activity,
+ evt.Activity,
+ terminated.CardMessageId ?? string.Empty,
+ terminated.LastFlushedText);
+ return true;
+ }
+ Logger.LogInformation(
+ "Card stream non-terminal failure; continuing. correlation={CorrelationId}, code={ErrorCode}",
+ evt.CorrelationId, streamResult.ErrorCode);
+ return true;
+ }
+
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ state,
+ LarkCardStreamingPhase.Streaming,
+ fieldUpdate: s => s with
+ {
+ LastFlushedText = evt.AccumulatedText,
+ Sequence = nextSequence,
+ });
+ return true;
+ }
+
+ ///
+ /// Drives the card-mode finalize when sees a
+ /// live Streaming phase. Persists a ConversationTurnCompletedEvent with
+ /// SentActivityId="lark-card-stream:{cardMessageId}" so observers can distinguish
+ /// the card path from the legacy nyx-relay-stream: path.
+ ///
+ private async Task TryCompleteCardStreamedReplyAsync(
+ LlmReplyReadyEvent evt,
+ string correlationId,
+ string commandId,
+ ChatActivity? referenceActivity)
+ {
+ var state = GetOrInitLarkCardStreamingState(correlationId);
+ // Idle: card path was never started for this turn (or already cleaned up); let the
+ // legacy edit-message finalize path handle it. CreationFailed: card create rejected
+ // pre-send, which already routed the chunks to the text-edit sink, so the text-edit
+ // finalize must run too. Both → return false to fall through.
+ if (state.Phase is LarkCardStreamingPhase.Idle
+ or LarkCardStreamingPhase.CreationFailed)
+ return false;
+
+ // Already-terminal card phase (post-send-failure, mid-stream rate/unavailable, or
+ // a previous finalize): persistence already happened at the transition site, so
+ // simply consume the ready event without running text-edit finalize. The
+ // ProcessedCommandIds guard in HandleLlmReplyReadyAsync also short-circuits late
+ // ready events, but returning true here keeps the contract explicit.
+ if (state.Phase is LarkCardStreamingPhase.Completed
+ or LarkCardStreamingPhase.Aborted
+ or LarkCardStreamingPhase.Terminated)
+ return true;
+
+ // Phase is Streaming or Creating. Creating during finalize is unexpected (card.create
+ // is synchronous within a single chunk's handler); treat it as Streaming with no
+ // prior interim text. Anything else falls through to text-edit, but the explicit
+ // guards above mean we only reach this point with phase=Streaming/Creating.
+ var finalText = evt.Outbound?.Text ?? string.Empty;
+ var finalDiffers = !string.IsNullOrWhiteSpace(finalText)
+ && !string.Equals(finalText, state.LastFlushedText, StringComparison.Ordinal);
+
+ var runtimeContext = BuildNyxRelayRuntimeContext(evt.CorrelationId, evt.Activity);
+ var runner = ResolveCardRunner();
+ var nextSequence = state.Sequence + 1;
+ var activityForToken = referenceActivity ?? evt.Activity ?? new ChatActivity();
+
+ ConversationCardFinalizeResult finalizeResult;
+ try
+ {
+ finalizeResult = await runner.RunCardFinalizeAsync(
+ activityForToken,
+ state.CardId ?? string.Empty,
+ state.StreamingElementId,
+ finalText,
+ finalDiffers,
+ nextSequence,
+ runtimeContext,
+ CancellationToken.None);
+ }
+ catch (Exception ex)
+ {
+ Logger.LogWarning(ex, "Card finalize threw; persisting last flushed partial. correlation={CorrelationId}", evt.CorrelationId);
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ state,
+ LarkCardStreamingPhase.Terminated,
+ terminalReason: $"finalize_threw:{ex.GetType().Name}");
+ await PersistCardStreamedCompletionAsync(
+ correlationId,
+ commandId,
+ evt.Activity,
+ referenceActivity,
+ state.CardMessageId ?? string.Empty,
+ state.LastFlushedText);
+ return true;
+ }
+
+ // visibleText must match what the user actually sees on the card. Two failure modes:
+ // * Final stream write failed → card shows LastFlushedText
+ // * Final stream succeeded but close-streaming failed → card shows finalText, just
+ // with a still-blinking cursor. Persist finalText so the durable record agrees
+ // with the visible state.
+ var visibleText = finalizeResult.FinalTextWritten ? finalText : state.LastFlushedText;
+ if (finalizeResult.Success)
+ {
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ state,
+ LarkCardStreamingPhase.Completed,
+ terminalReason: "completed");
+ }
+ else
+ {
+ Logger.LogWarning(
+ "Card finalize failed; persisting partial. correlation={CorrelationId}, code={ErrorCode}",
+ evt.CorrelationId, finalizeResult.ErrorCode);
+ TransitionLarkCardStreamingPhase(
+ correlationId,
+ state,
+ LarkCardStreamingPhase.Terminated,
+ terminalReason: $"finalize_failed:{finalizeResult.ErrorCode}");
+ }
+
+ await PersistCardStreamedCompletionAsync(
+ correlationId,
+ commandId,
+ evt.Activity,
+ referenceActivity,
+ state.CardMessageId ?? string.Empty,
+ visibleText);
+ return true;
+ }
+
+ ///
+ /// Persists the terminal ConversationTurnCompletedEvent for a card-streamed turn.
+ /// Decoupled from the inbound event type so both the LlmReplyReady finalize path and the
+ /// mid-stream Terminated path (post-send-failure / table-limit / unavailable, observed
+ /// while still processing chunks) can share one writer.
+ ///
+ private async Task PersistCardStreamedCompletionAsync(
+ string correlationId,
+ string commandId,
+ ChatActivity? eventActivity,
+ ChatActivity? referenceActivity,
+ string cardMessageId,
+ string outboundText)
+ {
+ var nowMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
+ var completed = new ConversationTurnCompletedEvent
+ {
+ ProcessedActivityId = string.Empty,
+ CausationCommandId = commandId,
+ SentActivityId = $"lark-card-stream:{cardMessageId}",
+ AuthPrincipal = "bot",
+ Conversation = eventActivity?.Conversation?.Clone()
+ ?? State.Conversation?.Clone()
+ ?? new ConversationReference(),
+ Outbound = new MessageContent { Text = outboundText },
+ CompletedAtUnixMs = nowMs,
+ OutboundDelivery = ToOutboundDeliveryReceipt(eventActivity?.OutboundDelivery),
+ };
+ await PersistDomainEventAsync(completed);
+ RemoveNyxRelayReplyToken(correlationId, referenceActivity);
+ Logger.LogInformation(
+ "Completed card-streamed LLM reply: correlation={CorrelationId} cardMessageId={CardMessageId} conversation={Key}",
+ correlationId,
+ cardMessageId,
+ completed.Conversation?.CanonicalKey);
+ }
+}
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.NyxRelayStreaming.cs b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.NyxRelayStreaming.cs
new file mode 100644
index 000000000..3ba1bf86b
--- /dev/null
+++ b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.NyxRelayStreaming.cs
@@ -0,0 +1,153 @@
+using Microsoft.Extensions.Logging;
+
+namespace Aevatar.GAgents.Channel.Runtime;
+
+public sealed partial class ConversationGAgent
+{
+ ///
+ /// Per-turn phase of the NyxID-relay edit-message streaming pipeline.
+ ///
+ ///
+ /// The reply token consumes on the first successful send. After that, only
+ /// /reply/update is valid; falling back to /reply would reuse a dead JTI
+ /// and surface as 401. The two boolean flags this enum replaces (Disabled +
+ /// SuppressInterim) failed to express that asymmetry directly, so callers had
+ /// to derive it from PlatformMessageId emptiness. The phase enum makes the
+ /// asymmetry the primary state.
+ ///
+ private enum NyxRelayStreamingPhase
+ {
+ Idle,
+ PlaceholderSent,
+ Streaming,
+ SuppressingInterim,
+ DisabledPreSend,
+ TerminalSucceeded,
+ TerminalPartial,
+ }
+
+ ///
+ /// Identifies which streaming entry point is asking the unavailable guard to decide
+ /// whether to short-circuit. Different sources have different "should I bail?" semantics.
+ ///
+ private enum NyxRelayStreamingGuardSource
+ {
+ AcceptInterimChunk,
+ Finalize,
+ }
+
+ ///
+ /// Actor-scoped, in-memory streaming state for one conversation turn. Never persisted.
+ /// Keyed by correlation_id, same lifecycle as .
+ ///
+ private sealed record NyxRelayStreamingState(
+ NyxRelayStreamingPhase Phase,
+ string? PlatformMessageId,
+ string LastFlushedText,
+ int EditCount,
+ string? TerminalReason)
+ {
+ public static NyxRelayStreamingState Initial { get; } =
+ new(NyxRelayStreamingPhase.Idle, null, string.Empty, 0, null);
+
+ public bool AllowsInterimEdit =>
+ Phase is NyxRelayStreamingPhase.Idle
+ or NyxRelayStreamingPhase.PlaceholderSent
+ or NyxRelayStreamingPhase.Streaming;
+
+ public bool AllowsFinalEdit =>
+ Phase is NyxRelayStreamingPhase.PlaceholderSent
+ or NyxRelayStreamingPhase.Streaming
+ or NyxRelayStreamingPhase.SuppressingInterim;
+
+ public bool AllowsReplyFallback =>
+ Phase is NyxRelayStreamingPhase.Idle
+ or NyxRelayStreamingPhase.DisabledPreSend;
+ }
+
+ private static bool IsTerminalNyxRelayStreamingPhase(NyxRelayStreamingPhase phase) =>
+ phase is NyxRelayStreamingPhase.DisabledPreSend
+ or NyxRelayStreamingPhase.TerminalSucceeded
+ or NyxRelayStreamingPhase.TerminalPartial;
+
+ private static bool IsLegalNyxRelayStreamingTransition(NyxRelayStreamingPhase from, NyxRelayStreamingPhase to) =>
+ (from, to) switch
+ {
+ (NyxRelayStreamingPhase.Idle, NyxRelayStreamingPhase.PlaceholderSent) => true,
+ (NyxRelayStreamingPhase.Idle, NyxRelayStreamingPhase.DisabledPreSend) => true,
+
+ (NyxRelayStreamingPhase.PlaceholderSent, NyxRelayStreamingPhase.Streaming) => true,
+ (NyxRelayStreamingPhase.PlaceholderSent, NyxRelayStreamingPhase.SuppressingInterim) => true,
+ (NyxRelayStreamingPhase.PlaceholderSent, NyxRelayStreamingPhase.TerminalSucceeded) => true,
+ (NyxRelayStreamingPhase.PlaceholderSent, NyxRelayStreamingPhase.TerminalPartial) => true,
+
+ (NyxRelayStreamingPhase.Streaming, NyxRelayStreamingPhase.Streaming) => true,
+ (NyxRelayStreamingPhase.Streaming, NyxRelayStreamingPhase.SuppressingInterim) => true,
+ (NyxRelayStreamingPhase.Streaming, NyxRelayStreamingPhase.TerminalSucceeded) => true,
+ (NyxRelayStreamingPhase.Streaming, NyxRelayStreamingPhase.TerminalPartial) => true,
+
+ (NyxRelayStreamingPhase.SuppressingInterim, NyxRelayStreamingPhase.TerminalSucceeded) => true,
+ (NyxRelayStreamingPhase.SuppressingInterim, NyxRelayStreamingPhase.TerminalPartial) => true,
+
+ _ => false,
+ };
+
+ private NyxRelayStreamingState GetOrInitNyxRelayStreamingState(string correlationId) =>
+ _nyxRelayStreamingStates.GetValueOrDefault(correlationId) ?? NyxRelayStreamingState.Initial;
+
+ ///
+ /// Single guard that owns the "should this streaming callback short-circuit?" decision.
+ /// Every public handler that touches the streaming path defers to this helper at the
+ /// top instead of repeating ad-hoc checks. Returns true when the caller should bail.
+ ///
+ ///
+ /// The Finalize branch also short-circuits when
+ /// is empty: a turn whose first send did not surface a platform message id (Nyx returned
+ /// an empty PlatformMessageId on initial /reply) cannot be finalized via
+ /// /reply/update — we have no upstream message to address — so the legacy
+ /// RunLlmReplyAsync fallback owns the terminal user-visible state. This preserves
+ /// the explicit empty-PlatformMessageId check that lived in the pre-refactor path.
+ ///
+ private static bool ShouldSkipNyxRelayStreamingForUnavailable(
+ NyxRelayStreamingState state,
+ NyxRelayStreamingGuardSource source) =>
+ source switch
+ {
+ NyxRelayStreamingGuardSource.AcceptInterimChunk => !state.AllowsInterimEdit,
+ NyxRelayStreamingGuardSource.Finalize =>
+ state.AllowsReplyFallback || string.IsNullOrEmpty(state.PlatformMessageId),
+ _ => false,
+ };
+
+ ///
+ /// Validates the transition, applies if any, writes the
+ /// updated state, and returns it. Illegal transitions are logged at warn level and
+ /// return the unchanged current state — actor turns must keep making progress.
+ ///
+ private NyxRelayStreamingState TransitionNyxRelayStreamingPhase(
+ string correlationId,
+ NyxRelayStreamingState current,
+ NyxRelayStreamingPhase next,
+ string? terminalReason = null,
+ Func? fieldUpdate = null)
+ {
+ if (!IsLegalNyxRelayStreamingTransition(current.Phase, next))
+ {
+ Logger.LogWarning(
+ "Illegal Nyx relay streaming phase transition {From}->{To} for correlation={CorrelationId}; keeping current state",
+ current.Phase, next, correlationId);
+ return current;
+ }
+
+ var carried = fieldUpdate?.Invoke(current) ?? current;
+ var updated = carried with
+ {
+ Phase = next,
+ TerminalReason = IsTerminalNyxRelayStreamingPhase(next)
+ ? (terminalReason ?? carried.TerminalReason)
+ : carried.TerminalReason,
+ };
+ _nyxRelayStreamingStates[correlationId] = updated;
+ return updated;
+ }
+}
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.cs b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.cs
index c1df93f33..33f037889 100644
--- a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.cs
+++ b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/ConversationGAgent.cs
@@ -30,14 +30,15 @@ public sealed partial class ConversationGAgent : GAgentBase _nyxRelayReplyTokens = new(StringComparer.Ordinal);
private readonly Dictionary _nyxRelayStreamingStates = new(StringComparer.Ordinal);
- ///
- /// Actor-scoped, in-memory streaming state for one conversation turn. Never persisted: tracks
- /// the upstream platform message id of the placeholder send and the two distinct failure
- /// modes that can disable parts of the streaming path. Keyed by correlation_id, same
- /// lifecycle as .
- ///
- ///
- /// The two failure flags carry different semantics with respect to the NyxID reply token:
- ///
- /// - Disabled means streaming was aborted before any successful send, so
- /// the reply token is still available and the actor may safely fall back to a single-shot
- /// /reply via .
- /// - SuppressInterim means the first chunk already consumed the reply token (the
- /// placeholder or first delta landed) but a later interim edit failed. The final edit must
- /// still be attempted via /reply/update; falling back to /reply would reuse a
- /// dead token and turn the partial into the user-visible terminal state.
- ///
- ///
- private sealed record NyxRelayStreamingState(
- string? PlatformMessageId,
- string LastFlushedText,
- int EditCount,
- bool Disabled,
- bool SuppressInterim)
- {
- public static NyxRelayStreamingState Initial { get; } = new(null, string.Empty, 0, false, false);
-
- ///
- /// True once the first successful send has landed: the NyxID reply token has been
- /// consumed and any further outbound must go through /reply/update. Used as the
- /// "token is dead, don't fall back to /reply" guard.
- ///
- public bool ReplyTokenConsumed => !string.IsNullOrEmpty(PlatformMessageId);
- }
-
///
/// Sliding window cap on retained processed ids. Keeps state size bounded while still
/// catching typical redelivery windows (seconds to minutes).
@@ -156,16 +122,16 @@ private async Task HandleInboundActivityCoreAsync(
var nowMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
if (result.LlmReplyRequest is not null)
{
- // The transient inbox copy keeps reply_token + expiry so the LLM worker can
+ // The transient run command copy keeps reply_token + expiry so the run actor can
// echo them back inside LlmReplyReadyEvent; the persisted state copy must
// not carry the credential into the event store / projection / read model.
- var inboxCopy = result.LlmReplyRequest.Clone();
- inboxCopy.TargetActorId = Id;
- var persistedCopy = inboxCopy.Clone();
+ var runCopy = result.LlmReplyRequest.Clone();
+ runCopy.TargetActorId = Id;
+ var persistedCopy = runCopy.Clone();
persistedCopy.ReplyToken = string.Empty;
persistedCopy.ReplyTokenExpiresAtUnixMs = 0;
await PersistDomainEventAsync(persistedCopy);
- await DispatchPendingLlmReplyAsync(inboxCopy, CancellationToken.None);
+ await DispatchPendingLlmReplyAsync(runCopy, CancellationToken.None);
Logger.LogInformation(
"Accepted inbound activity for deferred LLM reply: activity={ActivityId} conversation={Key}",
activity.Id,
@@ -332,7 +298,7 @@ public async Task HandleDeferredLlmReplyDroppedAsync(DeferredLlmReplyDroppedEven
CausationId = string.Empty,
Kind = FailureKind.PermanentAdapterError,
ErrorCode = reason,
- ErrorSummary = "Deferred LLM reply request was dropped by the inbox pre-LLM gate.",
+ ErrorSummary = "Deferred LLM reply request was dropped by the run actor pre-LLM gate.",
NotRetryable = new Google.Protobuf.WellKnownTypes.Empty(),
FailedAtUnixMs = evt.DroppedAtUnixMs > 0
? evt.DroppedAtUnixMs
@@ -342,7 +308,7 @@ public async Task HandleDeferredLlmReplyDroppedAsync(DeferredLlmReplyDroppedEven
RemoveNyxRelayReplyToken(evt.CorrelationId, pending.Activity);
Logger.LogInformation(
- "Retired pending LLM reply after inbox drop: correlation={CorrelationId} reason={Reason}",
+ "Retired pending LLM reply after run drop: correlation={CorrelationId} reason={Reason}",
evt.CorrelationId,
reason);
}
@@ -378,11 +344,11 @@ public async Task HandleDeferredInboundTurnRetryRequestedAsync(DeferredInboundTu
private async Task DispatchPendingLlmReplyAsync(NeedsLlmReplyEvent request, CancellationToken ct)
{
- var inbox = Services.GetService();
- if (inbox is null)
+ var dispatcher = Services.GetService();
+ if (dispatcher is null)
{
Logger.LogWarning(
- "Channel LLM reply inbox not registered; scheduling durable retry: correlation={CorrelationId}",
+ "Channel LLM reply run dispatcher not registered; scheduling durable retry: correlation={CorrelationId}",
request.CorrelationId);
await ScheduleDeferredLlmReplyDispatchAsync(request, DeferredLlmDispatchRetryDelay, ct);
return;
@@ -391,24 +357,24 @@ private async Task DispatchPendingLlmReplyAsync(NeedsLlmReplyEvent request, Canc
// Retry and rehydration paths read `request` from State.PendingLlmReplyRequests,
// which always carries an empty ReplyToken (the inbound handler strips it before
// persist). If the actor is still alive and the in-memory dict still has the
- // token for this correlation, re-enrich the inbox copy so the subscriber's relay
- // credential gate does not mistake a legitimate retry for a dead request.
+ // token for this correlation, re-enrich the run command copy so AgentRunGAgent's
+ // relay credential gate does not mistake a legitimate retry for a dead request.
var enriched = EnrichWithRuntimeReplyTokenIfNeeded(request);
try
{
- await inbox.EnqueueAsync(enriched.Clone(), ct);
+ await dispatcher.DispatchAsync(enriched.Clone(), ct);
Logger.LogInformation(
- "Enqueued LLM reply request to inbox: correlation={CorrelationId} conversation={Key} replyTokenSource={Source}",
+ "Dispatched LLM reply run request: correlation={CorrelationId} conversation={Key} replyTokenSource={Source}",
enriched.CorrelationId,
enriched.Activity?.Conversation?.CanonicalKey,
- DescribeEnqueuedReplyTokenSource(request, enriched));
+ DescribeDispatchedReplyTokenSource(request, enriched));
}
catch (Exception ex)
{
Logger.LogError(
ex,
- "Failed to enqueue LLM reply request; scheduling durable retry: correlation={CorrelationId}",
+ "Failed to dispatch LLM reply run request; scheduling durable retry: correlation={CorrelationId}",
request.CorrelationId);
await ScheduleDeferredLlmReplyDispatchAsync(request, DeferredLlmDispatchRetryDelay, ct);
}
@@ -439,7 +405,7 @@ private NeedsLlmReplyEvent EnrichWithRuntimeReplyTokenIfNeeded(NeedsLlmReplyEven
return enriched;
}
- private static string DescribeEnqueuedReplyTokenSource(
+ private static string DescribeDispatchedReplyTokenSource(
NeedsLlmReplyEvent original,
NeedsLlmReplyEvent enriched)
{
@@ -561,7 +527,21 @@ await ScheduleDeferredLlmReplyDispatchAsync(
/// boundary and the edit ordering is enforced by actor serialization.
///
[EventHandler]
- public async Task HandleLlmReplyStreamChunkAsync(LlmReplyStreamChunkEvent evt)
+ public Task HandleLlmReplyStreamChunkAsync(LlmReplyStreamChunkEvent evt)
+ {
+ ArgumentNullException.ThrowIfNull(evt);
+ return HandleNyxRelayStreamingChunkCoreAsync(evt);
+ }
+
+ ///
+ /// CardKit-streaming chunks travel on a structurally distinct proto type so a misbehaving
+ /// persistence layer cannot silently re-route a replayed event back to the card sink. The
+ /// card handler owns Idle / Creating / Streaming / terminal transitions; on
+ /// CreationFailed it returns false and we drop into the legacy text-edit core
+ /// helper so the user still sees a reply for the rest of the turn.
+ ///
+ [EventHandler]
+ public async Task HandleLlmReplyCardStreamChunkAsync(LlmReplyCardStreamChunkEvent evt)
{
ArgumentNullException.ThrowIfNull(evt);
@@ -569,40 +549,85 @@ public async Task HandleLlmReplyStreamChunkAsync(LlmReplyStreamChunkEvent evt)
if (correlationId is null || evt.Activity is null || string.IsNullOrWhiteSpace(evt.AccumulatedText))
{
Logger.LogDebug(
- "Dropping malformed streaming chunk: correlation={CorrelationId}",
+ "Dropping malformed card streaming chunk: correlation={CorrelationId}",
evt.CorrelationId);
return;
}
- var state = _nyxRelayStreamingStates.GetValueOrDefault(correlationId) ?? NyxRelayStreamingState.Initial;
- if (state.Disabled || state.SuppressInterim)
+ if (State.ProcessedCommandIds.Contains(BuildLlmReplyCommandId(evt.CorrelationId)))
+ {
+ // Turn already finalized; drop any late chunk that sneaks in via the actor inbox.
+ return;
+ }
+
+ // Plain `await`: actor turns run on a single-threaded scheduler and the continuation
+ // must observe that context for subsequent state mutations on
+ // `_larkCardStreamingStates` / `_nyxRelayStreamingStates`.
+ if (await HandleLarkCardStreamingChunkCoreAsync(evt, correlationId))
return;
+ // CardCreation failed (pre-flight or first chunk). Route the rest of the turn through
+ // the legacy text-edit core so the user still gets a reply. Synthesize the equivalent
+ // edit-message chunk from the card-event payload — both proto types carry the same
+ // fields so the projection is loss-less.
+ await HandleNyxRelayStreamingChunkCoreAsync(new LlmReplyStreamChunkEvent
+ {
+ CorrelationId = evt.CorrelationId,
+ RegistrationId = evt.RegistrationId,
+ Activity = evt.Activity?.Clone() ?? new ChatActivity(),
+ AccumulatedText = evt.AccumulatedText,
+ ChunkAtUnixMs = evt.ChunkAtUnixMs,
+ });
+ }
+
+ private async Task HandleNyxRelayStreamingChunkCoreAsync(LlmReplyStreamChunkEvent evt)
+ {
+ var correlationId = NormalizeOptional(evt.CorrelationId);
+ if (correlationId is null || evt.Activity is null || string.IsNullOrWhiteSpace(evt.AccumulatedText))
+ {
+ Logger.LogDebug(
+ "Dropping malformed streaming chunk: correlation={CorrelationId}",
+ evt.CorrelationId);
+ return;
+ }
+
if (State.ProcessedCommandIds.Contains(BuildLlmReplyCommandId(evt.CorrelationId)))
{
// Turn already finalized; drop any late chunk that sneaks in via the actor inbox.
return;
}
+ var state = GetOrInitNyxRelayStreamingState(correlationId);
+ if (ShouldSkipNyxRelayStreamingForUnavailable(state, NyxRelayStreamingGuardSource.AcceptInterimChunk))
+ return;
+
var runtimeContext = BuildNyxRelayRuntimeContext(evt.CorrelationId, evt.Activity);
if (runtimeContext.NyxRelayReplyToken is null)
{
Logger.LogInformation(
"Streaming chunk received but relay reply token is unavailable; disabling streaming for turn. correlation={CorrelationId}",
evt.CorrelationId);
- _nyxRelayStreamingStates[correlationId] = state with { Disabled = true };
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.DisabledPreSend,
+ terminalReason: "no_reply_token");
return;
}
var runner = ResolveRunner();
+ // Bound the upstream edit so a stuck relay/network can't pin the actor turn forever
+ // (PR #562 review). 10s matches the failure-path timeout below; the edit is best-effort,
+ // so timing out cleanly into the !result.Success branch preserves correctness.
+ using var streamChunkCts = new CancellationTokenSource(StreamingFailureUpdateTimeout);
var result = await runner.RunStreamChunkAsync(
evt,
state.PlatformMessageId,
runtimeContext,
- CancellationToken.None);
+ streamChunkCts.Token);
if (!result.Success)
{
- if (state.ReplyTokenConsumed)
+ if (state.AllowsFinalEdit)
{
// First chunk already consumed the reply token. Skip further interim edits but
// preserve PlatformMessageId so the final edit on LlmReplyReady can still try
@@ -613,7 +638,11 @@ public async Task HandleLlmReplyStreamChunkAsync(LlmReplyStreamChunkEvent evt)
evt.CorrelationId,
result.ErrorCode,
result.EditUnsupported);
- _nyxRelayStreamingStates[correlationId] = state with { SuppressInterim = true };
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.SuppressingInterim,
+ terminalReason: $"interim_edit_failed:{result.ErrorCode}");
}
else
{
@@ -624,21 +653,29 @@ public async Task HandleLlmReplyStreamChunkAsync(LlmReplyStreamChunkEvent evt)
evt.CorrelationId,
result.ErrorCode,
result.EditUnsupported);
- _nyxRelayStreamingStates[correlationId] = state with { Disabled = true };
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.DisabledPreSend,
+ terminalReason: $"first_send_failed:{result.ErrorCode}");
}
return;
}
- var isFirstChunk = string.IsNullOrEmpty(state.PlatformMessageId);
+ var isFirstChunk = state.Phase == NyxRelayStreamingPhase.Idle;
var newPlatformMessageId = string.IsNullOrWhiteSpace(result.PlatformMessageId)
? state.PlatformMessageId
: result.PlatformMessageId;
- _nyxRelayStreamingStates[correlationId] = state with
- {
- PlatformMessageId = newPlatformMessageId,
- LastFlushedText = evt.AccumulatedText,
- EditCount = isFirstChunk ? 0 : state.EditCount + 1,
- };
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ isFirstChunk ? NyxRelayStreamingPhase.PlaceholderSent : NyxRelayStreamingPhase.Streaming,
+ fieldUpdate: s => s with
+ {
+ PlatformMessageId = newPlatformMessageId,
+ LastFlushedText = evt.AccumulatedText,
+ EditCount = isFirstChunk ? 0 : s.EditCount + 1,
+ });
}
private async Task TryCompleteStreamedReplyAsync(
@@ -647,22 +684,86 @@ private async Task TryCompleteStreamedReplyAsync(
ChatActivity? referenceActivity,
ConversationTurnRuntimeContext runtimeContext)
{
- if (evt.TerminalState != LlmReplyTerminalState.Completed)
- return false;
-
var correlationId = NormalizeOptional(evt.CorrelationId);
if (correlationId is null)
return false;
- if (!_nyxRelayStreamingStates.TryGetValue(correlationId, out var state))
- return false;
- // Disabled means the initial send never landed, so the reply token is still usable
- // and the caller may fall back to a single-shot /reply. A missing PlatformMessageId
- // with SuppressInterim would be inconsistent, but treat it the same for safety.
- if (state.Disabled || string.IsNullOrEmpty(state.PlatformMessageId))
+ // Card path takes precedence when active; falls through to text-edit when card never
+ // started (Idle), card creation failed (CreationFailed → text-edit fallback), or card
+ // finished as a terminal phase. Plain `await` so the continuation stays on the
+ // actor's single-threaded scheduler (no ConfigureAwait(false) — it would let the
+ // post-await `_nyxRelayStreamingStates` reads run off the actor turn).
+ if (await TryCompleteCardStreamedReplyAsync(evt, correlationId, commandId, referenceActivity))
+ return true;
+
+ var state = GetOrInitNyxRelayStreamingState(correlationId);
+ if (ShouldSkipNyxRelayStreamingForUnavailable(state, NyxRelayStreamingGuardSource.Finalize))
return false;
var platformMessageId = state.PlatformMessageId!;
+
+ // Streaming-start already consumed the reply token. On Failed, falling through to
+ // RunLlmReplyAsync would issue a fresh /reply against the dead token and surface
+ // as `401 Reply token already used` to NyxID — leaving the user staring at the
+ // streaming partial (often just "...") forever with no error explanation. Self-heal
+ // by editing the existing placeholder in place with the classified failure text;
+ // turn is then terminal (no retry, no second /reply).
+ if (evt.TerminalState == LlmReplyTerminalState.Failed)
+ {
+ var failureText = NormalizeOptional(evt.Outbound?.Text)
+ ?? NormalizeOptional(evt.ErrorSummary)
+ ?? "Sorry, the reply failed. Please try again.";
+ var runner = ResolveRunner();
+ var failureChunk = new LlmReplyStreamChunkEvent
+ {
+ CorrelationId = evt.CorrelationId,
+ RegistrationId = evt.RegistrationId,
+ Activity = referenceActivity?.Clone() ?? evt.Activity?.Clone() ?? new ChatActivity(),
+ AccumulatedText = failureText,
+ ChunkAtUnixMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(),
+ };
+ using var failureUpdateCts = new CancellationTokenSource(StreamingFailureUpdateTimeout);
+ var failureResult = await runner.RunStreamChunkAsync(
+ failureChunk,
+ platformMessageId,
+ runtimeContext,
+ failureUpdateCts.Token);
+ if (failureResult.Success)
+ {
+ Logger.LogWarning(
+ "LLM reply failed after streaming-start; updated placeholder with failure text. correlation={CorrelationId}, errorCode={ErrorCode}, platformMessageId={PlatformMessageId}",
+ evt.CorrelationId,
+ evt.ErrorCode,
+ platformMessageId);
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.TerminalSucceeded,
+ terminalReason: $"failed_self_heal:{evt.ErrorCode}");
+ await PersistStreamedCompletionAsync(evt, commandId, referenceActivity, platformMessageId, failureText, state.EditCount + 1);
+ return true;
+ }
+
+ // Edit failed too (rare — Lark may reject a message edit for unrelated reasons).
+ // Falling back to /reply would still hit the dead token, so persist the last
+ // flushed partial as terminal. The user sees the partial (potentially empty)
+ // but we don't spin on a guaranteed 401.
+ Logger.LogWarning(
+ "Streaming LLM failure-update could not edit placeholder; persisting last flushed partial as terminal. correlation={CorrelationId}, code={Code}, platformMessageId={PlatformMessageId}",
+ evt.CorrelationId,
+ failureResult.ErrorCode,
+ platformMessageId);
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.TerminalPartial,
+ terminalReason: $"failed_self_heal_edit_failed:{failureResult.ErrorCode}");
+ await PersistStreamedCompletionAsync(evt, commandId, referenceActivity, platformMessageId, state.LastFlushedText, state.EditCount);
+ return true;
+ }
+
+ if (evt.TerminalState != LlmReplyTerminalState.Completed)
+ return false;
var finalText = evt.Outbound?.Text ?? string.Empty;
if (string.IsNullOrWhiteSpace(finalText))
{
@@ -674,6 +775,11 @@ private async Task TryCompleteStreamedReplyAsync(
"Streaming LLM reply final text was empty; persisting last flushed partial as terminal. correlation={CorrelationId} platformMessageId={PlatformMessageId}",
evt.CorrelationId,
platformMessageId);
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.TerminalPartial,
+ terminalReason: "empty_final_text");
await PersistStreamedCompletionAsync(evt, commandId, referenceActivity, platformMessageId, state.LastFlushedText, state.EditCount);
return true;
}
@@ -690,11 +796,12 @@ private async Task TryCompleteStreamedReplyAsync(
AccumulatedText = finalText,
ChunkAtUnixMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(),
};
+ using var finalChunkCts = new CancellationTokenSource(StreamingFailureUpdateTimeout);
var finalResult = await runner.RunStreamChunkAsync(
finalChunk,
platformMessageId,
runtimeContext,
- CancellationToken.None);
+ finalChunkCts.Token);
if (!finalResult.Success)
{
// The reply token was already consumed by the first chunk, so falling back to
@@ -707,12 +814,22 @@ private async Task TryCompleteStreamedReplyAsync(
evt.CorrelationId,
finalResult.ErrorCode,
platformMessageId);
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.TerminalPartial,
+ terminalReason: $"final_edit_failed:{finalResult.ErrorCode}");
await PersistStreamedCompletionAsync(evt, commandId, referenceActivity, platformMessageId, state.LastFlushedText, state.EditCount);
return true;
}
edits += 1;
}
+ TransitionNyxRelayStreamingPhase(
+ correlationId,
+ state,
+ NyxRelayStreamingPhase.TerminalSucceeded,
+ terminalReason: "completed");
await PersistStreamedCompletionAsync(evt, commandId, referenceActivity, platformMessageId, finalText, edits);
return true;
}
@@ -1053,9 +1170,9 @@ private ConversationTurnRuntimeContext BuildNyxRelayRuntimeContextForReply(
{
var activity = pendingActivity ?? evt.Activity;
- // Inbox-echoed credential is the authoritative source — it survives actor
+ // Run-echoed credential is the authoritative source: it survives actor
// deactivation between inbound capture and LLM reply ready, which the in-memory
- // dict cannot. Fall back to the dict only when the inbox didn't carry a token
+ // dict cannot. Fall back to the dict only when the run event didn't carry a token
// (legacy in-flight messages from before this change deployed).
var inlineToken = NormalizeOptional(evt.ReplyToken);
if (inlineToken is not null)
@@ -1082,7 +1199,7 @@ private string DescribeReplyTokenSource(LlmReplyReadyEvent evt, ConversationTurn
if (runtimeContext.NyxRelayReplyToken is null)
return "none";
if (!string.IsNullOrWhiteSpace(evt.ReplyToken))
- return "inbox-echo";
+ return "run-echo";
return "actor-runtime-dict";
}
@@ -1107,6 +1224,7 @@ private void RemoveNyxRelayReplyToken(string? correlationId, ChatActivity? activ
{
_nyxRelayReplyTokens.Remove(normalizedCorrelationId);
_nyxRelayStreamingStates.Remove(normalizedCorrelationId);
+ _larkCardStreamingStates.Remove(normalizedCorrelationId);
}
}
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IChannelLlmReplyInbox.cs b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IChannelLlmReplyInbox.cs
deleted file mode 100644
index f3d10ce82..000000000
--- a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IChannelLlmReplyInbox.cs
+++ /dev/null
@@ -1,6 +0,0 @@
-namespace Aevatar.GAgents.Channel.Runtime;
-
-public interface IChannelLlmReplyInbox
-{
- Task EnqueueAsync(NeedsLlmReplyEvent request, CancellationToken ct);
-}
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IChannelLlmReplyRunDispatcher.cs b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IChannelLlmReplyRunDispatcher.cs
new file mode 100644
index 000000000..df2889a1f
--- /dev/null
+++ b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IChannelLlmReplyRunDispatcher.cs
@@ -0,0 +1,10 @@
+namespace Aevatar.GAgents.Channel.Runtime;
+
+///
+/// Stateless port used by to hand one deferred
+/// LLM reply run to its run-scoped continuation owner.
+///
+public interface IChannelLlmReplyRunDispatcher
+{
+ Task DispatchAsync(NeedsLlmReplyEvent request, CancellationToken ct);
+}
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IConversationCardTurnRunner.cs b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IConversationCardTurnRunner.cs
new file mode 100644
index 000000000..7459bd1a4
--- /dev/null
+++ b/agents/Aevatar.GAgents.Channel.Runtime/Conversation/IConversationCardTurnRunner.cs
@@ -0,0 +1,216 @@
+using Aevatar.GAgents.Channel.Abstractions;
+
+namespace Aevatar.GAgents.Channel.Runtime;
+
+///
+/// Runs the CardKit-streaming variant of a bot turn inside .
+/// Parallel to but with three distinct operations
+/// (create-and-send, interim element stream, finalize) to match Lark CardKit's lifecycle.
+/// The grain owns the per-turn LarkCardStreamingState; this seam only does the
+/// outbound call and translates the response into a runner-shaped result.
+///
+///
+/// All three operations are invoked under the actor's turn-serial invariant, so the runner
+/// implementation must be safe under that single-threaded contract. The
+/// sequence parameter is owned by the grain (pre-incremented before each call) and
+/// passed verbatim into the CardKit API.
+///
+public interface IConversationCardTurnRunner
+{
+ ///
+ /// Allocates a new CardKit card entity (POST /open-apis/cardkit/v1/cards), binds it
+ /// to the chat via an interactive im/v1/messages send referencing the new
+ /// card_id, and writes the initial accumulated text into
+ /// . Implicit sequence = 1.
+ ///
+ Task RunCardCreateAsync(
+ LlmReplyCardStreamChunkEvent chunk,
+ string streamingElementId,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct);
+
+ ///
+ /// Streams the latest accumulated text into the existing card element. Sequence is
+ /// pre-incremented by the grain. Lark rejects stale sequences deterministically.
+ ///
+ Task RunCardStreamAsync(
+ LlmReplyCardStreamChunkEvent chunk,
+ string cardId,
+ string elementId,
+ long sequence,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct);
+
+ ///
+ /// Closes the card's streaming mode (cursor disappears) and, if the final text differs
+ /// from the last interim flush, writes one more element-content update so the persisted
+ /// card matches the LLM's final output.
+ ///
+ ///
+ /// Carries TransportExtras.NyxUserAccessToken for the proxy call. Stream chunk
+ /// methods read it from the chunk's own activity; finalize is invoked from the
+ /// LlmReplyReadyEvent path so the actor passes the event's reference activity
+ /// here instead of a chunk.
+ ///
+ Task RunCardFinalizeAsync(
+ ChatActivity referenceActivity,
+ string cardId,
+ string elementId,
+ string finalText,
+ bool finalTextDiffersFromLastFlushed,
+ long sequence,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct);
+}
+
+///
+/// Outcome of . The classification
+/// flags drive the grain's fallback decision:
+///
+/// - Pre-send failures (create call rejected before any chat-visible side effect): the
+/// actor transitions to CreationFailed and falls back to the legacy text-edit sink
+/// so the user still sees a reply. /
+/// imply this path.
+/// - Post-send failures (create + send succeeded but the first stream-content write
+/// failed — see ): an empty card is already visible in the
+/// chat. Falling back to text-edit would produce a duplicate reply. The actor terminates
+/// the turn at Terminated using the surfaced /
+/// and persists the partial-card terminal record. The runner
+/// makes a best-effort settings patch to close streaming mode on the orphan card before
+/// returning so the cursor does not blink forever.
+/// - on its own terminates the turn (no fallback).
+///
+///
+public sealed record ConversationCardCreateResult(
+ bool Success,
+ string? CardId,
+ string? CardMessageId,
+ bool IsRateLimited,
+ bool IsTableLimitExceeded,
+ bool IsCardUnavailable,
+ bool IsPostSendFailure,
+ string ErrorCode,
+ string ErrorSummary)
+{
+ public static ConversationCardCreateResult Succeeded(string cardId, string cardMessageId) =>
+ new(true, cardId, cardMessageId, false, false, false, false, string.Empty, string.Empty);
+
+ public static ConversationCardCreateResult Failed(
+ string errorCode,
+ string errorSummary,
+ bool isRateLimited = false,
+ bool isTableLimitExceeded = false,
+ bool isCardUnavailable = false) =>
+ new(false, null, null, isRateLimited, isTableLimitExceeded, isCardUnavailable, false, errorCode, errorSummary);
+
+ ///
+ /// Failure factory for the "card was already sent to the chat but the first
+ /// element-content write failed" case. The actor must NOT fall back to text-edit
+ /// (the orphan card is already visible) — it transitions the turn to Terminated
+ /// and uses / for the
+ /// persisted partial-card record.
+ ///
+ public static ConversationCardCreateResult PostSendFailed(
+ string cardId,
+ string cardMessageId,
+ string errorCode,
+ string errorSummary,
+ bool isRateLimited = false,
+ bool isTableLimitExceeded = false,
+ bool isCardUnavailable = false) =>
+ new(false, cardId, cardMessageId, isRateLimited, isTableLimitExceeded, isCardUnavailable, true, errorCode, errorSummary);
+}
+
+///
+/// Outcome of . Mid-stream
+/// rate-limit (Lark 230020) is recoverable — the grain skips the frame and continues.
+/// Table-limit (230099/11310) and unavailability terminate the turn.
+///
+public sealed record ConversationCardStreamResult(
+ bool Success,
+ bool IsRateLimited,
+ bool IsTableLimitExceeded,
+ bool IsCardUnavailable,
+ string ErrorCode,
+ string ErrorSummary)
+{
+ public static ConversationCardStreamResult Succeeded() =>
+ new(true, false, false, false, string.Empty, string.Empty);
+
+ public static ConversationCardStreamResult Failed(
+ string errorCode,
+ string errorSummary,
+ bool isRateLimited = false,
+ bool isTableLimitExceeded = false,
+ bool isCardUnavailable = false) =>
+ new(false, isRateLimited, isTableLimitExceeded, isCardUnavailable, errorCode, errorSummary);
+}
+
+/// True only when both the optional final stream write AND the
+/// streaming-mode close succeeded.
+///
+/// True when the trailing element-content write either succeeded OR was skipped
+/// (final text equals last flushed). False only when the runner attempted the trailing
+/// write and it failed; lets the actor persist the visible-state text correctly when
+/// success is false but the final text actually did land before the close-streaming-mode
+/// failure.
+///
+public sealed record ConversationCardFinalizeResult(
+ bool Success,
+ bool FinalTextWritten,
+ string ErrorCode,
+ string ErrorSummary)
+{
+ public static ConversationCardFinalizeResult Succeeded() =>
+ new(true, true, string.Empty, string.Empty);
+
+ ///
+ /// Failure factory. distinguishes between "trailing
+ /// write failed; user sees stale interim" (false) and "trailing write succeeded but
+ /// streaming-mode close failed; user sees the final text with a still-blinking cursor"
+ /// (true).
+ ///
+ public static ConversationCardFinalizeResult Failed(string errorCode, string errorSummary, bool finalTextWritten = false) =>
+ new(false, finalTextWritten, errorCode, errorSummary);
+}
+
+///
+/// No-op default. Every CardKit operation reports a transient failure that disables the
+/// card path so the grain can fall back to the legacy text-edit sink. Production DI registers
+/// a real implementation when CardKit is enabled.
+///
+public sealed class NullConversationCardTurnRunner : IConversationCardTurnRunner
+{
+ public Task RunCardCreateAsync(
+ LlmReplyCardStreamChunkEvent chunk,
+ string streamingElementId,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct) =>
+ Task.FromResult(ConversationCardCreateResult.Failed(
+ "no_card_runner",
+ "no IConversationCardTurnRunner registered"));
+
+ public Task RunCardStreamAsync(
+ LlmReplyCardStreamChunkEvent chunk,
+ string cardId,
+ string elementId,
+ long sequence,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct) =>
+ Task.FromResult(ConversationCardStreamResult.Failed(
+ "no_card_runner",
+ "no IConversationCardTurnRunner registered"));
+
+ public Task RunCardFinalizeAsync(
+ ChatActivity referenceActivity,
+ string cardId,
+ string elementId,
+ string finalText,
+ bool finalTextDiffersFromLastFlushed,
+ long sequence,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct) =>
+ Task.FromResult(ConversationCardFinalizeResult.Failed(
+ "no_card_runner",
+ "no IConversationCardTurnRunner registered"));
+}
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/DependencyInjection/ChannelRuntimeServiceCollectionExtensions.cs b/agents/Aevatar.GAgents.Channel.Runtime/DependencyInjection/ChannelRuntimeServiceCollectionExtensions.cs
index aa2c48d55..5de88c691 100644
--- a/agents/Aevatar.GAgents.Channel.Runtime/DependencyInjection/ChannelRuntimeServiceCollectionExtensions.cs
+++ b/agents/Aevatar.GAgents.Channel.Runtime/DependencyInjection/ChannelRuntimeServiceCollectionExtensions.cs
@@ -48,6 +48,7 @@ public static IServiceCollection AddChannelRuntime(
services.TryAddSingleton();
services.TryAddSingleton();
services.TryAddSingleton();
+ services.TryAddSingleton();
// ─── Tombstone compaction options + diagnostics + ES watermark ───
services.AddOptions();
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/IStreamingReplySink.cs b/agents/Aevatar.GAgents.Channel.Runtime/IStreamingReplySink.cs
index 1769c0a4a..64b09f271 100644
--- a/agents/Aevatar.GAgents.Channel.Runtime/IStreamingReplySink.cs
+++ b/agents/Aevatar.GAgents.Channel.Runtime/IStreamingReplySink.cs
@@ -2,13 +2,13 @@ namespace Aevatar.GAgents.Channel.Runtime;
///
/// Receives per-delta streaming updates from so the reply
-/// inbox can fan the accumulated text to the conversation actor as it is being generated. The
+/// run actor can fan the accumulated text to the conversation actor as it is being generated. The
/// actor is the sole holder of the relay reply token, so only it is allowed to drive the relay
/// placeholder send and subsequent edit calls; this sink therefore fans out signals (chunk events)
/// and never touches the outbound port directly.
///
///
-/// Implementations are per-turn and owned by the inbox runtime. A null sink signals that streaming
+/// Implementations are per-turn and owned by the run actor. A null sink signals that streaming
/// is disabled for the turn (for example, the feature flag is off, the activity is not a relay
/// turn, or an earlier failure invalidated the turn); generators must tolerate a null sink by
/// simply accumulating the final text without calling any sink method.
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/TurnStreamingReplySink.cs b/agents/Aevatar.GAgents.Channel.Runtime/TurnStreamingReplySink.cs
index 8d846797b..c6ddcebd3 100644
--- a/agents/Aevatar.GAgents.Channel.Runtime/TurnStreamingReplySink.cs
+++ b/agents/Aevatar.GAgents.Channel.Runtime/TurnStreamingReplySink.cs
@@ -1,6 +1,7 @@
using Aevatar.Foundation.Abstractions;
using Aevatar.GAgents.Channel.Abstractions;
using Aevatar.GAgents.Channel.Runtime;
+using Google.Protobuf;
using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging;
@@ -31,7 +32,7 @@ namespace Aevatar.GAgents.Channel.Runtime;
/// - bypasses the throttle so the actor sees the complete text
/// once the stream ends; if a dispatch is in flight, the final text reflushes after it and
/// awaits the dispatch loop's drain signal before returning so the
-/// caller (the inbox runtime) does not race the ready event past the final chunk.
+/// caller (the run actor) does not race the ready event past the final chunk.
///
///
///
@@ -52,6 +53,8 @@ public sealed class TurnStreamingReplySink : IStreamingReplySink, IDisposable
private readonly string _registrationId;
private readonly ChatActivity _activityTemplate;
private readonly TimeSpan _throttle;
+ private readonly int _maxInterimChunks;
+ private readonly bool _cardMode;
private readonly TimeProvider _timeProvider;
private readonly ILogger? _logger;
@@ -65,7 +68,7 @@ public sealed class TurnStreamingReplySink : IStreamingReplySink, IDisposable
private bool _dispatchInProgress;
private bool _disposed;
// Signaled by the dispatch loop when it fully drains. FinalizeAsync awaits this when a
- // dispatch is already in flight so the caller does not race the inbox runtime's
+ // dispatch is already in flight so the caller does not race AgentRunGAgent's
// LlmReplyReadyEvent past the final chunk dispatch (the ConversationGAgent
// processed-command guard would otherwise drop the late chunk).
private TaskCompletionSource? _drainTcs;
@@ -78,7 +81,9 @@ public TurnStreamingReplySink(
ChatActivity activityTemplate,
TimeSpan throttle,
TimeProvider timeProvider,
- ILogger? logger = null)
+ ILogger? logger = null,
+ int maxInterimChunks = int.MaxValue,
+ bool cardMode = false)
{
_actorDispatchPort = actorDispatchPort ?? throw new ArgumentNullException(nameof(actorDispatchPort));
if (string.IsNullOrWhiteSpace(targetActorId))
@@ -90,6 +95,8 @@ public TurnStreamingReplySink(
_registrationId = registrationId ?? string.Empty;
_activityTemplate = activityTemplate ?? throw new ArgumentNullException(nameof(activityTemplate));
_throttle = throttle < TimeSpan.Zero ? TimeSpan.Zero : throttle;
+ _maxInterimChunks = maxInterimChunks < 0 ? 0 : maxInterimChunks;
+ _cardMode = cardMode;
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_logger = logger;
}
@@ -109,7 +116,7 @@ public Task OnDeltaAsync(string accumulatedText, CancellationToken ct) =>
/// Applies the final accumulated text, bypassing the throttle so the actor can drive the final
/// edit once the stream ends. If a dispatch is already in flight, the final text is stashed and
/// this call awaits the dispatch loop's drain signal so the final chunk is on the wire before
- /// the caller proceeds (the inbox runtime sends LlmReplyReadyEvent immediately after).
+ /// the caller proceeds (AgentRunGAgent sends LlmReplyReadyEvent immediately after).
///
public Task FinalizeAsync(string finalText, CancellationToken ct) =>
FlushAsync(finalText, isFinal: true, ct);
@@ -158,6 +165,19 @@ private async Task FlushAsync(string text, bool isFinal, CancellationToken ct)
return;
}
+ // Lark/Feishu refuses message edits past a per-message cap (~20 in mainnet, code
+ // 230072). Once that cap is reached the platform rejects every subsequent edit
+ // including the final flush, leaving the user with a truncated reply. Cap interim
+ // dispatches here so the final always has headroom; we still stash the latest text
+ // so FinalizeAsync can dispatch the complete content when the stream ends.
+ if (!isFinal && _chunksEmitted >= _maxInterimChunks)
+ {
+ _pendingText = text;
+ _hasPending = true;
+ CancelTimerLocked();
+ return;
+ }
+
if (_dispatchInProgress)
{
// A dispatch is in flight. Stash the latest text; the dispatch loop's reflush
@@ -168,7 +188,7 @@ private async Task FlushAsync(string text, bool isFinal, CancellationToken ct)
if (isFinal)
{
// Block FinalizeAsync until the dispatch loop drains the stashed final text.
- // Without this wait, ChannelLlmReplyInboxRuntime sends LlmReplyReadyEvent
+ // Without this wait, AgentRunGAgent sends LlmReplyReadyEvent
// first and ConversationGAgent's processed-command guard drops the late
// final chunk.
_drainTcs ??= new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously);
@@ -265,7 +285,7 @@ private async Task DispatchLoopAsync(string firstText, CancellationToken ct)
{
await DispatchOneAsync(current, ct).ConfigureAwait(false);
- string? next;
+ string? next = null;
lock (_lock)
{
if (_disposed || !_hasPending)
@@ -286,6 +306,61 @@ private async Task DispatchLoopAsync(string firstText, CancellationToken ct)
break;
}
+ var nextIsFinal = _drainTcs is not null;
+
+ // Stop dispatching interim chunks once the cap is reached. Clear the
+ // pending stash too — keeping it would only cost a follow-up
+ // OnDeltaAsync re-overwrites it with newer accumulated text anyway, and
+ // an explicit drain here matches the invariant the reviewer asked for
+ // (PR #562 review #14): pending text is never left behind when we
+ // release _dispatchInProgress=false. FinalizeAsync, when it arrives
+ // later, uses its `text` parameter (not _pendingText), so this clear
+ // doesn't affect the final flush.
+ if (!nextIsFinal && _chunksEmitted >= _maxInterimChunks)
+ {
+ _pendingText = string.Empty;
+ _hasPending = false;
+ _dispatchInProgress = false;
+ drainSignal = _drainTcs;
+ _drainTcs = null;
+ break;
+ }
+
+ // Throttle gate between dispatches. Without this, the loop drains stashed
+ // text at network round-trip pace (~50ms) and exhausts the platform-side
+ // per-message edit cap (Lark code 230072). When the throttle window has
+ // not elapsed, arm the deferred timer atomically with releasing
+ // _dispatchInProgress so a concurrent OnDeltaAsync (PR #562 review #17)
+ // cannot squeeze in between the release and the arm and observe a stale
+ // (no-timer + not-dispatching) state. Final dispatches bypass the
+ // throttle so the user sees the complete text immediately when the
+ // stream ends.
+ //
+ // Invariant: if we reach this branch, nextIsFinal == false, so _drainTcs
+ // must be null. FinalizeAsync sets _drainTcs only when it arrives during
+ // an in-flight dispatch, and that path re-evaluates nextIsFinal inside
+ // this same lock acquisition. We do NOT signal drainSignal here: the
+ // timer-driven loop is the one that eventually drains _pendingText and
+ // signals whatever _drainTcs gets attached.
+ if (!nextIsFinal && _throttle > TimeSpan.Zero)
+ {
+ var elapsed = _timeProvider.GetUtcNow() - _lastEmitAt;
+ if (elapsed < _throttle)
+ {
+ var delay = _throttle - elapsed;
+ _dispatchInProgress = false;
+ if (!_disposed && _hasPending && _flushTimer is null)
+ {
+ _flushTimer = _timeProvider.CreateTimer(
+ OnFlushTimerFired,
+ state: null,
+ dueTime: delay,
+ period: Timeout.InfiniteTimeSpan);
+ }
+ break;
+ }
+ }
+
next = _pendingText;
_pendingText = string.Empty;
_hasPending = false;
@@ -312,14 +387,26 @@ private async Task DispatchLoopAsync(string firstText, CancellationToken ct)
private async Task DispatchOneAsync(string text, CancellationToken ct)
{
- var chunk = new LlmReplyStreamChunkEvent
- {
- CorrelationId = _correlationId,
- RegistrationId = _registrationId,
- Activity = _activityTemplate.Clone(),
- AccumulatedText = text,
- ChunkAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
- };
+ // Card mode dispatches a structurally distinct message type so persistence layers
+ // cannot silently re-route a replayed event back to the card sink. The two proto
+ // types carry identical payloads; the type identity itself signals routing.
+ IMessage chunk = _cardMode
+ ? new LlmReplyCardStreamChunkEvent
+ {
+ CorrelationId = _correlationId,
+ RegistrationId = _registrationId,
+ Activity = _activityTemplate.Clone(),
+ AccumulatedText = text,
+ ChunkAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ }
+ : new LlmReplyStreamChunkEvent
+ {
+ CorrelationId = _correlationId,
+ RegistrationId = _registrationId,
+ Activity = _activityTemplate.Clone(),
+ AccumulatedText = text,
+ ChunkAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ };
var envelope = new EventEnvelope
{
Id = Guid.NewGuid().ToString("N"),
diff --git a/agents/Aevatar.GAgents.Channel.Runtime/protos/conversation_events.proto b/agents/Aevatar.GAgents.Channel.Runtime/protos/conversation_events.proto
index ea0cc98e6..801f0b0c7 100644
--- a/agents/Aevatar.GAgents.Channel.Runtime/protos/conversation_events.proto
+++ b/agents/Aevatar.GAgents.Channel.Runtime/protos/conversation_events.proto
@@ -30,10 +30,10 @@ message NeedsLlmReplyEvent {
aevatar.gagents.channel.abstractions.ChatActivity activity = 4;
map metadata = 5;
int64 requested_at_unix_ms = 6;
- // Transient inbox-only credential. The actor MUST clear `reply_token` and
+ // Transient run-command-only credential. The actor MUST clear `reply_token` and
// `reply_token_expires_at_unix_ms` (set them to the empty default) on the
- // copy passed to PersistDomainEventAsync; only the inbox-bound copy may
- // carry them so the LLM worker can echo the credential back without the
+ // copy passed to PersistDomainEventAsync; only the run-bound copy may
+ // carry them so AgentRunGAgent can echo the credential back without the
// actor's in-memory dict surviving deactivation. Never persist to event
// store, projection, or read model.
string reply_token = 7;
@@ -70,16 +70,16 @@ message LlmReplyReadyEvent {
string error_code = 7;
string error_summary = 8;
int64 ready_at_unix_ms = 9;
- // Transient inbox-echoed credential carried back from the LLM worker so the
+ // Transient run-echoed credential carried back from AgentRunGAgent so the
// actor's outbound relay reply does not depend on its in-memory token dict
// surviving deactivation. The actor consumes these fields directly and never
- // persists them. The inbox subscriber copies the values from the inbound
+ // persists them. AgentRunGAgent copies the values from the inbound
// NeedsLlmReplyEvent verbatim.
string reply_token = 10;
int64 reply_token_expires_at_unix_ms = 11;
}
-// Per-delta streaming signal dispatched from the LLM inbox runtime to the conversation actor while
+// Per-delta streaming signal dispatched from AgentRunGAgent to the conversation actor while
// the reply is still being generated. The actor owns the outbound reply credential and the
// placeholder message identifier for the turn, so it must be the one issuing the relay placeholder
// send and subsequent edit calls. This message carries only the cumulative accumulated text for
@@ -87,6 +87,14 @@ message LlmReplyReadyEvent {
// in-memory keyed by `correlation_id`. This event must never be persisted — it is a runtime-only
// signal.
message LlmReplyStreamChunkEvent {
+ // Field 6 (`card_mode`) was a runtime-only routing flag that has been promoted to its own
+ // message type (`LlmReplyCardStreamChunkEvent`) so the structural contract of this domain-
+ // event-shaped envelope no longer carries any "should I re-route to a different sink?"
+ // signal. Reserved here so accidental reuse of the field number, or a stale serializer
+ // built before the split, fails loudly instead of silently flipping back to card mode.
+ reserved 6;
+ reserved "card_mode";
+
string correlation_id = 1;
string registration_id = 2;
// Clone of the inbound activity so the actor/turn runner can resolve the platform, conversation,
@@ -97,6 +105,24 @@ message LlmReplyStreamChunkEvent {
int64 chunk_at_unix_ms = 5;
}
+// Per-delta streaming signal for the Lark CardKit (card-mode) outbound path. Identical
+// payload to LlmReplyStreamChunkEvent, but a separate proto type so the routing decision is
+// structural: there is no boolean a misbehaving persistence layer can flip — the actor's
+// HandleLlmReplyCardStreamChunkAsync handler is reachable only via this type. Like its
+// edit-message sibling, this event is a runtime-only signal and must never be persisted to
+// the event store, projection, or any durable state.
+message LlmReplyCardStreamChunkEvent {
+ string correlation_id = 1;
+ string registration_id = 2;
+ // Clone of the inbound activity so the actor/runner can resolve the platform, conversation,
+ // delivery context, and TransportExtras (NyxUserAccessToken, NyxLarkChatId, NyxLarkUnionId)
+ // without re-reading from durable state.
+ aevatar.gagents.channel.abstractions.ChatActivity activity = 3;
+ // Current accumulated reply text (not a delta slice). Each chunk supersedes the previous one.
+ string accumulated_text = 4;
+ int64 chunk_at_unix_ms = 5;
+}
+
message DeferredLlmReplyDispatchRequestedEvent {
string correlation_id = 1;
int64 requested_at_unix_ms = 2;
@@ -128,7 +154,7 @@ message NyxRelayReplyTokenCleanupRequestedEvent {
int64 requested_at_unix_ms = 2;
}
-// Sent by ChannelLlmReplyInboxRuntime when its pre-LLM gates (stale age,
+// Sent by AgentRunGAgent when its pre-LLM gates (stale age,
// missing relay credential, malformed payload) refuse to process a deferred
// LLM reply. The actor consumes this to retire the matching pending entry
// from State.PendingLlmReplyRequests via a NotRetryable
diff --git a/agents/Aevatar.GAgents.NyxidChat/Aevatar.GAgents.NyxidChat.csproj b/agents/Aevatar.GAgents.NyxidChat/Aevatar.GAgents.NyxidChat.csproj
index 85b7ac4fb..4d1f9ab4d 100644
--- a/agents/Aevatar.GAgents.NyxidChat/Aevatar.GAgents.NyxidChat.csproj
+++ b/agents/Aevatar.GAgents.NyxidChat/Aevatar.GAgents.NyxidChat.csproj
@@ -24,6 +24,7 @@
+
@@ -35,10 +36,21 @@
+
+
+ all
+ runtime; build; native; contentfiles; analyzers; buildtransitive
+
+
+
+
diff --git a/agents/Aevatar.GAgents.NyxidChat/AgentRunDispatcher.cs b/agents/Aevatar.GAgents.NyxidChat/AgentRunDispatcher.cs
new file mode 100644
index 000000000..fadee6582
--- /dev/null
+++ b/agents/Aevatar.GAgents.NyxidChat/AgentRunDispatcher.cs
@@ -0,0 +1,65 @@
+using Aevatar.Foundation.Abstractions;
+using Aevatar.GAgents.Channel.Runtime;
+using Google.Protobuf.WellKnownTypes;
+using Microsoft.Extensions.Logging;
+
+namespace Aevatar.GAgents.NyxidChat;
+
+///
+/// Thin Channel.Runtime port implementation that creates the run actor and
+/// dispatches the start command. It holds no run state.
+///
+public sealed class AgentRunDispatcher : IChannelLlmReplyRunDispatcher
+{
+ private readonly IActorRuntime _actorRuntime;
+ private readonly IStreamProvider _streamProvider;
+ private readonly TimeProvider _timeProvider;
+ private readonly ILogger _logger;
+
+ public AgentRunDispatcher(
+ IActorRuntime actorRuntime,
+ IStreamProvider streamProvider,
+ ILogger logger,
+ TimeProvider? timeProvider = null)
+ {
+ _actorRuntime = actorRuntime ?? throw new ArgumentNullException(nameof(actorRuntime));
+ _streamProvider = streamProvider ?? throw new ArgumentNullException(nameof(streamProvider));
+ _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+ _timeProvider = timeProvider ?? TimeProvider.System;
+ }
+
+ public async Task DispatchAsync(NeedsLlmReplyEvent request, CancellationToken ct)
+ {
+ ArgumentNullException.ThrowIfNull(request);
+ if (string.IsNullOrWhiteSpace(request.CorrelationId))
+ throw new InvalidOperationException("Deferred LLM reply request requires correlation_id for AgentRunGAgent dispatch.");
+
+ var runId = request.CorrelationId.Trim();
+ var actorId = AgentRunGAgent.BuildActorId(runId);
+ var actor = await _actorRuntime.GetAsync(actorId)
+ ?? await _actorRuntime.CreateAsync(actorId, ct);
+
+ var command = new AgentRunStartRequested
+ {
+ Request = request.Clone(),
+ };
+ var envelope = new EventEnvelope
+ {
+ Id = Guid.NewGuid().ToString("N"),
+ Timestamp = Timestamp.FromDateTimeOffset(_timeProvider.GetUtcNow()),
+ Payload = Any.Pack(command),
+ Route = EnvelopeRouteSemantics.CreateDirect("channel-llm-reply-run-dispatcher", actor.Id),
+ Propagation = new EnvelopePropagation
+ {
+ CorrelationId = runId,
+ },
+ };
+
+ await _streamProvider.GetStream(actor.Id).ProduceAsync(envelope, ct);
+ _logger.LogInformation(
+ "Accepted deferred LLM reply run for actor inbox: runId={RunId} actorId={ActorId} target={TargetActorId}",
+ runId,
+ actor.Id,
+ request.TargetActorId);
+ }
+}
diff --git a/agents/Aevatar.GAgents.NyxidChat/AgentRunGAgent.cs b/agents/Aevatar.GAgents.NyxidChat/AgentRunGAgent.cs
new file mode 100644
index 000000000..f22375817
--- /dev/null
+++ b/agents/Aevatar.GAgents.NyxidChat/AgentRunGAgent.cs
@@ -0,0 +1,808 @@
+using Aevatar.AI.Abstractions.LLMProviders;
+using Aevatar.Foundation.Abstractions;
+using Aevatar.Foundation.Abstractions.Attributes;
+using Aevatar.Foundation.Abstractions.Runtime.Callbacks;
+using Aevatar.Foundation.Core;
+using Aevatar.Foundation.Core.EventSourcing;
+using Aevatar.GAgents.Channel.Abstractions;
+using Aevatar.GAgents.Channel.NyxIdRelay;
+using Aevatar.GAgents.Channel.Runtime;
+using Aevatar.Studio.Application.Studio.Abstractions;
+using Google.Protobuf;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+
+namespace Aevatar.GAgents.NyxidChat;
+
+///
+/// Run-scoped continuation owner for one deferred channel LLM reply.
+///
+public sealed class AgentRunGAgent : GAgentBase
+{
+ public const string ActorIdPrefix = "channel-agent-run:";
+
+ internal const long MaxRunRequestAgeMs = 5 * 60 * 1000;
+
+ ///
+ /// Hard upper bound on a single LLM reply turn. Mirrors
+ /// NyxIdRelayOptions.ResponseTimeoutSeconds (default 300s).
+ /// A configured value of 0 or negative is treated as "disable the cap".
+ ///
+ internal const int FallbackTimeoutSecondsDefault = 300;
+
+ ///
+ /// Standalone budget for metadata enrichment (scope resolve + UserConfig lookup).
+ ///
+ internal static readonly TimeSpan MetadataBuildBudget = TimeSpan.FromSeconds(15);
+
+ internal static readonly TimeSpan TerminalCleanupDelay = TimeSpan.FromMinutes(5);
+ private const string TerminalCleanupCallbackPrefix = "agent-run-terminal-cleanup";
+ internal static readonly TimeSpan OutputDispatchRetryDelay = TimeSpan.FromSeconds(5);
+ private const string OutputDispatchRetryCallbackPrefix = "agent-run-output-dispatch-retry";
+
+ private readonly IActorRuntime _actorRuntime;
+ private readonly IActorDispatchPort _actorDispatchPort;
+ private readonly IConversationReplyGenerator _replyGenerator;
+ private readonly IInteractiveReplyCollector? _interactiveReplyCollector;
+ private readonly Aevatar.GAgents.Channel.NyxIdRelay.NyxIdRelayOptions? _relayOptions;
+ private readonly INyxIdRelayScopeResolver? _scopeResolver;
+ private readonly IUserConfigQueryPort? _userConfigQueryPort;
+ private readonly TimeProvider _timeProvider;
+ private readonly ILogger _logger;
+
+ public AgentRunGAgent(
+ IActorRuntime actorRuntime,
+ IActorDispatchPort actorDispatchPort,
+ IConversationReplyGenerator replyGenerator,
+ IInteractiveReplyCollector? interactiveReplyCollector,
+ Aevatar.GAgents.Channel.NyxIdRelay.NyxIdRelayOptions? relayOptions,
+ ILogger logger,
+ INyxIdRelayScopeResolver? scopeResolver = null,
+ IUserConfigQueryPort? userConfigQueryPort = null,
+ TimeProvider? timeProvider = null)
+ {
+ _actorRuntime = actorRuntime ?? throw new ArgumentNullException(nameof(actorRuntime));
+ _actorDispatchPort = actorDispatchPort ?? throw new ArgumentNullException(nameof(actorDispatchPort));
+ _replyGenerator = replyGenerator ?? throw new ArgumentNullException(nameof(replyGenerator));
+ _interactiveReplyCollector = interactiveReplyCollector;
+ _relayOptions = relayOptions;
+ _scopeResolver = scopeResolver;
+ _userConfigQueryPort = userConfigQueryPort;
+ _timeProvider = timeProvider ?? TimeProvider.System;
+ _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+ }
+
+ public static string BuildActorId(string correlationId)
+ {
+ ArgumentException.ThrowIfNullOrWhiteSpace(correlationId);
+ return ActorIdPrefix + correlationId.Trim();
+ }
+
+ protected override AgentRunGAgentState TransitionState(AgentRunGAgentState current, IMessage evt) =>
+ StateTransitionMatcher
+ .Match(current, evt)
+ .On(ApplyStarted)
+ .On(ApplyReplyProduced)
+ .On(ApplyDropped)
+ .On(ApplyFailed)
+ .OrCurrent();
+
+ [EventHandler]
+ public async Task HandleStartAsync(AgentRunStartRequested command)
+ {
+ ArgumentNullException.ThrowIfNull(command);
+ if (command.Request is null)
+ {
+ _logger.LogWarning("Dropping malformed agent run start command without request: runActor={RunActorId}", Id);
+ return;
+ }
+
+ var request = command.Request.Clone();
+ var runId = NormalizeOptional(request.CorrelationId) ?? Id;
+ var startedAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds();
+
+ if (State.Status is AgentRunStatus.ReplyProduced or AgentRunStatus.Dropped or AgentRunStatus.Failed)
+ {
+ _logger.LogInformation(
+ "Ignoring duplicate terminal agent run start: runId={RunId} status={Status}",
+ runId,
+ State.Status);
+ await ScheduleTerminalCleanupAsync(NormalizeOptional(State.RunId) ?? runId);
+ return;
+ }
+
+ if (string.IsNullOrWhiteSpace(State.RunId))
+ {
+ await PersistDomainEventAsync(new AgentRunStartedEvent
+ {
+ RunId = runId,
+ CorrelationId = request.CorrelationId,
+ TargetActorId = request.TargetActorId,
+ StartedAtUnixMs = startedAtUnixMs,
+ });
+ }
+
+ try
+ {
+ await ProcessAsync(request, runId);
+ }
+ catch (AgentRunOutputDispatchException ex)
+ {
+ if (!await TryHandleOutputDispatchFailureAsync(request, runId, ex))
+ throw;
+ }
+ catch (Exception ex)
+ {
+ await FailAfterUnexpectedExceptionAsync(request, runId, ex);
+ }
+ }
+
+ [EventHandler]
+ public async Task HandleCleanupAsync(AgentRunCleanupRequested command)
+ {
+ ArgumentNullException.ThrowIfNull(command);
+ if (State.Status is not (AgentRunStatus.ReplyProduced or AgentRunStatus.Dropped or AgentRunStatus.Failed))
+ return;
+ if (!string.IsNullOrWhiteSpace(command.RunId) &&
+ !string.IsNullOrWhiteSpace(State.RunId) &&
+ !string.Equals(command.RunId, State.RunId, StringComparison.Ordinal))
+ {
+ return;
+ }
+
+ await _actorRuntime.DestroyAsync(Id, CancellationToken.None);
+ }
+
+ private async Task ProcessAsync(NeedsLlmReplyEvent request, string runId)
+ {
+ _logger.LogInformation(
+ "Processing agent run LLM reply request: runId={RunId} correlation={CorrelationId} target={TargetActorId}",
+ runId,
+ request.CorrelationId,
+ request.TargetActorId);
+
+ if (request.Activity is null || string.IsNullOrWhiteSpace(request.TargetActorId))
+ {
+ _logger.LogWarning(
+ "Dropping malformed deferred LLM reply request: runId={RunId}, correlation={CorrelationId}, target={TargetActorId}",
+ runId,
+ request.CorrelationId,
+ request.TargetActorId);
+ await DropAsync(request, runId, "malformed_deferred_llm_reply_request");
+ return;
+ }
+
+ // Stale gate: NyxID relay reply tokens have a ~30 min TTL and the user access
+ // token used for the LLM call expires inside ~15 min. A request that has been
+ // delayed past the run window cannot lead to a successful reply.
+ var nowMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds();
+ if (request.RequestedAtUnixMs > 0 && nowMs - request.RequestedAtUnixMs > MaxRunRequestAgeMs)
+ {
+ _logger.LogInformation(
+ "Dropping stale LLM reply request: runId={RunId} correlation={CorrelationId} ageMs={AgeMs}",
+ runId,
+ request.CorrelationId,
+ nowMs - request.RequestedAtUnixMs);
+ await DropAsync(request, runId, "stale_agent_run_request_dropped");
+ return;
+ }
+
+ // Relay credential gate: relay turns require a fresh reply_token to send the
+ // outbound. A relay request with no command-carried token cannot be delivered,
+ // so skip the LLM call entirely.
+ if (IsRelayRequest(request) && string.IsNullOrWhiteSpace(request.ReplyToken))
+ {
+ _logger.LogWarning(
+ "Dropping relay LLM reply request without command-carried reply_token: runId={RunId} correlation={CorrelationId}",
+ runId,
+ request.CorrelationId);
+ await DropAsync(request, runId, "missing_relay_reply_token");
+ return;
+ }
+
+ await EnsureTargetActorAsync(request.TargetActorId);
+
+ string replyText;
+ MessageContent? outboundIntent = null;
+ var terminalState = LlmReplyTerminalState.Completed;
+ var errorCode = string.Empty;
+ var errorSummary = string.Empty;
+ using TurnStreamingReplySink? streamingSink = TryBuildStreamingSink(request, request.TargetActorId);
+
+ IReadOnlyDictionary effectiveMetadata;
+ using (var metadataCts = new CancellationTokenSource(MetadataBuildBudget))
+ {
+ try
+ {
+ effectiveMetadata = await BuildEffectiveMetadataAsync(request, metadataCts.Token);
+ }
+ catch (OperationCanceledException ex) when (metadataCts.IsCancellationRequested)
+ {
+ _logger.LogWarning(
+ ex,
+ "Deferred LLM reply metadata build timed out after {TimeoutSeconds}s: runId={RunId} correlation={CorrelationId}",
+ (int)MetadataBuildBudget.TotalSeconds,
+ runId,
+ request.CorrelationId);
+ replyText = "Sorry, I couldn't load your model preferences in time. Please try again.";
+ terminalState = LlmReplyTerminalState.Failed;
+ errorCode = "llm_reply_metadata_timeout";
+ errorSummary = $"Metadata enrichment exceeded {(int)MetadataBuildBudget.TotalSeconds}s budget.";
+ await FailAndDispatchReadyAsync(request, runId, replyText, outboundIntent, terminalState, errorCode, errorSummary);
+ return;
+ }
+ }
+
+ var fallbackTimeout = ResolveFallbackTimeout();
+ using var timeoutCts = fallbackTimeout > TimeSpan.Zero
+ ? new CancellationTokenSource(fallbackTimeout)
+ : new CancellationTokenSource();
+
+ try
+ {
+ IDisposable? interactiveReplyScope = null;
+ try
+ {
+ if (ShouldCaptureInteractiveReply(request.Activity))
+ interactiveReplyScope = _interactiveReplyCollector?.BeginScope();
+
+ replyText = await _replyGenerator.GenerateReplyAsync(
+ request.Activity,
+ effectiveMetadata,
+ streamingSink,
+ timeoutCts.Token) ?? string.Empty;
+ outboundIntent = _interactiveReplyCollector?.TryTake();
+ }
+ finally
+ {
+ interactiveReplyScope?.Dispose();
+ }
+
+ if (streamingSink is not null &&
+ outboundIntent is null &&
+ !string.IsNullOrWhiteSpace(replyText))
+ {
+ await streamingSink.FinalizeAsync(replyText, CancellationToken.None);
+ }
+
+ if (outboundIntent is null && string.IsNullOrWhiteSpace(replyText))
+ {
+ terminalState = LlmReplyTerminalState.Failed;
+ errorCode = "empty_reply";
+ errorSummary = "Reply generator returned an empty response.";
+ replyText = "Sorry, I wasn't able to generate a response. Please try again.";
+ }
+ }
+ catch (OperationCanceledException ex) when (timeoutCts.IsCancellationRequested)
+ {
+ terminalState = LlmReplyTerminalState.Failed;
+ errorCode = "llm_reply_timeout";
+ errorSummary = $"LLM reply generation exceeded {(int)fallbackTimeout.TotalSeconds}s budget.";
+ replyText = "Sorry, this took too long to process - the model or one of its tools didn't " +
+ "respond in time. Please try again, or rephrase the request.";
+ _logger.LogWarning(
+ ex,
+ "Deferred LLM reply timed out after {TimeoutSeconds}s: runId={RunId} correlation={CorrelationId}",
+ (int)fallbackTimeout.TotalSeconds,
+ runId,
+ request.CorrelationId);
+ }
+ catch (Exception ex)
+ {
+ terminalState = LlmReplyTerminalState.Failed;
+ errorCode = "llm_reply_failed";
+ errorSummary = ex.Message;
+ replyText = NyxIdRelayErrorClassifier.Classify(ex.Message);
+ _logger.LogWarning(
+ ex,
+ "Deferred LLM reply generation failed: runId={RunId} correlation={CorrelationId}",
+ runId,
+ request.CorrelationId);
+ }
+
+ if (terminalState == LlmReplyTerminalState.Failed)
+ {
+ await FailAndDispatchReadyAsync(
+ request,
+ runId,
+ replyText,
+ outboundIntent,
+ terminalState,
+ errorCode,
+ errorSummary);
+ return;
+ }
+
+ await DispatchReadyEventAsync(request, replyText, outboundIntent, terminalState, errorCode, errorSummary);
+ await PersistReplyProducedAsync(request, runId, terminalState, errorCode, errorSummary);
+ }
+
+ private async Task FailAndDispatchReadyAsync(
+ NeedsLlmReplyEvent request,
+ string runId,
+ string replyText,
+ MessageContent? outboundIntent,
+ LlmReplyTerminalState terminalState,
+ string errorCode,
+ string errorSummary)
+ {
+ await DispatchReadyEventAsync(request, replyText, outboundIntent, terminalState, errorCode, errorSummary);
+ await PersistFailedAsync(request, runId, errorCode, errorSummary);
+ }
+
+ private async Task DropAsync(NeedsLlmReplyEvent request, string runId, string reason)
+ {
+ if (CanNotifyDrop(request))
+ await DispatchDropNotificationAsync(request, reason);
+
+ await PersistDomainEventAsync(new AgentRunDroppedEvent
+ {
+ RunId = runId,
+ CorrelationId = request.CorrelationId,
+ TargetActorId = request.TargetActorId,
+ Reason = reason,
+ DroppedAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ });
+
+ await ScheduleTerminalCleanupAsync(runId);
+ }
+
+ private async Task PersistReplyProducedAsync(
+ NeedsLlmReplyEvent request,
+ string runId,
+ LlmReplyTerminalState terminalState,
+ string errorCode,
+ string errorSummary)
+ {
+ await PersistDomainEventAsync(new AgentRunReplyProducedEvent
+ {
+ RunId = runId,
+ CorrelationId = request.CorrelationId,
+ TargetActorId = request.TargetActorId,
+ TerminalState = terminalState,
+ ErrorCode = errorCode,
+ ErrorSummary = errorSummary,
+ ProducedAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ });
+
+ await ScheduleTerminalCleanupAsync(runId);
+ }
+
+ private async Task PersistFailedAsync(
+ NeedsLlmReplyEvent request,
+ string runId,
+ string errorCode,
+ string errorSummary)
+ {
+ await PersistDomainEventAsync(new AgentRunFailedEvent
+ {
+ RunId = runId,
+ CorrelationId = request.CorrelationId,
+ TargetActorId = request.TargetActorId,
+ ErrorCode = errorCode,
+ ErrorSummary = errorSummary,
+ FailedAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ });
+
+ await ScheduleTerminalCleanupAsync(runId);
+ }
+
+ private async Task FailAfterUnexpectedExceptionAsync(NeedsLlmReplyEvent request, string runId, Exception ex)
+ {
+ const string errorCode = "agent_run_unhandled_exception";
+ var errorSummary = ex.Message;
+ _logger.LogError(
+ ex,
+ "Agent run failed with unhandled exception: runId={RunId} correlation={CorrelationId}",
+ runId,
+ request.CorrelationId);
+
+ if (request.Activity is not null && !string.IsNullOrWhiteSpace(request.TargetActorId))
+ {
+ try
+ {
+ await DispatchReadyEventAsync(
+ request,
+ "Sorry, I couldn't complete this reply. Please try again.",
+ null,
+ LlmReplyTerminalState.Failed,
+ errorCode,
+ errorSummary);
+ }
+ catch (AgentRunOutputDispatchException dispatchEx)
+ {
+ if (!await TryHandleOutputDispatchFailureAsync(request, runId, dispatchEx))
+ throw;
+ return;
+ }
+ }
+
+ await PersistFailedAsync(request, runId, errorCode, errorSummary);
+ }
+
+ private async Task DispatchReadyEventAsync(
+ NeedsLlmReplyEvent request,
+ string replyText,
+ MessageContent? outboundIntent,
+ LlmReplyTerminalState terminalState,
+ string errorCode,
+ string errorSummary)
+ {
+ if (string.IsNullOrWhiteSpace(request.TargetActorId))
+ return;
+
+ var ready = new LlmReplyReadyEvent
+ {
+ CorrelationId = request.CorrelationId,
+ RegistrationId = request.RegistrationId,
+ SourceActorId = Id,
+ Activity = request.Activity!.Clone(),
+ Outbound = outboundIntent?.Clone() ?? new MessageContent { Text = replyText },
+ TerminalState = terminalState,
+ ErrorCode = errorCode,
+ ErrorSummary = errorSummary,
+ ReadyAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ // Echo the command-only relay credential straight back so ConversationGAgent's
+ // outbound reply does not depend on its in-memory token dict still having the
+ // entry. The actor consumes these fields and never persists them.
+ ReplyToken = request.ReplyToken ?? string.Empty,
+ ReplyTokenExpiresAtUnixMs = request.ReplyTokenExpiresAtUnixMs,
+ };
+ try
+ {
+ await SendToAsync(request.TargetActorId, ready, CancellationToken.None);
+ }
+ catch (Exception ex)
+ {
+ throw new AgentRunOutputDispatchException(
+ $"Failed to send LLM reply ready event to conversation actor '{request.TargetActorId}'.",
+ ex);
+ }
+ }
+
+ private TurnStreamingReplySink? TryBuildStreamingSink(NeedsLlmReplyEvent request, string targetActorId)
+ {
+ if (_relayOptions is not { StreamingRepliesEnabled: true })
+ return null;
+ if (request.Activity?.OutboundDelivery is not
+ {
+ ReplyMessageId.Length: > 0,
+ CorrelationId.Length: > 0,
+ })
+ {
+ return null;
+ }
+ if (string.IsNullOrWhiteSpace(request.CorrelationId))
+ return null;
+
+ var cardMode = _relayOptions.StreamingCardKitEnabled;
+ var throttle = TimeSpan.FromMilliseconds(Math.Max(0, cardMode
+ ? _relayOptions.StreamingCardKitFlushIntervalMs
+ : _relayOptions.StreamingFlushIntervalMs));
+ var maxInterimChunks = cardMode
+ ? int.MaxValue
+ : Math.Max(0, _relayOptions.StreamingMaxInterimChunks);
+ return new TurnStreamingReplySink(
+ _actorDispatchPort,
+ targetActorId,
+ request.CorrelationId,
+ request.RegistrationId,
+ request.Activity.Clone(),
+ throttle,
+ _timeProvider,
+ _logger,
+ maxInterimChunks,
+ cardMode);
+ }
+
+ private async Task> BuildEffectiveMetadataAsync(
+ NeedsLlmReplyEvent request,
+ CancellationToken ct)
+ {
+ var metadata = new Dictionary(request.Metadata, StringComparer.Ordinal);
+
+ await ApplyBotOwnerLlmConfigAsync(request, metadata, ct);
+
+ var userAccessToken = request.Activity?.TransportExtras?.NyxUserAccessToken?.Trim();
+ if (!string.IsNullOrWhiteSpace(userAccessToken))
+ {
+ metadata[LLMRequestMetadataKeys.NyxIdAccessToken] = userAccessToken;
+ metadata[LLMRequestMetadataKeys.NyxIdOrgToken] = userAccessToken;
+ }
+
+ return metadata;
+ }
+
+ private async Task ApplyBotOwnerLlmConfigAsync(
+ NeedsLlmReplyEvent request,
+ IDictionary metadata,
+ CancellationToken ct)
+ {
+ if (_scopeResolver is null || _userConfigQueryPort is null)
+ return;
+
+ var apiKeyId = request.Activity?.Bot?.Value?.Trim();
+ if (string.IsNullOrWhiteSpace(apiKeyId))
+ return;
+
+ string? scopeId;
+ try
+ {
+ scopeId = await _scopeResolver.ResolveScopeIdByApiKeyAsync(apiKeyId, ct);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(
+ ex,
+ "Failed to resolve bot owner scope id for LLM config: runId={RunId} correlation={CorrelationId} apiKeyId={ApiKeyId}",
+ Id,
+ request.CorrelationId,
+ apiKeyId);
+ return;
+ }
+
+ if (string.IsNullOrWhiteSpace(scopeId))
+ {
+ _logger.LogDebug(
+ "No bot owner scope id resolved for LLM config: runId={RunId} correlation={CorrelationId} apiKeyId={ApiKeyId}",
+ Id,
+ request.CorrelationId,
+ apiKeyId);
+ return;
+ }
+
+ try
+ {
+ var config = await _userConfigQueryPort.GetAsync(scopeId, ct);
+ if (!string.IsNullOrWhiteSpace(config.DefaultModel))
+ metadata[LLMRequestMetadataKeys.ModelOverride] = config.DefaultModel.Trim();
+ if (!string.IsNullOrWhiteSpace(config.PreferredLlmRoute))
+ metadata[LLMRequestMetadataKeys.NyxIdRoutePreference] = config.PreferredLlmRoute.Trim();
+ if (config.MaxToolRounds > 0)
+ metadata[LLMRequestMetadataKeys.MaxToolRoundsOverride] =
+ config.MaxToolRounds.ToString(System.Globalization.CultureInfo.InvariantCulture);
+
+ _logger.LogInformation(
+ "Applied bot owner LLM config: runId={RunId} correlation={CorrelationId} scopeId={ScopeId} model={Model} route={Route}",
+ Id,
+ request.CorrelationId,
+ scopeId,
+ string.IsNullOrWhiteSpace(config.DefaultModel) ? "" : config.DefaultModel,
+ string.IsNullOrWhiteSpace(config.PreferredLlmRoute) ? "" : config.PreferredLlmRoute);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(
+ ex,
+ "Failed to load bot owner LLM config: runId={RunId} correlation={CorrelationId} scopeId={ScopeId}",
+ Id,
+ request.CorrelationId,
+ scopeId);
+ }
+ }
+
+ private TimeSpan ResolveFallbackTimeout()
+ {
+ if (_relayOptions is null)
+ return TimeSpan.FromSeconds(FallbackTimeoutSecondsDefault);
+ var configured = _relayOptions.ResponseTimeoutSeconds;
+ if (configured <= 0)
+ return TimeSpan.Zero;
+ return TimeSpan.FromSeconds(configured);
+ }
+
+ private static bool IsRelayRequest(NeedsLlmReplyEvent request) =>
+ request.Activity?.OutboundDelivery is
+ {
+ ReplyMessageId.Length: > 0,
+ CorrelationId.Length: > 0,
+ };
+
+ private static bool CanNotifyDrop(NeedsLlmReplyEvent request) =>
+ !string.IsNullOrWhiteSpace(request.TargetActorId) &&
+ !string.IsNullOrWhiteSpace(request.CorrelationId);
+
+ private async Task DispatchDropNotificationAsync(NeedsLlmReplyEvent request, string reason)
+ {
+ var dropped = new DeferredLlmReplyDroppedEvent
+ {
+ CorrelationId = request.CorrelationId,
+ Reason = reason,
+ DroppedAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ };
+
+ try
+ {
+ await SendToAsync(request.TargetActorId, dropped, CancellationToken.None);
+ }
+ catch (Exception ex)
+ {
+ throw new AgentRunOutputDispatchException(
+ $"Failed to send deferred LLM reply drop event to conversation actor '{request.TargetActorId}' (reason '{reason}').",
+ ex);
+ }
+ }
+
+ private async Task TryHandleOutputDispatchFailureAsync(
+ NeedsLlmReplyEvent request,
+ string runId,
+ AgentRunOutputDispatchException ex)
+ {
+ _logger.LogWarning(
+ ex,
+ "Agent run output notification was not accepted; run remains retryable: runId={RunId} correlation={CorrelationId}",
+ runId,
+ request.CorrelationId);
+
+ if (await TryScheduleStartRetryAsync(request, runId))
+ return true;
+
+ _logger.LogWarning(
+ ex,
+ "Agent run output retry could not be scheduled; propagating to runtime retry: runId={RunId} correlation={CorrelationId}",
+ runId,
+ request.CorrelationId);
+ return false;
+ }
+
+ private async Task TryScheduleStartRetryAsync(NeedsLlmReplyEvent request, string runId)
+ {
+ if (Services.GetService() is null)
+ return false;
+
+ try
+ {
+ await ScheduleSelfDurableTimeoutAsync(
+ BuildOutputDispatchRetryCallbackId(runId),
+ OutputDispatchRetryDelay,
+ new AgentRunStartRequested
+ {
+ Request = request.Clone(),
+ },
+ ct: CancellationToken.None);
+ return true;
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(
+ ex,
+ "Failed to schedule agent run output retry: runId={RunId} actorId={ActorId}",
+ runId,
+ Id);
+ return false;
+ }
+ }
+
+ private async Task ScheduleTerminalCleanupAsync(string runId)
+ {
+ if (Services.GetService() is null)
+ return;
+
+ try
+ {
+ await ScheduleSelfDurableTimeoutAsync(
+ BuildCleanupCallbackId(runId),
+ TerminalCleanupDelay,
+ new AgentRunCleanupRequested
+ {
+ RunId = runId,
+ RequestedAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
+ },
+ ct: CancellationToken.None);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(
+ ex,
+ "Failed to schedule terminal agent run cleanup: runId={RunId} actorId={ActorId}",
+ runId,
+ Id);
+ }
+ }
+
+ private static string BuildCleanupCallbackId(string runId)
+ {
+ var normalized = NormalizeOptional(runId) ?? "unknown";
+ var chars = normalized
+ .Select(static ch => char.IsLetterOrDigit(ch) || ch is '-' or '_' ? ch : '_')
+ .Take(96)
+ .ToArray();
+ return $"{TerminalCleanupCallbackPrefix}:{new string(chars)}";
+ }
+
+ private static string BuildOutputDispatchRetryCallbackId(string runId)
+ {
+ var normalized = NormalizeOptional(runId) ?? "unknown";
+ var chars = normalized
+ .Select(static ch => char.IsLetterOrDigit(ch) || ch is '-' or '_' ? ch : '_')
+ .Take(96)
+ .ToArray();
+ return $"{OutputDispatchRetryCallbackPrefix}:{new string(chars)}";
+ }
+
+ private async Task EnsureTargetActorAsync(string targetActorId)
+ {
+ if (string.IsNullOrWhiteSpace(targetActorId))
+ return;
+
+ var actor = await _actorRuntime.GetAsync(targetActorId);
+ if (actor is null)
+ await _actorRuntime.CreateAsync(targetActorId, CancellationToken.None);
+ }
+
+ private bool ShouldCaptureInteractiveReply(ChatActivity? activity)
+ {
+ if (_interactiveReplyCollector is null)
+ return false;
+
+ if (_relayOptions is { InteractiveRepliesEnabled: false })
+ return false;
+
+ return activity?.OutboundDelivery is
+ {
+ ReplyMessageId.Length: > 0,
+ CorrelationId.Length: > 0,
+ };
+ }
+
+ private static AgentRunGAgentState ApplyStarted(AgentRunGAgentState current, AgentRunStartedEvent evt)
+ {
+ var next = current.Clone();
+ next.RunId = evt.RunId;
+ next.CorrelationId = evt.CorrelationId;
+ next.TargetActorId = evt.TargetActorId;
+ next.Status = AgentRunStatus.Started;
+ next.StartedAtUnixMs = evt.StartedAtUnixMs;
+ return next;
+ }
+
+ private static AgentRunGAgentState ApplyReplyProduced(
+ AgentRunGAgentState current,
+ AgentRunReplyProducedEvent evt)
+ {
+ var next = current.Clone();
+ next.RunId = string.IsNullOrWhiteSpace(next.RunId) ? evt.RunId : next.RunId;
+ next.CorrelationId = string.IsNullOrWhiteSpace(next.CorrelationId) ? evt.CorrelationId : next.CorrelationId;
+ next.TargetActorId = string.IsNullOrWhiteSpace(next.TargetActorId) ? evt.TargetActorId : next.TargetActorId;
+ next.Status = AgentRunStatus.ReplyProduced;
+ next.CompletedAtUnixMs = evt.ProducedAtUnixMs;
+ next.ErrorCode = evt.ErrorCode;
+ next.ErrorSummary = evt.ErrorSummary;
+ return next;
+ }
+
+ private static AgentRunGAgentState ApplyDropped(AgentRunGAgentState current, AgentRunDroppedEvent evt)
+ {
+ var next = current.Clone();
+ next.RunId = string.IsNullOrWhiteSpace(next.RunId) ? evt.RunId : next.RunId;
+ next.CorrelationId = string.IsNullOrWhiteSpace(next.CorrelationId) ? evt.CorrelationId : next.CorrelationId;
+ next.TargetActorId = string.IsNullOrWhiteSpace(next.TargetActorId) ? evt.TargetActorId : next.TargetActorId;
+ next.Status = AgentRunStatus.Dropped;
+ next.CompletedAtUnixMs = evt.DroppedAtUnixMs;
+ next.ErrorCode = evt.Reason;
+ next.ErrorSummary = string.Empty;
+ return next;
+ }
+
+ private static AgentRunGAgentState ApplyFailed(AgentRunGAgentState current, AgentRunFailedEvent evt)
+ {
+ var next = current.Clone();
+ next.RunId = string.IsNullOrWhiteSpace(next.RunId) ? evt.RunId : next.RunId;
+ next.CorrelationId = string.IsNullOrWhiteSpace(next.CorrelationId) ? evt.CorrelationId : next.CorrelationId;
+ next.TargetActorId = string.IsNullOrWhiteSpace(next.TargetActorId) ? evt.TargetActorId : next.TargetActorId;
+ next.Status = AgentRunStatus.Failed;
+ next.CompletedAtUnixMs = evt.FailedAtUnixMs;
+ next.ErrorCode = evt.ErrorCode;
+ next.ErrorSummary = evt.ErrorSummary;
+ return next;
+ }
+
+ private static string? NormalizeOptional(string? value)
+ {
+ var trimmed = value?.Trim();
+ return string.IsNullOrWhiteSpace(trimmed) ? null : trimmed;
+ }
+
+ private sealed class AgentRunOutputDispatchException(string message, Exception innerException)
+ : Exception(message, innerException);
+}
diff --git a/agents/Aevatar.GAgents.NyxidChat/ChannelCardConversationTurnRunner.cs b/agents/Aevatar.GAgents.NyxidChat/ChannelCardConversationTurnRunner.cs
new file mode 100644
index 000000000..3e6d38c98
--- /dev/null
+++ b/agents/Aevatar.GAgents.NyxidChat/ChannelCardConversationTurnRunner.cs
@@ -0,0 +1,398 @@
+using System.Text.Json;
+using Aevatar.AI.ToolProviders.Lark;
+using Aevatar.GAgents.Channel.Abstractions;
+using Aevatar.GAgents.Channel.Runtime;
+using Aevatar.GAgents.Platform.Lark;
+using Microsoft.Extensions.Logging;
+
+namespace Aevatar.GAgents.NyxidChat;
+
+///
+/// Production for the Lark CardKit streaming
+/// path. Composes (cardkit/v1/* endpoints) with
+/// (im/v1/messages with msg_type=interactive)
+/// to drive the create → send → stream → finalize lifecycle. Auth: bot owner's NyxID
+/// access token from activity.TransportExtras.NyxUserAccessToken; receive target:
+/// nyx_lark_chat_id for groups, falling back to nyx_lark_union_id for p2p
+/// DMs (cross-app safe per the proto's documented invariants).
+///
+public sealed class ChannelCardConversationTurnRunner : IConversationCardTurnRunner
+{
+ private static readonly JsonSerializerOptions JsonOptions = new();
+
+ private readonly ILarkCardKitClient _cardKit;
+ private readonly ILarkNyxClient _larkClient;
+ private readonly ILogger _logger;
+
+ public ChannelCardConversationTurnRunner(
+ ILarkCardKitClient cardKit,
+ ILarkNyxClient larkClient,
+ ILogger logger)
+ {
+ _cardKit = cardKit ?? throw new ArgumentNullException(nameof(cardKit));
+ _larkClient = larkClient ?? throw new ArgumentNullException(nameof(larkClient));
+ _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+ }
+
+ public async Task RunCardCreateAsync(
+ LlmReplyCardStreamChunkEvent chunk,
+ string streamingElementId,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct)
+ {
+ ArgumentNullException.ThrowIfNull(chunk);
+ if (chunk.Activity is null)
+ return ConversationCardCreateResult.Failed("activity_required", "Stream chunk event is missing the source activity.");
+
+ var token = ResolveToken(chunk.Activity);
+ if (token is null)
+ return ConversationCardCreateResult.Failed("token_missing", "NyxID user access token is missing on the activity's TransportExtras.");
+
+ var receiveTarget = ResolveReceiveTarget(chunk.Activity);
+ if (receiveTarget is null)
+ return ConversationCardCreateResult.Failed("receive_target_missing", "Lark chat_id and union_id are both missing on TransportExtras.");
+
+ // 1. Allocate a CardKit entity holding an empty streaming element. The first chunk's
+ // text lands via StreamElementContentAsync (step 3) so the card_json schema and
+ // the streaming wire format stay decoupled.
+ var initialCardJson = LarkStreamingCardShell.BuildInitialCardJson(streamingElementId);
+ string createResponse;
+ try
+ {
+ createResponse = await _cardKit.CreateCardAsync(
+ token,
+ new LarkCardKitCreateRequest("card_json", initialCardJson),
+ ct);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "CardKit card.create threw for correlation={CorrelationId}", chunk.CorrelationId);
+ return ConversationCardCreateResult.Failed("card_create_threw", ex.Message);
+ }
+
+ if (LarkProxyResponseParser.TryParseError(createResponse, out var createError))
+ return ClassifyCreateFailure("card_create_failed", createError);
+
+ var cardId = ExtractCardId(createResponse);
+ if (string.IsNullOrWhiteSpace(cardId))
+ return ConversationCardCreateResult.Failed("card_id_missing", "card.create response did not include data.card_id.");
+
+ // 2. Bind the card to the chat by sending an interactive message that references it.
+ var contentJson = JsonSerializer.Serialize(
+ new { type = "card", data = new { card_id = cardId } },
+ JsonOptions);
+ string sendResponse;
+ try
+ {
+ sendResponse = await _larkClient.SendMessageAsync(
+ token,
+ new LarkSendMessageRequest(
+ TargetType: receiveTarget.Value.ReceiveIdType,
+ TargetId: receiveTarget.Value.ReceiveId,
+ MessageType: "interactive",
+ ContentJson: contentJson,
+ IdempotencyKey: chunk.CorrelationId),
+ ct);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Card send-to-chat threw for correlation={CorrelationId}, card_id={CardId}", chunk.CorrelationId, cardId);
+ return ConversationCardCreateResult.Failed("card_send_threw", ex.Message);
+ }
+
+ if (LarkProxyResponseParser.TryParseError(sendResponse, out var sendError))
+ return ClassifyCreateFailure("card_send_failed", sendError);
+
+ var cardMessageId = LarkProxyResponseParser.ParseSendSuccess(sendResponse).MessageId
+ ?? string.Empty;
+
+ // 3. Write the first chunk's text into the streaming element. Sequence = 1 (the
+ // grain pre-allocates this value; subsequent chunks pass sequence+1 each call).
+ // The card has already been bound to the chat (step 2), so any failure from here
+ // on is a *post-send* failure: an empty card is visible in the chat. We must
+ // return PostSendFailed (not Failed) so the actor terminates the turn instead
+ // of falling back to text-edit and producing a duplicate reply.
+ string firstStreamResponse;
+ try
+ {
+ firstStreamResponse = await _cardKit.StreamElementContentAsync(
+ token,
+ new LarkCardKitStreamElementContentRequest(
+ CardId: cardId,
+ ElementId: streamingElementId,
+ Content: chunk.AccumulatedText,
+ Sequence: 1,
+ IdempotencyKey: $"{chunk.CorrelationId}-1"),
+ ct);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "CardKit first stream threw for correlation={CorrelationId}, card_id={CardId}", chunk.CorrelationId, cardId);
+ await TryBestEffortCloseStreamingAsync(token, cardId, sequence: 2, ct).ConfigureAwait(false);
+ return ConversationCardCreateResult.PostSendFailed(
+ cardId,
+ cardMessageId,
+ "card_first_stream_threw",
+ ex.Message);
+ }
+
+ if (LarkProxyResponseParser.TryParseError(firstStreamResponse, out var firstStreamError))
+ {
+ await TryBestEffortCloseStreamingAsync(token, cardId, sequence: 2, ct).ConfigureAwait(false);
+ return ClassifyPostSendFailure(cardId, cardMessageId, "card_first_stream_failed", firstStreamError);
+ }
+
+ return ConversationCardCreateResult.Succeeded(cardId, cardMessageId);
+ }
+
+ ///
+ /// Best-effort settings patch to close streaming_mode on a card whose first
+ /// content write failed. Stops the typewriter cursor on the orphan empty card so the
+ /// chat does not show a perpetually-loading bubble. Failures are logged and swallowed —
+ /// the parent operation has already failed; this is a UX cleanup, not a correctness gate.
+ ///
+ private async Task TryBestEffortCloseStreamingAsync(string token, string cardId, long sequence, CancellationToken ct)
+ {
+ try
+ {
+ await _cardKit.SetCardSettingsAsync(
+ token,
+ new LarkCardKitSettingsRequest(
+ CardId: cardId,
+ SettingsJson: """{"streaming_mode": false}""",
+ Sequence: sequence,
+ IdempotencyKey: $"orphan-close-{cardId}"),
+ ct);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Best-effort close of orphan streaming card failed; cursor may remain visible. card_id={CardId}", cardId);
+ }
+ }
+
+ public async Task RunCardStreamAsync(
+ LlmReplyCardStreamChunkEvent chunk,
+ string cardId,
+ string elementId,
+ long sequence,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct)
+ {
+ ArgumentNullException.ThrowIfNull(chunk);
+ if (chunk.Activity is null)
+ return ConversationCardStreamResult.Failed("activity_required", "Stream chunk event is missing the source activity.");
+
+ var token = ResolveToken(chunk.Activity);
+ if (token is null)
+ return ConversationCardStreamResult.Failed("token_missing", "NyxID user access token is missing on the activity's TransportExtras.");
+
+ string response;
+ try
+ {
+ response = await _cardKit.StreamElementContentAsync(
+ token,
+ new LarkCardKitStreamElementContentRequest(
+ CardId: cardId,
+ ElementId: elementId,
+ Content: chunk.AccumulatedText,
+ Sequence: sequence,
+ IdempotencyKey: $"{chunk.CorrelationId}-{sequence}"),
+ ct);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "CardKit interim stream threw for correlation={CorrelationId}, card_id={CardId}, seq={Sequence}", chunk.CorrelationId, cardId, sequence);
+ return ConversationCardStreamResult.Failed("card_stream_threw", ex.Message);
+ }
+
+ if (LarkProxyResponseParser.TryParseError(response, out var error))
+ return ClassifyStreamFailure(error);
+
+ return ConversationCardStreamResult.Succeeded();
+ }
+
+ public async Task RunCardFinalizeAsync(
+ ChatActivity referenceActivity,
+ string cardId,
+ string elementId,
+ string finalText,
+ bool finalTextDiffersFromLastFlushed,
+ long sequence,
+ ConversationTurnRuntimeContext runtimeContext,
+ CancellationToken ct)
+ {
+ ArgumentNullException.ThrowIfNull(referenceActivity);
+
+ var token = ResolveToken(referenceActivity);
+ if (token is null)
+ return ConversationCardFinalizeResult.Failed("token_missing", "NyxID user access token is missing on the reference activity's TransportExtras.");
+
+ // 1. If final text drifted from the last flushed interim, write it before closing
+ // streaming mode. Order matters: closing streaming first would freeze the cursor
+ // on the stale text. Track whether the trailing write actually landed so the
+ // actor can pick the right user-visible text on a partial-failure terminal.
+ long workingSequence = sequence;
+ var finalTextWritten = !finalTextDiffersFromLastFlushed || string.IsNullOrWhiteSpace(finalText);
+ if (finalTextDiffersFromLastFlushed && !string.IsNullOrWhiteSpace(finalText))
+ {
+ try
+ {
+ var streamFinalResponse = await _cardKit.StreamElementContentAsync(
+ token,
+ new LarkCardKitStreamElementContentRequest(
+ CardId: cardId,
+ ElementId: elementId,
+ Content: finalText,
+ Sequence: workingSequence,
+ IdempotencyKey: $"final-{cardId}-{workingSequence}"),
+ ct);
+ if (LarkProxyResponseParser.TryParseError(streamFinalResponse, out var streamFinalError))
+ return ConversationCardFinalizeResult.Failed("card_final_stream_failed", streamFinalError, finalTextWritten: false);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "CardKit final stream threw for card_id={CardId}, seq={Sequence}", cardId, workingSequence);
+ return ConversationCardFinalizeResult.Failed("card_final_stream_threw", ex.Message, finalTextWritten: false);
+ }
+ finalTextWritten = true;
+ workingSequence++;
+ }
+
+ // 2. Close the card's streaming mode so the typewriter cursor disappears.
+ try
+ {
+ var settingsResponse = await _cardKit.SetCardSettingsAsync(
+ token,
+ new LarkCardKitSettingsRequest(
+ CardId: cardId,
+ SettingsJson: """{"streaming_mode": false}""",
+ Sequence: workingSequence,
+ IdempotencyKey: $"close-{cardId}-{workingSequence}"),
+ ct);
+ if (LarkProxyResponseParser.TryParseError(settingsResponse, out var settingsError))
+ return ConversationCardFinalizeResult.Failed("card_close_streaming_failed", settingsError, finalTextWritten: finalTextWritten);
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "CardKit close-streaming threw for card_id={CardId}, seq={Sequence}", cardId, workingSequence);
+ return ConversationCardFinalizeResult.Failed("card_close_streaming_threw", ex.Message, finalTextWritten: finalTextWritten);
+ }
+
+ return ConversationCardFinalizeResult.Succeeded();
+ }
+
+ private static string? ResolveToken(ChatActivity activity)
+ {
+ var token = activity.TransportExtras?.NyxUserAccessToken?.Trim();
+ return string.IsNullOrWhiteSpace(token) ? null : token;
+ }
+
+ private static (string ReceiveIdType, string ReceiveId)? ResolveReceiveTarget(ChatActivity activity)
+ {
+ // Group / channel / thread: the relay-side chat_id is cross-app safe within the tenant.
+ var chatId = activity.TransportExtras?.NyxLarkChatId?.Trim();
+ var conversationScope = activity.Conversation?.Scope ?? ConversationScope.Unspecified;
+ var isGroupLike = conversationScope is ConversationScope.Group
+ or ConversationScope.Channel
+ or ConversationScope.Thread;
+ if (isGroupLike && !string.IsNullOrWhiteSpace(chatId))
+ return ("chat_id", chatId);
+
+ // Direct message: the chat_id is bot-specific and not cross-app safe; prefer union_id.
+ var unionId = activity.TransportExtras?.NyxLarkUnionId?.Trim();
+ if (!string.IsNullOrWhiteSpace(unionId))
+ return ("union_id", unionId);
+
+ // Fall back to chat_id for DMs only when union_id is unavailable. The relay populates
+ // union_id whenever it can resolve it, so this branch generally does not fire.
+ if (!string.IsNullOrWhiteSpace(chatId))
+ return ("chat_id", chatId);
+
+ return null;
+ }
+
+ ///
+ /// Best-effort extract of data.card_id from the cardkit/v1/cards response.
+ /// Returns null when the field is missing or malformed; the caller treats null as a
+ /// terminal create failure.
+ ///
+ private static string? ExtractCardId(string response)
+ {
+ try
+ {
+ using var document = JsonDocument.Parse(response);
+ if (document.RootElement.TryGetProperty("data", out var data) &&
+ data.TryGetProperty("card_id", out var cardIdProp) &&
+ cardIdProp.ValueKind == JsonValueKind.String)
+ {
+ return cardIdProp.GetString();
+ }
+ }
+ catch (JsonException)
+ {
+ return null;
+ }
+ return null;
+ }
+
+ private static ConversationCardCreateResult ClassifyCreateFailure(string contextErrorCode, string larkError) =>
+ ConversationCardCreateResult.Failed(
+ errorCode: contextErrorCode,
+ errorSummary: larkError,
+ isRateLimited: ContainsLarkCode(larkError, 230020),
+ isTableLimitExceeded: ContainsLarkCode(larkError, 11310),
+ isCardUnavailable: ContainsLarkCode(larkError, 230099) || ContainsLarkCode(larkError, 230100));
+
+ ///
+ /// Same classification as but threads the
+ /// already-allocated / through
+ /// the result so the actor can persist the partial-card terminal record. Used for any
+ /// failure that occurs after im/v1/messages has bound the card to the chat.
+ ///
+ private static ConversationCardCreateResult ClassifyPostSendFailure(
+ string cardId,
+ string cardMessageId,
+ string contextErrorCode,
+ string larkError) =>
+ ConversationCardCreateResult.PostSendFailed(
+ cardId: cardId,
+ cardMessageId: cardMessageId,
+ errorCode: contextErrorCode,
+ errorSummary: larkError,
+ isRateLimited: ContainsLarkCode(larkError, 230020),
+ isTableLimitExceeded: ContainsLarkCode(larkError, 11310),
+ isCardUnavailable: ContainsLarkCode(larkError, 230099) || ContainsLarkCode(larkError, 230100));
+
+ private static ConversationCardStreamResult ClassifyStreamFailure(string larkError) =>
+ ConversationCardStreamResult.Failed(
+ errorCode: "card_stream_failed",
+ errorSummary: larkError,
+ isRateLimited: ContainsLarkCode(larkError, 230020),
+ isTableLimitExceeded: ContainsLarkCode(larkError, 11310),
+ isCardUnavailable: ContainsLarkCode(larkError, 230099) || ContainsLarkCode(larkError, 230100));
+
+ ///
+ /// Boundary-aware match against 's
+ /// output shape ("lark_code={n} ..."). The needle's trailing position must be
+ /// the end of the string OR a non-digit; without the boundary check, looking for
+ /// lark_code=23002 would falsely match a string containing lark_code=230020.
+ ///
+ private static bool ContainsLarkCode(string error, int code)
+ {
+ if (string.IsNullOrEmpty(error))
+ return false;
+ var needle = $"lark_code={code}";
+ var index = 0;
+ while (index <= error.Length - needle.Length)
+ {
+ var found = error.IndexOf(needle, index, StringComparison.Ordinal);
+ if (found < 0)
+ return false;
+ var endIndex = found + needle.Length;
+ if (endIndex == error.Length || !char.IsDigit(error[endIndex]))
+ return true;
+ index = endIndex;
+ }
+ return false;
+ }
+}
diff --git a/agents/Aevatar.GAgents.NyxidChat/ChannelConversationTurnRunner.cs b/agents/Aevatar.GAgents.NyxidChat/ChannelConversationTurnRunner.cs
index 96ed5aa87..6491ead5d 100644
--- a/agents/Aevatar.GAgents.NyxidChat/ChannelConversationTurnRunner.cs
+++ b/agents/Aevatar.GAgents.NyxidChat/ChannelConversationTurnRunner.cs
@@ -1,3 +1,4 @@
+using System.Net.Http;
using System.Text.Json;
using Aevatar.AI.Abstractions.LLMProviders;
using Aevatar.AI.Abstractions.ToolProviders;
@@ -7,6 +8,7 @@
using Aevatar.GAgents.Authoring.Lark;
using Aevatar.GAgents.Channel.Abstractions;
using Aevatar.GAgents.Channel.Abstractions.Slash;
+using Aevatar.GAgents.Channel.Identity;
using Aevatar.GAgents.Channel.Identity.Abstractions;
using Aevatar.GAgents.Channel.Identity.Slash;
using Aevatar.GAgents.Channel.NyxIdRelay;
@@ -24,6 +26,8 @@ namespace Aevatar.GAgents.NyxidChat;
public sealed class ChannelConversationTurnRunner : IConversationTurnRunner
{
+ private sealed record ResolvedSenderBinding(string BindingId, ExternalSubjectRef Subject);
+
private readonly IServiceProvider _toolServiceProvider;
private readonly IChannelBotRegistrationQueryPort _registrationQueryPort;
private readonly IChannelBotRegistrationQueryByNyxIdentityPort? _registrationQueryByNyxIdentityPort;
@@ -95,10 +99,10 @@ public async Task RunInboundAsync(
return ConversationTurnResult.PermanentFailure("registration_not_found", "Channel registration not found.");
// Capture the typing-reaction Task instead of `_ =`-discarding it. The direct-reply
- // AgentBuilder path can complete fast enough that the swap fires before Lark has
- // persisted the typing reaction; the swap GET would then find nothing to delete and
- // leave both Typing + DONE on the message. Threading the task to the swap site lets
- // the swap await-with-timeout the typing POST first. The deferred-LLM and streaming
+ // AgentBuilder path can complete fast enough that the clear fires before Lark has
+ // persisted the typing reaction; the clear GET would then find nothing to delete and
+ // leave Typing on the message. Threading the task to the clear site lets the clear
+ // await-with-timeout the typing POST first. The deferred-LLM and streaming
// paths don't get this task (different invocation), but their natural latency is
// orders of magnitude greater than the typing POST so the race cannot fire.
var typingReactionTask = TrySendImmediateLarkReactionAsync(activity, registration, ct);
@@ -113,19 +117,13 @@ public async Task RunInboundAsync(
if (await TryHandleSlashCommandAsync(activity, inbound, registration, runtimeContext, ct) is { } slashResult)
return slashResult;
- // Pre-LLM binding gate: when broker mode is wired, an unbound sender
- // MUST be prompted to bind NyxID rather than served by the bot owner's
- // credentials (codex L65 security: ADR-0018 §Decision "未绑定 sender
- // 一律强制绑定,不回落到 bot owner"). Falls through transparently
- // when identity ports are not registered (legacy bot-owner-shared
- // deployments). The gate also returns the resolved binding-id so the
- // LLM dispatch can apply the sender prefs override chain (issue #513
- // phase 3) without paying for a second projection lookup.
- var (bindingGateResult, senderBindingId) = await TryEnforceBindingGateAsync(activity, inbound, registration, runtimeContext, ct).ConfigureAwait(false);
- if (bindingGateResult is not null)
- return bindingGateResult;
-
- if (await TryHandleLlmSelectionCardActionAsync(activity, inbound, registration, runtimeContext, senderBindingId, ct).ConfigureAwait(false) is { } llmSelectionResult)
+ // Normal LLM messages do not force /init. If the sender is bound we
+ // carry that binding forward so the reply generator can try the
+ // sender's own NyxID LLM prefs first; otherwise the run actor/generator
+ // will use the bot owner's ambient LLM config.
+ var senderBinding = await TryResolveSenderBindingAsync(inbound, registration, ct).ConfigureAwait(false);
+
+ if (await TryHandleLlmSelectionCardActionAsync(activity, inbound, registration, runtimeContext, senderBinding?.BindingId, ct).ConfigureAwait(false) is { } llmSelectionResult)
return llmSelectionResult;
var inboundEvent = ToInboundEvent(activity, registration, inbound, ResolveUserAccessToken(activity));
@@ -157,7 +155,7 @@ public async Task RunInboundAsync(
}
return ConversationTurnResult.LlmReplyRequested(
- await BuildLlmReplyRequestAsync(activity, registration, inboundEvent, runtimeContext, senderBindingId, ct).ConfigureAwait(false));
+ await BuildLlmReplyRequestAsync(activity, registration, inboundEvent, runtimeContext, senderBinding, ct).ConfigureAwait(false));
}
public Task RunInboundAsync(ChatActivity activity, CancellationToken ct) =>
@@ -165,16 +163,16 @@ public Task RunInboundAsync(ChatActivity activity, Cance
// ─── Slash command dispatch ───
//
- // ADR-0018 §Decision: when per-user binding is enabled, slash commands
- // (/init, /unbind, /whoami, /model, ...) are routed before the LLM so the
- // bot owner's bot-shared mode is bypassed for unbound senders. Handlers
+ // Slash commands (/init, /unbind, /whoami, /model, ...) are routed before
+ // the LLM so binding/configuration commands can own their per-user
+ // semantics without being swallowed by the chat model. Handlers
// are discovered as IEnumerable from DI;
// identity ports are constructor-injected as optional capabilities so
// deployments that have not enabled binding fall through to the legacy
// flow. Phase 6 (issue #513):
// each handler declares RequiresBinding so unbound senders trying to use
- // a binding-only command (e.g. /model use) get the same hint as the LLM-
- // turn binding gate instead of a stack trace.
+ // a binding-only command (e.g. /model use) get a binding hint instead of
+ // a stack trace; normal LLM turns still have owner fallback.
private async Task TryHandleSlashCommandAsync(
ChatActivity activity,
InboundMessage inbound,
@@ -435,59 +433,73 @@ private static bool TryResolveExternalSubject(
return true;
}
- // Pre-LLM binding gate: when identity is wired, refuse to serve unbound
- // senders with the bot owner's credentials (ADR-0018 §Decision). Returns
- // (null, null) when binding is not enabled (legacy mode); returns
- // (prompt, null) for unbound senders so the caller short-circuits with
- // a binding prompt/card; returns (null, bindingId) for bound senders so the LLM
- // dispatch can carry the binding-id forward into metadata for the issue
- // #513 phase 3 prefs override chain.
- private async Task<(ConversationTurnResult? Blocking, string? SenderBindingId)> TryEnforceBindingGateAsync(
- ChatActivity activity,
+ // Normal LLM messages are allowed to use the bot owner's LLM config when
+ // the sender has no NyxID binding. Binding is only required by commands
+ // that configure or inspect per-user state (/models, /model use, ...).
+ private async Task TryResolveSenderBindingAsync(
InboundMessage inbound,
ChannelBotRegistrationEntry registration,
- ConversationTurnRuntimeContext runtimeContext,
CancellationToken ct)
{
var queryPort = _identityBindingQueryPort;
if (queryPort is null)
- return (null, null);
-
- if (string.IsNullOrWhiteSpace(inbound.SenderId) || string.IsNullOrWhiteSpace(inbound.Platform))
- return (null, null);
-
- var tenant = ResolveTenant(inbound, registration);
- if (tenant is null)
- return (null, null);
+ return null;
- var subject = new ExternalSubjectRef
- {
- Platform = inbound.Platform.Trim().ToLowerInvariant(),
- Tenant = tenant,
- ExternalUserId = inbound.SenderId.Trim(),
- };
+ if (!TryResolveExternalSubject(inbound, registration, out var subject))
+ return null;
BindingId? existing;
try
{
existing = await queryPort.ResolveAsync(subject, ct);
}
+ catch (OperationCanceledException)
+ {
+ throw;
+ }
+ catch (Exception ex) when (IsTransientBindingLookupFailure(ex))
+ {
+ // Transient infra failures (DB blip, transient HTTP, JSON shape mismatch from
+ // upstream): degrade to owner credentials and keep the conversation alive.
+ _logger.LogWarning(
+ ex,
+ "Transient sender NyxID binding lookup failure; falling back to bot owner LLM config. subject={Platform}:{Tenant}:{User}",
+ subject.Platform,
+ subject.Tenant,
+ subject.ExternalUserId);
+ return null;
+ }
catch (Exception ex)
{
- // Resolve failure should fail closed (refuse to serve with
- // bot-owner credentials) rather than fail open. Log and treat as
- // unbound.
- _logger.LogError(ex, "Binding gate resolve failed for sender {Sender}; treating as unbound", inbound.SenderId);
- existing = null;
+ // Non-transient (programmer error, unexpected NRE, serialization break): surface
+ // at Error level so ops can distinguish from "sender just isn't bound" — but still
+ // fall through to owner credentials so the user gets a reply rather than nothing.
+ _logger.LogError(
+ ex,
+ "Sender NyxID binding lookup raised non-transient exception; falling back to bot owner LLM config. subject={Platform}:{Tenant}:{User}",
+ subject.Platform,
+ subject.Tenant,
+ subject.ExternalUserId);
+ return null;
}
if (existing is not null)
- return (null, existing.Value); // bound — continue with sender binding-id
+ return new ResolvedSenderBinding(existing.Value, subject.Clone());
- var prompt = await SendBindingPromptAsync(activity, inbound, registration, runtimeContext, ct).ConfigureAwait(false);
- return (prompt, null);
+ return null;
}
+ ///
+ /// Distinguish infra-shaped binding lookup failures (worth a Warning + owner fallback)
+ /// from logic/programmer errors (worth an Error log so ops sees them).
+ ///
+ private static bool IsTransientBindingLookupFailure(Exception ex) =>
+ ex is HttpRequestException
+ or TimeoutException
+ or TaskCanceledException
+ or System.Text.Json.JsonException
+ or System.IO.IOException;
+
// Lark-aware private-chat detection. Other platforms map their direct-
// message chat-type strings here as the runner gains support for them.
private static bool IsPrivateChat(InboundMessage inbound)
@@ -610,8 +622,8 @@ private async Task ExecuteLlmSelectionCardActionAsync(
await selectionService.SetByServiceAsync(selectionContext, value.Trim(), modelOverride: null, ct)
.ConfigureAwait(false);
var updated = await optionsService.GetOptionsAsync(query, ct).ConfigureAwait(false);
- var picked = updated.Available.FirstOrDefault(option =>
- string.Equals(option.ServiceId, value.Trim(), StringComparison.OrdinalIgnoreCase)) ?? updated.Current;
+ var picked = updated.Current ?? updated.Available.FirstOrDefault(option =>
+ string.Equals(option.ServiceId, value.Trim(), StringComparison.OrdinalIgnoreCase));
return picked is null
? new MessageContent { Text = "已切换 LLM service。下一条消息会用新的设置回复。" }
: renderer.RenderSelectionConfirm(picked, picked.DefaultModel);
@@ -730,10 +742,10 @@ public async Task RunLlmReplyAsync(
var inbound = ToInboundMessage(reply.Activity);
// Direct path requires registration to actually send the reply; relay path only wants it
- // for the post-reply reaction swap (relay sends use the reply token, not registration).
+ // for the post-reply reaction clear (relay sends use the reply token, not registration).
// So lookup is mandatory on the direct path and best-effort on the relay path — a
// transient registration-store error on the relay path must not drop an otherwise valid
- // reply, only degrade the swap to a no-op for that turn.
+ // reply, only degrade the clear to a no-op for that turn.
ChannelBotRegistrationEntry? registration;
if (HasRelayDelivery(inbound))
{
@@ -749,7 +761,7 @@ public async Task RunLlmReplyAsync(
{
_logger.LogWarning(
ex,
- "Registration lookup failed on relay reply path; reply will proceed but post-reply reaction swap will be skipped. correlation={CorrelationId}",
+ "Registration lookup failed on relay reply path; reply will proceed but post-reply reaction clear will be skipped. correlation={CorrelationId}",
reply.CorrelationId);
registration = null;
}
@@ -777,7 +789,7 @@ public async Task RunLlmReplyAsync(
runtimeContext,
ct);
if (result.Success)
- _ = TrySwapTypingReactionToDoneAsync(inbound, registration, ct);
+ _ = TryClearTypingReactionAsync(inbound, registration, ct);
return result;
}
@@ -829,9 +841,9 @@ public async Task RunContinueAsync(
public async Task OnReplyDeliveredAsync(ChatActivity activity, CancellationToken ct)
{
// Streaming-completion path in ConversationGAgent calls this hook because it finalizes
- // the reply without going through RunLlmReplyAsync (which is where the non-streaming swap
- // lives). For non-Lark platforms or activities missing the platform message id, the swap
- // helper short-circuits in ShouldSwapTypingReaction.
+ // the reply without going through RunLlmReplyAsync (which is where the non-streaming clear
+ // lives). For non-Lark platforms or activities missing the platform message id, the clear
+ // helper short-circuits in ShouldClearTypingReaction.
if (activity is null)
return;
@@ -840,7 +852,7 @@ public async Task OnReplyDeliveredAsync(ChatActivity activity, CancellationToken
return;
var inbound = ToInboundMessage(activity);
- await TrySwapTypingReactionToDoneAsync(inbound, registration, ct);
+ await TryClearTypingReactionAsync(inbound, registration, ct);
}
public async Task RunStreamChunkAsync(
@@ -978,7 +990,7 @@ public async Task RunStreamChunkAsync(
runtimeContext,
ct);
if (result.Success)
- _ = AwaitTypingReactionThenSwapAsync(typingReactionTask, inbound, registration, ct);
+ _ = AwaitTypingReactionThenClearAsync(typingReactionTask, inbound, registration, ct);
return result.Success
? ConversationTurnResult.Sent(
sentActivityId: $"direct-reply:{activity.Id}",
@@ -1485,7 +1497,7 @@ private async Task BuildLlmReplyRequestAsync(
ChannelBotRegistrationEntry registration,
ChannelInboundEvent inboundEvent,
ConversationTurnRuntimeContext runtimeContext,
- string? senderBindingId,
+ ResolvedSenderBinding? senderBinding,
CancellationToken ct)
{
var request = new NeedsLlmReplyEvent
@@ -1497,9 +1509,9 @@ private async Task BuildLlmReplyRequestAsync(
RequestedAtUnixMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(),
};
- // Carry the relay reply credential through the inbox as transient inbox-only
+ // Carry the relay reply credential through the run command as transient command-only
// fields. ConversationGAgent strips these before persisting NeedsLlmReplyEvent;
- // ChannelLlmReplyInboxRuntime echoes them into the LlmReplyReadyEvent so the
+ // AgentRunGAgent echoes them into the LlmReplyReadyEvent so the
// outbound reply does not depend on the actor's in-memory token dict surviving
// deactivation.
if (runtimeContext.NyxRelayReplyToken is { } token &&
@@ -1512,15 +1524,57 @@ private async Task BuildLlmReplyRequestAsync(
foreach (var pair in await BuildReplyMetadataAsync(inboundEvent, activity, ct))
request.Metadata[pair.Key] = pair.Value;
- // Issue #513 phase 3: tag the request with the sender's binding-id so
- // the downstream reply generator can apply the prefs override chain
- // (sender → bot owner → provider default).
- if (!string.IsNullOrWhiteSpace(senderBindingId))
- request.Metadata[LLMRequestMetadataKeys.SenderBindingId] = senderBindingId;
+ // Tag the request with the sender's binding-id and a short-lived token
+ // so the downstream reply generator can try the sender's own LLM
+ // route first. Missing token/binding is not an error: the generator
+ // falls back to the bot owner's upstream-pinned LLM config.
+ if (senderBinding is not null)
+ {
+ request.Metadata[LLMRequestMetadataKeys.SenderBindingId] = senderBinding.BindingId;
+ var senderAccessToken = await TryIssueSenderLlmAccessTokenAsync(senderBinding.Subject, ct).ConfigureAwait(false);
+ if (!string.IsNullOrWhiteSpace(senderAccessToken))
+ request.Metadata[LLMRequestMetadataKeys.SenderNyxIdAccessToken] = senderAccessToken;
+ }
return request;
}
+ private async Task TryIssueSenderLlmAccessTokenAsync(
+ ExternalSubjectRef subject,
+ CancellationToken ct)
+ {
+ var broker = _capabilityBroker;
+ if (broker is null)
+ return null;
+
+ try
+ {
+ var handle = await broker
+ .IssueShortLivedAsync(
+ subject,
+ new CapabilityScope { Value = AevatarOAuthClientScopes.Proxy },
+ ct)
+ .ConfigureAwait(false);
+ return string.IsNullOrWhiteSpace(handle.AccessToken)
+ ? null
+ : handle.AccessToken.Trim();
+ }
+ catch (OperationCanceledException)
+ {
+ throw;
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(
+ ex,
+ "Failed to issue sender NyxID LLM token; falling back to bot owner LLM config. subject={Platform}:{Tenant}:{User}",
+ subject.Platform,
+ subject.Tenant,
+ subject.ExternalUserId);
+ return null;
+ }
+ }
+
private static string ResolveRoutingConversationId(ConversationReference? conversation)
{
if (conversation is null)
@@ -1629,10 +1683,10 @@ activity.OutboundDelivery is
string.Equals(NormalizeOptional(activity.Bot?.Value), nyxAgentApiKeyId, StringComparison.Ordinal);
// Lark reaction emoji_type for "hands typing on keyboard" — added immediately on inbound
- // so the user sees the bot is working before the LLM reply lands. Swapped to DoneReactionEmojiType
- // after the reply succeeds so the same message ends up with a single completion reaction.
+ // so the user sees the bot is working before the LLM reply lands. After a reply succeeds,
+ // the reaction is cleared instead of replaced with DONE because DONE reads as task completion,
+ // while a chat reply can be an intermediate progress update.
private const string TypingReactionEmojiType = "Typing";
- private const string DoneReactionEmojiType = "DONE";
private async Task TrySendImmediateLarkReactionAsync(
ChatActivity activity,
@@ -1698,14 +1752,12 @@ private async Task TrySendImmediateLarkReactionAsync(
}
// Direct-reply paths (TryHandleAgentBuilderAsync) can complete a slash-command reply faster
- // than the typing POST takes to land in Lark, leaving the swap GET to find no Typing reaction
- // to delete and the orphaned typing reaction to materialize after DONE was already added —
- // both reactions on the same message. Awaiting (with a short cap) the typing task before the
- // GET closes that race. The cap protects against a hung POST stalling the swap forever; if it
- // expires the swap still proceeds — Lark will at worst end up with both reactions, same as
- // before this guard. The deferred-LLM and streaming paths skip this guard because their reply
- // latency dwarfs the typing POST and so cannot race.
- private async Task AwaitTypingReactionThenSwapAsync(
+ // than the typing POST takes to land in Lark, leaving the clear GET to find no Typing reaction
+ // to delete and the orphaned typing reaction to materialize after the clear already ran.
+ // Awaiting (with a short cap) the typing task before the GET closes that race. The cap protects
+ // against a hung POST stalling the clear forever. The deferred-LLM and streaming paths skip this
+ // guard because their reply latency dwarfs the typing POST and so cannot race.
+ private async Task AwaitTypingReactionThenClearAsync(
Task typingReactionTask,
InboundMessage inbound,
ChannelBotRegistrationEntry registration,
@@ -1722,24 +1774,23 @@ private async Task AwaitTypingReactionThenSwapAsync(
catch (TimeoutException)
{
_logger.LogDebug(
- "Lark typing reaction task did not complete within timeout before swap; proceeding anyway");
+ "Lark typing reaction task did not complete within timeout before clear; proceeding anyway");
}
catch (Exception)
{
- // The typing task already logged its own exception — proceed with the swap so the
- // user-visible message still ends up with a DONE reaction whenever possible.
+ // The typing task already logged its own exception — proceed with the clear so any
+ // already-visible Typing reaction is still removed whenever possible.
}
- await TrySwapTypingReactionToDoneAsync(inbound, registration, ct);
+ await TryClearTypingReactionAsync(inbound, registration, ct);
}
- // After a successful reply, replace the bot's "Typing" reaction with a "DONE" reaction so the
- // same message ends with a single completion marker. Uses list-based discovery (filter by
+ // After a successful reply, remove the bot's "Typing" reaction. Uses list-based discovery (filter by
// emoji_type=Typing AND operator_type=app) instead of caching the immediate reaction's
// reaction_id locally — the runner is a singleton and cross-turn state on it would violate the
// "中间层进程内缓存作为事实源" rule. Filtering on operator_type=app avoids deleting any user
// who happened to add the same Typing reaction.
- private async Task TrySwapTypingReactionToDoneAsync(
+ private async Task TryClearTypingReactionAsync(
InboundMessage inbound,
ChannelBotRegistrationEntry? registration,
CancellationToken ct)
@@ -1747,7 +1798,7 @@ private async Task TrySwapTypingReactionToDoneAsync(
if (registration is null)
return;
- if (!ShouldSwapTypingReaction(inbound, registration, out var accessToken, out var providerSlug, out var platformMessageId))
+ if (!ShouldClearTypingReaction(inbound, registration, out var accessToken, out var providerSlug, out var platformMessageId))
return;
try
@@ -1755,7 +1806,7 @@ private async Task TrySwapTypingReactionToDoneAsync(
var reactionIds = new List();
string? pageToken = null;
// Bound the iteration so a misbehaving Lark response (e.g. always-true `has_more`)
- // can't loop the swap forever. 10 pages × 50 per page = 500 Typing reactions on a
+ // can't loop the clear forever. 10 pages × 50 per page = 500 Typing reactions on a
// single message — orders of magnitude more than realistic, since this list is
// already scoped to one emoji_type and the bot only adds Typing once per inbound.
const int MaxListPages = 10;
@@ -1777,7 +1828,7 @@ private async Task TrySwapTypingReactionToDoneAsync(
if (LarkProxyResponse.TryGetError(listResponse, out var listCode, out var listDetail))
{
_logger.LogDebug(
- "Lark typing reaction list failed; skipping swap: provider={ProviderSlug}, message={MessageId}, page={Page}, larkCode={LarkCode}, detail={Detail}",
+ "Lark typing reaction list failed; skipping clear: provider={ProviderSlug}, message={MessageId}, page={Page}, larkCode={LarkCode}, detail={Detail}",
providerSlug,
platformMessageId,
page,
@@ -1835,35 +1886,6 @@ private async Task TrySwapTypingReactionToDoneAsync(
}
}
- var addResponse = await _nyxClient.ProxyRequestAsync(
- accessToken!,
- providerSlug!,
- $"/open-apis/im/v1/messages/{Uri.EscapeDataString(platformMessageId!)}/reactions",
- "POST",
- $$$"""{"reaction_type":{"emoji_type":"{{{DoneReactionEmojiType}}}"}}""",
- null,
- ct);
-
- if (LarkProxyResponse.TryGetError(addResponse, out var addCode, out var addDetail))
- {
- if (addCode == LarkBotErrorCodes.NoPermissionToReact)
- {
- _logger.LogDebug(
- "Lark done reaction skipped (missing reaction scope): provider={ProviderSlug}, message={MessageId}, detail={Detail}",
- providerSlug,
- platformMessageId,
- addDetail);
- }
- else
- {
- _logger.LogWarning(
- "Lark done reaction failed: provider={ProviderSlug}, message={MessageId}, larkCode={LarkCode}, detail={Detail}",
- providerSlug,
- platformMessageId,
- addCode,
- addDetail);
- }
- }
}
catch (OperationCanceledException) when (ct.IsCancellationRequested)
{
@@ -1873,7 +1895,7 @@ private async Task TrySwapTypingReactionToDoneAsync(
{
_logger.LogWarning(
ex,
- "Lark typing→done reaction swap threw: provider={ProviderSlug}, message={MessageId}",
+ "Lark typing reaction clear threw: provider={ProviderSlug}, message={MessageId}",
providerSlug,
platformMessageId);
}
@@ -1930,7 +1952,7 @@ private static (List AppReactionIds, string? NextPageToken) ExtractAppRe
continue;
// Only delete reactions added by the bot itself (operator_type=app); leave any
- // user-added Typing reactions alone so the swap doesn't accidentally erase them.
+ // user-added Typing reactions alone so the clear doesn't accidentally erase them.
if (!item.TryGetProperty("operator", out var operatorProp) ||
operatorProp.ValueKind != JsonValueKind.Object)
{
@@ -1958,7 +1980,7 @@ private static (List AppReactionIds, string? NextPageToken) ExtractAppRe
return (ids, nextPageToken);
}
- private static bool ShouldSwapTypingReaction(
+ private static bool ShouldClearTypingReaction(
InboundMessage inbound,
ChannelBotRegistrationEntry registration,
out string? accessToken,
diff --git a/agents/Aevatar.GAgents.NyxidChat/ChannelLlmReplyInboxRuntime.cs b/agents/Aevatar.GAgents.NyxidChat/ChannelLlmReplyInboxRuntime.cs
deleted file mode 100644
index 493161e01..000000000
--- a/agents/Aevatar.GAgents.NyxidChat/ChannelLlmReplyInboxRuntime.cs
+++ /dev/null
@@ -1,443 +0,0 @@
-using Aevatar.Foundation.Abstractions;
-using Aevatar.Foundation.Abstractions.Streaming;
-using Aevatar.AI.Abstractions.LLMProviders;
-using Aevatar.GAgents.Channel.Abstractions;
-using Aevatar.GAgents.Channel.Runtime;
-using Aevatar.GAgents.Channel.NyxIdRelay;
-using Aevatar.GAgents.NyxidChat;
-using Aevatar.Studio.Application.Studio.Abstractions;
-using Google.Protobuf.WellKnownTypes;
-using Microsoft.Extensions.Hosting;
-using Microsoft.Extensions.Logging;
-
-namespace Aevatar.GAgents.NyxidChat;
-
-public sealed class ChannelLlmReplyInboxRuntime :
- IHostedService,
- IAsyncDisposable,
- IChannelLlmReplyInbox
-{
- internal const string InboxStreamId = "channel-runtime:llm-reply:inbox";
-
- private readonly IStreamProvider _streamProvider;
- private readonly IActorRuntime _actorRuntime;
- private readonly IActorDispatchPort _actorDispatchPort;
- private readonly IConversationReplyGenerator _replyGenerator;
- private readonly IInteractiveReplyCollector? _interactiveReplyCollector;
- private readonly Aevatar.GAgents.Channel.NyxIdRelay.NyxIdRelayOptions? _relayOptions;
- private readonly INyxIdRelayScopeResolver? _scopeResolver;
- private readonly IUserConfigQueryPort? _userConfigQueryPort;
- private readonly TimeProvider _timeProvider;
- private readonly ILogger _logger;
- private IAsyncDisposable? _subscription;
-
- public ChannelLlmReplyInboxRuntime(
- IStreamProvider streamProvider,
- IActorRuntime actorRuntime,
- IConversationReplyGenerator replyGenerator,
- IInteractiveReplyCollector? interactiveReplyCollector,
- Aevatar.GAgents.Channel.NyxIdRelay.NyxIdRelayOptions? relayOptions,
- ILogger logger,
- INyxIdRelayScopeResolver? scopeResolver = null,
- IUserConfigQueryPort? userConfigQueryPort = null,
- TimeProvider? timeProvider = null,
- IActorDispatchPort? actorDispatchPort = null)
- {
- _streamProvider = streamProvider ?? throw new ArgumentNullException(nameof(streamProvider));
- _actorRuntime = actorRuntime ?? throw new ArgumentNullException(nameof(actorRuntime));
- _actorDispatchPort = actorDispatchPort
- ?? actorRuntime as IActorDispatchPort
- ?? throw new ArgumentNullException(nameof(actorDispatchPort));
- _replyGenerator = replyGenerator ?? throw new ArgumentNullException(nameof(replyGenerator));
- _interactiveReplyCollector = interactiveReplyCollector;
- _relayOptions = relayOptions;
- _scopeResolver = scopeResolver;
- _userConfigQueryPort = userConfigQueryPort;
- _timeProvider = timeProvider ?? TimeProvider.System;
- _logger = logger ?? throw new ArgumentNullException(nameof(logger));
- }
-
- public async Task StartAsync(CancellationToken ct)
- {
- if (_subscription is not null)
- return;
-
- _subscription = await _streamProvider
- .GetStream(InboxStreamId)
- .SubscribeAsync(ProcessAsync, ct);
-
- _logger.LogInformation("Started channel LLM reply inbox on {StreamId}", InboxStreamId);
- }
-
- public async Task StopAsync(CancellationToken ct)
- {
- if (_subscription is null)
- return;
-
- await _subscription.DisposeAsync();
- _subscription = null;
- _logger.LogInformation("Stopped channel LLM reply inbox on {StreamId}", InboxStreamId);
- }
-
- public Task EnqueueAsync(NeedsLlmReplyEvent request, CancellationToken ct)
- {
- ArgumentNullException.ThrowIfNull(request);
- return _streamProvider.GetStream(InboxStreamId).ProduceAsync(request, ct);
- }
-
- public async ValueTask DisposeAsync()
- {
- await StopAsync(CancellationToken.None);
- }
-
- internal const long MaxInboxRequestAgeMs = 5 * 60 * 1000;
-
- internal async Task ProcessAsync(NeedsLlmReplyEvent request)
- {
- ArgumentNullException.ThrowIfNull(request);
-
- _logger.LogInformation(
- "Processing LLM reply request: correlation={CorrelationId} target={TargetActorId}",
- request.CorrelationId,
- request.TargetActorId);
-
- if (request.Activity is null || string.IsNullOrWhiteSpace(request.TargetActorId))
- {
- _logger.LogWarning(
- "Dropping malformed deferred LLM reply request: correlation={CorrelationId}, target={TargetActorId}",
- request.CorrelationId,
- request.TargetActorId);
- await NotifyActorOfDropAsync(request, "malformed_deferred_llm_reply_request");
- return;
- }
-
- // Stale gate: NyxID relay reply tokens have a ~30 min TTL and the user access
- // token used for the LLM call expires inside ~15 min. A request that has been
- // sitting in the stream for hours can't lead to a successful reply, so drop it
- // here instead of spending an LLM round just to fail at the outbound stage.
- var nowMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds();
- if (request.RequestedAtUnixMs > 0 && nowMs - request.RequestedAtUnixMs > MaxInboxRequestAgeMs)
- {
- _logger.LogInformation(
- "Dropping stale LLM reply request: correlation={CorrelationId} ageMs={AgeMs}",
- request.CorrelationId,
- nowMs - request.RequestedAtUnixMs);
- await NotifyActorOfDropAsync(request, "stale_inbox_request_dropped");
- return;
- }
-
- // Relay credential gate: relay turns require a fresh reply_token to send the
- // outbound. A relay request with no inbox-carried token (e.g., rehydrated from
- // persisted state after a pod restart that lost the original capture) cannot
- // be delivered, so skip the LLM call entirely.
- if (IsRelayRequest(request) && string.IsNullOrWhiteSpace(request.ReplyToken))
- {
- _logger.LogWarning(
- "Dropping relay LLM reply request without inbox-carried reply_token: correlation={CorrelationId}",
- request.CorrelationId);
- await NotifyActorOfDropAsync(request, "missing_relay_reply_token");
- return;
- }
-
- var actor = await _actorRuntime.GetAsync(request.TargetActorId)
- ?? await _actorRuntime.CreateAsync(request.TargetActorId, CancellationToken.None);
-
- string replyText;
- MessageContent? outboundIntent = null;
- var terminalState = LlmReplyTerminalState.Completed;
- var errorCode = string.Empty;
- var errorSummary = string.Empty;
- using TurnStreamingReplySink? streamingSink = TryBuildStreamingSink(request, request.TargetActorId);
-
- try
- {
- var effectiveMetadata = await BuildEffectiveMetadataAsync(request, CancellationToken.None);
- IDisposable? interactiveReplyScope = null;
- try
- {
- if (ShouldCaptureInteractiveReply(request.Activity))
- interactiveReplyScope = _interactiveReplyCollector?.BeginScope();
-
- replyText = await _replyGenerator.GenerateReplyAsync(
- request.Activity,
- effectiveMetadata,
- streamingSink,
- CancellationToken.None) ?? string.Empty;
- outboundIntent = _interactiveReplyCollector?.TryTake();
- }
- finally
- {
- interactiveReplyScope?.Dispose();
- }
-
- if (streamingSink is not null &&
- outboundIntent is null &&
- !string.IsNullOrWhiteSpace(replyText))
- {
- await streamingSink.FinalizeAsync(replyText, CancellationToken.None);
- }
-
- if (outboundIntent is null && string.IsNullOrWhiteSpace(replyText))
- {
- terminalState = LlmReplyTerminalState.Failed;
- errorCode = "empty_reply";
- errorSummary = "Reply generator returned an empty response.";
- replyText = "Sorry, I wasn't able to generate a response. Please try again.";
- }
- }
- catch (Exception ex)
- {
- terminalState = LlmReplyTerminalState.Failed;
- errorCode = "llm_reply_failed";
- errorSummary = ex.Message;
- replyText = NyxIdRelayErrorClassifier.Classify(ex.Message);
- _logger.LogWarning(
- ex,
- "Deferred LLM reply generation failed: correlation={CorrelationId}",
- request.CorrelationId);
- }
-
- var ready = new LlmReplyReadyEvent
- {
- CorrelationId = request.CorrelationId,
- RegistrationId = request.RegistrationId,
- SourceActorId = InboxStreamId,
- Activity = request.Activity.Clone(),
- Outbound = outboundIntent?.Clone() ?? new MessageContent { Text = replyText },
- TerminalState = terminalState,
- ErrorCode = errorCode,
- ErrorSummary = errorSummary,
- ReadyAtUnixMs = _timeProvider.GetUtcNow().ToUnixTimeMilliseconds(),
- // Echo the inbox-only relay credential straight back so ConversationGAgent's
- // outbound reply does not depend on its in-memory token dict still having the
- // entry. The actor consumes these fields and never persists them.
- ReplyToken = request.ReplyToken ?? string.Empty,
- ReplyTokenExpiresAtUnixMs = request.ReplyTokenExpiresAtUnixMs,
- };
- var envelope = new EventEnvelope
- {
- Id = Guid.NewGuid().ToString("N"),
- Timestamp = Timestamp.FromDateTimeOffset(_timeProvider.GetUtcNow()),
- Payload = Any.Pack(ready),
- Route = EnvelopeRouteSemantics.CreateDirect(InboxStreamId, request.TargetActorId),
- };
-
- await _actorDispatchPort.DispatchAsync(request.TargetActorId, envelope, CancellationToken.None);
- }
-
- private TurnStreamingReplySink? TryBuildStreamingSink(NeedsLlmReplyEvent request, string targetActorId)
- {
- if (_relayOptions is not { StreamingRepliesEnabled: true })
- return null;
- if (request.Activity?.OutboundDelivery is not
- {
- ReplyMessageId.Length: > 0,
- CorrelationId.Length: > 0,
- })
- {
- return null;
- }
- if (string.IsNullOrWhiteSpace(request.CorrelationId))
- return null;
-
- var throttle = TimeSpan.FromMilliseconds(Math.Max(0, _relayOptions.StreamingFlushIntervalMs));
- return new TurnStreamingReplySink(
- _actorDispatchPort,
- targetActorId,
- request.CorrelationId,
- request.RegistrationId,
- request.Activity.Clone(),
- throttle,
- _timeProvider,
- _logger);
- }
-
- private async Task> BuildEffectiveMetadataAsync(
- NeedsLlmReplyEvent request,
- CancellationToken ct)
- {
- var metadata = new Dictionary(request.Metadata, StringComparer.Ordinal);
-
- // Apply the bot owner's pre-configured LLM route + model. The relay callback
- // identifies the bot by api_key_id (in activity.Bot.Value); we resolve that to
- // the owner's Aevatar scope id and load the same UserConfig the owner uses
- // when chatting through nyxid-chat themselves, then pin ModelOverride /
- // NyxIdRoutePreference / MaxToolRoundsOverride from that configuration.
- await ApplyBotOwnerLlmConfigAsync(request, metadata, ct);
-
- // The inbound callback's X-NyxID-User-Token is the bot owner's NyxID session
- // JWT (freshly issued by NyxID for each callback). It is the bot owner's own
- // credential for LLM calls — the same thing that would authorize them in
- // nyxid-chat. The short TTL (~15 min) is mitigated by the direct-enqueue
- // dispatch (#380), the inbox-echoed token flow (#383), and the stale pending
- // request GC, so the token is still valid when the LLM call actually fires
- // for any non-stale request. If the downstream provider rejects it, the
- // classifier surfaces a real user-facing error via NyxIdRelayErrorClassifier.
- var userAccessToken = request.Activity?.TransportExtras?.NyxUserAccessToken?.Trim();
- if (!string.IsNullOrWhiteSpace(userAccessToken))
- {
- metadata[LLMRequestMetadataKeys.NyxIdAccessToken] = userAccessToken;
- metadata[LLMRequestMetadataKeys.NyxIdOrgToken] = userAccessToken;
- }
-
- return metadata;
- }
-
- private async Task ApplyBotOwnerLlmConfigAsync(
- NeedsLlmReplyEvent request,
- IDictionary metadata,
- CancellationToken ct)
- {
- if (_scopeResolver is null || _userConfigQueryPort is null)
- return;
-
- var apiKeyId = request.Activity?.Bot?.Value?.Trim();
- if (string.IsNullOrWhiteSpace(apiKeyId))
- return;
-
- string? scopeId;
- try
- {
- scopeId = await _scopeResolver.ResolveScopeIdByApiKeyAsync(apiKeyId, ct);
- }
- catch (Exception ex)
- {
- _logger.LogWarning(
- ex,
- "Failed to resolve bot owner scope id for LLM config: correlation={CorrelationId} apiKeyId={ApiKeyId}",
- request.CorrelationId,
- apiKeyId);
- return;
- }
-
- if (string.IsNullOrWhiteSpace(scopeId))
- {
- _logger.LogDebug(
- "No bot owner scope id resolved for LLM config: correlation={CorrelationId} apiKeyId={ApiKeyId}",
- request.CorrelationId,
- apiKeyId);
- return;
- }
-
- try
- {
- var config = await _userConfigQueryPort.GetAsync(scopeId, ct);
- if (!string.IsNullOrWhiteSpace(config.DefaultModel))
- metadata[LLMRequestMetadataKeys.ModelOverride] = config.DefaultModel.Trim();
- if (!string.IsNullOrWhiteSpace(config.PreferredLlmRoute))
- metadata[LLMRequestMetadataKeys.NyxIdRoutePreference] = config.PreferredLlmRoute.Trim();
- if (config.MaxToolRounds > 0)
- metadata[LLMRequestMetadataKeys.MaxToolRoundsOverride] =
- config.MaxToolRounds.ToString(System.Globalization.CultureInfo.InvariantCulture);
-
- _logger.LogInformation(
- "Applied bot owner LLM config: correlation={CorrelationId} scopeId={ScopeId} model={Model} route={Route}",
- request.CorrelationId,
- scopeId,
- string.IsNullOrWhiteSpace(config.DefaultModel) ? "" : config.DefaultModel,
- string.IsNullOrWhiteSpace(config.PreferredLlmRoute) ? "" : config.PreferredLlmRoute);
- }
- catch (Exception ex)
- {
- _logger.LogWarning(
- ex,
- "Failed to load bot owner LLM config: correlation={CorrelationId} scopeId={ScopeId}",
- request.CorrelationId,
- scopeId);
- }
- }
-
- private static bool IsRelayRequest(NeedsLlmReplyEvent request) =>
- request.Activity?.OutboundDelivery is
- {
- ReplyMessageId.Length: > 0,
- CorrelationId.Length: > 0,
- };
-
- private async Task NotifyActorOfDropAsync(NeedsLlmReplyEvent request, string reason)
- {
- if (string.IsNullOrWhiteSpace(request.TargetActorId) ||
- string.IsNullOrWhiteSpace(request.CorrelationId))
- {
- return;
- }
-
- IActor? actor;
- try
- {
- actor = await _actorRuntime.GetAsync(request.TargetActorId);
- }
- catch (Exception ex)
- {
- _logger.LogWarning(
- ex,
- "Failed to resolve actor for inbox drop notification: correlation={CorrelationId} target={TargetActorId}",
- request.CorrelationId,
- request.TargetActorId);
- return;
- }
-
- if (actor is null)
- {
- // No active actor means there is nothing pending to clean up; the request
- // either was never persisted or the actor's state was already retired.
- return;
- }
-
- var dropped = new DeferredLlmReplyDroppedEvent
- {
- CorrelationId = request.CorrelationId,
- Reason = reason,
- DroppedAtUnixMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(),
- };
- var envelope = new EventEnvelope
- {
- Id = Guid.NewGuid().ToString("N"),
- Timestamp = Timestamp.FromDateTimeOffset(DateTimeOffset.UtcNow),
- Payload = Any.Pack(dropped),
- Route = EnvelopeRouteSemantics.CreateDirect(InboxStreamId, request.TargetActorId),
- };
-
- try
- {
- await _actorDispatchPort.DispatchAsync(request.TargetActorId, envelope, CancellationToken.None);
- }
- catch (Exception ex)
- {
- _logger.LogWarning(
- ex,
- "Failed to deliver inbox drop notification: correlation={CorrelationId} reason={Reason}",
- request.CorrelationId,
- reason);
- }
- }
-
- private bool ShouldCaptureInteractiveReply(ChatActivity? activity)
- {
- if (_interactiveReplyCollector is null)
- return false;
-
- if (_relayOptions is { InteractiveRepliesEnabled: false })
- return false;
-
- return activity?.OutboundDelivery is
- {
- ReplyMessageId.Length: > 0,
- CorrelationId.Length: > 0,
- };
- }
-}
-
-public sealed class ChannelLlmReplyInboxHostedService : IHostedService
-{
- private readonly ChannelLlmReplyInboxRuntime _runtime;
-
- public ChannelLlmReplyInboxHostedService(ChannelLlmReplyInboxRuntime runtime)
- {
- _runtime = runtime ?? throw new ArgumentNullException(nameof(runtime));
- }
-
- public Task StartAsync(CancellationToken ct) => _runtime.StartAsync(ct);
-
- public Task StopAsync(CancellationToken ct) => _runtime.StopAsync(ct);
-}
diff --git a/agents/Aevatar.GAgents.NyxidChat/ConversationReplyGenerator.cs b/agents/Aevatar.GAgents.NyxidChat/ConversationReplyGenerator.cs
index ae0039261..c3c732327 100644
--- a/agents/Aevatar.GAgents.NyxidChat/ConversationReplyGenerator.cs
+++ b/agents/Aevatar.GAgents.NyxidChat/ConversationReplyGenerator.cs
@@ -1,3 +1,4 @@
+using System.Net.Http;
using System.Text;
using Aevatar.AI.Abstractions;
using Aevatar.AI.Abstractions.LLMProviders;
@@ -8,6 +9,8 @@
using Aevatar.AI.ToolProviders.Skills;
using Aevatar.GAgents.Channel.Abstractions;
using Aevatar.GAgents.Channel.Runtime;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
namespace Aevatar.GAgents.NyxidChat;
@@ -23,9 +26,17 @@ public sealed class NyxIdConversationReplyGenerator : IConversationReplyGenerato
private readonly IReadOnlyList _toolMiddlewares;
private readonly IReadOnlyList _llmMiddlewares;
private readonly SkillRegistry? _skillRegistry;
+ private readonly IRemoteSkillFetcher? _remoteSkillFetcher;
private readonly global::Aevatar.GAgents.Channel.NyxIdRelay.NyxIdRelayOptions? _relayOptions;
private readonly INyxIdUserLlmPreferencesStore? _preferencesStore;
private readonly IUserMemoryStore? _userMemoryStore;
+ private readonly ILogger _logger;
+
+ private sealed record EffectiveMetadataPlan(
+ IReadOnlyDictionary Primary,
+ IReadOnlyDictionary? OwnerFallback);
+
+ private sealed record SenderPreferenceApplication(bool AnyApplied, bool RouteApplied);
public NyxIdConversationReplyGenerator(
ILLMProviderFactory llmProviderFactory,
@@ -34,9 +45,11 @@ public NyxIdConversationReplyGenerator(
IEnumerable? toolMiddlewares = null,
IEnumerable? llmMiddlewares = null,
SkillRegistry? skillRegistry = null,
+ IRemoteSkillFetcher? remoteSkillFetcher = null,
global::Aevatar.GAgents.Channel.NyxIdRelay.NyxIdRelayOptions? relayOptions = null,
INyxIdUserLlmPreferencesStore? preferencesStore = null,
- IUserMemoryStore? userMemoryStore = null)
+ IUserMemoryStore? userMemoryStore = null,
+ ILogger? logger = null)
{
_llmProviderFactory = llmProviderFactory ?? throw new ArgumentNullException(nameof(llmProviderFactory));
_toolSources = (toolSources ?? []).ToArray();
@@ -44,9 +57,26 @@ public NyxIdConversationReplyGenerator(
_toolMiddlewares = (toolMiddlewares ?? []).ToArray();
_llmMiddlewares = (llmMiddlewares ?? []).ToArray();
_skillRegistry = skillRegistry;
+ _remoteSkillFetcher = remoteSkillFetcher;
_relayOptions = relayOptions;
_preferencesStore = preferencesStore;
_userMemoryStore = userMemoryStore;
+ _logger = logger ?? NullLogger.Instance;
+
+ // Surface a half-wired skills configuration at startup. When the registry is
+ // present but the remote fetcher is not, use_skill is still advertised to the
+ // LLM (BuildTurnToolsAsync registers it from the registry alone) yet any call
+ // that would have to pull a remote skill silently falls back to "skill not
+ // found". Logging at construction time gives ops a single line they can grep
+ // for instead of debugging a flaky use_skill in production.
+ // (PR #562 review on ConversationReplyGenerator.cs:120, 4-of-5 reviewers.)
+ if (_skillRegistry is not null && _remoteSkillFetcher is null)
+ {
+ _logger.LogWarning(
+ "NyxIdConversationReplyGenerator wired with SkillRegistry but no IRemoteSkillFetcher: " +
+ "use_skill will be advertised to the LLM but cannot pull remote skills. " +
+ "Register an IRemoteSkillFetcher (e.g. AddOrnnSkills) or drop the SkillRegistry to silence this.");
+ }
}
public async Task GenerateReplyAsync(
@@ -58,15 +88,96 @@ public NyxIdConversationReplyGenerator(
ArgumentNullException.ThrowIfNull(activity);
ArgumentNullException.ThrowIfNull(metadata);
- var effectiveMetadata = await BuildEffectiveMetadataAsync(metadata, ct);
- var history = new global::Aevatar.AI.Core.Chat.ChatHistory
+ // Emit a placeholder immediately so the user sees a message within the outbound RTT,
+ // regardless of LLM cold-start, router selection, or tool-call latency before the
+ // first real delta. The first real delta overwrites this placeholder via edit-in-place;
+ // if no delta ever arrives (tool-only or empty turn), the caller's FinalizeAsync edits
+ // the placeholder to the final text. Disabled by setting the option to empty/whitespace.
+ if (streamingSink is not null)
{
- MaxMessages = MaxHistoryMessages,
- };
+ var placeholder = _relayOptions?.StreamingPlaceholderText;
+ if (!string.IsNullOrWhiteSpace(placeholder))
+ await streamingSink.OnDeltaAsync(placeholder, ct);
+ }
+
+ var metadataPlan = await BuildEffectiveMetadataPlanAsync(metadata, ct);
+ var primaryTools = await BuildTurnToolsAsync(ct);
+
+ try
+ {
+ return await GenerateWithMetadataAsync(
+ activity,
+ metadataPlan.Primary,
+ primaryTools,
+ streamingSink,
+ ct)
+ .ConfigureAwait(false);
+ }
+ catch (OperationCanceledException)
+ {
+ throw;
+ }
+ catch (Exception ex) when (metadataPlan.OwnerFallback is not null && IsRetryableSenderRouteFailure(ex))
+ {
+ _logger.LogWarning(
+ ex,
+ "Sender LLM route failed; retrying with bot owner LLM config. activity={ActivityId}",
+ activity.Id);
+
+ var fallbackTools = await BuildTurnToolsAsync(ct);
+ return await GenerateWithMetadataAsync(
+ activity,
+ metadataPlan.OwnerFallback,
+ fallbackTools,
+ streamingSink,
+ ct)
+ .ConfigureAwait(false);
+ }
+ }
+
+ ///
+ /// Decide whether falling back from sender credentials to owner credentials is worth
+ /// the retry. Programmer errors (Argument*, NullReference, InvalidCast) are not transient
+ /// and would only fail the same way with the owner token while burying the original cause
+ /// behind a second failure. We retry only on infra-shaped failures: network, timeout, JSON
+ /// parsing of upstream errors, and the InvalidOperationException NyxID emits when an
+ /// access token is rejected.
+ ///
+ private static bool IsRetryableSenderRouteFailure(Exception ex) =>
+ ex is HttpRequestException
+ or TimeoutException
+ or System.Text.Json.JsonException
+ or InvalidOperationException
+ or TaskCanceledException
+ or System.IO.IOException;
+
+ private async Task BuildTurnToolsAsync(CancellationToken ct)
+ {
var tools = new ToolManager();
foreach (var tool in await DiscoverToolsAsync(ct))
tools.Register(tool);
+ // SkillsAgentToolSource (when AddSkills is wired) advertises the same use_skill
+ // through DiscoverToolsAsync, so this defensive registration only matters for
+ // minimal hosts that registered AddOrnnSkills (IRemoteSkillFetcher) without
+ // AddSkills. ToolManager.Register is last-write-wins so the duplicate is harmless.
+ if (_skillRegistry is not null || _remoteSkillFetcher is not null)
+ tools.Register(new UseSkillTool(_skillRegistry ?? new SkillRegistry(), _remoteSkillFetcher));
+
+ return tools;
+ }
+
+ private async Task GenerateWithMetadataAsync(
+ ChatActivity activity,
+ IReadOnlyDictionary effectiveMetadata,
+ ToolManager tools,
+ IStreamingReplySink? streamingSink,
+ CancellationToken ct)
+ {
+ var history = new global::Aevatar.AI.Core.Chat.ChatHistory
+ {
+ MaxMessages = MaxHistoryMessages,
+ };
var runtime = new ChatRuntime(
providerFactory: ResolveProvider,
history: history,
@@ -91,18 +202,6 @@ public NyxIdConversationReplyGenerator(
agentName: "NyxIdConversationReply",
streamBufferCapacity: StreamBufferCapacity);
- // Emit a placeholder immediately so the user sees a message within the outbound RTT,
- // regardless of LLM cold-start, router selection, or tool-call latency before the
- // first real delta. The first real delta overwrites this placeholder via edit-in-place;
- // if no delta ever arrives (tool-only or empty turn), the caller's FinalizeAsync edits
- // the placeholder to the final text. Disabled by setting the option to empty/whitespace.
- if (streamingSink is not null)
- {
- var placeholder = _relayOptions?.StreamingPlaceholderText;
- if (!string.IsNullOrWhiteSpace(placeholder))
- await streamingSink.OnDeltaAsync(placeholder, ct);
- }
-
var output = new StringBuilder();
await foreach (var chunk in runtime.ChatStreamAsync(
activity.Content.Text,
@@ -122,11 +221,13 @@ public NyxIdConversationReplyGenerator(
return output.ToString();
}
- private async Task> BuildEffectiveMetadataAsync(
+ private async Task BuildEffectiveMetadataPlanAsync(
IReadOnlyDictionary metadata,
CancellationToken ct)
{
var effective = new Dictionary(metadata, StringComparer.Ordinal);
+ effective.Remove(LLMRequestMetadataKeys.SenderNyxIdAccessToken);
+ Dictionary? ownerFallback = null;
// Issue #513 phase 3: prefs override chain is sender → bot-owner →
// provider default. The bot owner's prefs are already pinned upstream
@@ -135,12 +236,33 @@ private async Task> BuildEffectiveMetadataAs
// so this generator only has to layer sender overrides on top when
// the inbound carries a binding-id. SetIfFilled is field-level, so a
// sender who set DefaultModel but not PreferredRoute still inherits
- // the bot owner's route from the upstream-pinned metadata.
+ // the bot owner's route from the upstream-pinned metadata. If a
+ // sender-owned attempt fails, we retry once with this owner snapshot.
if (_preferencesStore is not null &&
metadata.TryGetValue(LLMRequestMetadataKeys.SenderBindingId, out var senderBindingId) &&
!string.IsNullOrWhiteSpace(senderBindingId))
{
- await ApplyPreferencesAsync(senderBindingId, effective, ct);
+ var ownerSnapshot = CreateOwnerFallbackSnapshot(effective);
+ var applied = await ApplyPreferencesAsync(senderBindingId, effective, ct);
+ if (applied.RouteApplied)
+ {
+ if (metadata.TryGetValue(LLMRequestMetadataKeys.SenderNyxIdAccessToken, out var senderAccessToken) &&
+ !string.IsNullOrWhiteSpace(senderAccessToken))
+ {
+ var trimmedToken = senderAccessToken.Trim();
+ effective[LLMRequestMetadataKeys.NyxIdAccessToken] = trimmedToken;
+ effective[LLMRequestMetadataKeys.NyxIdOrgToken] = trimmedToken;
+ ownerFallback = ownerSnapshot;
+ }
+ else
+ {
+ effective = ownerSnapshot;
+ }
+ }
+ else if (applied.AnyApplied)
+ {
+ ownerFallback = ownerSnapshot;
+ }
}
if (_userMemoryStore is not null)
@@ -149,7 +271,11 @@ private async Task> BuildEffectiveMetadataAs
{
var promptSection = await _userMemoryStore.BuildPromptSectionAsync(2000, ct);
if (!string.IsNullOrWhiteSpace(promptSection))
+ {
effective[LLMRequestMetadataKeys.UserMemoryPrompt] = promptSection;
+ if (ownerFallback is not null)
+ ownerFallback[LLMRequestMetadataKeys.UserMemoryPrompt] = promptSection;
+ }
}
catch (OperationCanceledException)
{
@@ -161,7 +287,7 @@ private async Task> BuildEffectiveMetadataAs
}
}
- return effective;
+ return new EffectiveMetadataPlan(effective, ownerFallback);
}
///
@@ -170,13 +296,13 @@ private async Task> BuildEffectiveMetadataAs
/// the bot owner's value stays intact. User-config failures degrade to
/// "no sender override" rather than failing the LLM turn.
///
- private async Task ApplyPreferencesAsync(
+ private async Task ApplyPreferencesAsync(
string senderBindingId,
Dictionary effective,
CancellationToken ct)
{
if (_preferencesStore is null)
- return;
+ return new SenderPreferenceApplication(false, false);
NyxIdUserLlmPreferences preferences;
try
@@ -189,22 +315,32 @@ private async Task ApplyPreferencesAsync(
}
catch
{
- return;
+ return new SenderPreferenceApplication(false, false);
}
- SetIfFilled(effective, LLMRequestMetadataKeys.ModelOverride, preferences.DefaultModel?.Trim());
- SetIfFilled(effective, LLMRequestMetadataKeys.NyxIdRoutePreference, preferences.PreferredRoute?.Trim());
- SetIfFilled(
+ var modelApplied = SetIfFilled(effective, LLMRequestMetadataKeys.ModelOverride, preferences.DefaultModel?.Trim());
+ var routeApplied = SetIfFilled(effective, LLMRequestMetadataKeys.NyxIdRoutePreference, preferences.PreferredRoute?.Trim());
+ var roundsApplied = SetIfFilled(
effective,
LLMRequestMetadataKeys.MaxToolRoundsOverride,
preferences.MaxToolRounds > 0 ? preferences.MaxToolRounds.ToString() : null);
+ return new SenderPreferenceApplication(modelApplied || routeApplied || roundsApplied, routeApplied);
+ }
+
+ private static Dictionary CreateOwnerFallbackSnapshot(Dictionary effective)
+ {
+ var snapshot = new Dictionary(effective, StringComparer.Ordinal);
+ snapshot.Remove(LLMRequestMetadataKeys.SenderBindingId);
+ snapshot.Remove(LLMRequestMetadataKeys.SenderNyxIdAccessToken);
+ return snapshot;
}
- private static void SetIfFilled(Dictionary map, string key, string? value)
+ private static bool SetIfFilled(Dictionary map, string key, string? value)
{
if (string.IsNullOrWhiteSpace(value))
- return;
+ return false;
map[key] = value;
+ return true;
}
private async Task> DiscoverToolsAsync(CancellationToken ct)
@@ -248,7 +384,7 @@ private string BuildSystemPrompt()
var prompt = LoadBaseSystemPrompt();
prompt += NyxIdRelayPromptConfiguration.BuildChannelRuntimeConfigurationSection(_relayOptions);
- if (_skillRegistry != null && _skillRegistry.Count > 0)
+ if (_skillRegistry is not null && _skillRegistry.Count > 0)
{
var skillSection = _skillRegistry.BuildSystemPromptSection();
if (!string.IsNullOrEmpty(skillSection))
diff --git a/agents/Aevatar.GAgents.NyxidChat/LlmSelection/DefaultUserLlmSelectionService.cs b/agents/Aevatar.GAgents.NyxidChat/LlmSelection/DefaultUserLlmSelectionService.cs
index d74b1c233..072281262 100644
--- a/agents/Aevatar.GAgents.NyxidChat/LlmSelection/DefaultUserLlmSelectionService.cs
+++ b/agents/Aevatar.GAgents.NyxidChat/LlmSelection/DefaultUserLlmSelectionService.cs
@@ -41,8 +41,7 @@ public async Task SetByServiceAsync(
ArgumentException.ThrowIfNullOrWhiteSpace(serviceId);
var view = await _optionsService.GetOptionsAsync(ToQuery(context), ct).ConfigureAwait(false);
- var option = view.Available.FirstOrDefault(candidate =>
- string.Equals(candidate.ServiceId, serviceId.Trim(), StringComparison.OrdinalIgnoreCase));
+ var option = FindSelectionOption(serviceId.Trim(), view.Available);
if (option is null)
throw new InvalidOperationException($"LLM service '{serviceId}' is not available for this user.");
EnsureSelectable(option);
@@ -127,6 +126,32 @@ private static void EnsureSelectable(UserLlmOption option)
throw new InvalidOperationException($"LLM service '{option.DisplayName}' is not ready: {option.Status}.");
}
+ private static UserLlmOption? FindSelectionOption(string requested, IReadOnlyList available)
+ {
+ var directMatches = available
+ .Where(option => string.Equals(option.ServiceId, requested, StringComparison.OrdinalIgnoreCase))
+ .ToArray();
+ var directSelectable = directMatches.Where(IsSelectable).Take(2).ToArray();
+ if (directSelectable.Length == 1)
+ return directSelectable[0];
+
+ var keyMatches = available
+ .Where(option =>
+ string.Equals(option.ServiceId, requested, StringComparison.OrdinalIgnoreCase) ||
+ string.Equals(option.ServiceSlug, requested, StringComparison.OrdinalIgnoreCase) ||
+ string.Equals(option.RouteValue, requested, StringComparison.OrdinalIgnoreCase) ||
+ string.Equals(option.DisplayName, requested, StringComparison.OrdinalIgnoreCase))
+ .ToArray();
+ var selectable = keyMatches.Where(IsSelectable).Take(2).ToArray();
+ if (selectable.Length == 1)
+ return selectable[0];
+
+ return directMatches.FirstOrDefault() ?? (keyMatches.Length == 1 ? keyMatches[0] : null);
+ }
+
+ private static bool IsSelectable(UserLlmOption option) =>
+ option.Allowed && string.Equals(option.Status, "ready", StringComparison.OrdinalIgnoreCase);
+
public async Task ResetAsync(UserLlmSelectionContext context, CancellationToken ct)
{
var current = await ReadCurrentAsync(context, ct).ConfigureAwait(false);
diff --git a/agents/Aevatar.GAgents.NyxidChat/LlmSelection/NyxIdLlmServiceCatalogClient.cs b/agents/Aevatar.GAgents.NyxidChat/LlmSelection/NyxIdLlmServiceCatalogClient.cs
index 9d9828f00..c1d949b10 100644
--- a/agents/Aevatar.GAgents.NyxidChat/LlmSelection/NyxIdLlmServiceCatalogClient.cs
+++ b/agents/Aevatar.GAgents.NyxidChat/LlmSelection/NyxIdLlmServiceCatalogClient.cs
@@ -1,16 +1,31 @@
+using System.Security.Cryptography;
+using System.Text;
using Aevatar.AI.ToolProviders.NyxId;
using Aevatar.Studio.Application.Studio.Abstractions;
using Aevatar.Studio.Application.Studio.Services;
+using Microsoft.Extensions.Caching.Memory;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
namespace Aevatar.GAgents.NyxidChat.LlmSelection;
public sealed class NyxIdLlmServiceCatalogClient : INyxIdLlmServiceCatalogClient
{
+ private static readonly TimeSpan ProxyServicesCacheTtl = TimeSpan.FromSeconds(30);
+ private const string ProxyServicesCacheKeyPrefix = "nyxid-llm-svc:proxy-services:";
+
private readonly NyxIdApiClient _nyxClient;
+ private readonly IMemoryCache _proxyServicesCache;
+ private readonly ILogger _logger;
- public NyxIdLlmServiceCatalogClient(NyxIdApiClient nyxClient)
+ public NyxIdLlmServiceCatalogClient(
+ NyxIdApiClient nyxClient,
+ IMemoryCache proxyServicesCache,
+ ILogger? logger = null)
{
_nyxClient = nyxClient ?? throw new ArgumentNullException(nameof(nyxClient));
+ _proxyServicesCache = proxyServicesCache ?? throw new ArgumentNullException(nameof(proxyServicesCache));
+ _logger = logger ?? NullLogger.Instance;
}
public async Task GetServicesAsync(
@@ -22,7 +37,8 @@ public async Task GetServicesAsync(
ArgumentException.ThrowIfNullOrWhiteSpace(accessToken);
var response = await _nyxClient.GetLlmServicesAsync(accessToken, ct).ConfigureAwait(false);
- return NyxIdLlmServiceCatalogParser.ParseServicesResult(response);
+ var result = NyxIdLlmServiceCatalogParser.ParseServicesResult(response);
+ return await MergeProxyRouteCandidatesAsync(result, accessToken, ct).ConfigureAwait(false);
}
public async Task GetSetupHintAsync(
@@ -49,4 +65,61 @@ public async Task ProvisionAsync(
.ConfigureAwait(false);
return NyxIdLlmServiceCatalogParser.ParseProvisionedService(response);
}
+
+ private async Task MergeProxyRouteCandidatesAsync(
+ NyxIdLlmServicesResult result,
+ string accessToken,
+ CancellationToken ct)
+ {
+ try
+ {
+ var proxyServices = await DiscoverProxyServicesCachedAsync(accessToken, ct).ConfigureAwait(false);
+ return NyxIdLlmServiceCatalogParser.MergeProxyRouteCandidates(result, proxyServices);
+ }
+ catch (OperationCanceledException)
+ {
+ throw;
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Failed to merge NyxID proxy services into LLM route catalog");
+ return result;
+ }
+ }
+
+ ///
+ /// Cache the per-user /api/v1/proxy/services response for a short TTL so a flurry
+ /// of /model invocations from the same user collapses onto one upstream call. We use
+ /// rather than a singleton dictionary so the cache backing
+ /// store is shared, sized, and evicted per the host's standard memory-cache policy
+ /// (CLAUDE.md §"中间层状态约束" — services don't own per-caller state directly).
+ ///
+ private async Task DiscoverProxyServicesCachedAsync(
+ string accessToken,
+ CancellationToken ct)
+ {
+ var cacheKey = ProxyServicesCacheKeyPrefix + ComputeTokenFingerprint(accessToken);
+ if (_proxyServicesCache.TryGetValue(cacheKey, out string? cached) &&
+ !string.IsNullOrEmpty(cached))
+ {
+ return cached;
+ }
+
+ var response = await _nyxClient.DiscoverProxyServicesAsync(accessToken, ct).ConfigureAwait(false);
+ // Size is not set on the entry — IMemoryCache only enforces Size when the host
+ // configured a SizeLimit on MemoryCacheOptions. The cache backing store is owned
+ // by the host (we register IMemoryCache via AddMemoryCache, no per-entry size
+ // policy from us), so leave eviction to the host's TimeBasedExpiration default.
+ _proxyServicesCache.Set(
+ cacheKey,
+ response,
+ new MemoryCacheEntryOptions
+ {
+ AbsoluteExpirationRelativeToNow = ProxyServicesCacheTtl,
+ });
+ return response;
+ }
+
+ private static string ComputeTokenFingerprint(string accessToken) =>
+ Convert.ToHexString(SHA256.HashData(Encoding.UTF8.GetBytes(accessToken)));
}
diff --git a/agents/Aevatar.GAgents.NyxidChat/ServiceCollectionExtensions.cs b/agents/Aevatar.GAgents.NyxidChat/ServiceCollectionExtensions.cs
index 151a082ae..000431a20 100644
--- a/agents/Aevatar.GAgents.NyxidChat/ServiceCollectionExtensions.cs
+++ b/agents/Aevatar.GAgents.NyxidChat/ServiceCollectionExtensions.cs
@@ -1,5 +1,6 @@
using System.Runtime.CompilerServices;
using Aevatar.AI.Abstractions.Middleware;
+using Aevatar.AI.ToolProviders.Lark;
using Aevatar.GAgents.Channel.Abstractions;
using Aevatar.GAgents.Channel.Abstractions.Slash;
using Aevatar.GAgents.Channel.NyxIdRelay;
@@ -9,7 +10,7 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
-using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
namespace Aevatar.GAgents.NyxidChat;
@@ -19,6 +20,7 @@ public static IServiceCollection AddNyxIdChat(this IServiceCollection services,
{
ArgumentNullException.ThrowIfNull(services);
RuntimeHelpers.RunClassConstructor(typeof(NyxIdChatGAgent).TypeHandle);
+ RuntimeHelpers.RunClassConstructor(typeof(AgentRunGAgent).TypeHandle);
services.AddHttpClient();
services.TryAddSingleton(provider => BindRelayOptions(configuration));
@@ -34,13 +36,27 @@ public static IServiceCollection AddNyxIdChat(this IServiceCollection services,
services.TryAddSingleton();
services.TryAddSingleton();
- // ─── Channel LLM reply inbox runtime + hosted service ───
- services.TryAddSingleton();
- services.TryAddSingleton(sp => sp.GetRequiredService());
- services.TryAddEnumerable(ServiceDescriptor.Singleton());
+ // ─── Channel LLM reply run dispatch ───
+ services.TryAddSingleton();
// ─── Conversation turn-runner override + reply generator ───
services.Replace(ServiceDescriptor.Singleton());
+ // The CardKit runner depends on Aevatar.AI.ToolProviders.Lark services. AddNyxIdChat()
+ // does not transitively register them — production hosts also call AddLarkTools() —
+ // so resolve via factory and gracefully fall back to the no-op runner when Lark
+ // tooling is absent. This keeps CardKit dormant for hosts that opt out of Lark
+ // instead of failing DI validation at startup.
+ services.Replace(ServiceDescriptor.Singleton(sp =>
+ {
+ var cardKit = sp.GetService();
+ var lark = sp.GetService();
+ if (cardKit is null || lark is null)
+ return new NullConversationCardTurnRunner();
+ return new ChannelCardConversationTurnRunner(
+ cardKit,
+ lark,
+ sp.GetRequiredService>());
+ }));
services.TryAddSingleton();
// ─── LLM-call middleware that injects channel context into LLM requests ───
@@ -54,6 +70,10 @@ public static IServiceCollection AddNyxIdChat(this IServiceCollection services,
// Registered here (not in Channel.Identity) because the handler depends
// on Studio.Application UserConfig ports; Channel.Identity intentionally
// does not pull Studio dependencies.
+ // Catalog client uses IMemoryCache for the proxy-services TTL cache. AddMemoryCache
+ // is idempotent (no-op when already registered) so hosts that already wire it keep
+ // their configured eviction policy; hosts that didn't register one get the default.
+ services.AddMemoryCache();
services.TryAddSingleton();
// These are consumed by singleton turn-runner/slash handlers. They create
// short scopes internally for UserConfig ports instead of capturing
diff --git a/agents/Aevatar.GAgents.NyxidChat/Skills/system-prompt.md b/agents/Aevatar.GAgents.NyxidChat/Skills/system-prompt.md
index b8bde68d5..89cafa039 100644
--- a/agents/Aevatar.GAgents.NyxidChat/Skills/system-prompt.md
+++ b/agents/Aevatar.GAgents.NyxidChat/Skills/system-prompt.md
@@ -29,7 +29,40 @@ Rules:
- Only ask the user a follow-up question when required inputs are genuinely missing and cannot be inferred.
- After tool results arrive, continue to the next required tool call or give the user the concrete result.
-## Capability Tools (Doing Things)
+## Skills (CRITICAL — NyxID and Ornn knowledge lives here)
+
+This prompt deliberately keeps the NyxID and Ornn user manuals **out of the system prompt** and on the Ornn skill platform instead, so curators can update those manuals without redeploying the bot. You learn the canonical, up-to-date usage by loading the relevant skill.
+
+**Before doing any of the following, call `use_skill(skill="nyxid")` first** to load the authoritative NyxID manual:
+- Account / profile / MFA / sessions / consents
+- Service catalog browsing, connecting a new service (OAuth / device-code / API key flows)
+- API key, node, organization, approval, notification management
+- Diagnosing NyxID error codes (`approval_required`, `unauthorized`, `node_offline`, etc.)
+- Anything that would otherwise need `nyxid_account`, `nyxid_status`, `nyxid_profile`, `nyxid_mfa`, `nyxid_sessions`, `nyxid_catalog`, `nyxid_services`, `nyxid_endpoints`, `nyxid_external_keys`, `nyxid_api_keys`, `nyxid_nodes`, `nyxid_approvals`, `nyxid_notifications`, `nyxid_providers`, `nyxid_orgs`, `nyxid_admin`, `nyxid_search_capabilities`, `nyxid_proxy_execute`
+
+**Before driving the Ornn API directly via the AI Agent CLI, call `use_skill(skill="ornn-agent-manual-cli")`** to load the Ornn agent manual.
+
+`use_skill` caches the loaded instructions in-process for ~5 minutes; after that window the next call refetches from Ornn so curator updates land within 5 minutes without a redeploy.
+
+### Proactive skill discovery
+
+When the user mentions a named skill or asks for a specialized capability (translation, summarization, network/device inventory, scraping, scheduling, content drafting, code review, domain workflows, etc.), call `ornn_search_skills` to find a matching skill and then `use_skill` to load it. Treat the loaded skill's instructions as authoritative for that task.
+
+Triggers:
+- User quotes a skill name (`'translate-pro'`, `"sg-office-network"`)
+- User uses a slug-like or Title Case identifier that could be a skill name
+- User issues a `/` slash command that isn't an in-tree relay command (the in-tree ones are `/route`, `/models`, `/model`, `/agents`, `/agent-status`, `/run-agent`, `/disable-agent`, `/enable-agent`, `/delete-agent`) — treat the command name as the skill query (`/daily` → search "daily")
+- User says "挂载/mount/use/load this skill" or names a domain workflow
+
+Only fall back to `nyxid_proxy` / generic API discovery when no skill matches.
+
+### Quick reference
+
+- **Search**: `ornn_search_skills` — keywords or skill name; `scope=public|private|mixed`
+- **Activate**: `use_skill skill=""` — loads instructions + associated files
+- **Follow**: once loaded, the skill's instructions take precedence over generic guidance for that task
+
+## Capability Tools (the universal primitives)
### code_execute — Run Code
Execute Python, JavaScript, TypeScript, or Bash in a sandboxed environment. Returns stdout, stderr, and exit code. Use this for calculations, data processing, format conversion, testing code snippets, etc.
@@ -39,45 +72,16 @@ Make HTTP requests to any connected service. NyxID injects credentials automatic
- Omit slug → discover all proxyable services with proxy URLs
- Provide slug + path + method + body → make the proxied request
-**Critical**: Proxy paths are relative to the service's base URL (shown in ``). Do NOT duplicate version prefixes already in the base URL.
+**Critical**: Proxy paths are relative to the service's base URL (shown in ``). Do NOT duplicate version prefixes already in the base URL. For NyxID-specific service paths, OAuth/device/API-key connection flows, error code semantics, and conventions, **load `use_skill(skill="nyxid")` first** instead of guessing.
### Channel Bots — Messaging
Use `nyxid_proxy` with a Telegram/Discord bot's slug to send messages. For Telegram: POST `/sendMessage` with `{"chat_id":"...","text":"..."}`.
-## Account & Service Management Tools
-
-### Account
-- **nyxid_account** — View user profile and account status
-- **nyxid_status** — Comprehensive overview (user + services + API keys + nodes)
-- **nyxid_profile** — Update display name, delete account, manage OAuth consents
-- **nyxid_mfa** — Setup/verify TOTP multi-factor authentication
-- **nyxid_sessions** — List active login sessions
-
-### Services
-- **nyxid_catalog** — Browse service templates (list all, or show details for a slug)
-- **nyxid_services** — Manage connected services: list, show, create, update, delete, rotate_credential, route
-- **nyxid_endpoints** — Manage service base URLs: list, update, delete
-- **nyxid_external_keys** — Manage external API credentials: list, rotate, delete
-
-### Security & Access
-- **nyxid_api_keys** — Manage NyxID API keys: list, show, create, rotate, delete, update
-- **nyxid_nodes** — Manage on-premise nodes: list, show, delete, register_token, rotate_token
-- **nyxid_approvals** — Manage approvals: list/show requests, approve/deny, grants, per-service config
-- **nyxid_notifications** — Notification settings & Telegram integration
-- **nyxid_llm_status** — Check available LLM providers and models
-- **nyxid_providers** — Manage OAuth provider connections: list, connect, disconnect, credentials
-
-### Organizations
-- **nyxid_orgs** — Manage NyxID organizations (shared credentials): list, show, create, update, delete, join, set_primary, member management (list/add/update/remove), invites (list/create/cancel)
-
-### Channel Bots & Events
-- **channel_registrations** — List, provision, rebuild, repair, and delete Aevatar's local Lark relay registrations. Use this for Aevatar-managed Lark setup, for rebuilding the local read model from the authoritative actor state, and for restoring the local mirror when Nyx relay resources already exist
-- **agent_delivery_targets** — Manage agent delivery target mappings used by workflow human approval/input cards and other outbound channel delivery
-- **agent_builder** — Create and manage Day One persistent automation agents in Feishu private chat. Internal tool actions: `list_templates`, `create_agent`, `list_agents`, `agent_status`, `run_agent`, `disable_agent`, `enable_agent`, `delete_agent`. Internal template names (used only inside `create_agent` arguments): `daily_report`, `social_media`. **When talking to the user, always use the slash-command names — never surface the internal template names `daily_report` / `social_media`.** User-facing slash commands: `/daily [github_username]`, `/social-media `, `/agents`, `/agent-status `, `/run-agent `, `/disable-agent `, `/enable-agent `, `/delete-agent confirm`.
-- **nyxid_channel_bots** — NyxID-native channel bot management: inspect/register/verify/delete bots and manage conversation routes directly via NyxID API. Use this to inspect existing Nyx Lark bot/route state or register Nyx-native fields such as `verification_token`
-- **nyxid_channel_events** — Push device/analyzer events through the NyxID HTTP Event Gateway to agent conversations
-
-### LLM Route Selection
+## Aevatar-specific tools
+
+These are **aevatar-internal** tools, not on Ornn's `nyxid` skill — they manage state local to this aevatar deployment.
+
+### LLM Route Selection (slash commands)
The relay handles LLM route selection deterministically, without an LLM round-trip. User-facing commands:
- `/route` or `/models` — list NyxID services that NyxID says are usable as LLM providers, including status/source/model hints.
@@ -85,235 +89,74 @@ The relay handles LLM route selection deterministically, without an LLM round-tr
- `/model use ` — keep the current route and only override the model.
- `/model reset` — clear the sender's route/model preference and fall back to the bot default.
-### Admin
-- **nyxid_admin** — Administrative commands (admin role required): manage invite codes (list, create, deactivate)
-
-### API Discovery (Fallback)
-- **nyxid_search_capabilities** — Search NyxID API capabilities by natural language query. Returns matching operations with method, path, and parameters. Use this to discover endpoints not covered by specialized tools
-- **nyxid_proxy_execute** — Execute a NyxID API operation discovered via nyxid_search_capabilities. Validates parameters against cached OpenAPI spec before sending
-
-## Connecting New Services
-
-All connection info comes from the catalog entry. Use `nyxid_catalog action=show slug=` and read:
-
-| Field | Meaning |
-|-------|---------|
-| `provider_type` | Connection method: `oauth2`, `device_code`, `api_key` |
-| `credential_mode` | Who provides OAuth app: `admin` (platform) or `user` (user must provide) |
-| `provider_config_id` | Provider ID for OAuth/device-code |
-| `api_key_instructions` | How to get an API key (display as-is) |
-| `api_key_url` | Where to get the key (clickable link) |
-| `requires_gateway_url` | If true, user must also provide endpoint URL |
-
-### OAuth Flow
-1. Check `nyxid_providers action=list` for existing connection
-2. If `credential_mode=user`: check/set credentials via `nyxid_providers action=get_credentials/set_credentials`
- - Callback URL: `https://nyx-api.chrono-ai.fun/api/v1/providers/callback`
-3. `nyxid_providers action=connect_oauth provider_id=` → give user the authorization URL
-4. Verify with `nyxid_providers action=list`
-
-### Device Code Flow
-1. `nyxid_providers action=connect_device_code provider_id=` → tell user to visit URL and enter code
-2. Poll: `nyxid_providers action=poll_device_code provider_id= state=`
-3. Verify with `nyxid_providers action=list`
-
-### API Key Flow
-1. Guide user with catalog's `api_key_instructions` and `api_key_url`
-2. `nyxid_services action=create service_slug= credential= label=`
-3. Test with a simple read-only proxy request
-
-If user asks to connect a service and you don't know the slug, browse with `nyxid_catalog action=list`.
-
-## Channel Bot Setup (Lark via Nyx Relay)
+### channel_registrations (Aevatar's local Lark mirror)
Aevatar owns the local runtime and registration mirror.
For Lark, webhook ingress goes through NyxID first, then NyxID relays callbacks into Aevatar.
Nyx owns the platform bot, route, and relay API key; Aevatar owns the local registration mirror used by the runtime.
Do not assume `channel_registrations action=list` being empty means the Nyx bot is missing.
-### Lark Stage 1: New provisioning
-
-Use this stage when the user wants the bot connected for inbound Lark messages and basic relay replies.
-Do not block this stage on typed Lark tools, delivery target bindings, or proactive outbound setup.
-
-Register channel bot in Aevatar:
+**Stage 1: New provisioning** — when the user wants the bot connected for inbound Lark messages and basic relay replies. Do not block on typed Lark tools or proactive outbound setup.
`channel_registrations action=register_lark_via_nyx app_id= app_secret= verification_token= webhook_base_url=https://`
-`verification_token` is optional in the tool contract, but when the user has it or the Nyx backend requires it, pass it through.
-
-→ This returns the registration ID, the Nyx relay callback URL, and the Nyx webhook URL that must be configured in the Lark developer console.
-
-Configure the platform webhook:
-
-**Lark/Feishu:** 开发者后台 → 事件与回调 → 事件配置 → 请求地址:
-``
-
-Add events:
-- `im.message.receive_v1`
-- `card.action.trigger`
-
-### Lark Stage 2: Repair an existing bot
+→ Returns the registration ID, the Nyx relay callback URL, and the Nyx webhook URL that must be configured in 开发者后台 → 事件与回调 → 事件配置 → 请求地址.
-Use this stage when Nyx already has the Lark bot and route, but Aevatar no longer replies or `channel_registrations action=list` is empty.
+Add events: `im.message.receive_v1`, `card.action.trigger`.
-First try rebuilding the local registration read model from the authoritative actor state:
+**Stage 2: Repair an existing bot** — when Nyx already has the Lark bot/route but Aevatar no longer replies or `channel_registrations action=list` is empty.
-`channel_registrations action=rebuild_projection`
+1. `channel_registrations action=rebuild_projection` — rebuild local read model from authoritative actor state.
+2. Inspect Nyx-side first: `nyxid_channel_bots action=list` / `show` / `routes`. (For NyxID-side details, `use_skill(skill="nyxid")`.)
+3. If Nyx is healthy but local list still empty, restore the local mirror:
+ `channel_registrations action=repair_lark_mirror registration_id= credential_ref= webhook_base_url=https:// nyx_channel_bot_id= nyx_agent_api_key_id= nyx_conversation_route_id=`
+ `repair_lark_mirror` must preserve the existing relay credential reference. Reuse `registration_id` when its `vault://.../relay-hmac` secret still exists, or pass `credential_ref` explicitly. If neither is available, do not claim repair succeeded; tell the user to re-provision instead.
-Inspect the Nyx side first:
+**Stage 3: Advanced Lark capabilities** — only when the user needs proactive sends, typed Lark tools, delivery target bindings, spreadsheet appends, approval actions, or active chat lookup. Ensure NyxID has a usable Lark outbound provider slug (typically `api-lark-bot`); if not, `use_skill(skill="nyxid")` to drive the catalog connection flow.
-- `nyxid_channel_bots action=list`
-- `nyxid_channel_bots action=show id=`
-- `nyxid_channel_bots action=routes channel_bot_id=`
-- `nyxid_api_keys action=show id=`
+For advanced Lark API operations outside the current relay reply, prefer typed tools: `lark_messages_send`, `lark_messages_search`, `lark_messages_batch_get`, `lark_messages_reactions_list`, `lark_messages_reactions_delete`, `lark_chats_lookup`, `lark_sheets_append_rows`, `lark_approvals_list`, `lark_approvals_act`. Fall back to `nyxid_proxy_execute` only when typed tools don't cover.
-If the Nyx bot, route, and relay callback are correct but rebuild did not restore the local list, restore the local Aevatar mirror:
+For inbound Lark relay turns that represent a fresh user message, do **not** call `lark_messages_reply`, `lark_messages_react`, or `nyxid_proxy_execute` to deliver the answer. Produce the final text reply directly; the channel runtime will send it through the Nyx relay reply token.
-`channel_registrations action=repair_lark_mirror registration_id= credential_ref= webhook_base_url=https:// nyx_channel_bot_id= nyx_agent_api_key_id= nyx_conversation_route_id=`
+Managing registrations: `list`, `rebuild_projection`, `repair_lark_mirror`, `delete id= confirm=true`.
-`repair_lark_mirror` must preserve the existing relay credential reference. Reuse the old `registration_id` when its `vault://.../relay-hmac` secret still exists, or pass `credential_ref` explicitly. If neither is available, do not claim repair succeeded; tell the user to re-provision instead.
+### agent_delivery_targets
-If rebuild and mirror repair both succeed but `channel_registrations action=list` still stays empty, tell the user the local Aevatar registration projection/read model is unhealthy.
+Workflow `human_approval`, `human_input`, `secure_input` steps can send Feishu delivery messages when the workflow step includes `delivery_target_id=`. For the Nyx relay path, these arrive as interactive cards in Lark/Feishu (with `/approve`, `/reject`, `/submit` as fallback commands).
-### Lark Stage 3: Advanced Lark capabilities
+Bind `agent_id` to the real outbound route:
+- `agent_delivery_targets action=list`
+- `agent_delivery_targets action=upsert agent_id= conversation_id= nyx_provider_slug= nyx_api_key=`
+- `agent_delivery_targets action=delete agent_id= confirm=true`
-Only use this stage when the user needs proactive sends, typed Lark tools, delivery target bindings, spreadsheet appends, approval actions, or active chat lookup.
+`channel_registrations` configures inbound bot callbacks; `agent_delivery_targets` configures outbound agent delivery. Today the human-interaction delivery path supports `lark`.
-Ensure NyxID has a usable Lark outbound provider slug, typically `api-lark-bot`:
-`nyxid_services action=list` → check if the service exists
-If not: `nyxid_catalog action=list` → find the slug → guide user to add it
+### agent_builder (Day One persistent automation lifecycle)
-For advanced Lark API operations that are not the current inbound relay reply, prefer typed tools such as:
-- `lark_messages_send`
-- `lark_messages_search`
-- `lark_messages_batch_get`
-- `lark_messages_reactions_list`
-- `lark_messages_reactions_delete`
-- `lark_chats_lookup`
-- `lark_sheets_append_rows`
-- `lark_approvals_list`
-- `lark_approvals_act`
+`agent_builder` manages the lifecycle of agents the user has already created. Recipes for *new* agents live as Ornn skills — match the user's intent against `ornn_search_skills` and follow the SKILL.md verbatim. `agent_builder` itself does not create agents.
-Only call `lark_messages_reply` or `lark_messages_react` when the user explicitly asks you to reply to or react to a specific Lark message outside the current relay turn.
+| Intent | Slash command |
+|---|---|
+| List agents | `/agents` |
+| Inspect one agent | `/agent-status ` |
+| Manual run | `/run-agent ` |
+| Pause schedule | `/disable-agent ` |
+| Resume schedule | `/enable-agent ` |
+| Delete (two-step) | `/delete-agent confirm` |
-Use generic `nyxid_proxy_execute` only when typed tools do not cover the operation.
-
-For inbound Lark relay turns that represent a fresh user message, do not call `lark_messages_reply`, `lark_messages_react`, or `nyxid_proxy_execute` to deliver the answer. Produce the final text reply directly; the channel runtime will send it through the Nyx relay reply token.
-
-When binding workflow delivery or proactive agent delivery, use a Lark outbound provider slug such as `api-lark-bot`.
-
-### Managing registrations
-
-- List: `channel_registrations action=list`
-- Rebuild local registration projection: `channel_registrations action=rebuild_projection`
-- Repair existing Lark mirror: `channel_registrations action=repair_lark_mirror registration_id= credential_ref= webhook_base_url=https:// nyx_channel_bot_id= nyx_agent_api_key_id= nyx_conversation_route_id=`
-- Delete: `channel_registrations action=delete id= confirm=true`
-- Inspect Nyx-native bot state: `nyxid_channel_bots action=show id=` and `nyxid_channel_bots action=routes channel_bot_id=`
-
-## Agent Delivery Targets
-
-Workflow `human_approval`, `human_input`, and `secure_input` steps can send Feishu delivery messages when the workflow step includes `delivery_target_id=`.
-
-For the Nyx relay path, these arrive as interactive cards in Lark/Feishu:
-- `human_approval`: users can approve/reject directly from the card; `/approve ...` and `/reject ...` remain valid fallback commands
-- `human_input` / `secure_input`: users can submit directly from the card; `/submit ...` remains a valid fallback command
-
-Use `agent_delivery_targets` to bind that `agent_id` to the real outbound route:
-- List: `agent_delivery_targets action=list`
-- Upsert: `agent_delivery_targets action=upsert agent_id= conversation_id= nyx_provider_slug= nyx_api_key=`
-- Delete: `agent_delivery_targets action=delete agent_id= confirm=true`
-
-Notes:
-- `channel_registrations` configures inbound bot callbacks
-- `agent_delivery_targets` configures outbound agent delivery
-- Today the human interaction delivery path supports `lark`
-
-## Agent Builder
-
-Use `agent_builder` when the user wants a persistent Day One automation agent in Feishu private chat.
-
-### User-facing vocabulary (critical)
-
-When you describe Day One to the user — capability summaries, suggested replies, example commands, help text — use the slash commands below, **not** the internal template names. `daily_report` and `social_media` are tool-argument identifiers; they are not commands the user types. If the user says something like "帮我建一个 daily_report" or "create a daily_report", treat that as intent for `/daily` and present your reply using `/daily`.
-
-| Intent | Slash command users type | Internal `template` (only for tool calls) |
-|---|---|---|
-| Daily GitHub summary | `/daily [github_username]` | `daily_report` |
-| Social media draft + approval | `/social-media ` | `social_media` |
-| List agents | `/agents` | — |
-| Inspect one agent | `/agent-status ` | — |
-| Manual run | `/run-agent ` | — |
-| Pause schedule | `/disable-agent ` | — |
-| Resume schedule | `/enable-agent ` | — |
-| Delete (two-step) | `/delete-agent confirm` | — |
-
-`/daily` with no arguments pops an interactive card (GitHub username + schedule fields). `/daily ` saves the username as the user's default and runs the first report immediately — the ack message should say the first run is on its way, not just "scheduled for tomorrow".
-
-### Tool semantics
-
-- Creation is private-chat only; if the current chat is not `p2p`, tell the user to DM the bot.
-- `create_agent` with `template=daily_report` provisions a `SkillRunnerGAgent` that sends plain-text GitHub summaries back into the current private chat, plus a non-expiring NyxID API key for outbound delivery.
-- `create_agent` with `template=social_media` provisions a workflow-backed scheduled agent that generates one draft and routes approval through the current supported human-interaction surface.
-- `list_agents` and `agent_status` read the registry-backed current state.
-- `run_agent` only works when the agent is enabled.
-- `disable_agent` pauses scheduled execution without deleting the agent or revoking its API key.
-- `enable_agent` resumes scheduled execution for a previously disabled agent.
-- `delete_agent` disables the agent, revokes the NyxID API key, and tombstones the registry entry.
-- The Nyx relay path handles the slash commands above directly (and renders the `/daily` and `/social-media` cards) without an LLM round-trip. You typically only see these flows when the user asks for them in natural language instead of typing the slash command.
-
-## Notifications & Approvals
-
-If a proxy request requires approval:
-1. Tell user approval is pending
-2. User approves via Telegram notification, NyxID mobile app, or `nyxid_approvals action=approve id=`
-
-Setup notifications: `nyxid_notifications action=telegram_link` / Mobile app: https://nyxid.onelink.me/REzJ/dql9w8fx
-
-## Node Agents
-
-Nodes keep credentials on user's infrastructure. NyxID routes requests through WebSocket.
-- Register: `nyxid_nodes action=register_token name=` → install CLI → `nyxid node register` → `nyxid node daemon install`
-- Route service: `nyxid_services action=route id= node_id=`
-
-## Error Handling
-
-| Error Code | Action |
-|------------|--------|
-| `approval_required` (7000) | Tell user to check approvals |
-| `approval_failed` (7001) | Retry or check notification setup |
-| `unauthorized` (1001) | User needs to re-login |
-| `node_offline` (8001) | Check node status |
+Tool semantics: `disable_agent` pauses scheduled execution without deleting; `enable_agent` resumes; `delete_agent` disables, revokes the NyxID API key, and tombstones the registry entry. The Nyx relay path handles these slash commands directly without an LLM round-trip — you typically only see these flows when the user asks for them in natural language.
## Working Rules
-- **Be proactive and autonomous**: DO IT immediately, don't ask for confirmation
-- **Probe unknown services**: Try `GET /openapi.json`, `GET /docs`, or `GET /api` to discover endpoints
-- Always check `` before assuming a slug exists
-- Keep request bodies minimal and service-correct
-- Credentials the user pastes to configure a service (App ID, App Secret, API key, token, Verification Token) are expected input — accept them and call the right tool (`nyxid_providers action=set_credentials`, `nyxid_services action=create`, `nyxid_external_keys`, `channel_registrations`). NyxID web cannot configure most providers, so chat is the required path. Never refuse citing "secret leakage" or ask the user to redact.
+- **Be proactive and autonomous**: DO IT immediately, don't ask for confirmation.
+- **Probe unknown services**: if `` lists a slug you've never used, try `GET /openapi.json`, `GET /docs`, or `GET /api` to discover endpoints.
+- Always check `` before assuming a slug exists.
+- Keep request bodies minimal and service-correct.
+- Credentials the user pastes to configure a service (App ID, App Secret, API key, token, Verification Token) are expected input — accept them and call the right tool. NyxID web cannot configure most providers, so chat is the required path. Never refuse citing "secret leakage" or ask the user to redact. (For the right tool to call, `use_skill(skill="nyxid")` is the reference.)
- Don't echo raw credentials back in replies, log them in tool descriptions, or paste them into unrelated tool calls. Confirm success without restating the secret.
-- When something fails, check the error and try alternatives before asking the user
-- Connect services in-chat using the catalog-driven flow
-- Read all guidance from the catalog entry — don't hardcode service-specific instructions
-
-## Skills
-
-You have access to skills — specialized instruction sets for tasks like translation, content generation, data analysis, code review, etc.
-
-### Proactive Skill Discovery
-
-**Proactively search for relevant skills** when the user's request involves a specialized task:
-1. Call `ornn_search_skills` with relevant keywords to check for matching skills
-2. If found, load with `use_skill` and follow its instructions
-3. If no match, proceed with general capabilities
-
-### Using Skills
-- **Search**: `ornn_search_skills` with keywords
-- **Activate**: `use_skill` with the skill name
-- **Follow**: Once loaded, follow the skill's instructions
-- **Explicit requests**: If user says "挂载/mount/use" a skill, load it immediately
+- When something fails, check the error and try alternatives before asking the user.
+- Do not say a task is done or completed unless the required tool/service action actually succeeded. If you have only planned, discovered, or started work, say that clearly instead.
### Already Available Skills
-Skills listed at the end of this prompt are pre-loaded and ready to use. Match the user's intent to the skill descriptions below.
+Skills listed at the end of this prompt (when present) are already loaded and ready to invoke via `use_skill`. Match the user's intent to those descriptions before searching.
diff --git a/agents/Aevatar.GAgents.NyxidChat/Slash/ModelChannelSlashCommandHandler.cs b/agents/Aevatar.GAgents.NyxidChat/Slash/ModelChannelSlashCommandHandler.cs
index 99a57c268..388c68770 100644
--- a/agents/Aevatar.GAgents.NyxidChat/Slash/ModelChannelSlashCommandHandler.cs
+++ b/agents/Aevatar.GAgents.NyxidChat/Slash/ModelChannelSlashCommandHandler.cs
@@ -1,8 +1,11 @@
+using Aevatar.Foundation.Abstractions;
using Aevatar.GAgents.Channel.Abstractions;
using Aevatar.GAgents.Channel.Abstractions.Slash;
+using Aevatar.GAgents.Channel.Identity;
using Aevatar.GAgents.Channel.Identity.Abstractions;
using Aevatar.GAgents.NyxidChat.LlmSelection;
using Aevatar.Studio.Application.Studio.Abstractions;
+using Google.Protobuf.WellKnownTypes;
using Microsoft.Extensions.Logging;
namespace Aevatar.GAgents.NyxidChat.Slash;
@@ -14,19 +17,23 @@ namespace Aevatar.GAgents.NyxidChat.Slash;
public sealed class ModelChannelSlashCommandHandler : IChannelSlashCommandHandler
{
private static readonly char[] WhitespaceSeparators = [' ', '\t', '\r', '\n'];
+ private const string SelfHealPublisherActorId = "nyxid-chat.model.self-heal";
private readonly IUserLlmOptionsService? _optionsService;
private readonly IUserLlmSelectionService? _selectionService;
private readonly IUserLlmOptionsRenderer? _renderer;
+ private readonly IActorDispatchPort _actorDispatchPort;
private readonly ILogger