From 25b344dd3b232919e46d43e6c8134b93584f26a2 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sat, 23 May 2026 08:33:49 -0700 Subject: [PATCH 01/32] substrate wip Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 77 ++++ .../crd/bases/kagent.dev_agentharnesses.yaml | 141 +++++++ go/api/httpapi/types.go | 14 +- go/api/v1alpha2/agentharness_types.go | 99 +++++ go/api/v1alpha2/zz_generated.deepcopy.go | 93 +++++ .../controller/agentharness_controller.go | 117 +++++- .../handlers/agentharness_gateway.go | 365 ++++++++++++++++++ .../handlers/agentharness_gateway_rewrite.go | 235 +++++++++++ .../agentharness_gateway_rewrite_test.go | 165 ++++++++ .../handlers/agentharness_gateway_test.go | 132 +++++++ .../internal/httpserver/handlers/agents.go | 52 ++- .../internal/httpserver/handlers/handlers.go | 17 +- go/core/internal/httpserver/middleware.go | 17 +- go/core/internal/httpserver/server.go | 45 ++- go/core/pkg/app/app.go | 180 ++++++++- .../pkg/sandboxbackend/openshell/openclaw.go | 2 +- .../openshell/openclaw/bootstrap.go | 147 +++++-- .../openclaw/bootstrap_substrate_test.go | 21 + .../openshell/openclaw/bootstrap_test.go | 40 +- .../openshell/openclaw/constants.go | 7 +- .../openshell/openclaw/provider.go | 11 +- .../openshell/openclaw/types.go | 21 +- .../pkg/sandboxbackend/substrate/client.go | 114 ++++++ .../pkg/sandboxbackend/substrate/config.go | 22 ++ .../sandboxbackend/substrate/delete_actor.go | 127 ++++++ .../substrate/delete_actor_test.go | 18 + .../substrate/delete_provision.go | 109 ++++++ .../substrate/delete_provision_test.go | 61 +++ .../pkg/sandboxbackend/substrate/openclaw.go | 231 +++++++++++ .../sandboxbackend/substrate/openclaw_test.go | 53 +++ .../pkg/sandboxbackend/substrate/provision.go | 301 +++++++++++++++ .../substrate/provision_openclaw.go | 88 +++++ .../substrate/provision_openclaw_test.go | 153 ++++++++ .../substrate/provision_test.go | 47 +++ go/go.mod | 38 +- go/go.sum | 88 +++-- .../templates/kagent.dev_agentharnesses.yaml | 141 +++++++ .../templates/controller-deployment.yaml | 26 ++ helm/kagent/templates/rbac/getter-role.yaml | 19 + helm/kagent/values.yaml | 28 +- ui/next.config.ts | 15 + .../app/openshell/OpenshellTerminalPage.tsx | 42 +- ui/src/components/AgentCard.tsx | 37 +- ui/src/components/AgentListView.tsx | 43 ++- .../agent-form/OpenClawSandboxFields.tsx | 91 +++++ ui/src/lib/agentHarness.ts | 23 +- ui/src/lib/openClawSandboxForm.ts | 46 +++ ui/src/lib/openshellSandboxAgents.ts | 8 + ui/src/types/index.ts | 13 + 49 files changed, 3779 insertions(+), 201 deletions(-) create mode 100644 examples/substrate-openclaw/README.md create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway.go create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_test.go create mode 100644 go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/client.go create mode 100644 go/core/pkg/sandboxbackend/substrate/config.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_actor.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_actor_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_provision.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_provision_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/openclaw.go create mode 100644 go/core/pkg/sandboxbackend/substrate/openclaw_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_openclaw.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_test.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md new file mode 100644 index 0000000000..1b27550895 --- /dev/null +++ b/examples/substrate-openclaw/README.md @@ -0,0 +1,77 @@ +# OpenClaw on Agent Substrate + +## 1. Install Substrate on your Kind cluster + +Uses cluster `kind` (`KIND_CLUSTER_NAME=kind`; or set `KUBECONFIG` / context accordingly). + +```bash +cd substrate + +./hack/create-kind-cluster.sh +./hack/install-ate-kind.sh --deploy-ate-system +``` + +`--deploy-ate-system` installs the **control plane only** (ate-api, ate-controller, atelet, atenet, …). Your registry catalog will show `ateapi-*`, `atelet-*`, etc., but **not** ateom until you build it. + +Build and push **ateom-gvisor** (required for kagent `workerPool.ateomImage`): + +```bash +# build the ateom-gvisor image from the substrate folder +export KO_DOCKER_REPO=localhost:5001 +export KO_DEFAULTPLATFORMS=linux/$(go env GOARCH) +./hack/ko.sh build -B ./cmd/servers/ateom-gvisor +``` + +## 2. Load nemoclaw image + +The image is a multi-arch manifest list. On Apple Silicon, `kind load docker-image` often fails with `content digest ... not found` because Docker only has the local arch locally while kind imports with `--all-platforms`. Use `docker save` + `ctr import` instead (match `--name` to your cluster, e.g. `agent` for context `kind-agent`): + +```bash +docker pull --platform linux/arm64 ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 +docker save ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 | \ + docker exec -i kind-control-plane ctr --namespace=k8s.io images import - +``` + +On amd64 hosts, use `--platform linux/amd64` in the pull step. + +## kagent AgentHarness with substrate runtime + +kagent **auto-provisions** a per-harness `ActorTemplate` (and optionally a `WorkerPool`). + +Install kagent (Substrate must already be running in the cluster): + +```bash +export KIND_CLUSTER_NAME=kind +make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true" +``` + +Create a harness with only what you must choose: + +- **`snapshotsConfig.location`** — GCS `gs://` prefix (Substrate snapshots are GCS-only today) +- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool` + **`ateomImage`**) +- **`workerPool.ateomImage`** — (`localhost:5001/ateom-gvisor:latest`) + +```yaml +apiVersion: kagent.dev/v1alpha2 +kind: AgentHarness +metadata: + name: peterj-claw + namespace: kagent +spec: + runtime: substrate + backend: openclaw + description: OpenClaw on Agent Substrate + modelConfigRef: default-model-config + substrate: + snapshotsConfig: + location: gs://ate-snapshots/kagent/kagent/my-claw/ + workerPool: + replicas: 1 + ateomImage: localhost:5001/ateom-gvisor:latest + # Optional: adopt existing resources instead of auto-create + # workerPoolRef: + # name: my-pool + # namespace: ate-system +``` + +Port-forward the UI (`kubectl port-forward -n kagent svc/kagent-ui 8001:8080`) and navigate to the deployed agent harness. Use `test-token` as a gateway token to OpenClaw. diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index 308d7ba0f2..f82ff2a1d4 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -19,6 +19,9 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: + - jsonPath: .spec.runtime + name: Runtime + type: string - jsonPath: .spec.backend name: Backend type: string @@ -511,6 +514,106 @@ spec: type: string type: array type: object + runtime: + default: openshell + description: Runtime selects the harness provisioning stack. Defaults + to openshell when unset. + enum: + - openshell + - substrate + type: string + substrate: + description: Substrate is required when runtime is substrate. + properties: + actorTemplateRef: + description: |- + ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. + When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + gatewayPort: + default: 80 + description: GatewayPort is the port OpenClaw listens on inside + the actor (Substrate routes to :80 today). + format: int32 + type: integer + gatewayTokenSecretRef: + description: |- + GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. + When unset, the controller falls back to --substrate-gateway-token(-file). + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + snapshotsConfig: + description: SnapshotsConfig is required for auto-provisioned + templates (GCS gs:// location). + properties: + location: + description: |- + Location is the GCS URI prefix for golden and incremental snapshots. + Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + type: string + required: + - location + type: object + workerPool: + description: WorkerPool creates a dedicated WorkerPool in the + harness namespace when workerPoolRef is unset. + properties: + ateomImage: + description: |- + AteomImage is the ateom herder image (pullable registry ref, not ko://). + Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + type: string + replicas: + default: 2 + description: Replicas is the number of ateom worker pods. + Defaults to 2 when unset or zero. + format: int32 + type: integer + type: object + workerPoolRef: + description: |- + WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). + Mutually exclusive with workerPool. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workloadImage: + description: WorkloadImage overrides the default nemoclaw/openclaw + sandbox image in the ActorTemplate. + type: string + required: + - snapshotsConfig + type: object required: - backend type: object @@ -612,6 +715,44 @@ spec: observedGeneration: format: int64 type: integer + substrate: + description: Substrate records auto-provisioned Substrate CR references. + properties: + actorTemplateReady: + description: ActorTemplateReady is true when the template phase + is Ready (golden snapshot taken). + type: boolean + actorTemplateRef: + description: ActorTemplateRef is the ActorTemplate used when creating + the actor. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workerPoolRef: + description: WorkerPoolRef is the WorkerPool used by the harness + ActorTemplate. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + type: object type: object type: object served: true diff --git a/go/api/httpapi/types.go b/go/api/httpapi/types.go index ec80d49ea1..0107e5ffe0 100644 --- a/go/api/httpapi/types.go +++ b/go/api/httpapi/types.go @@ -144,6 +144,17 @@ type OpenshellAgentHarnessListEntry struct { Endpoint string `json:"endpoint,omitempty"` } +// SubstrateAgentHarnessListEntry is set when runtime is substrate. +type SubstrateAgentHarnessListEntry struct { + Backend v1alpha2.AgentHarnessBackendType `json:"backend"` + Runtime v1alpha2.AgentHarnessRuntime `json:"runtime"` + ActorID string `json:"actorId,omitempty"` + GatewayUIPath string `json:"gatewayUIPath,omitempty"` + ModelConfigRef string `json:"modelConfigRef,omitempty"` + BackendRefID string `json:"backendRefId,omitempty"` + Endpoint string `json:"endpoint,omitempty"` +} + type AgentResponse struct { ID string `json:"id"` Agent *AgentResource `json:"agent"` @@ -156,7 +167,8 @@ type AgentResponse struct { DeploymentReady bool `json:"deploymentReady"` Accepted bool `json:"accepted"` WorkloadMode v1alpha2.WorkloadMode `json:"workloadMode,omitempty"` - OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` + OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` + SubstrateAgentHarness *SubstrateAgentHarnessListEntry `json:"substrateAgentHarness,omitempty"` } // Session types diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index c6a43f6c02..26c0109069 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -37,6 +37,76 @@ func IsKnownAgentHarnessBackend(b AgentHarnessBackendType) bool { } } +// AgentHarnessRuntime selects which control plane provisions the harness VM. +// +kubebuilder:validation:Enum=openshell;substrate +type AgentHarnessRuntime string + +const ( + AgentHarnessRuntimeOpenshell AgentHarnessRuntime = "openshell" + AgentHarnessRuntimeSubstrate AgentHarnessRuntime = "substrate" +) + +// AgentHarnessSubstrateSnapshotsConfig points at a GCS prefix for actor memory snapshots. +// Substrate currently expects a gs:// location (see Agent Substrate SnapshotsConfig). +type AgentHarnessSubstrateSnapshotsConfig struct { + // Location is the GCS URI prefix for golden and incremental snapshots. + // Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + // +required + Location string `json:"location"` +} + +// AgentHarnessSubstrateWorkerPoolSpec creates a dedicated WorkerPool for this harness. +// Mutually exclusive with workerPoolRef. +type AgentHarnessSubstrateWorkerPoolSpec struct { + // Replicas is the number of ateom worker pods. Defaults to 2 when unset or zero. + // +optional + // +kubebuilder:default=2 + Replicas int32 `json:"replicas,omitempty"` + + // AteomImage is the ateom herder image (pullable registry ref, not ko://). + // Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + // +optional + AteomImage string `json:"ateomImage,omitempty"` +} + +// AgentHarnessSubstrateSpec configures Agent Substrate (WorkerPool + ActorTemplate + Actor). +// +// By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). +// Set actorTemplateRef only to adopt an existing template (advanced / legacy). +type AgentHarnessSubstrateSpec struct { + // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). + // Mutually exclusive with workerPool. + // +optional + WorkerPoolRef *TypedReference `json:"workerPoolRef,omitempty"` + + // WorkerPool creates a dedicated WorkerPool in the harness namespace when workerPoolRef is unset. + // +optional + WorkerPool *AgentHarnessSubstrateWorkerPoolSpec `json:"workerPool,omitempty"` + + // SnapshotsConfig is required for auto-provisioned templates (GCS gs:// location). + // +required + SnapshotsConfig AgentHarnessSubstrateSnapshotsConfig `json:"snapshotsConfig"` + + // WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. + // +optional + WorkloadImage string `json:"workloadImage,omitempty"` + + // ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. + // When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. + // +optional + ActorTemplateRef *TypedReference `json:"actorTemplateRef,omitempty"` + + // GatewayPort is the port OpenClaw listens on inside the actor (Substrate routes to :80 today). + // +optional + // +kubebuilder:default=80 + GatewayPort int32 `json:"gatewayPort,omitempty"` + + // GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. + // When unset, the controller falls back to --substrate-gateway-token(-file). + // +optional + GatewayTokenSecretRef *TypedReference `json:"gatewayTokenSecretRef,omitempty"` +} + // AgentHarnessChannelType selects a messenger integration for OpenClaw harness VMs. // +kubebuilder:validation:Enum=telegram;slack type AgentHarnessChannelType string @@ -158,6 +228,15 @@ type AgentHarnessSpec struct { // +required Backend AgentHarnessBackendType `json:"backend"` + // Runtime selects the harness provisioning stack. Defaults to openshell when unset. + // +optional + // +kubebuilder:default=openshell + Runtime AgentHarnessRuntime `json:"runtime,omitempty"` + + // Substrate is required when runtime is substrate. + // +optional + Substrate *AgentHarnessSubstrateSpec `json:"substrate,omitempty"` + // Description is a short human-readable summary shown in the UI (e.g. agents list). // +optional Description string `json:"description,omitempty"` @@ -230,6 +309,25 @@ type AgentHarnessStatus struct { // Connection is populated by the controller when the harness is ready. // +optional Connection *AgentHarnessConnection `json:"connection,omitempty"` + + // Substrate records auto-provisioned Substrate CR references. + // +optional + Substrate *AgentHarnessSubstrateStatus `json:"substrate,omitempty"` +} + +// AgentHarnessSubstrateStatus is observed Substrate control-plane state for this harness. +type AgentHarnessSubstrateStatus struct { + // WorkerPoolRef is the WorkerPool used by the harness ActorTemplate. + // +optional + WorkerPoolRef TypedReference `json:"workerPoolRef,omitempty"` + + // ActorTemplateRef is the ActorTemplate used when creating the actor. + // +optional + ActorTemplateRef TypedReference `json:"actorTemplateRef,omitempty"` + + // ActorTemplateReady is true when the template phase is Ready (golden snapshot taken). + // +optional + ActorTemplateReady bool `json:"actorTemplateReady,omitempty"` } // AgentHarnessConditionType enumerates the condition types an AgentHarness may report. @@ -241,6 +339,7 @@ const ( // +kubebuilder:object:root=true // +kubebuilder:resource:path=agentharnesses,singular=agentharness,shortName=ahr,categories=kagent // +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Runtime",type="string",JSONPath=".spec.runtime" // +kubebuilder:printcolumn:name="Backend",type="string",JSONPath=".spec.backend" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" // +kubebuilder:printcolumn:name="ID",type="string",JSONPath=".status.backendRef.id" diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 52d10ed714..dd1b350ccb 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -295,6 +295,11 @@ func (in *AgentHarnessSlackChannelSpec) DeepCopy() *AgentHarnessSlackChannelSpec // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessSpec) DeepCopyInto(out *AgentHarnessSpec) { *out = *in + if in.Substrate != nil { + in, out := &in.Substrate, &out.Substrate + *out = new(AgentHarnessSubstrateSpec) + (*in).DeepCopyInto(*out) + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) @@ -346,6 +351,11 @@ func (in *AgentHarnessStatus) DeepCopyInto(out *AgentHarnessStatus) { *out = new(AgentHarnessConnection) **out = **in } + if in.Substrate != nil { + in, out := &in.Substrate, &out.Substrate + *out = new(AgentHarnessSubstrateStatus) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessStatus. @@ -373,6 +383,89 @@ func (in *AgentHarnessStatusRef) DeepCopy() *AgentHarnessStatusRef { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateSnapshotsConfig) DeepCopyInto(out *AgentHarnessSubstrateSnapshotsConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateSnapshotsConfig. +func (in *AgentHarnessSubstrateSnapshotsConfig) DeepCopy() *AgentHarnessSubstrateSnapshotsConfig { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateSnapshotsConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec) { + *out = *in + if in.WorkerPoolRef != nil { + in, out := &in.WorkerPoolRef, &out.WorkerPoolRef + *out = new(TypedReference) + **out = **in + } + if in.WorkerPool != nil { + in, out := &in.WorkerPool, &out.WorkerPool + *out = new(AgentHarnessSubstrateWorkerPoolSpec) + **out = **in + } + out.SnapshotsConfig = in.SnapshotsConfig + if in.ActorTemplateRef != nil { + in, out := &in.ActorTemplateRef, &out.ActorTemplateRef + *out = new(TypedReference) + **out = **in + } + if in.GatewayTokenSecretRef != nil { + in, out := &in.GatewayTokenSecretRef, &out.GatewayTokenSecretRef + *out = new(TypedReference) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateSpec. +func (in *AgentHarnessSubstrateSpec) DeepCopy() *AgentHarnessSubstrateSpec { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateStatus) DeepCopyInto(out *AgentHarnessSubstrateStatus) { + *out = *in + out.WorkerPoolRef = in.WorkerPoolRef + out.ActorTemplateRef = in.ActorTemplateRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateStatus. +func (in *AgentHarnessSubstrateStatus) DeepCopy() *AgentHarnessSubstrateStatus { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateWorkerPoolSpec) DeepCopyInto(out *AgentHarnessSubstrateWorkerPoolSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateWorkerPoolSpec. +func (in *AgentHarnessSubstrateWorkerPoolSpec) DeepCopy() *AgentHarnessSubstrateWorkerPoolSpec { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateWorkerPoolSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessTelegramChannelSpec) DeepCopyInto(out *AgentHarnessTelegramChannelSpec) { *out = *in diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index a73d7a7835..94b857d9ef 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -28,6 +28,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" ) const ( @@ -49,14 +50,38 @@ const ( // harness VMs are a generic exec/SSH-able environment with no in-cluster // workload owned by kagent. type AgentHarnessController struct { - Client client.Client - Recorder events.EventRecorder - Backends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + Client client.Client + Recorder events.EventRecorder + OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateProvisioner *substrate.Provisioner +} + +func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxbackend.AsyncBackend { + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + switch runtime { + case v1alpha2.AgentHarnessRuntimeSubstrate: + if r.SubstrateBackends == nil { + return nil + } + return r.SubstrateBackends[ah.Spec.Backend] + default: + if r.OpenshellBackends == nil { + return nil + } + return r.OpenshellBackends[ah.Spec.Backend] + } } // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/status,verbs=get;update;patch // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/finalizers,verbs=update +// +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) @@ -80,11 +105,15 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{Requeue: true}, nil } - backend := r.Backends[ah.Spec.Backend] + backend := r.backendFor(&ah) if backend == nil { + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, "BackendUnavailable", - fmt.Sprintf("no backend configured for %q", ah.Spec.Backend)) + fmt.Sprintf("no %s backend configured for %q", runtime, ah.Spec.Backend)) setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "BackendUnavailable", "") if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { @@ -93,6 +122,59 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, nil } + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { + provRes, err := r.SubstrateProvisioner.Ensure(ctx, &ah) + if err != nil { + log.Error(err, "substrate provision failed") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "SubstrateProvisionFailed", err.Error()) + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "SubstrateProvisionFailed", "") + if perr := r.patchAgentHarnessStatus(ctx, &ah); perr != nil { + return ctrl.Result{}, perr + } + return ctrl.Result{}, err + } + if ah.Status.Substrate == nil { + ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} + } + if provRes.WorkerPoolRef.Name != "" { + ah.Status.Substrate.WorkerPoolRef = v1alpha2.TypedReference{ + Name: provRes.WorkerPoolRef.Name, + Namespace: provRes.WorkerPoolRef.Namespace, + } + } + ah.Status.Substrate.ActorTemplateRef = v1alpha2.TypedReference{ + Name: provRes.ActorTemplateRef.Name, + Namespace: provRes.ActorTemplateRef.Namespace, + } + ah.Status.Substrate.ActorTemplateReady = provRes.ActorTemplateReady + // Persist status before metadata annotation patch (client Patch can refresh ah and drop in-memory status). + if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { + return ctrl.Result{}, err + } + if err := r.patchAgentHarnessProvisionAnnotations(ctx, &ah, provRes); err != nil { + return ctrl.Result{}, err + } + if !provRes.ActorTemplateReady { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, + "SubstrateProvisioning", "waiting for ActorTemplate golden snapshot") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "ActorTemplateNotReady", "ActorTemplate is not Ready yet") + if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { + return ctrl.Result{}, fmt.Errorf("reload AgentHarness after substrate provision: %w", err) + } + } + res, err := backend.EnsureAgentHarness(ctx, &ah) if err != nil { log.Error(err, "EnsureAgentHarness failed") @@ -192,7 +274,7 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } if ah.Status.BackendRef != nil && ah.Status.BackendRef.ID != "" { - del := r.Backends[ah.Status.BackendRef.Backend] + del := r.backendFor(ah) if del != nil { if err := del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: ah.Status.BackendRef.ID}); err != nil { if r.Recorder != nil { @@ -203,6 +285,12 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } } + if r.SubstrateProvisioner != nil { + if err := r.SubstrateProvisioner.Delete(ctx, ah); err != nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("delete substrate resources: %w", err) + } + } + controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) if err := r.Client.Update(ctx, ah); err != nil { return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) @@ -217,6 +305,23 @@ func (r *AgentHarnessController) patchAgentHarnessStatus(ctx context.Context, ah return nil } +func (r *AgentHarnessController) patchAgentHarnessProvisionAnnotations(ctx context.Context, ah *v1alpha2.AgentHarness, prov substrate.EnsureResult) error { + base := ah.DeepCopy() + if ah.Annotations == nil { + ah.Annotations = map[string]string{} + } + if prov.ManagedWorkerPool { + ah.Annotations[substrate.AnnotationManagedWorkerPool] = "true" + } + if prov.ManagedActorTemplate { + ah.Annotations[substrate.AnnotationManagedActorTemplate] = "true" + } + if err := r.Client.Patch(ctx, ah, client.MergeFrom(base)); err != nil { + return fmt.Errorf("patch AgentHarness substrate annotations: %w", err) + } + return nil +} + func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { now := metav1.Now() for i := range ah.Status.Conditions { diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go new file mode 100644 index 0000000000..453ec5d907 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -0,0 +1,365 @@ +package handlers + +import ( + "bytes" + "compress/gzip" + "context" + "fmt" + "io" + "net" + "net/http" + "net/http/httputil" + "net/url" + "os" + "strings" + "time" + + "github.com/gorilla/mux" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + substrateGatewayTokenSecretKey = "token" + // OpenClaw 2026.3.28+ returns 403 without operator scopes on HTTP/WS when only Bearer token is sent. + openclawDefaultOperatorScopes = "operator.admin" + // Origin OpenClaw accepts by default for bind=lan port=80 (localhost/127.0.0.1 on gateway port). + openclawLoopbackOrigin = "http://127.0.0.1:80" +) + +// AgentHarnessGatewayConfig configures Substrate harness HTTP/WebSocket proxy. +// Traffic is proxied directly to the actor ateom pod IP on port 80 (no atenet-router fallback). +type AgentHarnessGatewayConfig struct { + GatewayToken string + GatewayTokenFile string + AteAPIEndpoint string + AteAPIInsecure bool + DialTimeout time.Duration + CallTimeout time.Duration +} + +func (c *AgentHarnessGatewayConfig) resolveToken() (string, error) { + if c == nil { + return "", nil + } + if c.GatewayTokenFile != "" { + data, err := os.ReadFile(c.GatewayTokenFile) + if err != nil { + return "", fmt.Errorf("read substrate gateway token file: %w", err) + } + return strings.TrimSpace(string(data)), nil + } + return strings.TrimSpace(c.GatewayToken), nil +} + +// HandleAgentHarnessGateway proxies browser traffic to the actor OpenClaw gateway (pod IP when available). +func (h *Handlers) HandleAgentHarnessGateway(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("agentharness-gateway") + if h.AgentHarnessGateway == nil { + http.Error(w, "substrate gateway proxy is not configured", http.StatusServiceUnavailable) + return + } + + vars := mux.Vars(r) + namespace := strings.TrimSpace(vars["namespace"]) + name := strings.TrimSpace(vars["name"]) + if namespace == "" || name == "" { + http.Error(w, "namespace and name are required", http.StatusBadRequest) + return + } + + var ah v1alpha2.AgentHarness + if err := h.KubeClient.Get(r.Context(), types.NamespacedName{Namespace: namespace, Name: name}, &ah); err != nil { + if apierrors.IsNotFound(err) { + http.Error(w, "AgentHarness not found", http.StatusNotFound) + return + } + log.Error(err, "get AgentHarness") + http.Error(w, "failed to load AgentHarness", http.StatusInternalServerError) + return + } + + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + if runtime != v1alpha2.AgentHarnessRuntimeSubstrate { + http.Error(w, "gateway proxy is only available for runtime=substrate", http.StatusBadRequest) + return + } + if ah.Status.BackendRef == nil || ah.Status.BackendRef.ID == "" { + http.Error(w, "harness has no substrate actor yet", http.StatusServiceUnavailable) + return + } + + token, err := h.resolveHarnessGatewayToken(r.Context(), &ah) + if err != nil { + log.Error(err, "resolve gateway token") + http.Error(w, "gateway token not configured", http.StatusInternalServerError) + return + } + + target, upstreamHost, err := h.resolveSubstrateGatewayTarget(r.Context(), &ah) + if err != nil { + log.Info("resolve substrate gateway target failed", "error", err) + http.Error(w, err.Error(), http.StatusServiceUnavailable) + return + } + + publicPrefix := agentHarnessGatewayPublicPrefix(namespace, name) + + _, redirectTo, ok := resolveGatewayUpstreamPath(r.URL.Path, namespace, name, isWebSocketUpgrade(r)) + if !ok { + http.NotFound(w, r) + return + } + // Browsers do not complete WebSocket handshakes through 30x redirects. + if redirectTo != "" && !isWebSocketUpgrade(r) { + dest := redirectTo + if r.URL.RawQuery != "" { + dest += "?" + r.URL.RawQuery + } + http.Redirect(w, r, dest, http.StatusPermanentRedirect) + return + } + + proxy := newAgentHarnessGatewayProxy(target, upstreamHost, token, publicPrefix, namespace, name, log) + proxy.ServeHTTP(w, r) +} + +func (h *Handlers) resolveSubstrateGatewayTarget(ctx context.Context, ah *v1alpha2.AgentHarness) (*url.URL, string, error) { + cfg := h.AgentHarnessGateway + if cfg == nil { + return nil, "", fmt.Errorf("substrate gateway is not configured") + } + if cfg.AteAPIEndpoint == "" { + return nil, "", fmt.Errorf("substrate ate-api is not configured on the controller") + } + + ateClient, err := substrate.Dial(ctx, substrate.Config{ + AteAPIEndpoint: cfg.AteAPIEndpoint, + Insecure: cfg.AteAPIInsecure, + DialTimeout: cfg.DialTimeout, + CallTimeout: cfg.CallTimeout, + }) + if err != nil { + return nil, "", fmt.Errorf("dial ate-api: %w", err) + } + defer ateClient.Close() + + actorID := ah.Status.BackendRef.ID + actor, err := ateClient.GetActor(ctx, actorID) + if err != nil { + return nil, "", fmt.Errorf("get substrate actor %q: %w", actorID, err) + } + podIP := strings.TrimSpace(actor.GetAteomPodIp()) + if podIP == "" { + return nil, "", fmt.Errorf("substrate actor %q has no pod IP (status %s; resume the actor and wait until running)", actorID, actor.GetStatus()) + } + target, host, err := substrateGatewayPodTarget(podIP) + if err != nil { + return nil, "", fmt.Errorf("substrate actor %q pod IP %q: %w", actorID, podIP, err) + } + ctrllog.FromContext(ctx).WithName("agentharness-gateway").Info( + "proxying via actor pod IP", + "actor", actorID, + "podIP", host, + ) + return target, host, nil +} + +func substrateGatewayPodTarget(podIP string) (*url.URL, string, error) { + ip := strings.TrimSpace(podIP) + if ip == "" || net.ParseIP(ip) == nil { + return nil, "", fmt.Errorf("invalid actor pod IP %q", podIP) + } + target, err := url.Parse("http://" + net.JoinHostPort(ip, "80")) + if err != nil { + return nil, "", fmt.Errorf("parse actor pod target: %w", err) + } + return target, ip, nil +} + +func agentHarnessHarnessBase(namespace, name string) string { + return "/api/agentharnesses/" + namespace + "/" + name +} + +func agentHarnessGatewayPublicPrefix(namespace, name string) string { + return agentHarnessHarnessBase(namespace, name) + "/gateway/" +} + +// resolveGatewayUpstreamPath maps the public URL to the upstream path on the actor. +// redirectTo is set when the browser should use a trailing slash under /gateway/. +// HTTP and WebSocket upgrades to the gateway entry both proxy to upstream / (OpenClaw gateway UI). +func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade bool) (upstreamPath, redirectTo string, ok bool) { + base := agentHarnessHarnessBase(namespace, name) + if !strings.HasPrefix(requestPath, base) { + return "", "", false + } + rel := strings.TrimPrefix(requestPath, base) + if rel == "" { + return "", agentHarnessGatewayPublicPrefix(namespace, name), true + } + + switch { + case rel == "/gateway": + _ = wsUpgrade + return "/", agentHarnessGatewayPublicPrefix(namespace, name), true + case strings.HasPrefix(rel, "/gateway/"): + sub := strings.TrimPrefix(rel, "/gateway") + if sub == "" { + sub = "/" + } + return sub, "", true + case isHarnessStaticAssetPath(rel): + return rel, "", true + default: + return "", "", false + } +} + +func isHarnessStaticAssetPath(rel string) bool { + if strings.HasPrefix(rel, "/assets/") { + return true + } + switch rel { + case "/manifest.webmanifest", "/vite.svg", "/favicon.ico": + return true + } + return strings.HasPrefix(rel, "/favicon") +} + +// normalizeOpenClawBrowserOrigin rewrites Origin/Referer so OpenClaw accepts WS/API from kagent-ui +// (e.g. http://localhost:8001) while the gateway listens on the actor pod :80. +func normalizeOpenClawBrowserOrigin(req *http.Request) { + if req == nil { + return + } + if req.Header.Get("Origin") != "" { + req.Header.Set("Origin", openclawLoopbackOrigin) + } + if req.Header.Get("Referer") != "" { + req.Header.Set("Referer", openclawLoopbackOrigin+"/") + } +} + +func isWebSocketUpgrade(r *http.Request) bool { + if r == nil { + return false + } + return strings.EqualFold(r.Header.Get("Upgrade"), "websocket") && + strings.Contains(strings.ToLower(r.Header.Get("Connection")), "upgrade") +} + +func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPrefix, namespace, name string, log interface { + Error(error, string, ...any) +}) *httputil.ReverseProxy { + proxy := httputil.NewSingleHostReverseProxy(target) + proxy.FlushInterval = -1 + proxy.Transport = &http.Transport{ + Proxy: http.ProxyFromEnvironment, + ResponseHeaderTimeout: 0, + IdleConnTimeout: 90 * time.Second, + } + origDirector := proxy.Director + proxy.Director = func(req *http.Request) { + origDirector(req) + req.Host = upstreamHost + req.Header.Set("Host", upstreamHost) + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + req.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) + normalizeOpenClawBrowserOrigin(req) + subPath, _, pathOK := resolveGatewayUpstreamPath(req.URL.Path, namespace, name, isWebSocketUpgrade(req)) + if !pathOK { + subPath = "/" + } + if subPath == "" { + subPath = "/" + } else if !strings.HasPrefix(subPath, "/") { + subPath = "/" + subPath + } + req.URL.Path = subPath + req.URL.RawPath = subPath + } + proxy.ModifyResponse = func(resp *http.Response) error { + // Do not read or rewrite WebSocket upgrade responses (would break 101 handshakes). + if resp.StatusCode == http.StatusSwitchingProtocols { + return nil + } + + resp.Header.Del("Content-Security-Policy") + resp.Header.Del("Content-Security-Policy-Report-Only") + + if loc := resp.Header.Get("Location"); loc != "" { + if strings.HasPrefix(loc, "/") && !strings.HasPrefix(loc, publicPrefix) { + resp.Header.Set("Location", strings.TrimSuffix(publicPrefix, "/")+loc) + } + } + + ct := resp.Header.Get("Content-Type") + if !shouldRewriteGatewayBody(ct) { + return nil + } + body, err := readGatewayResponseBody(resp) + if err != nil { + return err + } + rewritten := rewriteGatewayBody(body, ct, publicPrefix) + if strings.Contains(strings.ToLower(ct), "text/html") { + rewritten = injectGatewayClientShim(rewritten, token) + } + resp.Header.Del("Content-Encoding") + resp.Header.Del("Content-Length") + resp.ContentLength = int64(len(rewritten)) + resp.Body = io.NopCloser(bytes.NewReader(rewritten)) + return nil + } + proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, proxyErr error) { + log.Error(proxyErr, "gateway proxy error", "host", upstreamHost) + http.Error(rw, "gateway proxy error", http.StatusBadGateway) + } + return proxy +} + +func readGatewayResponseBody(resp *http.Response) ([]byte, error) { + var reader io.Reader = resp.Body + if strings.EqualFold(resp.Header.Get("Content-Encoding"), "gzip") { + gz, err := gzip.NewReader(resp.Body) + if err != nil { + return nil, err + } + defer gz.Close() + reader = gz + } + defer resp.Body.Close() + return io.ReadAll(reader) +} + +func (h *Handlers) resolveHarnessGatewayToken(ctx context.Context, ah *v1alpha2.AgentHarness) (string, error) { + if ah.Spec.Substrate != nil && ah.Spec.Substrate.GatewayTokenSecretRef != nil { + ref := ah.Spec.Substrate.GatewayTokenSecretRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + var secret corev1.Secret + if err := h.KubeClient.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { + return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) + } + if secret.Data == nil { + return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) + } + val, ok := secret.Data[substrateGatewayTokenSecretKey] + if !ok { + return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, substrateGatewayTokenSecretKey) + } + return strings.TrimSpace(string(val)), nil + } + return h.AgentHarnessGateway.resolveToken() +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go new file mode 100644 index 0000000000..13818acb39 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go @@ -0,0 +1,235 @@ +package handlers + +import ( + "bytes" + "encoding/json" + "fmt" + "strings" +) + +// shouldRewriteGatewayQuotedPath returns true for root-absolute app paths we proxy, +// not for short tokens like "/g" (RegExp flags) or other non-asset paths. +func shouldRewriteGatewayQuotedPath(path string) bool { + if path == "" || !strings.HasPrefix(path, "/") || strings.HasPrefix(path, "//") { + return false + } + switch { + case strings.HasPrefix(path, "/assets"): + return true + case strings.HasPrefix(path, "/manifest"): + return true + case strings.HasPrefix(path, "/favicon"): + return true + case path == "/vite.svg": + return true + default: + return false + } +} + +// rewriteGatewayRootPaths prefixes root-absolute URLs in HTML/JS/CSS so assets load under +// /api/agentharnesses/{ns}/{name}/gateway/ (OpenClaw CSP blocks ; base-uri 'none'). +func rewriteGatewayRootPaths(body []byte, prefix string) []byte { + if len(body) == 0 || prefix == "" { + return body + } + if !strings.HasPrefix(prefix, "/") { + prefix = "/" + prefix + } + if !strings.HasSuffix(prefix, "/") { + prefix += "/" + } + + var out bytes.Buffer + out.Grow(len(body) + len(prefix)*4) + s := string(body) + for i := 0; i < len(s); i++ { + c := s[i] + if (c == '"' || c == '\'') && i+1 < len(s) && s[i+1] == '/' { + if i+2 < len(s) && s[i+2] == '/' { + out.WriteByte(c) + continue + } + quote := c + j := i + 1 + for j < len(s) && s[j] != quote { + j++ + } + path := s[i+1 : j] + out.WriteByte(quote) + if shouldRewriteGatewayQuotedPath(path) { + out.WriteString(prefix) + out.WriteString(strings.TrimPrefix(path, "/")) + } else { + out.WriteString(path) + } + if j < len(s) { + out.WriteByte(quote) + } + i = j + continue + } + if i+4 < len(s) && strings.EqualFold(s[i:i+4], "url(") { + j := i + 4 + for j < len(s) && (s[j] == ' ' || s[j] == '\t') { + j++ + } + if j < len(s) && (s[j] == '"' || s[j] == '\'') { + quote := s[j] + if j+1 < len(s) && s[j+1] == '/' && !(j+2 < len(s) && s[j+2] == '/') { + k := j + 1 + for k < len(s) && s[k] != quote { + k++ + } + path := s[j+1 : k] + out.WriteString(s[i : j+1]) + if shouldRewriteGatewayQuotedPath(path) { + out.WriteString(prefix) + out.WriteString(strings.TrimPrefix(path, "/")) + } else { + out.WriteString(path) + } + if k < len(s) { + out.WriteByte(quote) + } + i = k + continue + } + } else if j < len(s) && s[j] == '/' && !(j+1 < len(s) && s[j+1] == '/') { + k := j + 1 + for k < len(s) && s[k] != ')' && s[k] != ' ' && s[k] != '\t' && s[k] != '"' && s[k] != '\'' { + k++ + } + path := s[j:k] + out.WriteString(s[i:j]) + if shouldRewriteGatewayQuotedPath(path) { + out.WriteString(prefix) + out.WriteString(strings.TrimPrefix(path, "/")) + } else { + out.WriteString(path) + } + i = k - 1 + continue + } + } + out.WriteByte(c) + } + return out.Bytes() +} + +func stripGatewayBaseTag(body []byte) []byte { + lower := bytes.ToLower(body) + for { + idx := bytes.Index(lower, []byte("")) + if end < 0 { + break + } + endIdx := idx + end + 1 + body = append(append(body[:idx], body[endIdx:]...)) + lower = bytes.ToLower(body) + } + return body +} + +func stripGatewayCSP(body []byte) []byte { + lower := bytes.ToLower(body) + for _, tag := range []string{ + `")) + if end < 0 { + break + } + endIdx := idx + end + 1 + body = append(append(body[:idx], body[endIdx:]...)) + lower = bytes.ToLower(body) + } + } + return body +} + +func rewriteGatewayBody(body []byte, contentType, prefix string) []byte { + body = stripGatewayCSP(body) + ct := strings.ToLower(contentType) + if strings.Contains(ct, "text/html") { + body = stripGatewayBaseTag(body) + } + if shouldRewriteGatewayBody(contentType) { + body = rewriteGatewayRootPaths(body, prefix) + return rewriteGatewayWebSocketPaths(body, prefix) + } + return body +} + +// injectGatewayClientShim patches WebSocket URLs (trailing slash + ?token= for OpenClaw Control UI). +func injectGatewayClientShim(body []byte, gatewayToken string) []byte { + tokenJSON, _ := json.Marshal(gatewayToken) + shim := fmt.Sprintf(``, tokenJSON) + lower := bytes.ToLower(body) + for _, tag := range []string{"", ""} { + if idx := bytes.Index(lower, []byte(strings.ToLower(tag))); idx >= 0 { + out := make([]byte, 0, len(body)+len(shim)) + out = append(out, body[:idx]...) + out = append(out, shim...) + out = append(out, body[idx:]...) + return out + } + } + return append(bytes.Clone(body), shim...) +} + +// rewriteGatewayWebSocketPaths ensures bundled/runtime WS URLs use .../gateway/ (trailing slash). +// Only rewrites occurrences not already followed by '/' (avoids breaking .../gateway/assets/...). +func rewriteGatewayWebSocketPaths(body []byte, prefix string) []byte { + gatewayWithSlash := strings.TrimSuffix(prefix, "/") + "/" + gatewayNoSlash := strings.TrimSuffix(gatewayWithSlash, "/") + if gatewayNoSlash == "" || gatewayNoSlash == gatewayWithSlash { + return body + } + needle := []byte(gatewayNoSlash) + var out bytes.Buffer + out.Grow(len(body) + 16) + for i := 0; i < len(body); { + idx := bytes.Index(body[i:], needle) + if idx < 0 { + out.Write(body[i:]) + break + } + idx += i + out.Write(body[i:idx]) + end := idx + len(needle) + if end < len(body) && body[end] == '/' { + out.Write(needle) + } else { + out.Write([]byte(gatewayWithSlash)) + } + i = end + } + return out.Bytes() +} + +func shouldRewriteGatewayBody(contentType string) bool { + ct := strings.ToLower(contentType) + return strings.Contains(ct, "text/html") || + strings.Contains(ct, "javascript") || + strings.Contains(ct, "text/css") || + strings.Contains(ct, "application/json") +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go new file mode 100644 index 0000000000..eaab469051 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go @@ -0,0 +1,165 @@ +package handlers + +import ( + "net/http" + "strings" + "testing" +) + +func TestResolveGatewayUpstreamPath(t *testing.T) { + t.Parallel() + ns, name := "kagent", "my-claw" + public := agentHarnessGatewayPublicPrefix(ns, name) + + tests := []struct { + name string + path string + wsUpgrade bool + wantUp string + wantRedir string + wantOK bool + }{ + { + name: "harness root redirects", + path: "/api/agentharnesses/kagent/my-claw", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash redirects", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wantUp: "/", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash websocket", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wsUpgrade: true, + wantUp: "/", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway index", + path: "/api/agentharnesses/kagent/my-claw/gateway/", + wantUp: "/", + wantOK: true, + }, + { + name: "gateway asset", + path: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", + wantUp: "/assets/foo.js", + wantOK: true, + }, + { + name: "mis-resolved asset shim", + path: "/api/agentharnesses/kagent/my-claw/assets/foo.js", + wantUp: "/assets/foo.js", + wantOK: true, + }, + { + name: "manifest shim", + path: "/api/agentharnesses/kagent/my-claw/manifest.webmanifest", + wantUp: "/manifest.webmanifest", + wantOK: true, + }, + { + name: "unknown path", + path: "/api/agentharnesses/kagent/my-claw/api/v1/foo", + wantOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + up, redir, ok := resolveGatewayUpstreamPath(tt.path, ns, name, tt.wsUpgrade) + if ok != tt.wantOK { + t.Fatalf("ok = %v, want %v", ok, tt.wantOK) + } + if up != tt.wantUp { + t.Fatalf("upstream = %q, want %q", up, tt.wantUp) + } + if redir != tt.wantRedir { + t.Fatalf("redirect = %q, want %q", redir, tt.wantRedir) + } + }) + } +} + +func TestRewriteGatewayRootPaths(t *testing.T) { + t.Parallel() + prefix := "/api/agentharnesses/kagent/my-claw/gateway/" + in := `` + out := string(rewriteGatewayRootPaths([]byte(in), prefix)) + if !strings.Contains(out, `src="/api/agentharnesses/kagent/my-claw/gateway/assets/index.js"`) { + t.Fatalf("script src not rewritten: %s", out) + } + if !strings.Contains(out, `href="/api/agentharnesses/kagent/my-claw/gateway/manifest.webmanifest"`) { + t.Fatalf("link href not rewritten: %s", out) + } +} + +func TestIsWebSocketUpgrade(t *testing.T) { + t.Parallel() + req, _ := http.NewRequest(http.MethodGet, "http://example.com/api/x/gateway", nil) + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Upgrade", "websocket") + if !isWebSocketUpgrade(req) { + t.Fatal("expected websocket upgrade") + } + req2, _ := http.NewRequest(http.MethodGet, "http://example.com/", nil) + if isWebSocketUpgrade(req2) { + t.Fatal("expected not websocket upgrade") + } +} + +func TestRewriteGatewayWebSocketPaths(t *testing.T) { + t.Parallel() + prefix := "/api/agentharnesses/kagent/my-claw/gateway/" + in := `const u="ws://localhost:8001/api/agentharnesses/kagent/my-claw/gateway"; const v='wss://host/api/agentharnesses/kagent/my-claw/gateway'` + out := string(rewriteGatewayWebSocketPaths([]byte(in), prefix)) + want := "/api/agentharnesses/kagent/my-claw/gateway/" + if !strings.Contains(out, "ws://localhost:8001"+want) { + t.Fatalf("ws URL not rewritten: %s", out) + } + if !strings.Contains(out, "wss://host"+want) { + t.Fatalf("wss URL not rewritten: %s", out) + } +} + +func TestRewriteGatewayBodyStripsBaseAndCSP(t *testing.T) { + t.Parallel() + prefix := "/api/agentharnesses/kagent/my-claw/gateway/" + in := `` + out := string(rewriteGatewayBody([]byte(in), "text/html", prefix)) + if strings.Contains(strings.ToLower(out), "ok")) + })) + defer upstream.Close() + + target, err := url.Parse(upstream.URL) + if err != nil { + t.Fatal(err) + } + + proxy := newAgentHarnessGatewayProxy(target, podIP, token, publicPrefix, ns, name, testLog{t}) + req := httptest.NewRequest(http.MethodGet, publicPrefix, nil) + rec := httptest.NewRecorder() + proxy.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + if gotHost != podIP { + t.Fatalf("upstream Host = %q, want %q", gotHost, podIP) + } + if gotAuth != "Bearer "+token { + t.Fatalf("Authorization = %q", gotAuth) + } + if gotScopes != openclawDefaultOperatorScopes { + t.Fatalf("x-openclaw-scopes = %q", gotScopes) + } + if gotPath != "/" { + t.Fatalf("upstream path = %q, want /", gotPath) + } + body, _ := io.ReadAll(rec.Body) + if !strings.Contains(string(body), "ok") { + t.Fatalf("response body missing upstream content: %s", body) + } +} + +func TestGatewayProxyDirectorTargetsPodIPOnWebSocketPath(t *testing.T) { + t.Parallel() + const podIP = "10.244.0.29" + ns, name := "kagent", "my-claw" + publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) + + target, err := url.Parse("http://" + podIP + ":80") + if err != nil { + t.Fatal(err) + } + proxy := newAgentHarnessGatewayProxy(target, podIP, "tok", publicPrefix, ns, name, testLog{t}) + req := httptest.NewRequest(http.MethodGet, strings.TrimSuffix(publicPrefix, "/"), nil) + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Upgrade", "websocket") + req.Header.Set("Origin", "http://localhost:8001") + req.Header.Set("Referer", "http://localhost:8001/api/agentharnesses/kagent/my-claw/gateway/") + + proxy.Director(req) + + if req.Host != podIP { + t.Fatalf("Host = %q, want pod IP", req.Host) + } + if req.URL.Host != podIP+":80" { + t.Fatalf("URL.Host = %q", req.URL.Host) + } + if req.URL.Path != "/" { + t.Fatalf("URL.Path = %q, want /", req.URL.Path) + } + if req.Header.Get("Authorization") != "Bearer tok" { + t.Fatalf("missing Authorization") + } + if req.Header.Get("x-openclaw-scopes") != openclawDefaultOperatorScopes { + t.Fatalf("missing scopes header") + } + if req.Header.Get("Origin") != openclawLoopbackOrigin { + t.Fatalf("Origin = %q, want %q", req.Header.Get("Origin"), openclawLoopbackOrigin) + } + if req.Header.Get("Referer") != openclawLoopbackOrigin+"/" { + t.Fatalf("Referer = %q", req.Header.Get("Referer")) + } +} + +type testLog struct { + t *testing.T +} + +func (l testLog) Error(err error, msg string, _ ...any) { + l.t.Helper() + l.t.Logf("%s: %v", msg, err) +} diff --git a/go/core/internal/httpserver/handlers/agents.go b/go/core/internal/httpserver/handlers/agents.go index 59c68ce27f..76056660c8 100644 --- a/go/core/internal/httpserver/handlers/agents.go +++ b/go/core/internal/httpserver/handlers/agents.go @@ -160,19 +160,13 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, } } + runtime := sb.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + gatewayName := fmt.Sprintf("%s-%s", sb.Namespace, sb.Name) desc := strings.TrimSpace(sb.Spec.Description) - entry := &api.OpenshellAgentHarnessListEntry{ - Backend: sb.Spec.Backend, - GatewaySandboxName: gatewayName, - ModelConfigRef: sb.Spec.ModelConfigRef, - } - if sb.Status.BackendRef != nil { - entry.BackendRefID = sb.Status.BackendRef.ID - } - if sb.Status.Connection != nil { - entry.Endpoint = sb.Status.Connection.Endpoint - } resp := api.AgentResponse{ ID: id, @@ -184,9 +178,39 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, Description: desc, }, }, - DeploymentReady: ready, - Accepted: accepted, - OpenshellAgentHarness: entry, + DeploymentReady: ready, + Accepted: accepted, + } + + switch runtime { + case v1alpha2.AgentHarnessRuntimeSubstrate: + subEntry := &api.SubstrateAgentHarnessListEntry{ + Backend: sb.Spec.Backend, + Runtime: runtime, + ModelConfigRef: sb.Spec.ModelConfigRef, + GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), + } + if sb.Status.BackendRef != nil { + subEntry.BackendRefID = sb.Status.BackendRef.ID + subEntry.ActorID = sb.Status.BackendRef.ID + } + if sb.Status.Connection != nil { + subEntry.Endpoint = sb.Status.Connection.Endpoint + } + resp.SubstrateAgentHarness = subEntry + default: + entry := &api.OpenshellAgentHarnessListEntry{ + Backend: sb.Spec.Backend, + GatewaySandboxName: gatewayName, + ModelConfigRef: sb.Spec.ModelConfigRef, + } + if sb.Status.BackendRef != nil { + entry.BackendRefID = sb.Status.BackendRef.ID + } + if sb.Status.Connection != nil { + entry.Endpoint = sb.Status.Connection.Endpoint + } + resp.OpenshellAgentHarness = entry } mcRef := strings.TrimSpace(sb.Spec.ModelConfigRef) diff --git a/go/core/internal/httpserver/handlers/handlers.go b/go/core/internal/httpserver/handlers/handlers.go index 13a66adeb9..3d854bd134 100644 --- a/go/core/internal/httpserver/handlers/handlers.go +++ b/go/core/internal/httpserver/handlers/handlers.go @@ -12,6 +12,9 @@ import ( // Handlers holds all the HTTP handler components type Handlers struct { + KubeClient client.Client + AgentHarnessGateway *AgentHarnessGatewayConfig + Health *HealthHandler ModelConfig *ModelConfigHandler Model *ModelHandler @@ -43,7 +46,17 @@ type Base struct { } // NewHandlers creates a new Handlers instance with all handler components. -func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedName, dbService database.Client, watchedNamespaces []string, authorizer auth.Authorizer, proxyURL string, rcnclr reconciler.KagentReconciler, sandboxBackend sandboxbackend.Backend) *Handlers { +func NewHandlers( + kubeClient client.Client, + defaultModelConfig types.NamespacedName, + dbService database.Client, + watchedNamespaces []string, + authorizer auth.Authorizer, + proxyURL string, + rcnclr reconciler.KagentReconciler, + sandboxBackend sandboxbackend.Backend, + agentHarnessGateway *AgentHarnessGatewayConfig, +) *Handlers { base := &Base{ KubeClient: kubeClient, DefaultModelConfig: defaultModelConfig, @@ -55,6 +68,8 @@ func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedNa } return &Handlers{ + KubeClient: kubeClient, + AgentHarnessGateway: agentHarnessGateway, Health: NewHealthHandler(), ModelConfig: NewModelConfigHandler(base), Model: NewModelHandler(base), diff --git a/go/core/internal/httpserver/middleware.go b/go/core/internal/httpserver/middleware.go index 2f3a329378..fa112a8fa7 100644 --- a/go/core/internal/httpserver/middleware.go +++ b/go/core/internal/httpserver/middleware.go @@ -5,6 +5,7 @@ import ( "fmt" "net" "net/http" + "strings" "time" "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" @@ -77,9 +78,23 @@ func (w *statusResponseWriter) RespondWithError(err error) { } } +func isAgentHarnessGatewayPath(path string) bool { + if !strings.HasPrefix(path, "/api/agentharnesses/") { + return false + } + for _, marker := range []string{"/gateway", "/assets/", "/manifest.webmanifest", "/favicon"} { + if strings.Contains(path, marker) { + return true + } + } + return false +} + func contentTypeMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if len(r.URL.Path) >= 4 && r.URL.Path[:4] == "/api" && r.URL.Path != APIPathSandboxSSH { + if len(r.URL.Path) >= 4 && r.URL.Path[:4] == "/api" && + r.URL.Path != APIPathSandboxSSH && + !isAgentHarnessGatewayPath(r.URL.Path) { w.Header().Set("Content-Type", "application/json") } next.ServeHTTP(w, r) diff --git a/go/core/internal/httpserver/server.go b/go/core/internal/httpserver/server.go index aac7e831ab..40e624a3c5 100644 --- a/go/core/internal/httpserver/server.go +++ b/go/core/internal/httpserver/server.go @@ -50,6 +50,7 @@ const ( APIPathLangGraph = "/api/langgraph" APIPathCrewAI = "/api/crewai" APIPathSandboxSSH = "/api/sandbox/ssh" + APIPathAgentHarnessHarness = "/api/agentharnesses/{namespace}/{name}/" ) var defaultModelConfig = types.NamespacedName{ @@ -70,7 +71,8 @@ type ServerConfig struct { Authorizer auth.Authorizer ProxyURL string Reconciler reconciler.KagentReconciler - SandboxBackend sandboxbackend.Backend + SandboxBackend sandboxbackend.Backend + AgentHarnessGateway *handlers.AgentHarnessGatewayConfig } // HTTPServer is the structure that manages the HTTP server @@ -89,7 +91,17 @@ func NewHTTPServer(config ServerConfig) (*HTTPServer, error) { return &HTTPServer{ config: config, router: config.Router, - handlers: handlers.NewHandlers(config.KubeClient, defaultModelConfig, config.DbClient, config.WatchedNamespaces, config.Authorizer, config.ProxyURL, config.Reconciler, config.SandboxBackend), + handlers: handlers.NewHandlers( + config.KubeClient, + defaultModelConfig, + config.DbClient, + config.WatchedNamespaces, + config.Authorizer, + config.ProxyURL, + config.Reconciler, + config.SandboxBackend, + config.AgentHarnessGateway, + ), authenticator: config.Authenticator, }, nil } @@ -303,6 +315,12 @@ func (s *HTTPServer) setupRoutes() { // OpenShell sandbox PTY (browser WebSocket → gateway CONNECT → SSH). Authenticated like other /api routes. s.router.HandleFunc(APIPathSandboxSSH, adaptHandler(s.handlers.HandleSandboxSSHWebSocket)).Methods(http.MethodGet) + // Substrate OpenClaw gateway proxy (HTTP + WebSocket) to the actor pod IP :80. + // Includes /gateway/* and mis-resolved static paths (/assets/, manifest, etc.). + s.router.PathPrefix(APIPathAgentHarnessHarness).Handler( + adaptHandler(s.handlers.HandleAgentHarnessGateway), + ) + // A2A s.router.PathPrefix(APIPathA2A + "/{namespace}/{name}").Handler(s.config.A2AHandler) s.router.PathPrefix(APIPathA2ASandboxes + "/{namespace}/{name}").Handler(s.config.A2AHandler) @@ -313,21 +331,30 @@ func (s *HTTPServer) setupRoutes() { } // Use middleware for common functionality (first registered runs outermost on incoming requests). - s.router.Use(wsSandboxSSHAuthQueryMiddleware) + s.router.Use(wsAuthQueryMiddleware) s.router.Use(auth.AuthnMiddleware(s.authenticator)) s.router.Use(contentTypeMiddleware) s.router.Use(loggingMiddleware) s.router.Use(errorHandlerMiddleware) } -// wsSandboxSSHAuthQueryMiddleware maps access_token query → Authorization for browser WebSocket upgrades +// wsAuthQueryMiddleware maps token query params → Authorization for browser WebSocket upgrades // (fetch can send headers; WebSocket cannot). -func wsSandboxSSHAuthQueryMiddleware(next http.Handler) http.Handler { +func wsAuthQueryMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path == APIPathSandboxSSH && r.Header.Get("Authorization") == "" { - if t := r.URL.Query().Get("access_token"); t != "" { - r.Header.Set("Authorization", "Bearer "+strings.TrimSpace(t)) - } + if r.Header.Get("Authorization") != "" { + next.ServeHTTP(w, r) + return + } + var token string + switch { + case r.URL.Path == APIPathSandboxSSH || strings.HasSuffix(r.URL.Path, "/ssh"): + token = r.URL.Query().Get("access_token") + case isAgentHarnessGatewayPath(r.URL.Path): + token = r.URL.Query().Get("token") + } + if token != "" { + r.Header.Set("Authorization", "Bearer "+strings.TrimSpace(token)) } next.ServeHTTP(w, r) }) diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index ddad07d546..7885292985 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -54,10 +54,13 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" dbpkg "github.com/kagent-dev/kagent/go/api/database" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/migrations" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "github.com/kagent-dev/kagent/go/core/pkg/translator" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -99,6 +102,7 @@ func init() { utilruntime.Must(v1alpha1.AddToScheme(scheme)) utilruntime.Must(v1alpha2.AddToScheme(scheme)) utilruntime.Must(agentsandboxv1.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } @@ -148,6 +152,21 @@ type Config struct { DialTimeout time.Duration CallTimeout time.Duration } + Substrate struct { + AteAPIEndpoint string + Insecure bool + DialTimeout time.Duration + CallTimeout time.Duration + DefaultActorTemplateNamespace string + DefaultActorTemplateName string + GatewayToken string + GatewayTokenFile string + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string + } } func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { @@ -207,6 +226,20 @@ func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { commandLine.DurationVar(&cfg.Openshell.DialTimeout, "openshell-dial-timeout", 10*time.Second, "Timeout for the initial dial to the OpenShell gateway.") commandLine.DurationVar(&cfg.Openshell.CallTimeout, "openshell-call-timeout", 30*time.Second, "Per-RPC timeout for OpenShell gateway calls.") + commandLine.StringVar(&cfg.Substrate.AteAPIEndpoint, "substrate-ate-api-endpoint", "", "gRPC target for Agent Substrate ate-api (e.g. dns:///api.ate-system.svc:443). Enables substrate AgentHarness runtime when set.") + commandLine.BoolVar(&cfg.Substrate.Insecure, "substrate-ate-api-insecure", false, "Dial ate-api without TLS (local dev only).") + commandLine.DurationVar(&cfg.Substrate.DialTimeout, "substrate-dial-timeout", 10*time.Second, "Timeout for the initial dial to ate-api.") + commandLine.DurationVar(&cfg.Substrate.CallTimeout, "substrate-call-timeout", 30*time.Second, "Per-RPC timeout for ate-api calls.") + commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateNamespace, "substrate-default-actor-template-namespace", "", "Legacy fallback ActorTemplate namespace when adopting an external template (set spec.substrate.actorTemplateRef instead).") + commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateName, "substrate-default-actor-template-name", "", "Legacy fallback ActorTemplate name when adopting an external template (set spec.substrate.actorTemplateRef instead).") + commandLine.StringVar(&cfg.Substrate.GatewayToken, "substrate-gateway-token", "", "OpenClaw gateway Bearer token for substrate proxy. Prefer --substrate-gateway-token-file.") + commandLine.StringVar(&cfg.Substrate.GatewayTokenFile, "substrate-gateway-token-file", "", "File containing OpenClaw gateway Bearer token for substrate harness proxy.") + commandLine.StringVar(&cfg.Substrate.PauseImage, "substrate-pause-image", "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", "Pause image for auto-provisioned ActorTemplates.") + commandLine.StringVar(&cfg.Substrate.RunscAMD64URL, "substrate-runsc-amd64-url", "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc", "gVisor runsc URL for amd64.") + commandLine.StringVar(&cfg.Substrate.RunscAMD64SHA256, "substrate-runsc-amd64-sha256", "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63", "gVisor runsc sha256 for amd64.") + commandLine.StringVar(&cfg.Substrate.RunscARM64URL, "substrate-runsc-arm64-url", "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc", "gVisor runsc URL for arm64.") + commandLine.StringVar(&cfg.Substrate.RunscARM64SHA256, "substrate-runsc-arm64-sha256", "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9", "gVisor runsc sha256 for arm64.") + commandLine.StringVar(&agent_translator.DefaultServiceAccountName, "default-service-account-name", "", "Global default ServiceAccount name for agent pods. When set, agents without an explicit serviceAccountName will use this instead of creating a per-agent ServiceAccount.") commandLine.Var(&MapValue{Target: &agent_translator.DefaultAgentPodLabels}, "default-agent-pod-labels", "Comma-separated key=value pairs of labels to apply to all agent pod templates (e.g. 'team=platform,env=prod'). Per-agent labels take precedence.") @@ -563,23 +596,43 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne os.Exit(1) } + kubeClient := mgr.GetClient() + var openshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + var substrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend if cfg.Openshell.GatewayURL != "" { - kubeClient := mgr.GetClient() - openshellBackends, err := buildOpenshellSandboxBackends(ctx, &cfg, kubeClient) + var err error + openshellBackends, err = buildOpenshellSandboxBackends(ctx, &cfg, kubeClient) if err != nil { setupLog.Error(err, "unable to build openshell sandbox backends") os.Exit(1) } + } + var substrateAteClient *substrate.Client + if cfg.Substrate.AteAPIEndpoint != "" { + var err error + substrateBackends, substrateAteClient, err = buildSubstrateSandboxBackends(ctx, &cfg) + if err != nil { + setupLog.Error(err, "unable to build substrate sandbox backends") + os.Exit(1) + } + } + if len(openshellBackends) > 0 || len(substrateBackends) > 0 { + var substrateProvisioner *substrate.Provisioner + if len(substrateBackends) > 0 { + substrateProvisioner = substrateProvisionerFromConfig(kubeClient, &cfg, substrateAteClient) + } if err := (&controller.AgentHarnessController{ - Client: kubeClient, - Recorder: mgr.GetEventRecorder("agentharness-controller"), - Backends: openshellBackends, + Client: kubeClient, + Recorder: mgr.GetEventRecorder("agentharness-controller"), + OpenshellBackends: openshellBackends, + SubstrateBackends: substrateBackends, + SubstrateProvisioner: substrateProvisioner, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "AgentHarness") os.Exit(1) } } else { - setupLog.Info("AgentHarness controller disabled: --openshell-gateway-url not set") + setupLog.Info("AgentHarness controller disabled: set --openshell-gateway-url and/or --substrate-ate-api-endpoint") } if err = (&controller.ModelConfigController{ @@ -677,19 +730,40 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne os.Exit(1) } + var agentHarnessGateway *handlers.AgentHarnessGatewayConfig + if cfg.Substrate.AteAPIEndpoint != "" { + gwToken := cfg.Substrate.GatewayToken + if cfg.Substrate.GatewayTokenFile != "" { + data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) + if err != nil { + setupLog.Error(err, "unable to read substrate gateway token file") + os.Exit(1) + } + gwToken = strings.TrimSpace(string(data)) + } + agentHarnessGateway = &handlers.AgentHarnessGatewayConfig{ + GatewayToken: gwToken, + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + AteAPIInsecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, + } + } + httpServer, err := httpserver.NewHTTPServer(httpserver.ServerConfig{ - Router: router, - BindAddr: cfg.HttpServerAddr, - KubeClient: mgr.GetClient(), - A2AHandler: a2aHandler, - MCPHandler: mcpHandler, - WatchedNamespaces: watchNamespacesList, - DbClient: dbClient, - Authorizer: extensionCfg.Authorizer, - Authenticator: extensionCfg.Authenticator, - ProxyURL: cfg.Proxy.URL, - Reconciler: rcnclr, - SandboxBackend: extensionCfg.SandboxBackend, + Router: router, + BindAddr: cfg.HttpServerAddr, + KubeClient: mgr.GetClient(), + A2AHandler: a2aHandler, + MCPHandler: mcpHandler, + WatchedNamespaces: watchNamespacesList, + DbClient: dbClient, + Authorizer: extensionCfg.Authorizer, + Authenticator: extensionCfg.Authenticator, + ProxyURL: cfg.Proxy.URL, + Reconciler: rcnclr, + SandboxBackend: extensionCfg.SandboxBackend, + AgentHarnessGateway: agentHarnessGateway, }) if err != nil { setupLog.Error(err, "unable to create HTTP server") @@ -747,12 +821,80 @@ func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient ocl := openshell.NewOpenClawBackend(kubeClient, clients, oc, nil) hermesBackend := openshell.NewHermesBackend(kubeClient, clients, oc, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ocl, v1alpha2.AgentHarnessBackendHermes: hermesBackend, }, nil } +func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend, *substrate.Client, error) { + sc, _, err := substrateAppConfig(cfg) + if err != nil { + return nil, nil, err + } + client, err := substrate.Dial(ctx, sc) + if err != nil { + return nil, nil, err + } + + ocl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendOpenClaw, nil) + ncl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendNemoClaw, nil) + return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendNemoClaw: ncl, + }, client, nil +} + +func substrateAppConfig(cfg *Config) (substrate.Config, string, error) { + gwToken := cfg.Substrate.GatewayToken + if cfg.Substrate.GatewayTokenFile != "" { + data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) + if err != nil { + return substrate.Config{}, "", fmt.Errorf("read substrate gateway token file: %w", err) + } + gwToken = strings.TrimSpace(string(data)) + } + sc := substrate.Config{ + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + Insecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, + DefaultActorTemplateNamespace: cfg.Substrate.DefaultActorTemplateNamespace, + DefaultActorTemplateName: cfg.Substrate.DefaultActorTemplateName, + GatewayToken: gwToken, + ProvisionDefaults: substrate.ProvisionDefaults{ + PauseImage: cfg.Substrate.PauseImage, + RunscAMD64URL: cfg.Substrate.RunscAMD64URL, + RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, + RunscARM64URL: cfg.Substrate.RunscARM64URL, + RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + GatewayToken: gwToken, + }, + } + return sc, gwToken, nil +} + +func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate *substrate.Client) *substrate.Provisioner { + _, gwToken, err := substrateAppConfig(cfg) + if err != nil { + gwToken = cfg.Substrate.GatewayToken + } + return &substrate.Provisioner{ + Client: kubeClient, + Ate: ate, + Defaults: substrate.ProvisionDefaults{ + PauseImage: cfg.Substrate.PauseImage, + RunscAMD64URL: cfg.Substrate.RunscAMD64URL, + RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, + RunscARM64URL: cfg.Substrate.RunscARM64URL, + RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + GatewayToken: gwToken, + }, + } +} + // configureNamespaceWatching sets up the controller manager to watch specific namespaces // based on the provided configuration. It returns the list of namespaces being watched, // or nil if watching all namespaces. diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw.go b/go/core/pkg/sandboxbackend/openshell/openclaw.go index 9f95a407a5..f8032b4235 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw.go @@ -87,7 +87,7 @@ func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.Agen gwPort := defaultOpenclawGatewayPort token := b.cfg.Token - jsonBytes, env, err := openclaw.BuildBootstrapJSON(ctx, b.kubeClient, ah.Namespace, ah, mc, gwPort) + jsonBytes, env, err := openclaw.BuildBootstrapJSON(ctx, b.kubeClient, ah.Namespace, ah, mc, openclaw.OpenshellGatewayBootstrap(gwPort), openclaw.DefaultInferenceBaseURL) if err != nil { return fmt.Errorf("build openclaw config: %w", err) } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go index db2fdb373e..23d61fdbfb 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go @@ -11,9 +11,46 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// GatewayBootstrapConfig describes the gateway section of openclaw.json for a harness runtime. +type GatewayBootstrapConfig struct { + Port int + Bind string // loopback | lan + AuthMode string // none | token + Token string // required when AuthMode is token + ControlUI *ControlUIBootstrapConfig +} + +// ControlUIBootstrapConfig maps to gateway.controlUi in openclaw.json. +type ControlUIBootstrapConfig struct { + AllowedOrigins []string + DangerouslyDisableDeviceAuth bool +} + +// OpenshellGatewayBootstrap is the default gateway profile for OpenShell sandboxes. +func OpenshellGatewayBootstrap(port int) GatewayBootstrapConfig { + return GatewayBootstrapConfig{Port: port, Bind: "loopback", AuthMode: "none"} +} + +// SubstrateGatewayBootstrap is the gateway profile for Agent Substrate actors (port 80, token auth, proxied Control UI). +func SubstrateGatewayBootstrap(token string, port int) GatewayBootstrapConfig { + return GatewayBootstrapConfig{ + Port: port, + Bind: "lan", + AuthMode: "token", + Token: strings.TrimSpace(token), + ControlUI: &ControlUIBootstrapConfig{ + AllowedOrigins: []string{"*"}, + DangerouslyDisableDeviceAuth: true, + }, + } +} + // BuildBootstrapJSON builds ~/.openclaw/openclaw.json contents plus environment variables that must be present when // OpenClaw resolves openshell:resolve:env: (API key + channel tokens). -func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gwPort int) ([]byte, map[string]string, error) { +// +// defaultBaseURLWhenUnset is used when ModelConfig has no explicit provider base URL. +// OpenShell callers should pass DefaultInferenceBaseURL; Substrate should pass SubstrateBootstrapDefaultBaseURL. +func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, defaultBaseURLWhenUnset string) ([]byte, map[string]string, error) { if mc == nil { return nil, nil, fmt.Errorf("ModelConfig is required") } @@ -37,7 +74,7 @@ func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace strin } providerRecord := GatewayProviderRecordName(mc.Spec.Provider) - doc := buildCoreBootstrapDocument(mc, gwPort, apiKeyEnv, providerRecord, modelID, apiAdapter) + doc := buildCoreBootstrapDocument(mc, gw, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset) chState, err := accumulateHarnessChannels(ctx, kube, namespace, sbx.Spec.Backend, sbx.Spec.Channels, env) if err != nil { @@ -54,29 +91,19 @@ func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace strin return raw, env, nil } -func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gwPort int, apiKeyEnv, providerRecord, modelID, apiAdapter string) bootstrapDocument { - baseURL := bootstrapProviderBaseURL(mc) - return bootstrapDocument{ - Gateway: gatewaySection{ - Mode: "local", - Bind: "loopback", - Auth: gatewayAuth{Mode: "none"}, - Port: gwPort, - }, - Models: modelsSection{ - Mode: "merge", - Providers: map[string]providerSettings{ - providerRecord: { - BaseURL: baseURL, - APIKey: openshellResolveEnv(apiKeyEnv), - Auth: providerAuth(mc), - API: apiAdapter, - Models: []modelSlot{ - {ID: modelID, Name: modelID}, - }, - }, - }, - }, +// BuildGatewayOnlyBootstrapJSON returns a minimal openclaw.json with gateway settings only (no models/channels). +func BuildGatewayOnlyBootstrapJSON(gw GatewayBootstrapConfig) ([]byte, error) { + doc := bootstrapDocument{Gateway: buildGatewaySection(gw)} + raw, err := json.Marshal(doc) + if err != nil { + return nil, fmt.Errorf("marshal openclaw json: %w", err) + } + return raw, nil +} + +func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset string) bootstrapDocument { + doc := bootstrapDocument{ + Gateway: buildGatewaySection(gw), Agents: agentsSection{ Defaults: agentDefaults{ Model: defaultModelPick{ @@ -85,6 +112,76 @@ func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gwPort int, apiKeyEnv, }, }, } + + // Substrate: do not emit models.providers without baseUrl (OpenClaw rejects undefined baseUrl). + // Rely on agents.defaults + API key env unless the user set an explicit URL on ModelConfig. + if defaultBaseURLWhenUnset == SubstrateBootstrapDefaultBaseURL { + if explicit := modelConfigExplicitBaseURL(mc); explicit != "" { + doc.Models = &modelsSection{ + Mode: "merge", + Providers: map[string]providerSettings{ + providerRecord: { + BaseURL: explicit, + APIKey: openshellResolveEnv(apiKeyEnv), + Auth: providerAuth(mc), + API: apiAdapter, + Models: []modelSlot{ + {ID: modelID, Name: modelID}, + }, + }, + }, + } + } + return doc + } + + baseURL := bootstrapProviderBaseURL(mc, defaultBaseURLWhenUnset) + doc.Models = &modelsSection{ + Mode: "merge", + Providers: map[string]providerSettings{ + providerRecord: { + BaseURL: baseURL, + APIKey: openshellResolveEnv(apiKeyEnv), + Auth: providerAuth(mc), + API: apiAdapter, + Models: []modelSlot{ + {ID: modelID, Name: modelID}, + }, + }, + }, + } + return doc +} + +func buildGatewaySection(gw GatewayBootstrapConfig) gatewaySection { + port := gw.Port + if port <= 0 { + port = 18800 + } + bind := strings.TrimSpace(gw.Bind) + if bind == "" { + bind = "loopback" + } + authMode := strings.TrimSpace(gw.AuthMode) + if authMode == "" { + authMode = "none" + } + section := gatewaySection{ + Mode: "local", + Bind: bind, + Auth: gatewayAuth{Mode: authMode}, + Port: port, + } + if authMode == "token" { + section.Auth.Token = gw.Token + } + if gw.ControlUI != nil { + section.ControlUi = &controlUiSection{ + AllowedOrigins: gw.ControlUI.AllowedOrigins, + DangerouslyDisableDeviceAuth: gw.ControlUI.DangerouslyDisableDeviceAuth, + } + } + return section } func applySecretsAllowlist(doc *bootstrapDocument, env map[string]string) { diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go new file mode 100644 index 0000000000..4fd9ff2e72 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go @@ -0,0 +1,21 @@ +package openclaw_test + +import ( + "encoding/json" + "testing" + + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/stretchr/testify/require" +) + +func TestSubstrateGatewayBootstrap(t *testing.T) { + t.Parallel() + raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80)) + require.NoError(t, err) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + require.Equal(t, "lan", gw["bind"]) + cui := gw["controlUi"].(map[string]any) + require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go index 4dfa1a2633..3ffbd7c9ca 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go @@ -39,7 +39,7 @@ func TestBuildBootstrapJSON_OpenAIDefaultBaseURLInferenceLocal(t *testing.T) { sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, 18800) + raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.OpenshellGatewayBootstrap(18800), openclaw.DefaultInferenceBaseURL) require.NoError(t, err) var root map[string]any @@ -56,6 +56,42 @@ func TestBuildBootstrapJSON_OpenAIDefaultBaseURLInferenceLocal(t *testing.T) { require.Contains(t, kagent["allowlist"], "OPENAI_API_KEY") } +func TestBuildBootstrapJSON_SubstrateOmitsModelsWhenNoExplicitBaseURL(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "default" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openai-key", Namespace: ns}, + Data: map[string][]byte{"OPENAI_API_KEY": []byte("sk-test")}, + } + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc1", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() + raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80), openclaw.SubstrateBootstrapDefaultBaseURL) + require.NoError(t, err) + + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + _, hasModels := root["models"] + require.False(t, hasModels) + agents := root["agents"].(map[string]any) + defaults := agents["defaults"].(map[string]any) + model := defaults["model"].(map[string]any) + require.Equal(t, "openai/gpt-4o", model["primary"]) +} + func TestBuildBootstrapJSON_OpenAIAndTelegram(t *testing.T) { scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -92,7 +128,7 @@ func TestBuildBootstrapJSON_OpenAIAndTelegram(t *testing.T) { } kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - raw, env, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, 18800) + raw, env, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.OpenshellGatewayBootstrap(18800), openclaw.DefaultInferenceBaseURL) require.NoError(t, err) require.Equal(t, "sk-test", env["OPENAI_API_KEY"]) require.Equal(t, "telegram-bot-token", env["TELEGRAM_BOT_TOKEN_TG1"]) diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go b/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go index dd0f98cdc8..e94d0789f1 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go @@ -7,6 +7,11 @@ const ( // bootstrapSecretProviderID is the secrets.providers key written into openclaw.json. bootstrapSecretProviderID = "kagent" - // DefaultInferenceBaseURL is the Model provider baseUrl when ModelConfig does not set an explicit upstream. + // DefaultInferenceBaseURL is the Model provider baseUrl when ModelConfig does not set an explicit upstream (OpenShell). DefaultInferenceBaseURL = "https://inference.local/v1" + + // SubstrateBootstrapDefaultBaseURL is passed to BuildBootstrapJSON for Substrate harnesses. + // When ModelConfig has no explicit provider URL, the models section is omitted entirely so + // OpenClaw is not given a partial providers.* block (baseUrl is required when present). + SubstrateBootstrapDefaultBaseURL = "" ) diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go b/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go index 70a075a272..8c4183e0e9 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go @@ -7,7 +7,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" ) -func bootstrapProviderBaseURL(mc *v1alpha2.ModelConfig) string { +func modelConfigExplicitBaseURL(mc *v1alpha2.ModelConfig) string { switch mc.Spec.Provider { case v1alpha2.ModelProviderOpenAI: if mc.Spec.OpenAI != nil && strings.TrimSpace(mc.Spec.OpenAI.BaseURL) != "" { @@ -30,7 +30,14 @@ func bootstrapProviderBaseURL(mc *v1alpha2.ModelConfig) string { return strings.TrimSpace(mc.Spec.SAPAICore.BaseURL) } } - return DefaultInferenceBaseURL + return "" +} + +func bootstrapProviderBaseURL(mc *v1alpha2.ModelConfig, defaultWhenUnset string) string { + if u := modelConfigExplicitBaseURL(mc); u != "" { + return u + } + return defaultWhenUnset } func providerAuth(mc *v1alpha2.ModelConfig) string { diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go index da73347668..bf6dd73760 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go @@ -5,21 +5,28 @@ package openclaw type bootstrapDocument struct { Gateway gatewaySection `json:"gateway"` - Models modelsSection `json:"models"` + Models *modelsSection `json:"models,omitempty"` Agents agentsSection `json:"agents"` Channels *channelsConfig `json:"channels,omitempty"` Secrets secretsSection `json:"secrets"` } type gatewaySection struct { - Mode string `json:"mode"` - Bind string `json:"bind"` - Auth gatewayAuth `json:"auth"` - Port int `json:"port"` + Mode string `json:"mode"` + Bind string `json:"bind"` + Auth gatewayAuth `json:"auth"` + Port int `json:"port"` + ControlUi *controlUiSection `json:"controlUi,omitempty"` } type gatewayAuth struct { - Mode string `json:"mode"` + Mode string `json:"mode"` + Token string `json:"token,omitempty"` +} + +type controlUiSection struct { + AllowedOrigins []string `json:"allowedOrigins,omitempty"` + DangerouslyDisableDeviceAuth bool `json:"dangerouslyDisableDeviceAuth,omitempty"` } type modelsSection struct { @@ -28,7 +35,7 @@ type modelsSection struct { } type providerSettings struct { - BaseURL string `json:"baseUrl"` + BaseURL string `json:"baseUrl,omitempty"` APIKey string `json:"apiKey"` Auth string `json:"auth"` API string `json:"api"` diff --git a/go/core/pkg/sandboxbackend/substrate/client.go b/go/core/pkg/sandboxbackend/substrate/client.go new file mode 100644 index 0000000000..70291c7bb8 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/client.go @@ -0,0 +1,114 @@ +package substrate + +import ( + "context" + "crypto/tls" + "fmt" + "time" + + "github.com/agent-substrate/substrate/proto/ateapipb" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "google.golang.org/grpc/credentials/insecure" +) + +// Client wraps ate-api Control gRPC. +type Client struct { + ateapipb.ControlClient + conn *grpc.ClientConn + cfg Config +} + +// Dial connects to the ate-api server. +func Dial(ctx context.Context, cfg Config) (*Client, error) { + if cfg.AteAPIEndpoint == "" { + return nil, fmt.Errorf("substrate: ate-api endpoint is required") + } + dialTimeout := cfg.DialTimeout + if dialTimeout <= 0 { + dialTimeout = 10 * time.Second + } + dialCtx, cancel := context.WithTimeout(ctx, dialTimeout) + defer cancel() + + var opts []grpc.DialOption + if cfg.Insecure { + opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) + } else { + opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}))) + } + + conn, err := grpc.NewClient(cfg.AteAPIEndpoint, opts...) + if err != nil { + return nil, fmt.Errorf("substrate: dial ate-api %q: %w", cfg.AteAPIEndpoint, err) + } + _ = dialCtx + + return &Client{ + ControlClient: ateapipb.NewControlClient(conn), + conn: conn, + cfg: cfg, + }, nil +} + +func (c *Client) Close() error { + if c.conn != nil { + return c.conn.Close() + } + return nil +} + +func (c *Client) callCtx(ctx context.Context) (context.Context, context.CancelFunc) { + if c.cfg.CallTimeout <= 0 { + return ctx, func() {} + } + return context.WithTimeout(ctx, c.cfg.CallTimeout) +} + +func (c *Client) GetActor(ctx context.Context, actorID string) (*ateapipb.Actor, error) { + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.GetActor(ctx, &ateapipb.GetActorRequest{ActorId: actorID}) + if err != nil { + return nil, err + } + return resp.GetActor(), nil +} + +func (c *Client) CreateActor(ctx context.Context, actorID, tmplNS, tmplName string) (*ateapipb.Actor, error) { + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.CreateActor(ctx, &ateapipb.CreateActorRequest{ + ActorId: actorID, + ActorTemplateNamespace: tmplNS, + ActorTemplateName: tmplName, + }) + if err != nil { + return nil, err + } + return resp.GetActor(), nil +} + +func (c *Client) ResumeActor(ctx context.Context, actorID string) (*ateapipb.Actor, error) { + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.ResumeActor(ctx, &ateapipb.ResumeActorRequest{ActorId: actorID}) + if err != nil { + return nil, err + } + return resp.GetActor(), nil +} + +func (c *Client) SuspendActor(ctx context.Context, actorID string) error { + ctx, cancel := c.callCtx(ctx) + defer cancel() + _, err := c.ControlClient.SuspendActor(ctx, &ateapipb.SuspendActorRequest{ActorId: actorID}) + return err +} + +func (c *Client) DeleteActor(ctx context.Context, actorID string) error { + ctx, cancel := c.callCtx(ctx) + defer cancel() + _, err := c.ControlClient.DeleteActor(ctx, &ateapipb.DeleteActorRequest{ActorId: actorID}) + return err +} diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go new file mode 100644 index 0000000000..45b5cb2b48 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -0,0 +1,22 @@ +package substrate + +import "time" + +// Config holds connection settings for Agent Substrate ate-api. +type Config struct { + // AteAPIEndpoint is a gRPC target (e.g. dns:///api.ate-system.svc:443). + AteAPIEndpoint string + Insecure bool + DialTimeout time.Duration + CallTimeout time.Duration + + // DefaultActorTemplateNamespace/name is a legacy fallback when status/spec refs are unset. + DefaultActorTemplateNamespace string + DefaultActorTemplateName string + + // ProvisionDefaults configures auto-created WorkerPool/ActorTemplate resources. + ProvisionDefaults ProvisionDefaults + + // GatewayToken is the OpenClaw gateway Bearer token injected by the HTTP proxy. + GatewayToken string +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor.go b/go/core/pkg/sandboxbackend/substrate/delete_actor.go new file mode 100644 index 0000000000..c7a36e8409 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor.go @@ -0,0 +1,127 @@ +package substrate + +import ( + "context" + "fmt" + "time" + + "github.com/agent-substrate/substrate/proto/ateapipb" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +const ( + actorDeletePollInterval = 2 * time.Second + actorDeleteTimeout = 5 * time.Minute +) + +// deleteActorSequenced suspends the actor, waits until suspended, deletes it, and waits until gone. +func (c *Client) deleteActorSequenced(ctx context.Context, actorID string) error { + if actorID == "" { + return nil + } + deadline := time.Now().Add(actorDeleteTimeout) + + actor, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + return nil + } + return fmt.Errorf("get actor %q: %w", actorID, err) + } + + if err := c.ensureActorSuspended(ctx, actorID, actor.GetStatus(), deadline); err != nil { + return err + } + + if err := c.DeleteActor(ctx, actorID); err != nil { + if status.Code(err) == codes.NotFound { + return nil + } + if status.Code(err) == codes.FailedPrecondition { + // ate-api requires STATUS_SUSPENDED; re-check and surface current status. + actor, getErr := c.GetActor(ctx, actorID) + if getErr == nil { + return fmt.Errorf("delete actor %q: not suspended (status %s)", actorID, actor.GetStatus()) + } + } + return fmt.Errorf("delete actor %q: %w", actorID, err) + } + + return c.waitForActorDeleted(ctx, actorID, deadline) +} + +func (c *Client) ensureActorSuspended(ctx context.Context, actorID string, st ateapipb.Actor_Status, deadline time.Time) error { + switch st { + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + return nil + case ateapipb.Actor_STATUS_SUSPENDING: + // Retry suspend periodically; stuck checkpoint may need manual worker pod deletion. + _ = c.SuspendActor(ctx, actorID) + return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: + if err := c.SuspendActor(ctx, actorID); err != nil && status.Code(err) != codes.NotFound { + return fmt.Errorf("suspend actor %q: %w", actorID, err) + } + return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + default: + // Best-effort suspend for unknown/intermediate states before delete. + _ = c.SuspendActor(ctx, actorID) + return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + } +} + +func (c *Client) waitForActorStatus(ctx context.Context, actorID string, want ateapipb.Actor_Status, deadline time.Time) error { + for time.Now().Before(deadline) { + actor, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + if want == ateapipb.Actor_STATUS_UNSPECIFIED { + return nil + } + return fmt.Errorf("actor %q not found while waiting for %s", actorID, want) + } + return fmt.Errorf("get actor %q: %w", actorID, err) + } + if actor.GetStatus() == want { + return nil + } + if want == ateapipb.Actor_STATUS_SUSPENDED && actor.GetStatus() == ateapipb.Actor_STATUS_SUSPENDING { + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + continue + } + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + } + return fmt.Errorf("timeout waiting for actor %q status %s", actorID, want) +} + +func (c *Client) waitForActorDeleted(ctx context.Context, actorID string, deadline time.Time) error { + for time.Now().Before(deadline) { + _, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + return nil + } + return fmt.Errorf("get actor %q: %w", actorID, err) + } + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + } + return fmt.Errorf("timeout waiting for actor %q deletion", actorID) +} + +func sleepOrDone(ctx context.Context, d time.Duration) error { + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-t.C: + return nil + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go new file mode 100644 index 0000000000..38b9bdae39 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go @@ -0,0 +1,18 @@ +package substrate + +import ( + "testing" + "time" + + "github.com/agent-substrate/substrate/proto/ateapipb" +) + +func TestEnsureActorSuspendedAlreadySuspended(t *testing.T) { + t.Parallel() + c := &Client{} + deadline := time.Now().Add(time.Minute) + err := c.ensureActorSuspended(t.Context(), "ahr-test", ateapipb.Actor_STATUS_SUSPENDED, deadline) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go new file mode 100644 index 0000000000..0b74d786f7 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -0,0 +1,109 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + "time" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" +) + +const workerPoolDrainTimeout = 3 * time.Minute + +// Delete removes kagent-managed Substrate CRs after the harness actor has been removed. +// Order: golden snapshot actor (from ActorTemplate status), ActorTemplate, WorkerPool. +func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) error { + if ah == nil || ah.Annotations == nil { + return nil + } + if ah.Annotations[annotationManagedActorTemplate] == "true" { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + if err := p.deleteGoldenActor(ctx, key); err != nil { + return err + } + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &tmpl); err == nil { + if err := p.Client.Delete(ctx, &tmpl); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete ActorTemplate %s: %w", key, err) + } + } else if !apierrors.IsNotFound(err) { + return err + } + } + if ah.Annotations[annotationManagedWorkerPool] == "true" { + key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err == nil { + if err := p.Client.Delete(ctx, &wp); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete WorkerPool %s: %w", key, err) + } + } else if !apierrors.IsNotFound(err) { + return err + } + if err := p.waitForWorkerPoolDeploymentGone(ctx, key); err != nil { + return err + } + } + return nil +} + +func (p *Provisioner) deleteGoldenActor(ctx context.Context, tmplKey types.NamespacedName) error { + if p.Ate == nil || p.Client == nil { + return nil + } + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) + } + goldenID := strings.TrimSpace(tmpl.Status.GoldenActorID) + if goldenID == "" { + return nil + } + if err := p.Ate.deleteActorSequenced(ctx, goldenID); err != nil { + return fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) + } + return nil +} + +func workerPoolDeploymentName(wpName string) string { + return wpName + "-deployment" +} + +func (p *Provisioner) waitForWorkerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) error { + if p.Client == nil { + return nil + } + deployKey := types.NamespacedName{Namespace: wpKey.Namespace, Name: workerPoolDeploymentName(wpKey.Name)} + deadline := time.Now().Add(workerPoolDrainTimeout) + for time.Now().Before(deadline) { + var deploy appsv1.Deployment + err := p.Client.Get(ctx, deployKey, &deploy) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) + } + if deploy.DeletionTimestamp != nil { + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + continue + } + if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { + return nil + } + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + } + return fmt.Errorf("timeout waiting for WorkerPool deployment %s to drain", deployKey) +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go new file mode 100644 index 0000000000..dc4b8e338d --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go @@ -0,0 +1,61 @@ +package substrate + +import ( + "context" + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +type recordingActorDeleter struct { + deleted []string +} + +func (r *recordingActorDeleter) deleteActorSequenced(_ context.Context, actorID string) error { + r.deleted = append(r.deleted, actorID) + return nil +} + +func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + ns := "kagent" + tmpl := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns}, + Status: atev1alpha1.ActorTemplateStatus{ + GoldenActorID: "golden-actor-uuid", + Phase: atev1alpha1.PhaseReady, + }, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{ + Name: "peterj-claw", + Namespace: ns, + Annotations: map[string]string{ + annotationManagedActorTemplate: "true", + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tmpl).Build() + rec := &recordingActorDeleter{} + p := &Provisioner{Client: kube, Ate: rec} + + require.NoError(t, p.Delete(context.Background(), ah)) + require.Equal(t, []string{"golden-actor-uuid"}, rec.deleted) + + var got atev1alpha1.ActorTemplate + require.Error(t, kube.Get(context.Background(), client.ObjectKeyFromObject(tmpl), &got)) +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go new file mode 100644 index 0000000000..08d5b7d7a3 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -0,0 +1,231 @@ +package substrate + +import ( + "context" + "fmt" + "regexp" + "strings" + + "github.com/agent-substrate/substrate/proto/ateapipb" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" +) + +const ( + defaultActorHostSuffix = "actors.resources.substrate.ate.dev" + defaultSubstrateGWPort = int32(80) + actorIDPrefix = "ahr" +) + +var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) + +// ClawBackend implements AsyncBackend for OpenClaw/NemoClaw on Agent Substrate. +type ClawBackend struct { + client *Client + cfg Config + backend v1alpha2.AgentHarnessBackendType + recorder record.EventRecorder +} + +var _ sandboxbackend.AsyncBackend = (*ClawBackend)(nil) + +// NewOpenClawBackend returns a substrate backend for openclaw/nemoclaw harness types. +func NewOpenClawBackend(client *Client, cfg Config, backend v1alpha2.AgentHarnessBackendType, recorder record.EventRecorder) *ClawBackend { + return &ClawBackend{ + client: client, + cfg: cfg, + backend: backend, + recorder: recorder, + } +} + +func (b *ClawBackend) Name() v1alpha2.AgentHarnessBackendType { + return b.backend +} + +func (b *ClawBackend) EnsureAgentHarness(ctx context.Context, ah *v1alpha2.AgentHarness) (sandboxbackend.EnsureResult, error) { + if ah == nil { + return sandboxbackend.EnsureResult{}, fmt.Errorf("AgentHarness is required") + } + if err := validateSubstrateSpec(ah); err != nil { + return sandboxbackend.EnsureResult{}, err + } + + actorID := ActorID(ah) + tmplNS, tmplName := actorTemplateRef(ah, b.cfg) + + actor, err := b.client.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) != codes.NotFound { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate GetActor %q: %w", actorID, err) + } + actor, err = b.client.CreateActor(ctx, actorID, tmplNS, tmplName) + if err != nil { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate CreateActor %q: %w", actorID, err) + } + } + + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: + // already active or waking + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + actor, err = b.client.ResumeActor(ctx, actorID) + if err != nil { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate ResumeActor %q: %w", actorID, err) + } + default: + // suspending — wait for next reconcile + } + + endpoint := substrateConnectionEndpoint(ah.Namespace, ah.Name, actor) + + return sandboxbackend.EnsureResult{ + Handle: sandboxbackend.Handle{ID: actorID}, + Endpoint: endpoint, + }, nil +} + +func (b *ClawBackend) GetStatus(ctx context.Context, h sandboxbackend.Handle) (metav1.ConditionStatus, string, string) { + if h.ID == "" { + return metav1.ConditionUnknown, "ActorHandleMissing", "no substrate actor id recorded yet" + } + actor, err := b.client.GetActor(ctx, h.ID) + if err != nil { + if status.Code(err) == codes.NotFound { + return metav1.ConditionUnknown, "ActorNotFound", fmt.Sprintf("substrate actor %q not found", h.ID) + } + return metav1.ConditionUnknown, "ActorGetFailed", err.Error() + } + return actorStatusToCondition(actor) +} + +func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.Handle) error { + if h.ID == "" { + return nil + } + if err := b.client.deleteActorSequenced(ctx, h.ID); err != nil { + return fmt.Errorf("substrate delete actor %q: %w", h.ID, err) + } + return nil +} + +func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle) error { + // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time + // (see substrate.Provisioner.buildOpenClawActorStartup — same openclaw.BuildBootstrapJSON as OpenShell). + _ = ctx + _ = ah + _ = h + return nil +} + +// ActorID returns a stable DNS-1123 actor id for this harness. +func ActorID(ah *v1alpha2.AgentHarness) string { + raw := fmt.Sprintf("%s-%s-%s", actorIDPrefix, ah.Namespace, ah.Name) + raw = strings.ToLower(raw) + raw = strings.ReplaceAll(raw, "_", "-") + if len(raw) > 63 { + raw = raw[:63] + raw = strings.TrimRight(raw, "-") + } + if !dns1123Label.MatchString(raw) { + // fallback: hash-like trim + raw = fmt.Sprintf("%s-%s", actorIDPrefix, ah.UID) + if len(raw) > 63 { + raw = raw[:63] + } + } + return raw +} + +// ActorHost returns the atenet router Host header value for the actor. +func ActorHost(actorID string, suffix string) string { + if suffix == "" { + suffix = defaultActorHostSuffix + } + return actorID + "." + suffix +} + +func actorTemplateRef(ah *v1alpha2.AgentHarness, cfg Config) (string, string) { + if ah.Status.Substrate != nil && ah.Status.Substrate.ActorTemplateRef.Name != "" { + ref := ah.Status.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + return ns, ref.Name + } + if ah.Spec.Substrate != nil && ah.Spec.Substrate.ActorTemplateRef != nil { + ref := ah.Spec.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + if ref.Name != "" { + return ns, ref.Name + } + } + // Auto-provisioned template in the harness namespace (also when status was not persisted yet). + if ah.Annotations != nil && ah.Annotations[AnnotationManagedActorTemplate] == "true" { + return ah.Namespace, actorTemplateName(ah) + } + if cfg.DefaultActorTemplateNamespace != "" && cfg.DefaultActorTemplateName != "" { + return cfg.DefaultActorTemplateNamespace, cfg.DefaultActorTemplateName + } + return ah.Namespace, actorTemplateName(ah) +} + +func substrateConnectionEndpoint(namespace, name string, actor *ateapipb.Actor) string { + gw := fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", namespace, name) + if actor == nil { + return "kagent gateway: " + gw + } + if podIP := strings.TrimSpace(actor.GetAteomPodIp()); podIP != "" { + return fmt.Sprintf("http://%s:80 (pod IP; UI via kagent %s)", podIP, gw) + } + return fmt.Sprintf("kagent gateway: %s (actor status %s)", gw, actor.GetStatus()) +} + +func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + if runtime != v1alpha2.AgentHarnessRuntimeSubstrate { + return fmt.Errorf("substrate backend called for runtime %q", runtime) + } + if ah.Spec.Substrate == nil { + return fmt.Errorf("spec.substrate is required when runtime is substrate") + } + if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { + return nil + } + if strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location) == "" { + return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") + } + return nil +} + +func actorStatusToCondition(actor *ateapipb.Actor) (metav1.ConditionStatus, string, string) { + if actor == nil { + return metav1.ConditionUnknown, "ActorMissing", "empty actor response" + } + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_RUNNING: + if ip := actor.GetAteomPodIp(); ip != "" { + return metav1.ConditionTrue, "ActorRunning", fmt.Sprintf("actor running on %s", ip) + } + return metav1.ConditionTrue, "ActorRunning", "actor is running" + case ateapipb.Actor_STATUS_RESUMING: + return metav1.ConditionFalse, "ActorResuming", "actor is resuming" + case ateapipb.Actor_STATUS_SUSPENDING: + return metav1.ConditionFalse, "ActorSuspending", "actor is suspending" + case ateapipb.Actor_STATUS_SUSPENDED: + return metav1.ConditionFalse, "ActorSuspended", "actor is suspended" + default: + return metav1.ConditionUnknown, "ActorStatusUnknown", actor.GetStatus().String() + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go new file mode 100644 index 0000000000..fa7c6c8d75 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go @@ -0,0 +1,53 @@ +package substrate + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestActorID(t *testing.T) { + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "kagent", + Name: "my-claw", + UID: "00000000-0000-0000-0000-000000000001", + }, + } + id := ActorID(ah) + if !dns1123Label.MatchString(id) { + t.Fatalf("ActorID %q is not DNS-1123", id) + } + if id == "" { + t.Fatal("expected non-empty actor id") + } +} + +func TestActorHost(t *testing.T) { + got := ActorHost("ahr-kagent-my-claw", "") + if got != "ahr-kagent-my-claw.actors.resources.substrate.ate.dev" { + t.Fatalf("ActorHost = %q", got) + } +} + +func TestActorTemplateRefManagedProvisioner(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "kagent", + Name: "peterj-claw", + Annotations: map[string]string{ + AnnotationManagedActorTemplate: "true", + }, + }, + } + ns, name := actorTemplateRef(ah, Config{ + DefaultActorTemplateNamespace: "ate-demo-openclaw", + DefaultActorTemplateName: "openclaw", + }) + if ns != "kagent" || name != "peterj-claw" { + t.Fatalf("got %s/%s, want kagent/peterj-claw", ns, name) + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go new file mode 100644 index 0000000000..d8a63de188 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -0,0 +1,301 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +const ( + AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" + AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" + + annotationManagedWorkerPool = AnnotationManagedWorkerPool + annotationManagedActorTemplate = AnnotationManagedActorTemplate + + defaultWorkerPoolReplicas = int32(2) + defaultOpenClawContainer = "openclaw" +) + +// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. +type ProvisionDefaults struct { + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string + DefaultWorkloadImage string + GatewayToken string +} + +// ateActorDeleter removes actors from ate-api during harness teardown. +type ateActorDeleter interface { + deleteActorSequenced(ctx context.Context, actorID string) error +} + +// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. +type Provisioner struct { + Client client.Client + Defaults ProvisionDefaults + // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. + Ate ateActorDeleter +} + +// EnsureResult describes provisioned Substrate resources. +type EnsureResult struct { + WorkerPoolRef types.NamespacedName + ActorTemplateRef types.NamespacedName + ActorTemplateReady bool + ManagedWorkerPool bool + ManagedActorTemplate bool +} + +// Ensure creates or updates Substrate CRs and waits for ActorTemplate Ready. +func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { + if ah == nil || ah.Spec.Substrate == nil { + return EnsureResult{}, fmt.Errorf("spec.substrate is required") + } + if err := validateSubstrateProvisionSpec(ah); err != nil { + return EnsureResult{}, err + } + + // Legacy / advanced: user supplied an existing template. + if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { + ref := ah.Spec.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} + ready, err := p.actorTemplateReady(ctx, tmplKey) + if err != nil { + return EnsureResult{}, err + } + return EnsureResult{ + ActorTemplateRef: tmplKey, + ActorTemplateReady: ready, + ManagedActorTemplate: false, + }, nil + } + + wpKey, managedWP, err := p.ensureWorkerPool(ctx, ah) + if err != nil { + return EnsureResult{}, err + } + + tmplKey, err := p.ensureActorTemplate(ctx, ah, wpKey) + if err != nil { + return EnsureResult{}, err + } + + ready, err := p.actorTemplateReady(ctx, tmplKey) + if err != nil { + return EnsureResult{}, err + } + + _ = managedWP + return EnsureResult{ + WorkerPoolRef: wpKey, + ActorTemplateRef: tmplKey, + ActorTemplateReady: ready, + ManagedWorkerPool: managedWP, + ManagedActorTemplate: true, + }, nil +} + +func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { + sub := ah.Spec.Substrate + if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { + return nil + } + loc := strings.TrimSpace(sub.SnapshotsConfig.Location) + if loc == "" { + return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") + } + if !strings.HasPrefix(loc, "gs://") { + return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") + } + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { + return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") + } + return nil +} + +func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { + sub := ah.Spec.Substrate + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { + ns := sub.WorkerPoolRef.Namespace + if ns == "" { + ns = ah.Namespace + } + key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) + } + return key, false, nil + } + + key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + replicas := defaultWorkerPoolReplicas + ateomImage := "" + if sub.WorkerPool != nil { + if sub.WorkerPool.Replicas > 0 { + replicas = sub.WorkerPool.Replicas + } + ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) + } + if ateomImage == "" { + return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set spec.substrate.workerPool.ateomImage)") + } + + desired := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: replicas, + AteomImage: ateomImage, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) + } + + var existing atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) + } + return key, true, nil + } else if err != nil { + return types.NamespacedName{}, false, err + } + existing.Spec.Replicas = desired.Spec.Replicas + existing.Spec.AteomImage = desired.Spec.AteomImage + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) + } + return key, true, nil +} + +func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) + if workloadImage == "" { + workloadImage = strings.TrimSpace(p.Defaults.DefaultWorkloadImage) + } + if workloadImage == "" { + workloadImage = openshell.NemoclawSandboxBaseImage + } + startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) + if err != nil { + return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) + } + + desired := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.ActorTemplateSpec{ + PauseImage: p.Defaults.PauseImage, + Runsc: defaultRunscConfig(p.Defaults), + Containers: []atev1alpha1.Container{ + { + Name: defaultOpenClawContainer, + Image: workloadImage, + Ports: []corev1.ContainerPort{{ContainerPort: 80}}, + Command: []string{ + "/bin/sh", + "-c", + startupScript, + }, + Env: containerEnv, + }, + }, + WorkerPoolRef: corev1.ObjectReference{ + Name: wpKey.Name, + Namespace: wpKey.Namespace, + }, + SnapshotsConfig: atev1alpha1.SnapshotsConfig{ + Location: strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location), + }, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) + } + + var existing atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) + } + return key, nil + } else if err != nil { + return types.NamespacedName{}, err + } + existing.Spec = desired.Spec + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) + } + return key, nil +} + +func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &tmpl); err != nil { + return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) + } + return tmpl.Status.Phase == atev1alpha1.PhaseReady, nil +} + +func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { + return atev1alpha1.RunscConfig{ + AMD64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscAMD64URL, + SHA256Hash: d.RunscAMD64SHA256, + }, + ARM64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscARM64URL, + SHA256Hash: d.RunscARM64SHA256, + }, + } +} + +func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { + return map[string]string{ + "app.kubernetes.io/managed-by": "kagent", + "kagent.dev/agent-harness": ah.Name, + } +} + +func workerPoolName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name + "-wp") +} + +func actorTemplateName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name) +} + +func truncateDNS1123(s string) string { + s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) + if len(s) > 63 { + s = strings.TrimRight(s[:63], "-") + } + return s +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go new file mode 100644 index 0000000000..b2d53e405a --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -0,0 +1,88 @@ +package substrate + +import ( + "context" + "encoding/base64" + "fmt" + "sort" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/internal/utils" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + corev1 "k8s.io/api/core/v1" +) + +const defaultSubstrateOpenClawGatewayPort = 80 + +// buildOpenClawActorStartup returns the ateom workload startup script and container env for OpenClaw on Substrate. +// When spec.modelConfigRef is set, openclaw.json includes models/agents/channels like the OpenShell bootstrap path. +func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha2.AgentHarness) (script string, env []corev1.EnvVar, err error) { + if ah == nil { + return "", nil, fmt.Errorf("AgentHarness is required") + } + if p.Client == nil { + return "", nil, fmt.Errorf("substrate provisioner kubernetes client is required") + } + + token := strings.TrimSpace(p.Defaults.GatewayToken) + gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort) + + var jsonBytes []byte + var envMap map[string]string + + ref := strings.TrimSpace(ah.Spec.ModelConfigRef) + if ref != "" { + mcRef, parseErr := utils.ParseRefString(ref, ah.Namespace) + if parseErr != nil { + return "", nil, fmt.Errorf("parse modelConfigRef %q: %w", ref, parseErr) + } + mc := &v1alpha2.ModelConfig{} + if getErr := p.Client.Get(ctx, mcRef, mc); getErr != nil { + return "", nil, fmt.Errorf("get ModelConfig %s: %w", mcRef, getErr) + } + jsonBytes, envMap, err = openclaw.BuildBootstrapJSON(ctx, p.Client, ah.Namespace, ah, mc, gw, openclaw.SubstrateBootstrapDefaultBaseURL) + if err != nil { + return "", nil, fmt.Errorf("build openclaw bootstrap json: %w", err) + } + } else { + jsonBytes, err = openclaw.BuildGatewayOnlyBootstrapJSON(gw) + if err != nil { + return "", nil, fmt.Errorf("build gateway-only openclaw json: %w", err) + } + envMap = map[string]string{} + } + + containerEnv := openClawEnvVars(envMap) + script = openClawStartupScript(jsonBytes, gw.Port) + return script, containerEnv, nil +} + +func openClawEnvVars(envMap map[string]string) []corev1.EnvVar { + keys := make([]string, 0, len(envMap)) + for k := range envMap { + keys = append(keys, k) + } + sort.Strings(keys) + out := make([]corev1.EnvVar, 0, len(keys)+1) + for _, k := range keys { + out = append(out, corev1.EnvVar{Name: k, Value: envMap[k]}) + } + out = append(out, corev1.EnvVar{Name: "HOME", Value: "/root"}) + return out +} + +func openClawStartupScript(jsonBytes []byte, gwPort int) string { + b64 := base64.StdEncoding.EncodeToString(jsonBytes) + return strings.Join([]string{ + "set -e", + `mkdir -p "${HOME}/.openclaw"`, + fmt.Sprintf(`echo '%s' | base64 -d > "${HOME}/.openclaw/openclaw.json"`, b64), + fmt.Sprintf("openclaw gateway run --port %d --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 &", gwPort), + `for i in $(seq 1 60); do`, + ` curl -sf http://127.0.0.1:80/ >/dev/null 2>&1 && echo "gateway up" && break`, + " sleep 1", + "done", + "tail -f /tmp/openclaw-gateway.log /dev/null", + }, "\n") +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go new file mode 100644 index 0000000000..f4ac28d3f9 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -0,0 +1,153 @@ +package substrate + +import ( + "context" + "encoding/base64" + "encoding/json" + "strings" + "testing" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openai-key", Namespace: ns}, + Data: map[string][]byte{"OPENAI_API_KEY": []byte("sk-test")}, + } + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "default-model-config", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + ModelConfigRef: "default-model-config", + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + Location: "gs://bucket/prefix/", + }, + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() + p := &Provisioner{ + Client: kube, + Defaults: ProvisionDefaults{GatewayToken: "some-token"}, + } + + script, env, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + require.Contains(t, script, "base64 -d") + require.Contains(t, script, "openclaw gateway run --port 80") + + var foundKey bool + for _, e := range env { + if e.Name == "OPENAI_API_KEY" && e.Value == "sk-test" { + foundKey = true + } + } + require.True(t, foundKey, "expected OPENAI_API_KEY in container env") + + // Decode embedded JSON from the base64 line in the startup script. + var payload string + for _, line := range strings.Split(script, "\n") { + if !strings.Contains(line, "base64 -d") { + continue + } + start := strings.Index(line, `'`) + 1 + end := strings.LastIndex(line, `'`) + require.Greater(t, end, start) + payload = line[start:end] + break + } + require.NotEmpty(t, payload) + raw, decErr := base64.StdEncoding.DecodeString(payload) + require.NoError(t, decErr) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + require.Equal(t, "lan", gw["bind"]) + require.Equal(t, float64(80), gw["port"]) + auth := gw["auth"].(map[string]any) + require.Equal(t, "token", auth["mode"]) + require.Equal(t, "some-token", auth["token"]) + _, hasModels := root["models"] + require.False(t, hasModels, "substrate bootstrap should omit models unless ModelConfig sets an explicit baseUrl") + require.Contains(t, root, "agents") +} + +func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openai-key", Namespace: ns}, + Data: map[string][]byte{"OPENAI_API_KEY": []byte("sk-test")}, + } + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{BaseURL: "https://api.example/v1"}, + }, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + ModelConfigRef: "mc", + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + Location: "gs://bucket/prefix/", + }, + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() + p := &Provisioner{Client: kube, Defaults: ProvisionDefaults{}} + script, _, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + + var payload string + for _, line := range strings.Split(script, "\n") { + if strings.Contains(line, "base64 -d") { + start := strings.Index(line, `'`) + 1 + end := strings.LastIndex(line, `'`) + payload = line[start:end] + break + } + } + raw, decErr := base64.StdEncoding.DecodeString(payload) + require.NoError(t, decErr) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + openai := root["models"].(map[string]any)["providers"].(map[string]any)["openai"].(map[string]any) + require.Equal(t, "https://api.example/v1", openai["baseUrl"]) +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go new file mode 100644 index 0000000000..4878d40a99 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_test.go @@ -0,0 +1,47 @@ +package substrate + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestValidateSubstrateProvisionSpec(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + Location: "gs://bucket/prefix/", + }, + }, + }, + } + if err := validateSubstrateProvisionSpec(ah); err != nil { + t.Fatalf("expected valid: %v", err) + } + + ah.Spec.Substrate.SnapshotsConfig.Location = "s3://nope" + if err := validateSubstrateProvisionSpec(ah); err == nil { + t.Fatal("expected error for non-gs location") + } + + ah.Spec.Substrate.SnapshotsConfig.Location = "gs://ok" + ah.Spec.Substrate.WorkerPoolRef = &v1alpha2.TypedReference{Name: "pool"} + ah.Spec.Substrate.WorkerPool = &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 2} + if err := validateSubstrateProvisionSpec(ah); err == nil { + t.Fatal("expected error for workerPoolRef and workerPool together") + } +} + +func TestActorTemplateName(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "my-claw"}} + if got := actorTemplateName(ah); got != "my-claw" { + t.Fatalf("got %q", got) + } +} diff --git a/go/go.mod b/go/go.mod index 1b498ee8d8..b2d0f7fa2b 100644 --- a/go/go.mod +++ b/go/go.mod @@ -61,6 +61,7 @@ require ( ) require ( + github.com/agent-substrate/substrate v0.0.0 github.com/aws/aws-sdk-go-v2 v1.41.7 github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.50.6 github.com/golang/protobuf v1.5.4 @@ -85,7 +86,7 @@ require ( cel.dev/expr v0.25.1 // indirect charm.land/lipgloss/v2 v2.0.3 // indirect cloud.google.com/go v0.123.0 // indirect - cloud.google.com/go/auth v0.18.2 // indirect + cloud.google.com/go/auth v0.19.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect codeberg.org/chavacava/garif v0.2.0 // indirect @@ -195,20 +196,20 @@ require ( github.com/go-critic/go-critic v0.14.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect - github.com/go-openapi/jsonpointer v0.22.1 // indirect - github.com/go-openapi/jsonreference v0.21.2 // indirect - github.com/go-openapi/swag v0.25.1 // indirect - github.com/go-openapi/swag/cmdutils v0.25.1 // indirect - github.com/go-openapi/swag/conv v0.25.1 // indirect - github.com/go-openapi/swag/fileutils v0.25.1 // indirect - github.com/go-openapi/swag/jsonname v0.25.1 // indirect - github.com/go-openapi/swag/jsonutils v0.25.1 // indirect - github.com/go-openapi/swag/loading v0.25.1 // indirect - github.com/go-openapi/swag/mangling v0.25.1 // indirect - github.com/go-openapi/swag/netutils v0.25.1 // indirect - github.com/go-openapi/swag/stringutils v0.25.1 // indirect - github.com/go-openapi/swag/typeutils v0.25.1 // indirect - github.com/go-openapi/swag/yamlutils v0.25.1 // indirect + github.com/go-openapi/jsonpointer v0.22.4 // indirect + github.com/go-openapi/jsonreference v0.21.4 // indirect + github.com/go-openapi/swag v0.25.4 // indirect + github.com/go-openapi/swag/cmdutils v0.25.4 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/fileutils v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/mangling v0.25.4 // indirect + github.com/go-openapi/swag/netutils v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect github.com/go-toolsmith/astcast v1.1.0 // indirect github.com/go-toolsmith/astcopy v1.1.0 // indirect github.com/go-toolsmith/astequal v1.2.0 // indirect @@ -240,7 +241,7 @@ require ( github.com/google/s2a-go v0.1.9 // indirect github.com/google/safehtml v0.1.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.14 // indirect - github.com/googleapis/gax-go/v2 v2.18.0 // indirect + github.com/googleapis/gax-go/v2 v2.21.0 // indirect github.com/gordonklaus/ineffassign v0.2.0 // indirect github.com/gostaticanalysis/analysisutil v0.7.1 // indirect github.com/gostaticanalysis/comment v1.5.0 // indirect @@ -303,6 +304,7 @@ require ( github.com/moby/moby/api v1.54.1 // indirect github.com/moby/moby/client v0.4.0 // indirect github.com/moby/patternmatcher v0.6.1 // indirect + github.com/moby/spdystream v0.5.1 // indirect github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect @@ -391,7 +393,7 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/bridges/prometheus v0.68.0 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.42.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect @@ -417,7 +419,7 @@ require ( golang.org/x/time v0.15.0 // indirect golang.org/x/tools v0.45.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/api v0.272.0 // indirect + google.golang.org/api v0.274.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260406210006-6f92a3bedf2d // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect diff --git a/go/go.sum b/go/go.sum index 7392185179..1deb8cbab3 100644 --- a/go/go.sum +++ b/go/go.sum @@ -8,8 +8,8 @@ charm.land/lipgloss/v2 v2.0.3 h1:yM2zJ4Cf5Y51b7RHIwioil4ApI/aypFXXVHSwlM6RzU= charm.land/lipgloss/v2 v2.0.3/go.mod h1:7myLU9iG/3xluAWzpY/fSxYYHCgoKTie7laxk6ATwXA= cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE= cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU= -cloud.google.com/go/auth v0.18.2 h1:+Nbt5Ev0xEqxlNjd6c+yYUeosQ5TtEUaNcN/3FozlaM= -cloud.google.com/go/auth v0.18.2/go.mod h1:xD+oY7gcahcu7G2SG2DsBerfFxgPAJz17zz2joOFF3M= +cloud.google.com/go/auth v0.19.0 h1:DGYwtbcsGsT1ywuxsIoWi1u/vlks0moIblQHgSDgQkQ= +cloud.google.com/go/auth v0.19.0/go.mod h1:2Aph7BT2KnaSFOM0JDPyiYgNh6PL9vGMiP8CUIXZ+IY= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= @@ -70,6 +70,8 @@ github.com/abiosoft/ishell/v2 v2.0.2 h1:5qVfGiQISaYM8TkbBl7RFO6MddABoXpATrsFbVI+ github.com/abiosoft/ishell/v2 v2.0.2/go.mod h1:E4oTCXfo6QjoCart0QYa5m9w4S+deXs/P/9jA77A9Bs= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db h1:CjPUSXOiYptLbTdr1RceuZgSFDQ7U15ITERUGrUORx8= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db/go.mod h1:rB3B4rKii8V21ydCbIzH5hZiCQE7f5E9SzUb/ZZx530= +github.com/agent-substrate/substrate v0.0.0 h1:XEX4QAjzaIcv4amBqBvPE/f40WV5WHRWo7u04xvqv/g= +github.com/agent-substrate/substrate v0.0.0/go.mod h1:8Z4SJqPWDMPBa76JgIdpiX0jTY1JXcfLTXEAtkUv7go= github.com/alecthomas/assert/v2 v2.11.0 h1:2Q9r3ki8+JYXvGsDyBXwH3LcJ+WK5D0gc5E8vS6K3D0= github.com/alecthomas/assert/v2 v2.11.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= github.com/alecthomas/chroma/v2 v2.24.1 h1:m5ffpfZbIb++k8AqFEKy9uVgY12xIQtBsQlc6DfZJQM= @@ -92,6 +94,8 @@ github.com/anthropics/anthropic-sdk-go v1.43.0 h1:ShY3C7lafzHP0ze1dCxL3ZFZzvkGfX github.com/anthropics/anthropic-sdk-go v1.43.0/go.mod h1:5cEaslQ6A9ajdL5YUvhNW57LKxEz0OAZ7WEzgZWLD7k= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/ashanbrown/forbidigo/v2 v2.3.1 h1:KAZijvQ7zeIBKbhikT4jCm0TLYXC4u78bTiLh/8JROI= github.com/ashanbrown/forbidigo/v2 v2.3.1/go.mod h1:2QDkLTzU6TV937eFROamXrW92M3paehdae4HCDCOZCM= github.com/ashanbrown/makezero/v2 v2.2.1 h1:A7uU8dgB1PA9aelTxHMfHIQ8Qev8AB3JLxJUBUsejqM= @@ -300,36 +304,40 @@ github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= -github.com/go-openapi/jsonpointer v0.22.1 h1:sHYI1He3b9NqJ4wXLoJDKmUmHkWy/L7rtEo92JUxBNk= -github.com/go-openapi/jsonpointer v0.22.1/go.mod h1:pQT9OsLkfz1yWoMgYFy4x3U5GY5nUlsOn1qSBH5MkCM= -github.com/go-openapi/jsonreference v0.21.2 h1:Wxjda4M/BBQllegefXrY/9aq1fxBA8sI5M/lFU6tSWU= -github.com/go-openapi/jsonreference v0.21.2/go.mod h1:pp3PEjIsJ9CZDGCNOyXIQxsNuroxm8FAJ/+quA0yKzQ= -github.com/go-openapi/swag v0.25.1 h1:6uwVsx+/OuvFVPqfQmOOPsqTcm5/GkBhNwLqIR916n8= -github.com/go-openapi/swag v0.25.1/go.mod h1:bzONdGlT0fkStgGPd3bhZf1MnuPkf2YAys6h+jZipOo= -github.com/go-openapi/swag/cmdutils v0.25.1 h1:nDke3nAFDArAa631aitksFGj2omusks88GF1VwdYqPY= -github.com/go-openapi/swag/cmdutils v0.25.1/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= -github.com/go-openapi/swag/conv v0.25.1 h1:+9o8YUg6QuqqBM5X6rYL/p1dpWeZRhoIt9x7CCP+he0= -github.com/go-openapi/swag/conv v0.25.1/go.mod h1:Z1mFEGPfyIKPu0806khI3zF+/EUXde+fdeksUl2NiDs= -github.com/go-openapi/swag/fileutils v0.25.1 h1:rSRXapjQequt7kqalKXdcpIegIShhTPXx7yw0kek2uU= -github.com/go-openapi/swag/fileutils v0.25.1/go.mod h1:+NXtt5xNZZqmpIpjqcujqojGFek9/w55b3ecmOdtg8M= -github.com/go-openapi/swag/jsonname v0.25.1 h1:Sgx+qbwa4ej6AomWC6pEfXrA6uP2RkaNjA9BR8a1RJU= -github.com/go-openapi/swag/jsonname v0.25.1/go.mod h1:71Tekow6UOLBD3wS7XhdT98g5J5GR13NOTQ9/6Q11Zo= -github.com/go-openapi/swag/jsonutils v0.25.1 h1:AihLHaD0brrkJoMqEZOBNzTLnk81Kg9cWr+SPtxtgl8= -github.com/go-openapi/swag/jsonutils v0.25.1/go.mod h1:JpEkAjxQXpiaHmRO04N1zE4qbUEg3b7Udll7AMGTNOo= -github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1 h1:DSQGcdB6G0N9c/KhtpYc71PzzGEIc/fZ1no35x4/XBY= -github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1/go.mod h1:kjmweouyPwRUEYMSrbAidoLMGeJ5p6zdHi9BgZiqmsg= -github.com/go-openapi/swag/loading v0.25.1 h1:6OruqzjWoJyanZOim58iG2vj934TysYVptyaoXS24kw= -github.com/go-openapi/swag/loading v0.25.1/go.mod h1:xoIe2EG32NOYYbqxvXgPzne989bWvSNoWoyQVWEZicc= -github.com/go-openapi/swag/mangling v0.25.1 h1:XzILnLzhZPZNtmxKaz/2xIGPQsBsvmCjrJOWGNz/ync= -github.com/go-openapi/swag/mangling v0.25.1/go.mod h1:CdiMQ6pnfAgyQGSOIYnZkXvqhnnwOn997uXZMAd/7mQ= -github.com/go-openapi/swag/netutils v0.25.1 h1:2wFLYahe40tDUHfKT1GRC4rfa5T1B4GWZ+msEFA4Fl4= -github.com/go-openapi/swag/netutils v0.25.1/go.mod h1:CAkkvqnUJX8NV96tNhEQvKz8SQo2KF0f7LleiJwIeRE= -github.com/go-openapi/swag/stringutils v0.25.1 h1:Xasqgjvk30eUe8VKdmyzKtjkVjeiXx1Iz0zDfMNpPbw= -github.com/go-openapi/swag/stringutils v0.25.1/go.mod h1:JLdSAq5169HaiDUbTvArA2yQxmgn4D6h4A+4HqVvAYg= -github.com/go-openapi/swag/typeutils v0.25.1 h1:rD/9HsEQieewNt6/k+JBwkxuAHktFtH3I3ysiFZqukA= -github.com/go-openapi/swag/typeutils v0.25.1/go.mod h1:9McMC/oCdS4BKwk2shEB7x17P6HmMmA6dQRtAkSnNb8= -github.com/go-openapi/swag/yamlutils v0.25.1 h1:mry5ez8joJwzvMbaTGLhw8pXUnhDK91oSJLDPF1bmGk= -github.com/go-openapi/swag/yamlutils v0.25.1/go.mod h1:cm9ywbzncy3y6uPm/97ysW8+wZ09qsks+9RS8fLWKqg= +github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4= +github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80= +github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8= +github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4= +github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU= +github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ= +github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4= +github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y= +github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48= +github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg= +github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0= +github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= github.com/go-pg/pg/v10 v10.11.0 h1:CMKJqLgTrfpE/aOVeLdybezR2om071Vh38OLZjsyMI0= github.com/go-pg/pg/v10 v10.11.0/go.mod h1:4BpHRoxE61y4Onpof3x1a2SQvi9c+q1dJnrNdMjsroA= github.com/go-pg/zerochecker v0.2.0 h1:pp7f72c3DobMWOb2ErtZsnrPaSvHd2W4o9//8HtF4mU= @@ -426,8 +434,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.14 h1:yh8ncqsbUY4shRD5dA6RlzjJaT4hi3kII+zYw8wmLb8= github.com/googleapis/enterprise-certificate-proxy v0.3.14/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg= -github.com/googleapis/gax-go/v2 v2.18.0 h1:jxP5Uuo3bxm3M6gGtV94P4lliVetoCB4Wk2x8QA86LI= -github.com/googleapis/gax-go/v2 v2.18.0/go.mod h1:uSzZN4a356eRG985CzJ3WfbFSpqkLTjsnhWGJR6EwrE= +github.com/googleapis/gax-go/v2 v2.21.0 h1:h45NjjzEO3faG9Lg/cFrBh2PgegVVgzqKzuZl/wMbiI= +github.com/googleapis/gax-go/v2 v2.21.0/go.mod h1:But/NJU6TnZsrLai/xBAQLLz+Hc7fHZJt/hsCz3Fih4= github.com/gordonklaus/ineffassign v0.2.0 h1:Uths4KnmwxNJNzq87fwQQDDnbNb7De00VOk9Nu0TySs= github.com/gordonklaus/ineffassign v0.2.0/go.mod h1:TIpymnagPSexySzs7F9FnO1XFTy8IT3a59vmZp5Y9Lw= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -595,6 +603,8 @@ github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjI github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= +github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= @@ -869,8 +879,8 @@ go.opentelemetry.io/contrib/detectors/gcp v1.42.0 h1:kpt2PEJuOuqYkPcktfJqWWDjTEd go.opentelemetry.io/contrib/detectors/gcp v1.42.0/go.mod h1:W9zQ439utxymRrXsUOzZbFX4JhLxXU4+ZnCt8GG7yA8= go.opentelemetry.io/contrib/exporters/autoexport v0.68.0 h1:0D3GFvELGIwQGfC6agLsbrEYSGWZTRTxIXxcQUqrOuk= go.opentelemetry.io/contrib/exporters/autoexport v0.68.0/go.mod h1:DM2NV7Zb8CcGeVPt6glouY0FAiwZQ/iqgcWExhgWeN8= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0 h1:XmiuHzgJt067+a6kwyAzkhXooYVv3/TOw9cM2VfJgUM= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0/go.mod h1:KDgtbWKTQs4bM+VPUr6WlL9m/WXcmkCcBlIzqxPGzmI= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 h1:0Qx7VGBacMm9ZENQ7TnNObTYI4ShC+lHI16seduaxZo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0/go.mod h1:Sje3i3MjSPKTSPvVWCaL8ugBzJwik3u4smCjUeuupqg= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8VOmDefoh0+ztfGaymYbhdB/tT3zs79QaZTNGY= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= @@ -1027,10 +1037,12 @@ gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= google.golang.org/adk v1.2.0 h1:MfQD1/GqPfIsFNBcozNykkjdqNIdCrPH/SNqKPZF/yM= google.golang.org/adk v1.2.0/go.mod h1:6QY5jQI7awU4WYtJqvyIkJQheCvqsGWweU6BX63USEc= -google.golang.org/api v0.272.0 h1:eLUQZGnAS3OHn31URRf9sAmRk3w2JjMx37d2k8AjJmA= -google.golang.org/api v0.272.0/go.mod h1:wKjowi5LNJc5qarNvDCvNQBn3rVK8nSy6jg2SwRwzIA= +google.golang.org/api v0.274.0 h1:aYhycS5QQCwxHLwfEHRRLf9yNsfvp1JadKKWBE54RFA= +google.golang.org/api v0.274.0/go.mod h1:JbAt7mF+XVmWu6xNP8/+CTiGH30ofmCmk9nM8d8fHew= google.golang.org/genai v1.57.0 h1:qTyG2ynz5dQy2jF4CvZdLHHVslhR0heMue+zM1a4GNM= google.golang.org/genai v1.57.0/go.mod h1:A3kkl0nyBjyFlNjgxIwKq70julKbIxpSxqKO5gw/gmk= +google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgnawEVsOn6OFsnpyxNPRY9QV01dNB0= +google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:L43LFes82YgSonw6iTXTxXUX1OlULt4AQtkik4ULL/I= google.golang.org/genproto/googleapis/api v0.0.0-20260406210006-6f92a3bedf2d h1:/aDRtSZJjyLQzm75d+a1wOJaqyKBMvIAfeQmoa3ORiI= google.golang.org/genproto/googleapis/api v0.0.0-20260406210006-6f92a3bedf2d/go.mod h1:etfGUgejTiadZAUaEP14NP97xi1RGeawqkjDARA/UOs= google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d h1:wT2n40TBqFY6wiwazVK9/iTWbsQrgk5ZfCSVFLO9LQA= diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index 308d7ba0f2..f82ff2a1d4 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -19,6 +19,9 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: + - jsonPath: .spec.runtime + name: Runtime + type: string - jsonPath: .spec.backend name: Backend type: string @@ -511,6 +514,106 @@ spec: type: string type: array type: object + runtime: + default: openshell + description: Runtime selects the harness provisioning stack. Defaults + to openshell when unset. + enum: + - openshell + - substrate + type: string + substrate: + description: Substrate is required when runtime is substrate. + properties: + actorTemplateRef: + description: |- + ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. + When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + gatewayPort: + default: 80 + description: GatewayPort is the port OpenClaw listens on inside + the actor (Substrate routes to :80 today). + format: int32 + type: integer + gatewayTokenSecretRef: + description: |- + GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. + When unset, the controller falls back to --substrate-gateway-token(-file). + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + snapshotsConfig: + description: SnapshotsConfig is required for auto-provisioned + templates (GCS gs:// location). + properties: + location: + description: |- + Location is the GCS URI prefix for golden and incremental snapshots. + Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + type: string + required: + - location + type: object + workerPool: + description: WorkerPool creates a dedicated WorkerPool in the + harness namespace when workerPoolRef is unset. + properties: + ateomImage: + description: |- + AteomImage is the ateom herder image (pullable registry ref, not ko://). + Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + type: string + replicas: + default: 2 + description: Replicas is the number of ateom worker pods. + Defaults to 2 when unset or zero. + format: int32 + type: integer + type: object + workerPoolRef: + description: |- + WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). + Mutually exclusive with workerPool. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workloadImage: + description: WorkloadImage overrides the default nemoclaw/openclaw + sandbox image in the ActorTemplate. + type: string + required: + - snapshotsConfig + type: object required: - backend type: object @@ -612,6 +715,44 @@ spec: observedGeneration: format: int64 type: integer + substrate: + description: Substrate records auto-provisioned Substrate CR references. + properties: + actorTemplateReady: + description: ActorTemplateReady is true when the template phase + is Ready (golden snapshot taken). + type: boolean + actorTemplateRef: + description: ActorTemplateRef is the ActorTemplate used when creating + the actor. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workerPoolRef: + description: WorkerPoolRef is the WorkerPool used by the harness + ActorTemplate. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + type: object type: object type: object served: true diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index ee7119b8ea..c4610056c3 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -87,6 +87,32 @@ spec: {{- with .Values.controller.env }} {{- toYaml . | nindent 12 }} {{- end }} + {{- if .Values.controller.substrate.enabled }} + - name: SUBSTRATE_ATE_API_ENDPOINT + value: {{ .Values.controller.substrate.ateApiEndpoint | quote }} + {{- if .Values.controller.substrate.ateApiInsecure }} + - name: SUBSTRATE_ATE_API_INSECURE + value: "true" + {{- end }} + - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAMESPACE + value: {{ .Values.controller.substrate.defaultActorTemplateNamespace | quote }} + - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAME + value: {{ .Values.controller.substrate.defaultActorTemplateName | quote }} + {{- if .Values.controller.substrate.gatewayTokenSecretName }} + - name: SUBSTRATE_GATEWAY_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.controller.substrate.gatewayTokenSecretName | quote }} + key: token + {{- else if .Values.controller.substrate.gatewayToken }} + - name: SUBSTRATE_GATEWAY_TOKEN + value: {{ .Values.controller.substrate.gatewayToken | quote }} + {{- end }} + {{- with .Values.controller.substrate.pauseImage }} + - name: SUBSTRATE_PAUSE_IMAGE + value: {{ . | quote }} + {{- end }} + {{- end }} envFrom: - configMapRef: name: {{ include "kagent.fullname" . }}-controller diff --git a/helm/kagent/templates/rbac/getter-role.yaml b/helm/kagent/templates/rbac/getter-role.yaml index f0ed9614fe..cafe9d0f5c 100644 --- a/helm/kagent/templates/rbac/getter-role.yaml +++ b/helm/kagent/templates/rbac/getter-role.yaml @@ -53,6 +53,25 @@ - get - list - watch +- apiGroups: + - ate.dev + resources: + - workerpools + - actortemplates + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - ate.dev + resources: + - actortemplates/status + verbs: + - get - apiGroups: - "apps" resources: diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index d6e5dff723..4942879450 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -223,12 +223,26 @@ controller: ports: port: 8083 targetPort: 8083 - # TODO: NEED TO MAKE SURE THESE GET RENDERED IN controller-deployment.yaml - # Extra controller env. Examples — OpenShell: - # env: - # - name: OPENSHELL_GRPC_ADDR - # value: "openshell.my-namespace.svc.cluster.local:8080" - env: [] + # Extra controller env (mapped to flags via SUBSTRATE_* / OPENSHELL_* env names). + # OpenShell AgentHarness: set OPENSHELL_GATEWAY_URL (or leave defaults below). + env: + # - name: OPENSHELL_GATEWAY_URL + # value: openshell.openshell.svc.cluster.local:8080 + # - name: OPENSHELL_INSECURE + # value: "true" + + # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. + # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool); users set + # spec.substrate.snapshotsConfig.location (gs://) and worker pool ref or create spec. + substrate: + enabled: false + ateApiEndpoint: "dns:///api.ate-system.svc:443" + ateApiInsecure: false + gatewayToken: "test-token" + gatewayTokenSecretName: "" + gatewayTokenSecretNamespace: "" + pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + envFrom: [] # Additional volumes on the output Deployment definition. @@ -645,7 +659,7 @@ oauth2-proxy: # Skip authentication for kagent's branded login page, health checks, and static assets # This allows unauthenticated users to see the landing page and k8s probes to work skip-auth-route: "^/(health|login)$" - skip-auth-regex: "^/(login|_next/static|_next/image|login-bg\\.(jpg|png|webp)|logo-.*\\.png|favicon\\.ico).*$" + skip-auth-regex: "^/(login|_next/static|_next/image|login-bg\\.(jpg|png|webp)|logo-.*\\.png|favicon\\.ico|api/agentharnesses/.*/gateway).*$" # Use custom templates that redirect to kagent's branded /login page custom-templates-dir: "/templates" diff --git a/ui/next.config.ts b/ui/next.config.ts index e816991448..cc63e385f6 100644 --- a/ui/next.config.ts +++ b/ui/next.config.ts @@ -1,7 +1,22 @@ import type { NextConfig } from "next"; +const controllerDevURL = + process.env.KAGENT_DEV_CONTROLLER_URL ?? "http://127.0.0.1:8083"; + const nextConfig: NextConfig = { output: "standalone", + // Proxy /api to the controller in local dev (next dev :8001 → controller :8083). + async rewrites() { + if (process.env.NODE_ENV === "production") { + return []; + } + return [ + { + source: "/api/:path*", + destination: `${controllerDevURL}/api/:path*`, + }, + ]; + }, logging: { fetches: { fullUrl: true, diff --git a/ui/src/app/openshell/OpenshellTerminalPage.tsx b/ui/src/app/openshell/OpenshellTerminalPage.tsx index ab7a77a618..e937186f38 100644 --- a/ui/src/app/openshell/OpenshellTerminalPage.tsx +++ b/ui/src/app/openshell/OpenshellTerminalPage.tsx @@ -50,9 +50,9 @@ export function OpenshellTerminalPage() { : undefined; const clawHarnessSession = searchParams.get("clawHarness") === "1"; const harnessTerminalSession = clawHarnessSession || harnessBackend === "hermes"; - const autoConnect = Boolean(gatewaySandboxName); const namespace = searchParams.get("ns")?.trim() ?? ""; const crName = searchParams.get("name")?.trim() ?? ""; + const autoConnect = Boolean(gatewaySandboxName); const modelConfigRef = searchParams.get("modelConfigRef")?.trim() ?? ""; const [plainShellOnly, setPlainShellOnly] = useState(() => searchParams.get("plainShell") === "1"); /** Plain-shell mode the active SSH session was opened with (null when disconnected). */ @@ -63,7 +63,7 @@ export function OpenshellTerminalPage() { const [termError, setTermError] = useState(null); const [sessionActive, setSessionActive] = useState(false); - const [connecting, setConnecting] = useState(() => Boolean(autoConnect && gatewaySandboxName)); + const [connecting, setConnecting] = useState(() => Boolean(autoConnect)); const termHostRef = useRef(null); const termRef = useRef(null); @@ -118,25 +118,28 @@ export function OpenshellTerminalPage() { wsRef.current?.close(); }, []); - const connectTerminal = useCallback( - (gatewayName: string) => { + const connectTerminal = useCallback(() => { const term = termRef.current; if (!term) { setConnecting(false); return; } - const name = gatewayName.trim(); - if (!name) { - setTermError("Missing gateway sandbox name."); - return; - } setTermError(null); setConnecting(true); setSessionActive(false); wsRef.current?.close(); - const url = sandboxSshWebSocketURL(terminalApiBase()); + const name = gatewaySandboxName.trim(); + if (!name) { + setConnecting(false); + setTermError("Missing gateway sandbox name."); + return; + } + + const apiBase = terminalApiBase(); + const url = sandboxSshWebSocketURL(apiBase); + let ws: WebSocket; try { ws = new WebSocket(url); @@ -210,24 +213,22 @@ export function OpenshellTerminalPage() { } }; }, - [plainShellOnly, harnessBackend], + [plainShellOnly, harnessBackend, gatewaySandboxName], ); const restartSession = useCallback(() => { - const name = gatewaySandboxName.trim(); - if (!name) return; wsRef.current?.close(); - window.setTimeout(() => connectTerminal(name), 120); - }, [gatewaySandboxName, connectTerminal]); + window.setTimeout(() => connectTerminal(), 120); + }, [connectTerminal]); useEffect(() => { - if (!autoConnect || !gatewaySandboxName) return; + if (!autoConnect) return; const t = window.setTimeout(() => { if (!termRef.current) return; - connectTerminal(gatewaySandboxName); + connectTerminal(); }, 400); return () => window.clearTimeout(t); - }, [autoConnect, gatewaySandboxName, connectTerminal]); + }, [autoConnect, connectTerminal]); const showReconnect = Boolean(gatewaySandboxName) && !sessionActive && !connecting; const plainShellPendingRestart = @@ -284,7 +285,7 @@ export function OpenshellTerminalPage() { ) : null}
{showReconnect ? ( - ) : null} @@ -304,8 +305,7 @@ export function OpenshellTerminalPage() { {!gatewaySandboxName ? (

- Open an OpenShell sandbox from the Agents list to start a terminal - session. + Open a harness from the Agents list to start a terminal session.

) : null} diff --git a/ui/src/components/AgentCard.tsx b/ui/src/components/AgentCard.tsx index 7128d9f658..da99efea20 100644 --- a/ui/src/components/AgentCard.tsx +++ b/ui/src/components/AgentCard.tsx @@ -20,11 +20,17 @@ import { Brain, MoreHorizontal, Pencil, Terminal, Trash2 } from "lucide-react"; import { k8sRefUtils } from "@/lib/k8sUtils"; import { agentHarnessIcon, + agentHarnessRuntimeLabel, agentHarnessTypeLabel, getAgentHarnessBackend, + getAgentHarnessRuntime, isAgentHarness, } from "@/lib/agentHarness"; -import { isOpenshellSandboxRow, openshellTerminalHref } from "@/lib/openshellSandboxAgents"; +import { + isOpenshellSandboxRow, + isSubstrateHarnessRow, + openshellTerminalHref, +} from "@/lib/openshellSandboxAgents"; import { cn } from "@/lib/utils"; interface AgentCardProps { @@ -38,8 +44,10 @@ export function AgentCard({ agentResponse }: AgentCardProps) { const [deleteOpen, setDeleteOpen] = useState(false); const sshSandbox = isOpenshellSandboxRow(agentResponse); + const substrateHarness = isSubstrateHarnessRow(agentResponse); const agentHarness = isAgentHarness(agentResponse); const harnessBackend = getAgentHarnessBackend(agentResponse); + const harnessRuntime = getAgentHarnessRuntime(agentResponse); const agentRef = k8sRefUtils.toRef( agent.metadata.namespace || '', @@ -88,7 +96,11 @@ export function AgentCard({ agentResponse }: AgentCardProps) { {harnessBackend ? agentHarnessIcon(harnessBackend) : "🦞"} @@ -172,16 +184,19 @@ export function AgentCard({ agentResponse }: AgentCardProps) { ); + const substrateGatewayPath = agentResponse.substrateAgentHarness?.gatewayUIPath; const chatHref = - sshSandbox && agentResponse.openshellAgentHarness - ? openshellTerminalHref({ - gatewaySandboxName: agentResponse.openshellAgentHarness.gatewaySandboxName, - namespace: agent.metadata.namespace, - crName: agent.metadata.name, - modelConfigRef: agentResponse.modelConfigRef, - harnessBackend: harnessBackend, - }) - : `/agents/${agent.metadata.namespace}/${agent.metadata.name}/chat`; + substrateHarness && substrateGatewayPath + ? substrateGatewayPath + : sshSandbox && agentResponse.openshellAgentHarness + ? openshellTerminalHref({ + gatewaySandboxName: agentResponse.openshellAgentHarness.gatewaySandboxName, + namespace: agent.metadata.namespace, + crName: agent.metadata.name, + modelConfigRef: agentResponse.modelConfigRef, + harnessBackend, + }) + : `/agents/${agent.metadata.namespace}/${agent.metadata.name}/chat`; return ( <> diff --git a/ui/src/components/AgentListView.tsx b/ui/src/components/AgentListView.tsx index 501d23169c..3e0cfe19c7 100644 --- a/ui/src/components/AgentListView.tsx +++ b/ui/src/components/AgentListView.tsx @@ -24,7 +24,11 @@ import { getAgentHarnessBackend, isAgentHarness, } from "@/lib/agentHarness"; -import { isOpenshellSandboxRow, openshellTerminalHref } from "@/lib/openshellSandboxAgents"; +import { + isOpenshellSandboxRow, + isSubstrateHarnessRow, + openshellTerminalHref, +} from "@/lib/openshellSandboxAgents"; interface AgentListViewProps { agentResponse: AgentResponse[]; @@ -221,6 +225,7 @@ function AgentListRow({ item }: { item: AgentResponse }) { const [deleteOpen, setDeleteOpen] = useState(false); const sshSandbox = isOpenshellSandboxRow(item); + const substrateHarness = isSubstrateHarnessRow(item); const agentHarness = isAgentHarness(item); const harnessBackend = getAgentHarnessBackend(item); @@ -232,26 +237,38 @@ function AgentListRow({ item }: { item: AgentResponse }) { const nTools = countAgentToolBindings(item); const nSkills = countSkills(agent); + const substrateGatewayPath = item.substrateAgentHarness?.gatewayUIPath; const gatewaySandboxName = item.openshellAgentHarness?.gatewaySandboxName; const chatPath = useMemo( () => - sshSandbox && gatewaySandboxName - ? openshellTerminalHref({ - gatewaySandboxName, - namespace, - crName: name, - modelConfigRef: item.modelConfigRef, - harnessBackend, - }) - : `/agents/${encodeURIComponent(namespace)}/${encodeURIComponent(name)}/chat`, - [sshSandbox, gatewaySandboxName, namespace, name, item.modelConfigRef, harnessBackend], + substrateHarness && substrateGatewayPath + ? substrateGatewayPath + : sshSandbox && gatewaySandboxName + ? openshellTerminalHref({ + gatewaySandboxName, + namespace, + crName: name, + modelConfigRef: item.modelConfigRef, + harnessBackend, + }) + : `/agents/${encodeURIComponent(namespace)}/${encodeURIComponent(name)}/chat`, + [ + substrateHarness, + substrateGatewayPath, + sshSandbox, + gatewaySandboxName, + namespace, + name, + item.modelConfigRef, + harnessBackend, + ], ); - const goChat = () => { + const goChat = useCallback(() => { if (isReady) { router.push(chatPath); } - }; + }, [isReady, router, chatPath]); const handleEdit = (e: React.MouseEvent) => { e.preventDefault(); diff --git a/ui/src/components/agent-form/OpenClawSandboxFields.tsx b/ui/src/components/agent-form/OpenClawSandboxFields.tsx index 8af459e329..54e7cc6cf4 100644 --- a/ui/src/components/agent-form/OpenClawSandboxFields.tsx +++ b/ui/src/components/agent-form/OpenClawSandboxFields.tsx @@ -162,6 +162,97 @@ export function OpenClawSandboxFields({ {section === "general" ? validationError?.message : null} + + + Control plane + + + {value.runtime === "substrate" ? ( +
+ + Snapshot location (GCS) + set({ substrateSnapshotsLocation: e.target.value })} + /> +

+ Substrate stores golden and incremental snapshots at this gs:// prefix (GCS only today). +

+
+ + Worker pool + + + {value.substrateWorkerPoolMode === "existing" ? ( +
+ + WorkerPool namespace + set({ substrateWorkerPoolRefNamespace: e.target.value })} + /> + + + WorkerPool name + set({ substrateWorkerPoolRefName: e.target.value })} + /> + +
+ ) : ( + + Worker replicas + set({ substrateWorkerPoolReplicas: e.target.value })} + /> + + )} +
+ ) : null} +
+ b === value); } +export function getAgentHarnessRuntime(item: AgentResponse): "openshell" | "substrate" | undefined { + if (!isHarnessListRow(item)) { + return undefined; + } + if (isSubstrateHarnessRow(item)) { + return "substrate"; + } + return "openshell"; +} + /** * When this agent row represents an agent harness, returns the AgentHarness CR backend discriminator (e.g. openclaw vs nemoclaw). * Use {@link isAgentHarness} for a simple boolean check. */ export function getAgentHarnessBackend(item: AgentResponse): AgentHarnessBackend | undefined { - if (!isOpenshellSandboxRow(item)) { + if (!isHarnessListRow(item)) { return undefined; } - const backend = item.openshellAgentHarness?.backend; + const backend = + item.substrateAgentHarness?.backend ?? item.openshellAgentHarness?.backend; return isAgentHarnessBackend(backend) ? backend : undefined; } -/** True when the agents-list row is an agent harness (OpenShell sandbox whose backend is a known harness runtime). */ +/** True when the agents-list row is an agent harness. */ export function isAgentHarness(item: AgentResponse): boolean { return getAgentHarnessBackend(item) !== undefined; } @@ -80,3 +91,7 @@ export function agentHarnessTypeLabel(backend: AgentHarnessBackend): string { } } } + +export function agentHarnessRuntimeLabel(runtime: "openshell" | "substrate"): string { + return runtime === "substrate" ? "Substrate" : "OpenShell"; +} diff --git a/ui/src/lib/openClawSandboxForm.ts b/ui/src/lib/openClawSandboxForm.ts index 50b0c83cac..46608384f1 100644 --- a/ui/src/lib/openClawSandboxForm.ts +++ b/ui/src/lib/openClawSandboxForm.ts @@ -65,7 +65,18 @@ export function isClawHarnessBackend(backend: AgentHarnessSandboxBackend | undef return backend === "openclaw" || backend === "nemoclaw"; } +export type HarnessRuntimeForm = "openshell" | "substrate"; + export interface OpenClawSandboxFormSlice { + /** Harness control plane: OpenShell (default) or Agent Substrate. */ + runtime: HarnessRuntimeForm; + /** Use an existing Substrate WorkerPool or let kagent create one per harness. */ + substrateWorkerPoolMode: "create" | "existing"; + substrateWorkerPoolRefNamespace: string; + substrateWorkerPoolRefName: string; + substrateWorkerPoolReplicas: string; + /** GCS snapshot prefix (gs://bucket/path/) — required for auto-provisioned templates. */ + substrateSnapshotsLocation: string; /** Optional override for Sandbox.spec.image (OpenShell VM template image). Empty → controller default. */ image: string; channels: OpenClawChannelRow[]; @@ -80,6 +91,12 @@ export interface OpenClawSandboxFormSlice { export function defaultOpenClawSandboxFormSlice(): OpenClawSandboxFormSlice { return { + runtime: "openshell", + substrateWorkerPoolMode: "create", + substrateWorkerPoolRefNamespace: "", + substrateWorkerPoolRefName: "", + substrateWorkerPoolReplicas: "2", + substrateSnapshotsLocation: "gs://ate-snapshots/kagent/", image: "", channels: [], allowedDomains: "", @@ -361,11 +378,40 @@ export function buildSandboxCRDraft(args: { } const backend = resolveSandboxBackend(args.backend); + const runtime = args.openClaw.runtime?.trim() || "openshell"; + const spec: Record = { backend, + runtime, modelConfigRef, }; + if (runtime === "substrate") { + const snapshots = args.openClaw.substrateSnapshotsLocation?.trim(); + if (!snapshots) { + return { error: "Substrate snapshots location (gs://…) is required." }; + } + const substrate: Record = { + snapshotsConfig: { location: snapshots }, + }; + if (args.openClaw.substrateWorkerPoolMode === "existing") { + const wpName = args.openClaw.substrateWorkerPoolRefName?.trim(); + if (!wpName) { + return { error: "WorkerPool name is required when using an existing pool." }; + } + substrate.workerPoolRef = { + name: wpName, + namespace: args.openClaw.substrateWorkerPoolRefNamespace?.trim() || args.namespace.trim(), + }; + } else { + const replicas = Number.parseInt(args.openClaw.substrateWorkerPoolReplicas?.trim() || "2", 10); + substrate.workerPool = { + replicas: Number.isFinite(replicas) && replicas > 0 ? replicas : 2, + }; + } + spec.substrate = substrate; + } + const desc = args.description.trim(); if (desc) { spec.description = desc; diff --git a/ui/src/lib/openshellSandboxAgents.ts b/ui/src/lib/openshellSandboxAgents.ts index 64c7e45d9d..64f79770c2 100644 --- a/ui/src/lib/openshellSandboxAgents.ts +++ b/ui/src/lib/openshellSandboxAgents.ts @@ -5,6 +5,14 @@ export function isOpenshellSandboxRow(item: AgentResponse): boolean { return Boolean(item.openshellAgentHarness?.gatewaySandboxName); } +export function isSubstrateHarnessRow(item: AgentResponse): boolean { + return Boolean(item.substrateAgentHarness?.gatewayUIPath); +} + +export function isHarnessListRow(item: AgentResponse): boolean { + return isOpenshellSandboxRow(item) || isSubstrateHarnessRow(item); +} + export type OpenshellTerminalLinkParams = { gatewaySandboxName: string; namespace?: string; diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts index b4a441ce8e..7f50f04e5b 100644 --- a/ui/src/types/index.ts +++ b/ui/src/types/index.ts @@ -427,6 +427,18 @@ export interface OpenshellAgentHarnessListEntry { endpoint?: string; } +/** Merged into GET /api/agents when AgentHarness.spec.runtime is substrate. */ +export interface SubstrateAgentHarnessListEntry { + backend: string; + runtime: "substrate"; + actorId?: string; + /** Same-origin path for OpenClaw UI (HTTP + WebSocket via kagent proxy to actor pod IP). */ + gatewayUIPath?: string; + modelConfigRef?: string; + backendRefId?: string; + endpoint?: string; +} + export interface AgentResponse { id: number | string; agent: Agent; @@ -438,6 +450,7 @@ export interface AgentResponse { accepted: boolean; workloadMode?: "deployment" | "sandbox"; openshellAgentHarness?: OpenshellAgentHarnessListEntry; + substrateAgentHarness?: SubstrateAgentHarnessListEntry; } export interface RemoteMCPServer { From 76cae6c8f3eb56068bd17fca1ae3e2b6e9560c1b Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 26 May 2026 10:58:54 -0700 Subject: [PATCH 02/32] fix up the optional/non-optional types in the crd/values file Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 92 +++++++++++++++++-- .../crd/bases/kagent.dev_agentharnesses.yaml | 26 ++++-- go/api/v1alpha2/agentharness_types.go | 26 ++++-- go/api/v1alpha2/zz_generated.deepcopy.go | 6 +- .../handlers/agentharness_gateway.go | 48 +--------- go/core/pkg/app/app.go | 56 +++-------- .../pkg/sandboxbackend/substrate/config.go | 3 - .../sandboxbackend/substrate/gateway_token.go | 65 +++++++++++++ .../pkg/sandboxbackend/substrate/openclaw.go | 11 ++- .../pkg/sandboxbackend/substrate/provision.go | 36 ++++++-- .../substrate/provision_openclaw.go | 5 +- .../substrate/provision_openclaw_test.go | 85 ++++++++++++++++- .../substrate/provision_test.go | 84 ++++++++++++++++- .../templates/kagent.dev_agentharnesses.yaml | 26 ++++-- .../templates/controller-deployment.yaml | 30 ++++-- helm/kagent/values.yaml | 14 +-- 16 files changed, 454 insertions(+), 159 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/gateway_token.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index 1b27550895..eebeb17e29 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -42,14 +42,23 @@ Install kagent (Substrate must already be running in the cluster): ```bash export KIND_CLUSTER_NAME=kind -make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true" +make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" ``` -Create a harness with only what you must choose: +The generated `ActorTemplate` uses `controller.substrate.pauseImage`, +`controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, +`controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` +from the Helm values. Override them with `--set` or a values file when you need +to pin a different gVisor build. -- **`snapshotsConfig.location`** — GCS `gs://` prefix (Substrate snapshots are GCS-only today) -- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool` + **`ateomImage`**) -- **`workerPool.ateomImage`** — (`localhost:5001/ateom-gvisor:latest`) +Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to +`gs://ate-snapshots//`. If Helm sets +`controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can +be omitted unless you want to override it. + +- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool`) +- **`workerPool.ateomImage`** — optional override for the Helm/controller default (`localhost:5001/ateom-gvisor:latest`) +- **Gateway token** — required per harness with either `gatewayToken` or `gatewayTokenSecretRef` ```yaml apiVersion: kagent.dev/v1alpha2 @@ -63,15 +72,78 @@ spec: description: OpenClaw on Agent Substrate modelConfigRef: default-model-config substrate: - snapshotsConfig: - location: gs://ate-snapshots/kagent/kagent/my-claw/ - workerPool: - replicas: 1 - ateomImage: localhost:5001/ateom-gvisor:latest + # Optional: defaults to gs://ate-snapshots/kagent/peterj-claw + # snapshotsConfig: + # location: gs://ate-snapshots/kagent/peterj-claw + + # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. + # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. + # workerPool: + # replicas: 1 + # ateomImage: localhost:5001/ateom-gvisor:latest + + # Required: configure the OpenClaw gateway token for this harness. + # Use either gatewayToken or gatewayTokenSecretRef. The Secret must contain key "token". + gatewayToken: test-token + # gatewayTokenSecretRef: + # name: openclaw-gateway-token + # namespace: kagent + + # Optional: override the sandbox image used in the ActorTemplate. + # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + # Optional: adopt existing resources instead of auto-create # workerPoolRef: # name: my-pool # namespace: ate-system + # actorTemplateRef: + # name: my-template + # namespace: ate-system +``` + +When `actorTemplateRef` is not set, kagent creates an `ActorTemplate` that looks roughly like this: + +```yaml +apiVersion: ate.dev/v1alpha1 +kind: ActorTemplate +metadata: + name: peterj-claw + namespace: kagent + labels: + app.kubernetes.io/managed-by: kagent + kagent.dev/agent-harness: peterj-claw +spec: + pauseImage: gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da + runsc: + amd64: + url: gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc + sha256Hash: a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63 + arm64: + url: gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc + sha256Hash: 1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9 + workerPoolRef: + name: peterj-claw-wp + namespace: kagent + snapshotsConfig: + location: gs://ate-snapshots/kagent/peterj-claw + containers: + - name: openclaw + image: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + ports: + - containerPort: 80 + command: + - /bin/sh + - -c + - | + # Generated by kagent: + # 1. writes ~/.openclaw/openclaw.json from modelConfigRef/channels/gateway token + # 2. starts `openclaw gateway run --port 80 --allow-unconfigured` + # 3. waits for the gateway and tails the log + env: + - name: HOME + value: /root ``` +The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. + Port-forward the UI (`kubectl port-forward -n kagent svc/kagent-ui 8001:8080`) and navigate to the deployed agent harness. Use `test-token` as a gateway token to OpenClaw. diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index f82ff2a1d4..2c2f18ff71 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -547,10 +547,16 @@ spec: the actor (Substrate routes to :80 today). format: int32 type: integer + gatewayToken: + description: |- + GatewayToken is the OpenClaw gateway Bearer token for this harness. + Prefer gatewayTokenSecretRef for production secrets. + minLength: 1 + type: string gatewayTokenSecretRef: description: |- GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. - When unset, the controller falls back to --substrate-gateway-token(-file). + The Secret must contain a "token" key. properties: apiGroup: type: string @@ -564,8 +570,9 @@ spec: - name type: object snapshotsConfig: - description: SnapshotsConfig is required for auto-provisioned - templates (GCS gs:// location). + description: |- + SnapshotsConfig configures actor memory snapshots. Defaults to + gs://ate-snapshots// when unset. properties: location: description: |- @@ -582,12 +589,12 @@ spec: ateomImage: description: |- AteomImage is the ateom herder image (pullable registry ref, not ko://). - Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + Overrides the controller-wide substrate ateom image default for this WorkerPool. type: string replicas: - default: 2 + default: 1 description: Replicas is the number of ateom worker pods. - Defaults to 2 when unset or zero. + Defaults to 1 when unset or zero. format: int32 type: integer type: object @@ -611,9 +618,12 @@ spec: description: WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. type: string - required: - - snapshotsConfig type: object + x-kubernetes-validations: + - message: Exactly one of gatewayToken or gatewayTokenSecretRef must + be specified + rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) + || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) required: - backend type: object diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index 26c0109069..44902e1ffb 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -42,8 +42,8 @@ func IsKnownAgentHarnessBackend(b AgentHarnessBackendType) bool { type AgentHarnessRuntime string const ( - AgentHarnessRuntimeOpenshell AgentHarnessRuntime = "openshell" - AgentHarnessRuntimeSubstrate AgentHarnessRuntime = "substrate" + AgentHarnessRuntimeOpenshell AgentHarnessRuntime = "openshell" + AgentHarnessRuntimeSubstrate AgentHarnessRuntime = "substrate" ) // AgentHarnessSubstrateSnapshotsConfig points at a GCS prefix for actor memory snapshots. @@ -58,13 +58,13 @@ type AgentHarnessSubstrateSnapshotsConfig struct { // AgentHarnessSubstrateWorkerPoolSpec creates a dedicated WorkerPool for this harness. // Mutually exclusive with workerPoolRef. type AgentHarnessSubstrateWorkerPoolSpec struct { - // Replicas is the number of ateom worker pods. Defaults to 2 when unset or zero. + // Replicas is the number of ateom worker pods. Defaults to 1 when unset or zero. // +optional - // +kubebuilder:default=2 + // +kubebuilder:default=1 Replicas int32 `json:"replicas,omitempty"` // AteomImage is the ateom herder image (pullable registry ref, not ko://). - // Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + // Overrides the controller-wide substrate ateom image default for this WorkerPool. // +optional AteomImage string `json:"ateomImage,omitempty"` } @@ -73,6 +73,7 @@ type AgentHarnessSubstrateWorkerPoolSpec struct { // // By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). // Set actorTemplateRef only to adopt an existing template (advanced / legacy). +// +kubebuilder:validation:XValidation:rule="(has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef))",message="Exactly one of gatewayToken or gatewayTokenSecretRef must be specified" type AgentHarnessSubstrateSpec struct { // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). // Mutually exclusive with workerPool. @@ -83,9 +84,10 @@ type AgentHarnessSubstrateSpec struct { // +optional WorkerPool *AgentHarnessSubstrateWorkerPoolSpec `json:"workerPool,omitempty"` - // SnapshotsConfig is required for auto-provisioned templates (GCS gs:// location). - // +required - SnapshotsConfig AgentHarnessSubstrateSnapshotsConfig `json:"snapshotsConfig"` + // SnapshotsConfig configures actor memory snapshots. Defaults to + // gs://ate-snapshots// when unset. + // +optional + SnapshotsConfig *AgentHarnessSubstrateSnapshotsConfig `json:"snapshotsConfig,omitempty"` // WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. // +optional @@ -101,8 +103,14 @@ type AgentHarnessSubstrateSpec struct { // +kubebuilder:default=80 GatewayPort int32 `json:"gatewayPort,omitempty"` + // GatewayToken is the OpenClaw gateway Bearer token for this harness. + // Prefer gatewayTokenSecretRef for production secrets. + // +optional + // +kubebuilder:validation:MinLength=1 + GatewayToken string `json:"gatewayToken,omitempty"` + // GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. - // When unset, the controller falls back to --substrate-gateway-token(-file). + // The Secret must contain a "token" key. // +optional GatewayTokenSecretRef *TypedReference `json:"gatewayTokenSecretRef,omitempty"` } diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index dd1b350ccb..6acf8938f6 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -411,7 +411,11 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec *out = new(AgentHarnessSubstrateWorkerPoolSpec) **out = **in } - out.SnapshotsConfig = in.SnapshotsConfig + if in.SnapshotsConfig != nil { + in, out := &in.SnapshotsConfig, &out.SnapshotsConfig + *out = new(AgentHarnessSubstrateSnapshotsConfig) + **out = **in + } if in.ActorTemplateRef != nil { in, out := &in.ActorTemplateRef, &out.ActorTemplateRef *out = new(TypedReference) diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go index 453ec5d907..551a8b8981 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -10,21 +10,18 @@ import ( "net/http" "net/http/httputil" "net/url" - "os" "strings" "time" "github.com/gorilla/mux" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" ) const ( - substrateGatewayTokenSecretKey = "token" // OpenClaw 2026.3.28+ returns 403 without operator scopes on HTTP/WS when only Bearer token is sent. openclawDefaultOperatorScopes = "operator.admin" // Origin OpenClaw accepts by default for bind=lan port=80 (localhost/127.0.0.1 on gateway port). @@ -34,26 +31,10 @@ const ( // AgentHarnessGatewayConfig configures Substrate harness HTTP/WebSocket proxy. // Traffic is proxied directly to the actor ateom pod IP on port 80 (no atenet-router fallback). type AgentHarnessGatewayConfig struct { - GatewayToken string - GatewayTokenFile string - AteAPIEndpoint string - AteAPIInsecure bool - DialTimeout time.Duration - CallTimeout time.Duration -} - -func (c *AgentHarnessGatewayConfig) resolveToken() (string, error) { - if c == nil { - return "", nil - } - if c.GatewayTokenFile != "" { - data, err := os.ReadFile(c.GatewayTokenFile) - if err != nil { - return "", fmt.Errorf("read substrate gateway token file: %w", err) - } - return strings.TrimSpace(string(data)), nil - } - return strings.TrimSpace(c.GatewayToken), nil + AteAPIEndpoint string + AteAPIInsecure bool + DialTimeout time.Duration + CallTimeout time.Duration } // HandleAgentHarnessGateway proxies browser traffic to the actor OpenClaw gateway (pod IP when available). @@ -342,24 +323,5 @@ func readGatewayResponseBody(resp *http.Response) ([]byte, error) { } func (h *Handlers) resolveHarnessGatewayToken(ctx context.Context, ah *v1alpha2.AgentHarness) (string, error) { - if ah.Spec.Substrate != nil && ah.Spec.Substrate.GatewayTokenSecretRef != nil { - ref := ah.Spec.Substrate.GatewayTokenSecretRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - var secret corev1.Secret - if err := h.KubeClient.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { - return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) - } - if secret.Data == nil { - return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) - } - val, ok := secret.Data[substrateGatewayTokenSecretKey] - if !ok { - return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, substrateGatewayTokenSecretKey) - } - return strings.TrimSpace(string(val)), nil - } - return h.AgentHarnessGateway.resolveToken() + return substrate.ResolveGatewayToken(ctx, h.KubeClient, ah) } diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 7885292985..2389660463 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -53,13 +53,13 @@ import ( // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" dbpkg "github.com/kagent-dev/kagent/go/api/database" "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/migrations" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "github.com/kagent-dev/kagent/go/core/pkg/translator" "k8s.io/apimachinery/pkg/runtime" @@ -159,13 +159,12 @@ type Config struct { CallTimeout time.Duration DefaultActorTemplateNamespace string DefaultActorTemplateName string - GatewayToken string - GatewayTokenFile string PauseImage string RunscAMD64URL string RunscAMD64SHA256 string RunscARM64URL string RunscARM64SHA256 string + AteomImage string } } @@ -232,13 +231,12 @@ func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { commandLine.DurationVar(&cfg.Substrate.CallTimeout, "substrate-call-timeout", 30*time.Second, "Per-RPC timeout for ate-api calls.") commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateNamespace, "substrate-default-actor-template-namespace", "", "Legacy fallback ActorTemplate namespace when adopting an external template (set spec.substrate.actorTemplateRef instead).") commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateName, "substrate-default-actor-template-name", "", "Legacy fallback ActorTemplate name when adopting an external template (set spec.substrate.actorTemplateRef instead).") - commandLine.StringVar(&cfg.Substrate.GatewayToken, "substrate-gateway-token", "", "OpenClaw gateway Bearer token for substrate proxy. Prefer --substrate-gateway-token-file.") - commandLine.StringVar(&cfg.Substrate.GatewayTokenFile, "substrate-gateway-token-file", "", "File containing OpenClaw gateway Bearer token for substrate harness proxy.") commandLine.StringVar(&cfg.Substrate.PauseImage, "substrate-pause-image", "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", "Pause image for auto-provisioned ActorTemplates.") commandLine.StringVar(&cfg.Substrate.RunscAMD64URL, "substrate-runsc-amd64-url", "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc", "gVisor runsc URL for amd64.") commandLine.StringVar(&cfg.Substrate.RunscAMD64SHA256, "substrate-runsc-amd64-sha256", "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63", "gVisor runsc sha256 for amd64.") commandLine.StringVar(&cfg.Substrate.RunscARM64URL, "substrate-runsc-arm64-url", "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc", "gVisor runsc URL for arm64.") commandLine.StringVar(&cfg.Substrate.RunscARM64SHA256, "substrate-runsc-arm64-sha256", "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9", "gVisor runsc sha256 for arm64.") + commandLine.StringVar(&cfg.Substrate.AteomImage, "substrate-ateom-image", "", "Default ateom herder image for auto-provisioned Substrate WorkerPools. Per-harness spec.substrate.workerPool.ateomImage overrides this.") commandLine.StringVar(&agent_translator.DefaultServiceAccountName, "default-service-account-name", "", "Global default ServiceAccount name for agent pods. When set, agents without an explicit serviceAccountName will use this instead of creating a per-agent ServiceAccount.") @@ -732,21 +730,11 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne var agentHarnessGateway *handlers.AgentHarnessGatewayConfig if cfg.Substrate.AteAPIEndpoint != "" { - gwToken := cfg.Substrate.GatewayToken - if cfg.Substrate.GatewayTokenFile != "" { - data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) - if err != nil { - setupLog.Error(err, "unable to read substrate gateway token file") - os.Exit(1) - } - gwToken = strings.TrimSpace(string(data)) - } agentHarnessGateway = &handlers.AgentHarnessGatewayConfig{ - GatewayToken: gwToken, - AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, - AteAPIInsecure: cfg.Substrate.Insecure, - DialTimeout: cfg.Substrate.DialTimeout, - CallTimeout: cfg.Substrate.CallTimeout, + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + AteAPIInsecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, } } @@ -821,17 +809,14 @@ func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient ocl := openshell.NewOpenClawBackend(kubeClient, clients, oc, nil) hermesBackend := openshell.NewHermesBackend(kubeClient, clients, oc, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ocl, v1alpha2.AgentHarnessBackendHermes: hermesBackend, }, nil } func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend, *substrate.Client, error) { - sc, _, err := substrateAppConfig(cfg) - if err != nil { - return nil, nil, err - } + sc := substrateAppConfig(cfg) client, err := substrate.Dial(ctx, sc) if err != nil { return nil, nil, err @@ -840,20 +825,12 @@ func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alph ocl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendOpenClaw, nil) ncl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendNemoClaw, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ncl, }, client, nil } -func substrateAppConfig(cfg *Config) (substrate.Config, string, error) { - gwToken := cfg.Substrate.GatewayToken - if cfg.Substrate.GatewayTokenFile != "" { - data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) - if err != nil { - return substrate.Config{}, "", fmt.Errorf("read substrate gateway token file: %w", err) - } - gwToken = strings.TrimSpace(string(data)) - } +func substrateAppConfig(cfg *Config) substrate.Config { sc := substrate.Config{ AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, Insecure: cfg.Substrate.Insecure, @@ -861,25 +838,20 @@ func substrateAppConfig(cfg *Config) (substrate.Config, string, error) { CallTimeout: cfg.Substrate.CallTimeout, DefaultActorTemplateNamespace: cfg.Substrate.DefaultActorTemplateNamespace, DefaultActorTemplateName: cfg.Substrate.DefaultActorTemplateName, - GatewayToken: gwToken, ProvisionDefaults: substrate.ProvisionDefaults{ PauseImage: cfg.Substrate.PauseImage, RunscAMD64URL: cfg.Substrate.RunscAMD64URL, RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultAteomImage: cfg.Substrate.AteomImage, DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, - GatewayToken: gwToken, }, } - return sc, gwToken, nil + return sc } func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate *substrate.Client) *substrate.Provisioner { - _, gwToken, err := substrateAppConfig(cfg) - if err != nil { - gwToken = cfg.Substrate.GatewayToken - } return &substrate.Provisioner{ Client: kubeClient, Ate: ate, @@ -889,8 +861,8 @@ func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate * RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultAteomImage: cfg.Substrate.AteomImage, DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, - GatewayToken: gwToken, }, } } diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go index 45b5cb2b48..092e68ef92 100644 --- a/go/core/pkg/sandboxbackend/substrate/config.go +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -16,7 +16,4 @@ type Config struct { // ProvisionDefaults configures auto-created WorkerPool/ActorTemplate resources. ProvisionDefaults ProvisionDefaults - - // GatewayToken is the OpenClaw gateway Bearer token injected by the HTTP proxy. - GatewayToken string } diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_token.go b/go/core/pkg/sandboxbackend/substrate/gateway_token.go new file mode 100644 index 0000000000..abe4b0ba53 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/gateway_token.go @@ -0,0 +1,65 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// GatewayTokenSecretKey is the Secret data key used for per-harness OpenClaw gateway tokens. +const GatewayTokenSecretKey = "token" + +// ValidateGatewayTokenSpec requires exactly one per-harness OpenClaw gateway token source. +func ValidateGatewayTokenSpec(sub *v1alpha2.AgentHarnessSubstrateSpec) error { + if sub == nil { + return fmt.Errorf("spec.substrate is required") + } + hasToken := strings.TrimSpace(sub.GatewayToken) != "" + hasSecretRef := sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" + if hasToken == hasSecretRef { + return fmt.Errorf("exactly one of spec.substrate.gatewayToken or gatewayTokenSecretRef must be specified") + } + return nil +} + +// ResolveGatewayToken returns the per-harness gateway token. +func ResolveGatewayToken(ctx context.Context, kube client.Client, ah *v1alpha2.AgentHarness) (string, error) { + if ah == nil || ah.Spec.Substrate == nil { + return "", fmt.Errorf("spec.substrate is required") + } + if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { + return "", err + } + sub := ah.Spec.Substrate + if sub.GatewayTokenSecretRef != nil { + return resolveGatewayTokenSecret(ctx, kube, ah.Namespace, sub.GatewayTokenSecretRef) + } + return strings.TrimSpace(sub.GatewayToken), nil +} + +func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, defaultNamespace string, ref *v1alpha2.TypedReference) (string, error) { + if kube == nil { + return "", fmt.Errorf("kubernetes client is required to resolve gateway token secret") + } + ns := ref.Namespace + if ns == "" { + ns = defaultNamespace + } + var secret corev1.Secret + if err := kube.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { + return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) + } + if secret.Data == nil { + return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) + } + val, ok := secret.Data[GatewayTokenSecretKey] + if !ok { + return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, GatewayTokenSecretKey) + } + return strings.TrimSpace(string(val)), nil +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 08d5b7d7a3..0d269a45d8 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -17,8 +17,8 @@ import ( const ( defaultActorHostSuffix = "actors.resources.substrate.ate.dev" - defaultSubstrateGWPort = int32(80) - actorIDPrefix = "ahr" + defaultSubstrateGWPort = int32(80) + actorIDPrefix = "ahr" ) var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) @@ -200,11 +200,14 @@ func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { if ah.Spec.Substrate == nil { return fmt.Errorf("spec.substrate is required when runtime is substrate") } + if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { + return err + } if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { return nil } - if strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location) == "" { - return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") + if loc := substrateSnapshotsLocation(ah); !strings.HasPrefix(loc, "gs://") { + return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") } return nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go index d8a63de188..d704baa927 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -23,7 +23,8 @@ const ( annotationManagedWorkerPool = AnnotationManagedWorkerPool annotationManagedActorTemplate = AnnotationManagedActorTemplate - defaultWorkerPoolReplicas = int32(2) + defaultWorkerPoolReplicas = int32(1) + defaultSnapshotsBucket = "ate-snapshots" defaultOpenClawContainer = "openclaw" ) @@ -34,8 +35,8 @@ type ProvisionDefaults struct { RunscAMD64SHA256 string RunscARM64URL string RunscARM64SHA256 string + DefaultAteomImage string DefaultWorkloadImage string - GatewayToken string } // ateActorDeleter removes actors from ate-api during harness teardown. @@ -115,13 +116,13 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { sub := ah.Spec.Substrate + if err := ValidateGatewayTokenSpec(sub); err != nil { + return err + } if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { return nil } - loc := strings.TrimSpace(sub.SnapshotsConfig.Location) - if loc == "" { - return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") - } + loc := substrateSnapshotsLocation(ah) if !strings.HasPrefix(loc, "gs://") { return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") } @@ -156,7 +157,10 @@ func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHa ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) } if ateomImage == "" { - return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set spec.substrate.workerPool.ateomImage)") + ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) + } + if ateomImage == "" { + return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") } desired := &atev1alpha1.WorkerPool{ @@ -232,7 +236,7 @@ func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.Agen Namespace: wpKey.Namespace, }, SnapshotsConfig: atev1alpha1.SnapshotsConfig{ - Location: strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location), + Location: substrateSnapshotsLocation(ah), }, }, } @@ -277,6 +281,22 @@ func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { } } +func substrateSnapshotsLocation(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return defaultSubstrateSnapshotsLocation("", "") + } + if sub := ah.Spec.Substrate; sub != nil && sub.SnapshotsConfig != nil { + if loc := strings.TrimSpace(sub.SnapshotsConfig.Location); loc != "" { + return loc + } + } + return defaultSubstrateSnapshotsLocation(ah.Namespace, ah.Name) +} + +func defaultSubstrateSnapshotsLocation(namespace, name string) string { + return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) +} + func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { return map[string]string{ "app.kubernetes.io/managed-by": "kagent", diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index b2d53e405a..b5e9903e30 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -25,7 +25,10 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha return "", nil, fmt.Errorf("substrate provisioner kubernetes client is required") } - token := strings.TrimSpace(p.Defaults.GatewayToken) + token, err := ResolveGatewayToken(ctx, p.Client, ah) + if err != nil { + return "", nil, fmt.Errorf("resolve gateway token: %w", err) + } gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort) var jsonBytes []byte diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index f4ac28d3f9..9c16ece575 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -43,7 +43,8 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { Spec: v1alpha2.AgentHarnessSpec{ ModelConfigRef: "default-model-config", Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + GatewayToken: "some-token", + SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ Location: "gs://bucket/prefix/", }, }, @@ -52,8 +53,7 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() p := &Provisioner{ - Client: kube, - Defaults: ProvisionDefaults{GatewayToken: "some-token"}, + Client: kube, } script, env, err := p.buildOpenClawActorStartup(context.Background(), ah) @@ -97,6 +97,58 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { require.Contains(t, root, "agents") } +func TestBuildOpenClawActorStartup_WithHarnessGatewayToken(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openclaw-token", Namespace: ns}, + Data: map[string][]byte{GatewayTokenSecretKey: []byte("secret-token")}, + } + for _, tt := range []struct { + name string + substrate *v1alpha2.AgentHarnessSubstrateSpec + wantToken string + }{ + { + name: "inline token", + substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayToken: "inline-token", + }, + wantToken: "inline-token", + }, + { + name: "secret token", + substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayTokenSecretRef: &v1alpha2.TypedReference{Name: "openclaw-token"}, + }, + wantToken: "secret-token", + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret.DeepCopy()).Build() + p := &Provisioner{ + Client: kube, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Substrate: tt.substrate, + }, + } + + script, _, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + require.Equal(t, tt.wantToken, gatewayTokenFromStartup(t, script)) + }) + } +} + func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { t.Parallel() scheme := runtime.NewScheme() @@ -123,7 +175,8 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { Spec: v1alpha2.AgentHarnessSpec{ ModelConfigRef: "mc", Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + GatewayToken: "some-token", + SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ Location: "gs://bucket/prefix/", }, }, @@ -151,3 +204,27 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { openai := root["models"].(map[string]any)["providers"].(map[string]any)["openai"].(map[string]any) require.Equal(t, "https://api.example/v1", openai["baseUrl"]) } + +func gatewayTokenFromStartup(t *testing.T, script string) string { + t.Helper() + + var payload string + for _, line := range strings.Split(script, "\n") { + if strings.Contains(line, "base64 -d") { + start := strings.Index(line, `'`) + 1 + end := strings.LastIndex(line, `'`) + require.Greater(t, end, start) + payload = line[start:end] + break + } + } + require.NotEmpty(t, payload) + raw, decErr := base64.StdEncoding.DecodeString(payload) + require.NoError(t, decErr) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + auth := gw["auth"].(map[string]any) + token, _ := auth["token"].(string) + return token +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go index 4878d40a99..e0e767e458 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_test.go @@ -1,9 +1,15 @@ package substrate import ( + "context" "testing" + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/kagent-dev/kagent/go/api/v1alpha2" ) @@ -15,7 +21,8 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { Spec: v1alpha2.AgentHarnessSpec{ Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + GatewayToken: "test-token", + SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ Location: "gs://bucket/prefix/", }, }, @@ -25,7 +32,21 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { t.Fatalf("expected valid: %v", err) } - ah.Spec.Substrate.SnapshotsConfig.Location = "s3://nope" + ah.Spec.Substrate.SnapshotsConfig = nil + if err := validateSubstrateProvisionSpec(ah); err != nil { + t.Fatalf("expected default snapshots config to be valid: %v", err) + } + if got := substrateSnapshotsLocation(ah); got != "gs://ate-snapshots/kagent/claw" { + t.Fatalf("got default snapshots location %q", got) + } + + ah.Spec.Substrate.GatewayToken = "" + if err := validateSubstrateProvisionSpec(ah); err == nil { + t.Fatal("expected error when gateway token is not configured") + } + + ah.Spec.Substrate.GatewayToken = "test-token" + ah.Spec.Substrate.SnapshotsConfig = &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{Location: "s3://nope"} if err := validateSubstrateProvisionSpec(ah); err == nil { t.Fatal("expected error for non-gs location") } @@ -38,6 +59,65 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { } } +func TestEnsureWorkerPoolUsesDefaultAteomImage(t *testing.T) { + t.Parallel() + + for _, tt := range []struct { + name string + defaultImg string + workerPool *v1alpha2.AgentHarnessSubstrateWorkerPoolSpec + wantImage string + wantReplica int32 + }{ + { + name: "defaults omitted replicas", + defaultImg: "registry.example/ateom:default", + workerPool: &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{}, + wantImage: "registry.example/ateom:default", + wantReplica: 1, + }, + { + name: "workerpool override", + defaultImg: "registry.example/ateom:default", + workerPool: &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 3, AteomImage: "registry.example/ateom:override"}, + wantImage: "registry.example/ateom:override", + wantReplica: 3, + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + ah := &v1alpha2.AgentHarness{ + TypeMeta: metav1.TypeMeta{APIVersion: v1alpha2.GroupVersion.String(), Kind: "AgentHarness"}, + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + WorkerPool: tt.workerPool, + }, + }, + } + p := &Provisioner{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Defaults: ProvisionDefaults{DefaultAteomImage: tt.defaultImg}, + } + + key, managed, err := p.ensureWorkerPool(context.Background(), ah) + require.NoError(t, err) + require.True(t, managed) + + var wp atev1alpha1.WorkerPool + require.NoError(t, p.Client.Get(context.Background(), key, &wp)) + require.Equal(t, tt.wantImage, wp.Spec.AteomImage) + require.Equal(t, tt.wantReplica, wp.Spec.Replicas) + }) + } +} + func TestActorTemplateName(t *testing.T) { t.Parallel() ah := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "my-claw"}} diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index f82ff2a1d4..2c2f18ff71 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -547,10 +547,16 @@ spec: the actor (Substrate routes to :80 today). format: int32 type: integer + gatewayToken: + description: |- + GatewayToken is the OpenClaw gateway Bearer token for this harness. + Prefer gatewayTokenSecretRef for production secrets. + minLength: 1 + type: string gatewayTokenSecretRef: description: |- GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. - When unset, the controller falls back to --substrate-gateway-token(-file). + The Secret must contain a "token" key. properties: apiGroup: type: string @@ -564,8 +570,9 @@ spec: - name type: object snapshotsConfig: - description: SnapshotsConfig is required for auto-provisioned - templates (GCS gs:// location). + description: |- + SnapshotsConfig configures actor memory snapshots. Defaults to + gs://ate-snapshots// when unset. properties: location: description: |- @@ -582,12 +589,12 @@ spec: ateomImage: description: |- AteomImage is the ateom herder image (pullable registry ref, not ko://). - Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + Overrides the controller-wide substrate ateom image default for this WorkerPool. type: string replicas: - default: 2 + default: 1 description: Replicas is the number of ateom worker pods. - Defaults to 2 when unset or zero. + Defaults to 1 when unset or zero. format: int32 type: integer type: object @@ -611,9 +618,12 @@ spec: description: WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. type: string - required: - - snapshotsConfig type: object + x-kubernetes-validations: + - message: Exactly one of gatewayToken or gatewayTokenSecretRef must + be specified + rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) + || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) required: - backend type: object diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index c4610056c3..8810fa44e8 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -98,20 +98,30 @@ spec: value: {{ .Values.controller.substrate.defaultActorTemplateNamespace | quote }} - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAME value: {{ .Values.controller.substrate.defaultActorTemplateName | quote }} - {{- if .Values.controller.substrate.gatewayTokenSecretName }} - - name: SUBSTRATE_GATEWAY_TOKEN - valueFrom: - secretKeyRef: - name: {{ .Values.controller.substrate.gatewayTokenSecretName | quote }} - key: token - {{- else if .Values.controller.substrate.gatewayToken }} - - name: SUBSTRATE_GATEWAY_TOKEN - value: {{ .Values.controller.substrate.gatewayToken | quote }} - {{- end }} {{- with .Values.controller.substrate.pauseImage }} - name: SUBSTRATE_PAUSE_IMAGE value: {{ . | quote }} {{- end }} + {{- with .Values.controller.substrate.runscAMD64URL }} + - name: SUBSTRATE_RUNSC_AMD64_URL + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.runscAMD64SHA256 }} + - name: SUBSTRATE_RUNSC_AMD64_SHA256 + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.runscARM64URL }} + - name: SUBSTRATE_RUNSC_ARM64_URL + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.runscARM64SHA256 }} + - name: SUBSTRATE_RUNSC_ARM64_SHA256 + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.ateomImage }} + - name: SUBSTRATE_ATEOM_IMAGE + value: {{ . | quote }} + {{- end }} {{- end }} envFrom: - configMapRef: diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index 4942879450..06aaf3ca01 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -232,16 +232,18 @@ controller: # value: "true" # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. - # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool); users set - # spec.substrate.snapshotsConfig.location (gs://) and worker pool ref or create spec. + # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness + # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. substrate: - enabled: false + enabled: true ateApiEndpoint: "dns:///api.ate-system.svc:443" ateApiInsecure: false - gatewayToken: "test-token" - gatewayTokenSecretName: "" - gatewayTokenSecretNamespace: "" pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" + runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" + runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + ateomImage: "localhost:5001/ateom-gvisor:latest" envFrom: [] From 85b7b563b2ff5dbe73cfde4faf28d49c96890aa3 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 26 May 2026 15:05:38 -0700 Subject: [PATCH 03/32] clean up ui/gateway stuff (use base path from openclaw) Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 35 +-- .../handlers/agentharness_gateway.go | 127 +++------- .../agentharness_gateway_path_test.go | 89 +++++++ .../handlers/agentharness_gateway_rewrite.go | 235 ------------------ .../agentharness_gateway_rewrite_test.go | 165 ------------ .../handlers/agentharness_gateway_test.go | 34 +-- go/core/internal/httpserver/middleware.go | 7 +- go/core/internal/httpserver/server.go | 29 ++- .../openshell/openclaw/bootstrap.go | 24 +- .../openclaw/bootstrap_substrate_test.go | 3 +- .../openshell/openclaw/bootstrap_test.go | 2 +- .../openshell/openclaw/types.go | 11 +- .../substrate/provision_openclaw.go | 9 +- .../substrate/provision_openclaw_test.go | 2 + 14 files changed, 218 insertions(+), 554 deletions(-) create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go delete mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go delete mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index eebeb17e29..f2587b17a6 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -45,16 +45,9 @@ export KIND_CLUSTER_NAME=kind make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" ``` -The generated `ActorTemplate` uses `controller.substrate.pauseImage`, -`controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, -`controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` -from the Helm values. Override them with `--set` or a values file when you need -to pin a different gVisor build. +The generated `ActorTemplate` uses `controller.substrate.pauseImage`, `controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, `controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` from the Helm values Override them with `--set` or a values file when you need to pin a different gVisor build. -Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to -`gs://ate-snapshots//`. If Helm sets -`controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can -be omitted unless you want to override it. +Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to `gs://ate-snapshots//`. If Helm sets `controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can be omitted unless you want to override it. - **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool`) - **`workerPool.ateomImage`** — optional override for the Helm/controller default (`localhost:5001/ateom-gvisor:latest`) @@ -78,8 +71,8 @@ spec: # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. - # workerPool: - # replicas: 1 + workerPool: + replicas: 2 # ateomImage: localhost:5001/ateom-gvisor:latest # Required: configure the OpenClaw gateway token for this harness. @@ -137,13 +130,25 @@ spec: - | # Generated by kagent: # 1. writes ~/.openclaw/openclaw.json from modelConfigRef/channels/gateway token - # 2. starts `openclaw gateway run --port 80 --allow-unconfigured` - # 3. waits for the gateway and tails the log + # 2. configures gateway.controlUi.basePath for the kagent proxy path + # 3. starts `openclaw gateway run --port 80 --allow-unconfigured` + # 4. waits for the gateway and tails the log env: - name: HOME value: /root ``` -The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. +The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. kagent also sets `gateway.controlUi.basePath` to `/api/agentharnesses///gateway` so OpenClaw serves the Control UI under the same path kagent proxies. -Port-forward the UI (`kubectl port-forward -n kagent svc/kagent-ui 8001:8080`) and navigate to the deployed agent harness. Use `test-token` as a gateway token to OpenClaw. +Port-forward the UI: + +```bash +kubectl port-forward -n kagent svc/kagent-ui 8001:8080 +``` + +Navigate to the deployed agent harness. If the OpenClaw Control UI asks for a gateway connection, use: + +- Gateway URL: `http://localhost:8001/api/agentharnesses/kagent/peterj-claw/gateway/` +- Gateway token: `test-token` + +The gateway URL must include the trailing slash. The token is the value configured in `spec.substrate.gatewayToken`, or the Secret value referenced by `spec.substrate.gatewayTokenSecretRef`; enter it in the token/credentials field rather than relying on a `token` query parameter. diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go index 551a8b8981..5215fe82cb 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -1,11 +1,8 @@ package handlers import ( - "bytes" - "compress/gzip" "context" "fmt" - "io" "net" "net/http" "net/http/httputil" @@ -175,7 +172,8 @@ func agentHarnessGatewayPublicPrefix(namespace, name string) string { // resolveGatewayUpstreamPath maps the public URL to the upstream path on the actor. // redirectTo is set when the browser should use a trailing slash under /gateway/. -// HTTP and WebSocket upgrades to the gateway entry both proxy to upstream / (OpenClaw gateway UI). +// OpenClaw is configured with the same controlUi.basePath, so the proxy preserves +// the public gateway base path when forwarding to the actor. func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade bool) (upstreamPath, redirectTo string, ok bool) { base := agentHarnessHarnessBase(namespace, name) if !strings.HasPrefix(requestPath, base) { @@ -188,32 +186,18 @@ func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade b switch { case rel == "/gateway": - _ = wsUpgrade - return "/", agentHarnessGatewayPublicPrefix(namespace, name), true - case strings.HasPrefix(rel, "/gateway/"): - sub := strings.TrimPrefix(rel, "/gateway") - if sub == "" { - sub = "/" + upstream := agentHarnessGatewayPublicPrefix(namespace, name) + if wsUpgrade { + return upstream, "", true } - return sub, "", true - case isHarnessStaticAssetPath(rel): - return rel, "", true + return upstream, upstream, true + case strings.HasPrefix(rel, "/gateway/"): + return requestPath, "", true default: return "", "", false } } -func isHarnessStaticAssetPath(rel string) bool { - if strings.HasPrefix(rel, "/assets/") { - return true - } - switch rel { - case "/manifest.webmanifest", "/vite.svg", "/favicon.ico": - return true - } - return strings.HasPrefix(rel, "/favicon") -} - // normalizeOpenClawBrowserOrigin rewrites Origin/Referer so OpenClaw accepts WS/API from kagent-ui // (e.g. http://localhost:8001) while the gateway listens on the actor pod :80. func normalizeOpenClawBrowserOrigin(req *http.Request) { @@ -239,66 +223,45 @@ func isWebSocketUpgrade(r *http.Request) bool { func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPrefix, namespace, name string, log interface { Error(error, string, ...any) }) *httputil.ReverseProxy { - proxy := httputil.NewSingleHostReverseProxy(target) - proxy.FlushInterval = -1 - proxy.Transport = &http.Transport{ - Proxy: http.ProxyFromEnvironment, - ResponseHeaderTimeout: 0, - IdleConnTimeout: 90 * time.Second, - } - origDirector := proxy.Director - proxy.Director = func(req *http.Request) { - origDirector(req) - req.Host = upstreamHost - req.Header.Set("Host", upstreamHost) - if token != "" { - req.Header.Set("Authorization", "Bearer "+token) - } - req.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) - normalizeOpenClawBrowserOrigin(req) - subPath, _, pathOK := resolveGatewayUpstreamPath(req.URL.Path, namespace, name, isWebSocketUpgrade(req)) - if !pathOK { - subPath = "/" - } - if subPath == "" { - subPath = "/" - } else if !strings.HasPrefix(subPath, "/") { - subPath = "/" + subPath - } - req.URL.Path = subPath - req.URL.RawPath = subPath + proxy := &httputil.ReverseProxy{ + FlushInterval: -1, + Transport: &http.Transport{ + Proxy: http.ProxyFromEnvironment, + ResponseHeaderTimeout: 0, + IdleConnTimeout: 90 * time.Second, + }, + Rewrite: func(pr *httputil.ProxyRequest) { + pr.SetURL(target) + pr.Out.Host = upstreamHost + if token != "" { + pr.Out.Header.Set("Authorization", "Bearer "+token) + } + pr.Out.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) + normalizeOpenClawBrowserOrigin(pr.Out) + subPath, _, pathOK := resolveGatewayUpstreamPath(pr.In.URL.Path, namespace, name, isWebSocketUpgrade(pr.In)) + if !pathOK { + subPath = "/" + } + if subPath == "" { + subPath = "/" + } else if !strings.HasPrefix(subPath, "/") { + subPath = "/" + subPath + } + pr.Out.URL.Path = subPath + pr.Out.URL.RawPath = subPath + }, } proxy.ModifyResponse = func(resp *http.Response) error { - // Do not read or rewrite WebSocket upgrade responses (would break 101 handshakes). if resp.StatusCode == http.StatusSwitchingProtocols { return nil } - resp.Header.Del("Content-Security-Policy") - resp.Header.Del("Content-Security-Policy-Report-Only") - if loc := resp.Header.Get("Location"); loc != "" { - if strings.HasPrefix(loc, "/") && !strings.HasPrefix(loc, publicPrefix) { - resp.Header.Set("Location", strings.TrimSuffix(publicPrefix, "/")+loc) + publicBase := strings.TrimSuffix(publicPrefix, "/") + if strings.HasPrefix(loc, "/") && !strings.HasPrefix(loc, publicBase) { + resp.Header.Set("Location", publicBase+loc) } } - - ct := resp.Header.Get("Content-Type") - if !shouldRewriteGatewayBody(ct) { - return nil - } - body, err := readGatewayResponseBody(resp) - if err != nil { - return err - } - rewritten := rewriteGatewayBody(body, ct, publicPrefix) - if strings.Contains(strings.ToLower(ct), "text/html") { - rewritten = injectGatewayClientShim(rewritten, token) - } - resp.Header.Del("Content-Encoding") - resp.Header.Del("Content-Length") - resp.ContentLength = int64(len(rewritten)) - resp.Body = io.NopCloser(bytes.NewReader(rewritten)) return nil } proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, proxyErr error) { @@ -308,20 +271,6 @@ func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPre return proxy } -func readGatewayResponseBody(resp *http.Response) ([]byte, error) { - var reader io.Reader = resp.Body - if strings.EqualFold(resp.Header.Get("Content-Encoding"), "gzip") { - gz, err := gzip.NewReader(resp.Body) - if err != nil { - return nil, err - } - defer gz.Close() - reader = gz - } - defer resp.Body.Close() - return io.ReadAll(reader) -} - func (h *Handlers) resolveHarnessGatewayToken(ctx context.Context, ah *v1alpha2.AgentHarness) (string, error) { return substrate.ResolveGatewayToken(ctx, h.KubeClient, ah) } diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go new file mode 100644 index 0000000000..433bcd5205 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go @@ -0,0 +1,89 @@ +package handlers + +import ( + "net/http" + "testing" +) + +func TestResolveGatewayUpstreamPath(t *testing.T) { + t.Parallel() + ns, name := "kagent", "my-claw" + public := agentHarnessGatewayPublicPrefix(ns, name) + + tests := []struct { + name string + path string + wsUpgrade bool + wantUp string + wantRedir string + wantOK bool + }{ + { + name: "harness root redirects", + path: "/api/agentharnesses/kagent/my-claw", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash redirects", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wantUp: public, + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash websocket", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wsUpgrade: true, + wantUp: public, + wantOK: true, + }, + { + name: "gateway index", + path: "/api/agentharnesses/kagent/my-claw/gateway/", + wantUp: public, + wantOK: true, + }, + { + name: "gateway asset", + path: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", + wantUp: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", + wantOK: true, + }, + { + name: "unknown path", + path: "/api/agentharnesses/kagent/my-claw/api/v1/foo", + wantOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + up, redir, ok := resolveGatewayUpstreamPath(tt.path, ns, name, tt.wsUpgrade) + if ok != tt.wantOK { + t.Fatalf("ok = %v, want %v", ok, tt.wantOK) + } + if up != tt.wantUp { + t.Fatalf("upstream = %q, want %q", up, tt.wantUp) + } + if redir != tt.wantRedir { + t.Fatalf("redirect = %q, want %q", redir, tt.wantRedir) + } + }) + } +} + +func TestIsWebSocketUpgrade(t *testing.T) { + t.Parallel() + req, _ := http.NewRequest(http.MethodGet, "http://example.com/api/x/gateway", nil) + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Upgrade", "websocket") + if !isWebSocketUpgrade(req) { + t.Fatal("expected websocket upgrade") + } + req2, _ := http.NewRequest(http.MethodGet, "http://example.com/", nil) + if isWebSocketUpgrade(req2) { + t.Fatal("expected not websocket upgrade") + } +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go deleted file mode 100644 index 13818acb39..0000000000 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go +++ /dev/null @@ -1,235 +0,0 @@ -package handlers - -import ( - "bytes" - "encoding/json" - "fmt" - "strings" -) - -// shouldRewriteGatewayQuotedPath returns true for root-absolute app paths we proxy, -// not for short tokens like "/g" (RegExp flags) or other non-asset paths. -func shouldRewriteGatewayQuotedPath(path string) bool { - if path == "" || !strings.HasPrefix(path, "/") || strings.HasPrefix(path, "//") { - return false - } - switch { - case strings.HasPrefix(path, "/assets"): - return true - case strings.HasPrefix(path, "/manifest"): - return true - case strings.HasPrefix(path, "/favicon"): - return true - case path == "/vite.svg": - return true - default: - return false - } -} - -// rewriteGatewayRootPaths prefixes root-absolute URLs in HTML/JS/CSS so assets load under -// /api/agentharnesses/{ns}/{name}/gateway/ (OpenClaw CSP blocks ; base-uri 'none'). -func rewriteGatewayRootPaths(body []byte, prefix string) []byte { - if len(body) == 0 || prefix == "" { - return body - } - if !strings.HasPrefix(prefix, "/") { - prefix = "/" + prefix - } - if !strings.HasSuffix(prefix, "/") { - prefix += "/" - } - - var out bytes.Buffer - out.Grow(len(body) + len(prefix)*4) - s := string(body) - for i := 0; i < len(s); i++ { - c := s[i] - if (c == '"' || c == '\'') && i+1 < len(s) && s[i+1] == '/' { - if i+2 < len(s) && s[i+2] == '/' { - out.WriteByte(c) - continue - } - quote := c - j := i + 1 - for j < len(s) && s[j] != quote { - j++ - } - path := s[i+1 : j] - out.WriteByte(quote) - if shouldRewriteGatewayQuotedPath(path) { - out.WriteString(prefix) - out.WriteString(strings.TrimPrefix(path, "/")) - } else { - out.WriteString(path) - } - if j < len(s) { - out.WriteByte(quote) - } - i = j - continue - } - if i+4 < len(s) && strings.EqualFold(s[i:i+4], "url(") { - j := i + 4 - for j < len(s) && (s[j] == ' ' || s[j] == '\t') { - j++ - } - if j < len(s) && (s[j] == '"' || s[j] == '\'') { - quote := s[j] - if j+1 < len(s) && s[j+1] == '/' && !(j+2 < len(s) && s[j+2] == '/') { - k := j + 1 - for k < len(s) && s[k] != quote { - k++ - } - path := s[j+1 : k] - out.WriteString(s[i : j+1]) - if shouldRewriteGatewayQuotedPath(path) { - out.WriteString(prefix) - out.WriteString(strings.TrimPrefix(path, "/")) - } else { - out.WriteString(path) - } - if k < len(s) { - out.WriteByte(quote) - } - i = k - continue - } - } else if j < len(s) && s[j] == '/' && !(j+1 < len(s) && s[j+1] == '/') { - k := j + 1 - for k < len(s) && s[k] != ')' && s[k] != ' ' && s[k] != '\t' && s[k] != '"' && s[k] != '\'' { - k++ - } - path := s[j:k] - out.WriteString(s[i:j]) - if shouldRewriteGatewayQuotedPath(path) { - out.WriteString(prefix) - out.WriteString(strings.TrimPrefix(path, "/")) - } else { - out.WriteString(path) - } - i = k - 1 - continue - } - } - out.WriteByte(c) - } - return out.Bytes() -} - -func stripGatewayBaseTag(body []byte) []byte { - lower := bytes.ToLower(body) - for { - idx := bytes.Index(lower, []byte("")) - if end < 0 { - break - } - endIdx := idx + end + 1 - body = append(append(body[:idx], body[endIdx:]...)) - lower = bytes.ToLower(body) - } - return body -} - -func stripGatewayCSP(body []byte) []byte { - lower := bytes.ToLower(body) - for _, tag := range []string{ - `")) - if end < 0 { - break - } - endIdx := idx + end + 1 - body = append(append(body[:idx], body[endIdx:]...)) - lower = bytes.ToLower(body) - } - } - return body -} - -func rewriteGatewayBody(body []byte, contentType, prefix string) []byte { - body = stripGatewayCSP(body) - ct := strings.ToLower(contentType) - if strings.Contains(ct, "text/html") { - body = stripGatewayBaseTag(body) - } - if shouldRewriteGatewayBody(contentType) { - body = rewriteGatewayRootPaths(body, prefix) - return rewriteGatewayWebSocketPaths(body, prefix) - } - return body -} - -// injectGatewayClientShim patches WebSocket URLs (trailing slash + ?token= for OpenClaw Control UI). -func injectGatewayClientShim(body []byte, gatewayToken string) []byte { - tokenJSON, _ := json.Marshal(gatewayToken) - shim := fmt.Sprintf(``, tokenJSON) - lower := bytes.ToLower(body) - for _, tag := range []string{"", ""} { - if idx := bytes.Index(lower, []byte(strings.ToLower(tag))); idx >= 0 { - out := make([]byte, 0, len(body)+len(shim)) - out = append(out, body[:idx]...) - out = append(out, shim...) - out = append(out, body[idx:]...) - return out - } - } - return append(bytes.Clone(body), shim...) -} - -// rewriteGatewayWebSocketPaths ensures bundled/runtime WS URLs use .../gateway/ (trailing slash). -// Only rewrites occurrences not already followed by '/' (avoids breaking .../gateway/assets/...). -func rewriteGatewayWebSocketPaths(body []byte, prefix string) []byte { - gatewayWithSlash := strings.TrimSuffix(prefix, "/") + "/" - gatewayNoSlash := strings.TrimSuffix(gatewayWithSlash, "/") - if gatewayNoSlash == "" || gatewayNoSlash == gatewayWithSlash { - return body - } - needle := []byte(gatewayNoSlash) - var out bytes.Buffer - out.Grow(len(body) + 16) - for i := 0; i < len(body); { - idx := bytes.Index(body[i:], needle) - if idx < 0 { - out.Write(body[i:]) - break - } - idx += i - out.Write(body[i:idx]) - end := idx + len(needle) - if end < len(body) && body[end] == '/' { - out.Write(needle) - } else { - out.Write([]byte(gatewayWithSlash)) - } - i = end - } - return out.Bytes() -} - -func shouldRewriteGatewayBody(contentType string) bool { - ct := strings.ToLower(contentType) - return strings.Contains(ct, "text/html") || - strings.Contains(ct, "javascript") || - strings.Contains(ct, "text/css") || - strings.Contains(ct, "application/json") -} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go deleted file mode 100644 index eaab469051..0000000000 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go +++ /dev/null @@ -1,165 +0,0 @@ -package handlers - -import ( - "net/http" - "strings" - "testing" -) - -func TestResolveGatewayUpstreamPath(t *testing.T) { - t.Parallel() - ns, name := "kagent", "my-claw" - public := agentHarnessGatewayPublicPrefix(ns, name) - - tests := []struct { - name string - path string - wsUpgrade bool - wantUp string - wantRedir string - wantOK bool - }{ - { - name: "harness root redirects", - path: "/api/agentharnesses/kagent/my-claw", - wantRedir: public, - wantOK: true, - }, - { - name: "gateway without slash redirects", - path: "/api/agentharnesses/kagent/my-claw/gateway", - wantUp: "/", - wantRedir: public, - wantOK: true, - }, - { - name: "gateway without slash websocket", - path: "/api/agentharnesses/kagent/my-claw/gateway", - wsUpgrade: true, - wantUp: "/", - wantRedir: public, - wantOK: true, - }, - { - name: "gateway index", - path: "/api/agentharnesses/kagent/my-claw/gateway/", - wantUp: "/", - wantOK: true, - }, - { - name: "gateway asset", - path: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", - wantUp: "/assets/foo.js", - wantOK: true, - }, - { - name: "mis-resolved asset shim", - path: "/api/agentharnesses/kagent/my-claw/assets/foo.js", - wantUp: "/assets/foo.js", - wantOK: true, - }, - { - name: "manifest shim", - path: "/api/agentharnesses/kagent/my-claw/manifest.webmanifest", - wantUp: "/manifest.webmanifest", - wantOK: true, - }, - { - name: "unknown path", - path: "/api/agentharnesses/kagent/my-claw/api/v1/foo", - wantOK: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - up, redir, ok := resolveGatewayUpstreamPath(tt.path, ns, name, tt.wsUpgrade) - if ok != tt.wantOK { - t.Fatalf("ok = %v, want %v", ok, tt.wantOK) - } - if up != tt.wantUp { - t.Fatalf("upstream = %q, want %q", up, tt.wantUp) - } - if redir != tt.wantRedir { - t.Fatalf("redirect = %q, want %q", redir, tt.wantRedir) - } - }) - } -} - -func TestRewriteGatewayRootPaths(t *testing.T) { - t.Parallel() - prefix := "/api/agentharnesses/kagent/my-claw/gateway/" - in := `` - out := string(rewriteGatewayRootPaths([]byte(in), prefix)) - if !strings.Contains(out, `src="/api/agentharnesses/kagent/my-claw/gateway/assets/index.js"`) { - t.Fatalf("script src not rewritten: %s", out) - } - if !strings.Contains(out, `href="/api/agentharnesses/kagent/my-claw/gateway/manifest.webmanifest"`) { - t.Fatalf("link href not rewritten: %s", out) - } -} - -func TestIsWebSocketUpgrade(t *testing.T) { - t.Parallel() - req, _ := http.NewRequest(http.MethodGet, "http://example.com/api/x/gateway", nil) - req.Header.Set("Connection", "Upgrade") - req.Header.Set("Upgrade", "websocket") - if !isWebSocketUpgrade(req) { - t.Fatal("expected websocket upgrade") - } - req2, _ := http.NewRequest(http.MethodGet, "http://example.com/", nil) - if isWebSocketUpgrade(req2) { - t.Fatal("expected not websocket upgrade") - } -} - -func TestRewriteGatewayWebSocketPaths(t *testing.T) { - t.Parallel() - prefix := "/api/agentharnesses/kagent/my-claw/gateway/" - in := `const u="ws://localhost:8001/api/agentharnesses/kagent/my-claw/gateway"; const v='wss://host/api/agentharnesses/kagent/my-claw/gateway'` - out := string(rewriteGatewayWebSocketPaths([]byte(in), prefix)) - want := "/api/agentharnesses/kagent/my-claw/gateway/" - if !strings.Contains(out, "ws://localhost:8001"+want) { - t.Fatalf("ws URL not rewritten: %s", out) - } - if !strings.Contains(out, "wss://host"+want) { - t.Fatalf("wss URL not rewritten: %s", out) - } -} - -func TestRewriteGatewayBodyStripsBaseAndCSP(t *testing.T) { - t.Parallel() - prefix := "/api/agentharnesses/kagent/my-claw/gateway/" - in := `` - out := string(rewriteGatewayBody([]byte(in), "text/html", prefix)) - if strings.Contains(strings.ToLower(out), " (API key + channel tokens). // @@ -177,6 +190,7 @@ func buildGatewaySection(gw GatewayBootstrapConfig) gatewaySection { } if gw.ControlUI != nil { section.ControlUi = &controlUiSection{ + BasePath: normalizeControlUIBasePath(gw.ControlUI.BasePath), AllowedOrigins: gw.ControlUI.AllowedOrigins, DangerouslyDisableDeviceAuth: gw.ControlUI.DangerouslyDisableDeviceAuth, } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go index 4fd9ff2e72..e30f64daf8 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go @@ -10,12 +10,13 @@ import ( func TestSubstrateGatewayBootstrap(t *testing.T) { t.Parallel() - raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80)) + raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/kagent/claw/gateway/")) require.NoError(t, err) var root map[string]any require.NoError(t, json.Unmarshal(raw, &root)) gw := root["gateway"].(map[string]any) require.Equal(t, "lan", gw["bind"]) cui := gw["controlUi"].(map[string]any) + require.Equal(t, "/api/agentharnesses/kagent/claw/gateway", cui["basePath"]) require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go index 3ffbd7c9ca..424e850219 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go @@ -79,7 +79,7 @@ func TestBuildBootstrapJSON_SubstrateOmitsModelsWhenNoExplicitBaseURL(t *testing sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80), openclaw.SubstrateBootstrapDefaultBaseURL) + raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/default/s1/gateway"), openclaw.SubstrateBootstrapDefaultBaseURL) require.NoError(t, err) var root map[string]any diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go index bf6dd73760..40b3e75b4c 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go @@ -12,11 +12,11 @@ type bootstrapDocument struct { } type gatewaySection struct { - Mode string `json:"mode"` - Bind string `json:"bind"` - Auth gatewayAuth `json:"auth"` - Port int `json:"port"` - ControlUi *controlUiSection `json:"controlUi,omitempty"` + Mode string `json:"mode"` + Bind string `json:"bind"` + Auth gatewayAuth `json:"auth"` + Port int `json:"port"` + ControlUi *controlUiSection `json:"controlUi,omitempty"` } type gatewayAuth struct { @@ -25,6 +25,7 @@ type gatewayAuth struct { } type controlUiSection struct { + BasePath string `json:"basePath,omitempty"` AllowedOrigins []string `json:"allowedOrigins,omitempty"` DangerouslyDisableDeviceAuth bool `json:"dangerouslyDisableDeviceAuth,omitempty"` } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index b5e9903e30..95c561d48b 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -29,7 +29,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if err != nil { return "", nil, fmt.Errorf("resolve gateway token: %w", err) } - gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort) + gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort, openClawControlUIBasePath(ah)) var jsonBytes []byte var envMap map[string]string @@ -61,6 +61,13 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha return script, containerEnv, nil } +func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return "" + } + return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" +} + func openClawEnvVars(envMap map[string]string) []corev1.EnvVar { keys := make([]string, 0, len(envMap)) for k := range envMap { diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index 9c16ece575..1de21a3685 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -92,6 +92,8 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { auth := gw["auth"].(map[string]any) require.Equal(t, "token", auth["mode"]) require.Equal(t, "some-token", auth["token"]) + controlUI := gw["controlUi"].(map[string]any) + require.Equal(t, "/api/agentharnesses/kagent/peterj-claw/gateway", controlUI["basePath"]) _, hasModels := root["models"] require.False(t, hasModels, "substrate bootstrap should omit models unless ModelConfig sets an explicit baseUrl") require.Contains(t, root, "agents") From ebcaf63d7ead1ccc4fd2d2e77d5d0bf2728f8e98 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Wed, 27 May 2026 10:14:35 -0700 Subject: [PATCH 04/32] split substrate and openshell, use the secrets in substrate fork for modelconfig/channels Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 7 +- go/core/pkg/app/app.go | 5 +- .../openclaw/bootstrap_openshell.go | 83 +++++ .../bootstrap_openshell_test.go} | 2 +- .../bootstrap_shared.go} | 73 +---- .../openclaw/bootstrap_substrate.go | 87 ++++++ .../openclaw/bootstrap_substrate_test.go | 145 +++++++++ .../openclaw/channels_openshell.go | 123 ++++++++ .../openclaw/channels_shared.go | 105 +++++++ .../openclaw/channels_substrate.go | 114 +++++++ .../{openshell => }/openclaw/constants.go | 9 +- .../sandboxbackend/openclaw/credentials.go | 93 ++++++ .../{openshell => }/openclaw/defaults.go | 0 .../{openshell => }/openclaw/modelconfig.go | 25 ++ .../{openshell => }/openclaw/policy.go | 0 .../{openshell => }/openclaw/provider.go | 0 .../pkg/sandboxbackend/openclaw/secrets.go | 33 ++ .../{openshell => }/openclaw/ssh_test.go | 2 +- .../{openshell => }/openclaw/types.go | 37 ++- .../pkg/sandboxbackend/openshell/openclaw.go | 2 +- .../openclaw/bootstrap_substrate_test.go | 22 -- .../openshell/openclaw/channels.go | 97 ------ .../openshell/openshell_test.go | 2 +- .../pkg/sandboxbackend/openshell/policy.go | 2 +- .../sandboxbackend/openshell/ssh_terminal.go | 2 +- .../openshell/ssh_terminal_test.go | 2 +- .../pkg/sandboxbackend/openshell/translate.go | 2 +- .../openshell/translate_test.go | 2 +- .../substrate/delete_provision.go | 4 +- .../substrate/delete_provision_test.go | 2 +- .../pkg/sandboxbackend/substrate/openclaw.go | 2 +- .../pkg/sandboxbackend/substrate/provision.go | 284 +----------------- .../substrate/provision_actortemplate.go | 89 ++++++ .../substrate/provision_openclaw.go | 25 +- .../substrate/provision_openclaw_test.go | 12 +- .../substrate/provision_shared.go | 124 ++++++++ .../substrate/provision_workerpool.go | 77 +++++ 37 files changed, 1181 insertions(+), 514 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go rename go/core/pkg/sandboxbackend/{openshell/openclaw/bootstrap_test.go => openclaw/bootstrap_openshell_test.go} (98%) rename go/core/pkg/sandboxbackend/{openshell/openclaw/bootstrap.go => openclaw/bootstrap_shared.go} (64%) create mode 100644 go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/channels_openshell.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/channels_shared.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/channels_substrate.go rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/constants.go (61%) create mode 100644 go/core/pkg/sandboxbackend/openclaw/credentials.go rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/defaults.go (100%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/modelconfig.go (58%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/policy.go (100%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/provider.go (100%) create mode 100644 go/core/pkg/sandboxbackend/openclaw/secrets.go rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/ssh_test.go (72%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/types.go (77%) delete mode 100644 go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go delete mode 100644 go/core/pkg/sandboxbackend/openshell/openclaw/channels.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_shared.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_workerpool.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index f2587b17a6..bb5964f663 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -71,13 +71,14 @@ spec: # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. + # NOTE: use single worker for now due to https://github.com/agent-substrate/substrate/issues/50 + gatewayToken: test-token workerPool: - replicas: 2 + replicas: 1 # ateomImage: localhost:5001/ateom-gvisor:latest # Required: configure the OpenClaw gateway token for this harness. # Use either gatewayToken or gatewayTokenSecretRef. The Secret must contain key "token". - gatewayToken: test-token # gatewayTokenSecretRef: # name: openclaw-gateway-token # namespace: kagent @@ -140,6 +141,8 @@ spec: The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. kagent also sets `gateway.controlUi.basePath` to `/api/agentharnesses///gateway` so OpenClaw serves the Control UI under the same path kagent proxies. +When `modelConfigRef` or `spec.channels` are set, credentials are **not** copied into the ActorTemplate or `openclaw.json` as plaintext. kagent writes `valueFrom.secretKeyRef` (or inline `value` for harness inline tokens) on the ActorTemplate container env; Substrate `ate-api` resolves those refs at actor resume. In `openclaw.json`, kagent uses OpenClaw [env SecretRefs](https://docs.openclaw.ai/gateway/secrets) (`{source:"env",provider:"default",id:""}`) for `models.providers.*.apiKey`, `channels.telegram.accounts.*.botToken`, and `channels.slack.accounts.*.botToken` / `appToken`. Rotate a Secret and recreate the ActorTemplate golden snapshot when keys change. + Port-forward the UI: ```bash diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 2389660463..017c1ce7ee 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -59,6 +59,7 @@ import ( "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/migrations" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "github.com/kagent-dev/kagent/go/core/pkg/translator" @@ -845,7 +846,7 @@ func substrateAppConfig(cfg *Config) substrate.Config { RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, DefaultAteomImage: cfg.Substrate.AteomImage, - DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, }, } return sc @@ -862,7 +863,7 @@ func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate * RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, DefaultAteomImage: cfg.Substrate.AteomImage, - DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, }, } } diff --git a/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go new file mode 100644 index 0000000000..ede55d15b8 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go @@ -0,0 +1,83 @@ +package openclaw + +import ( + "context" + "encoding/json" + "fmt" + "slices" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// BuildBootstrapJSON builds ~/.openclaw/openclaw.json contents plus environment variables that must be present when +// OpenClaw resolves openshell:resolve:env: (API key + channel tokens). +// +// defaultBaseURLWhenUnset is used when ModelConfig has no explicit provider base URL. +// OpenShell callers should pass DefaultInferenceBaseURL. +func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, defaultBaseURLWhenUnset string) ([]byte, map[string]string, error) { + if mc == nil { + return nil, nil, fmt.Errorf("ModelConfig is required") + } + apiKey, err := ResolveModelConfigAPIKey(ctx, kube, mc) + if err != nil { + return nil, nil, fmt.Errorf("resolve model API key: %w", err) + } + apiAdapter, err := providerAPI(mc) + if err != nil { + return nil, nil, err + } + + apiKeyEnv := DefaultAPIKeyEnvVar(mc.Spec.Provider) + env := map[string]string{ + apiKeyEnv: apiKey, + } + + modelID, err := requiredModelID(mc) + if err != nil { + return nil, nil, err + } + + providerRecord := GatewayProviderRecordName(mc.Spec.Provider) + doc := buildCoreBootstrapDocument(mc, gw, credentialValue{literal: openshellResolveEnv(apiKeyEnv)}, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset) + + chState, err := accumulateHarnessChannels(ctx, kube, namespace, sbx.Spec.Channels, env) + if err != nil { + return nil, nil, err + } + doc.Channels = chState.channelsJSON() + + applyOpenshellSecretsAllowlist(&doc, env) + + raw, err := json.Marshal(doc) + if err != nil { + return nil, nil, fmt.Errorf("marshal openclaw json: %w", err) + } + return raw, env, nil +} + +func applyOpenshellSecretsAllowlist(doc *bootstrapDocument, env map[string]string, extraEnvNames ...string) { + seen := make(map[string]struct{}, len(env)+len(extraEnvNames)) + secretAllow := make([]string, 0, len(env)+len(extraEnvNames)) + for k := range env { + if _, ok := seen[k]; !ok { + seen[k] = struct{}{} + secretAllow = append(secretAllow, k) + } + } + for _, k := range extraEnvNames { + if _, ok := seen[k]; !ok { + seen[k] = struct{}{} + secretAllow = append(secretAllow, k) + } + } + slices.Sort(secretAllow) + doc.Secrets = secretsSection{ + Providers: map[string]secretProvider{ + openshellSecretProviderID: { + Source: "env", + Allowlist: secretAllow, + }, + }, + } +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell_test.go similarity index 98% rename from go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go rename to go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell_test.go index 424e850219..18f7b1ce42 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell_test.go @@ -6,7 +6,7 @@ import ( "testing" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_shared.go similarity index 64% rename from go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go rename to go/core/pkg/sandboxbackend/openclaw/bootstrap_shared.go index 9e72db95c3..b547c83f37 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_shared.go @@ -1,14 +1,11 @@ package openclaw import ( - "context" "encoding/json" "fmt" - "slices" "strings" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "sigs.k8s.io/controller-runtime/pkg/client" ) // GatewayBootstrapConfig describes the gateway section of openclaw.json for a harness runtime. @@ -58,52 +55,6 @@ func normalizeControlUIBasePath(path string) string { return strings.TrimRight(path, "/") } -// BuildBootstrapJSON builds ~/.openclaw/openclaw.json contents plus environment variables that must be present when -// OpenClaw resolves openshell:resolve:env: (API key + channel tokens). -// -// defaultBaseURLWhenUnset is used when ModelConfig has no explicit provider base URL. -// OpenShell callers should pass DefaultInferenceBaseURL; Substrate should pass SubstrateBootstrapDefaultBaseURL. -func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, defaultBaseURLWhenUnset string) ([]byte, map[string]string, error) { - if mc == nil { - return nil, nil, fmt.Errorf("ModelConfig is required") - } - apiKey, err := ResolveModelConfigAPIKey(ctx, kube, mc) - if err != nil { - return nil, nil, fmt.Errorf("resolve model API key: %w", err) - } - apiAdapter, err := providerAPI(mc) - if err != nil { - return nil, nil, err - } - - apiKeyEnv := DefaultAPIKeyEnvVar(mc.Spec.Provider) - env := map[string]string{ - apiKeyEnv: apiKey, - } - - modelID := strings.TrimSpace(mc.Spec.Model) - if modelID == "" { - return nil, nil, fmt.Errorf("ModelConfig.spec.model is required for OpenClaw bootstrap JSON") - } - - providerRecord := GatewayProviderRecordName(mc.Spec.Provider) - doc := buildCoreBootstrapDocument(mc, gw, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset) - - chState, err := accumulateHarnessChannels(ctx, kube, namespace, sbx.Spec.Backend, sbx.Spec.Channels, env) - if err != nil { - return nil, nil, err - } - doc.Channels = chState.channelsJSON() - - applySecretsAllowlist(&doc, env) - - raw, err := json.Marshal(doc) - if err != nil { - return nil, nil, fmt.Errorf("marshal openclaw json: %w", err) - } - return raw, env, nil -} - // BuildGatewayOnlyBootstrapJSON returns a minimal openclaw.json with gateway settings only (no models/channels). func BuildGatewayOnlyBootstrapJSON(gw GatewayBootstrapConfig) ([]byte, error) { doc := bootstrapDocument{Gateway: buildGatewaySection(gw)} @@ -114,7 +65,7 @@ func BuildGatewayOnlyBootstrapJSON(gw GatewayBootstrapConfig) ([]byte, error) { return raw, nil } -func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset string) bootstrapDocument { +func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, apiKey credentialValue, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset string) bootstrapDocument { doc := bootstrapDocument{ Gateway: buildGatewaySection(gw), Agents: agentsSection{ @@ -135,7 +86,7 @@ func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapCon Providers: map[string]providerSettings{ providerRecord: { BaseURL: explicit, - APIKey: openshellResolveEnv(apiKeyEnv), + APIKey: apiKey, Auth: providerAuth(mc), API: apiAdapter, Models: []modelSlot{ @@ -154,7 +105,7 @@ func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapCon Providers: map[string]providerSettings{ providerRecord: { BaseURL: baseURL, - APIKey: openshellResolveEnv(apiKeyEnv), + APIKey: apiKey, Auth: providerAuth(mc), API: apiAdapter, Models: []modelSlot{ @@ -198,18 +149,10 @@ func buildGatewaySection(gw GatewayBootstrapConfig) gatewaySection { return section } -func applySecretsAllowlist(doc *bootstrapDocument, env map[string]string) { - secretAllow := make([]string, 0, len(env)) - for k := range env { - secretAllow = append(secretAllow, k) - } - slices.Sort(secretAllow) - doc.Secrets = secretsSection{ - Providers: map[string]secretProvider{ - bootstrapSecretProviderID: { - Source: "env", - Allowlist: secretAllow, - }, - }, +func requiredModelID(mc *v1alpha2.ModelConfig) (string, error) { + modelID := strings.TrimSpace(mc.Spec.Model) + if modelID == "" { + return "", fmt.Errorf("ModelConfig.spec.model is required for OpenClaw bootstrap JSON") } + return modelID, nil } diff --git a/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go new file mode 100644 index 0000000000..f20e1294b6 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go @@ -0,0 +1,87 @@ +package openclaw + +import ( + "context" + "encoding/json" + "fmt" + "slices" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// BuildSubstrateBootstrapJSON builds openclaw.json and ActorTemplate container env for Agent Substrate. +// Model and channel credentials use OpenClaw env SecretRefs in openclaw.json ({source:"env",provider:"default",id:"..."}) +// and ActorTemplate container env (literal value or valueFrom secretKeyRef/configMapKeyRef, resolved by ate-api at resume). +func BuildSubstrateBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig) ([]byte, []corev1.EnvVar, error) { + if mc == nil { + return nil, nil, fmt.Errorf("ModelConfig is required") + } + apiKeyEnvVar, err := ModelConfigAPIKeyEnvVar(mc) + if err != nil { + return nil, nil, err + } + apiAdapter, err := providerAPI(mc) + if err != nil { + return nil, nil, err + } + + modelID, err := requiredModelID(mc) + if err != nil { + return nil, nil, err + } + + apiKeyEnv := apiKeyEnvVar.Name + providerRecord := GatewayProviderRecordName(mc.Spec.Provider) + apiKeyRef := openclawEnvSecretRef(apiKeyEnv) + doc := buildCoreBootstrapDocument(mc, gw, credentialValue{envSecret: &apiKeyRef}, providerRecord, modelID, apiAdapter, SubstrateBootstrapDefaultBaseURL) + + chState, channelEnv, err := accumulateSubstrateHarnessChannels(ctx, kube, namespace, sbx.Spec.Channels) + if err != nil { + return nil, nil, err + } + doc.Channels = chState.channelsJSON() + + applySubstrateSecretsAllowlist(&doc, apiKeyEnv, channelEnv) + + raw, err := json.Marshal(doc) + if err != nil { + return nil, nil, fmt.Errorf("marshal openclaw json: %w", err) + } + return raw, substrateContainerEnv(apiKeyEnvVar, channelEnv), nil +} + +func substrateContainerEnv(apiKey corev1.EnvVar, extra []corev1.EnvVar) []corev1.EnvVar { + out := make([]corev1.EnvVar, 0, len(extra)+2) + out = append(out, apiKey) + out = append(out, extra...) + out = append(out, corev1.EnvVar{Name: "HOME", Value: "/root"}) + return out +} + +func applySubstrateSecretsAllowlist(doc *bootstrapDocument, apiKeyEnv string, channelEnv []corev1.EnvVar) { + seen := make(map[string]struct{}, len(channelEnv)+1) + secretAllow := make([]string, 0, len(channelEnv)+1) + add := func(name string) { + if _, ok := seen[name]; ok { + return + } + seen[name] = struct{}{} + secretAllow = append(secretAllow, name) + } + add(apiKeyEnv) + for _, env := range channelEnv { + add(env.Name) + } + slices.Sort(secretAllow) + doc.Secrets = secretsSection{ + Providers: map[string]secretProvider{ + substrateSecretProviderID: { + Source: "env", + Allowlist: secretAllow, + }, + }, + Defaults: &secretsDefaults{Env: substrateSecretProviderID}, + } +} diff --git a/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go new file mode 100644 index 0000000000..a5136bc81f --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go @@ -0,0 +1,145 @@ +package openclaw_test + +import ( + "context" + "encoding/json" + "testing" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestSubstrateGatewayBootstrap(t *testing.T) { + t.Parallel() + raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/kagent/claw/gateway/")) + require.NoError(t, err) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + require.Equal(t, "lan", gw["bind"]) + cui := gw["controlUi"].(map[string]any) + require.Equal(t, "/api/agentharnesses/kagent/claw/gateway", cui["basePath"]) + require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) +} + +func TestBuildSubstrateBootstrapJSON_ModelConfigAPIKeyUsesSecretRef(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "default" + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc1", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(mc).Build() + raw, env, err := openclaw.BuildSubstrateBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80, "/gw/")) + require.NoError(t, err) + + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + secRoot := root["secrets"].(map[string]any) + secProvs := secRoot["providers"].(map[string]any) + defaultProv := secProvs["default"].(map[string]any) + require.Contains(t, defaultProv["allowlist"], "OPENAI_API_KEY") + defaults := secRoot["defaults"].(map[string]any) + require.Equal(t, "default", defaults["env"]) + + var apiKeyEnv *corev1.EnvVar + for i := range env { + if env[i].Name == "OPENAI_API_KEY" { + apiKeyEnv = &env[i] + break + } + } + require.NotNil(t, apiKeyEnv) + require.NotNil(t, apiKeyEnv.ValueFrom) + require.NotNil(t, apiKeyEnv.ValueFrom.SecretKeyRef) + require.Equal(t, "openai-key", apiKeyEnv.ValueFrom.SecretKeyRef.Name) + require.Equal(t, "OPENAI_API_KEY", apiKeyEnv.ValueFrom.SecretKeyRef.Key) + require.Empty(t, apiKeyEnv.Value) +} + +func TestBuildSubstrateBootstrapJSON_TelegramUsesEnvSecretRef(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "default" + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc1", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "tg-token", Namespace: ns}, + Data: map[string][]byte{"token": []byte("telegram-bot-token")}, + } + sbx := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Channels: []v1alpha2.AgentHarnessChannel{ + { + Name: "tg1", + Type: v1alpha2.AgentHarnessChannelTypeTelegram, + Telegram: &v1alpha2.AgentHarnessTelegramChannelSpec{ + BotToken: v1alpha2.AgentHarnessChannelCredential{ + ValueFrom: &v1alpha2.ValueSource{ + Type: v1alpha2.SecretValueSource, + Name: "tg-token", + Key: "token", + }, + }, + }, + }, + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(mc, secret).Build() + raw, env, err := openclaw.BuildSubstrateBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80, "/gw/")) + require.NoError(t, err) + + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + tg := root["channels"].(map[string]any)["telegram"].(map[string]any) + tg1 := tg["accounts"].(map[string]any)["tg1"].(map[string]any) + botToken := tg1["botToken"].(map[string]any) + require.Equal(t, "env", botToken["source"]) + require.Equal(t, "default", botToken["provider"]) + require.Equal(t, "KAGENT_SB_CH_TG1_TELEGRAM_BOT", botToken["id"]) + require.NotEqual(t, "telegram-bot-token", tg1["botToken"]) + + var tgEnv *corev1.EnvVar + for i := range env { + if env[i].Name == "KAGENT_SB_CH_TG1_TELEGRAM_BOT" { + tgEnv = &env[i] + break + } + } + require.NotNil(t, tgEnv) + require.NotNil(t, tgEnv.ValueFrom) + require.NotNil(t, tgEnv.ValueFrom.SecretKeyRef) + require.Equal(t, "tg-token", tgEnv.ValueFrom.SecretKeyRef.Name) + require.Equal(t, "token", tgEnv.ValueFrom.SecretKeyRef.Key) +} diff --git a/go/core/pkg/sandboxbackend/openclaw/channels_openshell.go b/go/core/pkg/sandboxbackend/openclaw/channels_openshell.go new file mode 100644 index 0000000000..a4000aa0b7 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/channels_openshell.go @@ -0,0 +1,123 @@ +package openclaw + +import ( + "context" + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/channels" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func accumulateHarnessChannels(ctx context.Context, kube client.Client, namespace string, specChannels []v1alpha2.AgentHarnessChannel, env map[string]string) (*harnessChannels, error) { + a := newHarnessChannels() + for _, ch := range specChannels { + switch ch.Type { + case v1alpha2.AgentHarnessChannelTypeTelegram: + if err := a.addTelegram(ctx, kube, namespace, ch, env); err != nil { + return nil, err + } + case v1alpha2.AgentHarnessChannelTypeSlack: + if err := a.addSlack(ctx, kube, namespace, ch, env); err != nil { + return nil, err + } + default: + return nil, unsupportedChannelType(ch.Name, ch.Type) + } + } + return a, nil +} + +func (a *harnessChannels) addTelegram(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel, env map[string]string) error { + spec := ch.Telegram + if spec == nil { + return fmt.Errorf("channel %q: telegram spec is required", ch.Name) + } + botEnv := channels.TelegramBotTokenEnvKey(ch.Name) + if err := putChannelCredential(ctx, kube, namespace, spec.BotToken, botEnv, env); err != nil { + return fmt.Errorf("channel %q telegram bot token: %w", ch.Name, err) + } + allowFrom, err := telegramAllowFrom(ctx, kube, namespace, spec) + if err != nil { + return fmt.Errorf("channel %q telegram allowlist: %w", ch.Name, err) + } + acc := telegramAccount{ + Name: ch.Name, + Enabled: true, + BotToken: credentialValue{literal: openshellResolveEnv(botEnv)}, + } + if len(allowFrom) > 0 { + acc.DMPolicy = "allowlist" + acc.AllowFrom = allowFrom + } else { + acc.DMPolicy = "pairing" + } + a.telegram[ch.Name] = acc + if a.tgDef == "" { + a.tgDef = ch.Name + } + return nil +} + +func (a *harnessChannels) addSlack(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel, env map[string]string) error { + spec := ch.Slack + if spec == nil { + return fmt.Errorf("channel %q: slack spec is required", ch.Name) + } + botEnv := channels.SlackBotTokenEnvKey(ch.Name) + appEnv := channels.SlackAppTokenEnvKey(ch.Name) + if err := putChannelCredential(ctx, kube, namespace, spec.BotToken, botEnv, env); err != nil { + return fmt.Errorf("channel %q slack bot token: %w", ch.Name, err) + } + if err := putChannelCredential(ctx, kube, namespace, spec.AppToken, appEnv, env); err != nil { + return fmt.Errorf("channel %q slack app token: %w", ch.Name, err) + } + opts := openClawSlackOptions(spec) + access := openClawSlackChannelAccess(opts) + acc := slackAccount{ + Name: ch.Name, + Enabled: true, + Mode: "socket", + BotToken: credentialValue{literal: channels.SlackBotTokenPlaceholder(botEnv)}, + AppToken: credentialValue{literal: channels.SlackAppTokenPlaceholder(appEnv)}, + UserTokenReadOnly: true, + GroupPolicy: string(access), + Capabilities: slackCaps{ + InteractiveReplies: slackInteractiveReplies(opts), + }, + } + if chans := trimNonEmptyStrings(opts.AllowlistChannels); len(chans) > 0 { + acc.DM = &groupDM{GroupEnabled: true, GroupChannels: chans} + } + a.slack[ch.Name] = acc + if a.slDef == "" { + a.slDef = ch.Name + } + if !a.slackSeen { + a.slackRootPolicy = access + a.slackSeen = true + } + return nil +} + +func telegramAllowFrom(ctx context.Context, kube client.Client, namespace string, spec *v1alpha2.AgentHarnessTelegramChannelSpec) ([]string, error) { + if len(spec.AllowedUserIDs) > 0 { + out := make([]string, 0, len(spec.AllowedUserIDs)) + for _, id := range spec.AllowedUserIDs { + s := strings.TrimSpace(id) + if s != "" { + out = append(out, s) + } + } + return out, nil + } + if spec.AllowedUserIDsFrom != nil { + raw, err := spec.AllowedUserIDsFrom.Resolve(ctx, kube, namespace) + if err != nil { + return nil, fmt.Errorf("resolve allowedUserIDsFrom: %w", err) + } + return splitAllowedList(raw), nil + } + return nil, nil +} diff --git a/go/core/pkg/sandboxbackend/openclaw/channels_shared.go b/go/core/pkg/sandboxbackend/openclaw/channels_shared.go new file mode 100644 index 0000000000..75aa66872c --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/channels_shared.go @@ -0,0 +1,105 @@ +package openclaw + +import ( + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +type harnessChannels struct { + telegram map[string]telegramAccount + tgDef string + + slack map[string]slackAccount + slDef string + + slackRootPolicy v1alpha2.AgentHarnessChannelAccess + slackSeen bool +} + +func newHarnessChannels() *harnessChannels { + return &harnessChannels{ + telegram: make(map[string]telegramAccount), + slack: make(map[string]slackAccount), + } +} + +func (a *harnessChannels) channelsJSON() *channelsConfig { + if len(a.telegram) == 0 && len(a.slack) == 0 { + return nil + } + out := &channelsConfig{} + if len(a.telegram) > 0 { + out.Telegram = &telegramBundle{ + Enabled: true, + Accounts: a.telegram, + DefaultAccount: a.tgDef, + } + } + if len(a.slack) > 0 { + out.Slack = &slackBundle{ + Enabled: true, + Mode: "socket", + WebhookPath: "/slack/events", + UserTokenReadOnly: true, + GroupPolicy: string(a.slackRootPolicy), + Accounts: a.slack, + DefaultAccount: a.slDef, + } + } + return out +} + +func openClawSlackOptions(spec *v1alpha2.AgentHarnessSlackChannelSpec) *v1alpha2.AgentHarnessOpenClawSlackOptions { + if spec == nil || spec.OpenClaw == nil { + return &v1alpha2.AgentHarnessOpenClawSlackOptions{} + } + return spec.OpenClaw +} + +func slackInteractiveReplies(opts *v1alpha2.AgentHarnessOpenClawSlackOptions) bool { + if opts == nil || opts.InteractiveReplies == nil { + return true + } + return *opts.InteractiveReplies +} + +func openClawSlackChannelAccess(opts *v1alpha2.AgentHarnessOpenClawSlackOptions) v1alpha2.AgentHarnessChannelAccess { + if opts == nil || opts.ChannelAccess == "" { + return v1alpha2.AgentHarnessChannelAccessOpen + } + return opts.ChannelAccess +} + +func splitAllowedList(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + var out []string + for _, part := range strings.FieldsFunc(raw, func(r rune) bool { + return r == ',' || r == '\n' || r == ';' + }) { + s := strings.TrimSpace(part) + if s != "" { + out = append(out, s) + } + } + return out +} + +func trimNonEmptyStrings(ss []string) []string { + out := make([]string, 0, len(ss)) + for _, s := range ss { + s = strings.TrimSpace(s) + if s != "" { + out = append(out, s) + } + } + return out +} + +func unsupportedChannelType(name string, typ v1alpha2.AgentHarnessChannelType) error { + return fmt.Errorf("channel %q: unsupported type %q", name, typ) +} diff --git a/go/core/pkg/sandboxbackend/openclaw/channels_substrate.go b/go/core/pkg/sandboxbackend/openclaw/channels_substrate.go new file mode 100644 index 0000000000..594661fb59 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/channels_substrate.go @@ -0,0 +1,114 @@ +package openclaw + +import ( + "context" + "fmt" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// accumulateSubstrateHarnessChannels configures channels with OpenClaw env SecretRefs in openclaw.json +// and returns container env vars (inline value or Kubernetes valueFrom refs) for the ActorTemplate. +func accumulateSubstrateHarnessChannels(ctx context.Context, kube client.Client, namespace string, channels []v1alpha2.AgentHarnessChannel) (*harnessChannels, []corev1.EnvVar, error) { + a := newHarnessChannels() + var containerEnv []corev1.EnvVar + for _, ch := range channels { + switch ch.Type { + case v1alpha2.AgentHarnessChannelTypeTelegram: + env, err := a.addSubstrateTelegram(ctx, kube, namespace, ch) + if err != nil { + return nil, nil, err + } + containerEnv = append(containerEnv, env...) + case v1alpha2.AgentHarnessChannelTypeSlack: + env, err := a.addSubstrateSlack(ctx, kube, namespace, ch) + if err != nil { + return nil, nil, err + } + containerEnv = append(containerEnv, env...) + default: + return nil, nil, unsupportedChannelType(ch.Name, ch.Type) + } + } + return a, containerEnv, nil +} + +func (a *harnessChannels) addSubstrateTelegram(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel) ([]corev1.EnvVar, error) { + spec := ch.Telegram + if spec == nil { + return nil, fmt.Errorf("channel %q: telegram spec is required", ch.Name) + } + botEnv := channelSecretEnvVar(ch.Name, "TELEGRAM_BOT") + botEnvVar, err := channelCredentialContainerEnv(spec.BotToken, botEnv) + if err != nil { + return nil, fmt.Errorf("channel %q telegram bot token: %w", ch.Name, err) + } + allowFrom, err := telegramAllowFrom(ctx, kube, namespace, spec) + if err != nil { + return nil, fmt.Errorf("channel %q telegram allowlist: %w", ch.Name, err) + } + ref := openclawEnvSecretRef(botEnv) + acc := telegramAccount{ + Name: ch.Name, + Enabled: true, + BotToken: credentialValue{envSecret: &ref}, + } + if len(allowFrom) > 0 { + acc.DMPolicy = "allowlist" + acc.AllowFrom = allowFrom + } else { + acc.DMPolicy = "pairing" + } + a.telegram[ch.Name] = acc + if a.tgDef == "" { + a.tgDef = ch.Name + } + return []corev1.EnvVar{botEnvVar}, nil +} + +func (a *harnessChannels) addSubstrateSlack(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel) ([]corev1.EnvVar, error) { + spec := ch.Slack + if spec == nil { + return nil, fmt.Errorf("channel %q: slack spec is required", ch.Name) + } + botEnv := channelSecretEnvVar(ch.Name, "SLACK_BOT") + appEnv := channelSecretEnvVar(ch.Name, "SLACK_APP") + botEnvVar, err := channelCredentialContainerEnv(spec.BotToken, botEnv) + if err != nil { + return nil, fmt.Errorf("channel %q slack bot token: %w", ch.Name, err) + } + appEnvVar, err := channelCredentialContainerEnv(spec.AppToken, appEnv) + if err != nil { + return nil, fmt.Errorf("channel %q slack app token: %w", ch.Name, err) + } + botRef := openclawEnvSecretRef(botEnv) + appRef := openclawEnvSecretRef(appEnv) + opts := openClawSlackOptions(spec) + access := openClawSlackChannelAccess(opts) + acc := slackAccount{ + Name: ch.Name, + Enabled: true, + Mode: "socket", + BotToken: credentialValue{envSecret: &botRef}, + AppToken: credentialValue{envSecret: &appRef}, + UserTokenReadOnly: true, + GroupPolicy: string(access), + Capabilities: slackCaps{ + InteractiveReplies: slackInteractiveReplies(opts), + }, + } + if chans := trimNonEmptyStrings(opts.AllowlistChannels); len(chans) > 0 { + acc.DM = &groupDM{GroupEnabled: true, GroupChannels: chans} + } + a.slack[ch.Name] = acc + if a.slDef == "" { + a.slDef = ch.Name + } + if !a.slackSeen { + a.slackRootPolicy = access + a.slackSeen = true + } + return []corev1.EnvVar{botEnvVar, appEnvVar}, nil +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go b/go/core/pkg/sandboxbackend/openclaw/constants.go similarity index 61% rename from go/core/pkg/sandboxbackend/openshell/openclaw/constants.go rename to go/core/pkg/sandboxbackend/openclaw/constants.go index e94d0789f1..bf696bd59d 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go +++ b/go/core/pkg/sandboxbackend/openclaw/constants.go @@ -4,13 +4,16 @@ const ( // NemoclawSandboxBaseImage is the default OpenShell VM image for OpenClaw/NemoClaw harnesses. NemoclawSandboxBaseImage = "ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4" - // bootstrapSecretProviderID is the secrets.providers key written into openclaw.json. - bootstrapSecretProviderID = "kagent" + // openshellSecretProviderID is the secrets.providers key written into openclaw.json for OpenShell sandboxes. + openshellSecretProviderID = "kagent" + + // substrateSecretProviderID is the env SecretRef provider id for native OpenClaw on Substrate. + substrateSecretProviderID = "default" // DefaultInferenceBaseURL is the Model provider baseUrl when ModelConfig does not set an explicit upstream (OpenShell). DefaultInferenceBaseURL = "https://inference.local/v1" - // SubstrateBootstrapDefaultBaseURL is passed to BuildBootstrapJSON for Substrate harnesses. + // SubstrateBootstrapDefaultBaseURL is passed when building openclaw.json for Substrate harnesses. // When ModelConfig has no explicit provider URL, the models section is omitted entirely so // OpenClaw is not given a partial providers.* block (baseUrl is required when present). SubstrateBootstrapDefaultBaseURL = "" diff --git a/go/core/pkg/sandboxbackend/openclaw/credentials.go b/go/core/pkg/sandboxbackend/openclaw/credentials.go new file mode 100644 index 0000000000..b167802c96 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/credentials.go @@ -0,0 +1,93 @@ +package openclaw + +import ( + "context" + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func sandboxChannelEnvSuffix(name string) string { + var b strings.Builder + for _, r := range strings.ToUpper(strings.TrimSpace(name)) { + switch { + case r >= 'A' && r <= 'Z', r >= '0' && r <= '9': + b.WriteRune(r) + default: + b.WriteByte('_') + } + } + s := strings.Trim(b.String(), "_") + if s == "" { + return "CH" + } + return s +} + +func channelSecretEnvVar(channelName, tokenRole string) string { + return fmt.Sprintf("KAGENT_SB_CH_%s_%s", sandboxChannelEnvSuffix(channelName), tokenRole) +} + +func putChannelCredential(ctx context.Context, kube client.Client, namespace string, cred v1alpha2.AgentHarnessChannelCredential, envKey string, env map[string]string) error { + if strings.TrimSpace(cred.Value) != "" { + env[envKey] = strings.TrimSpace(cred.Value) + return nil + } + if cred.ValueFrom == nil { + return fmt.Errorf("channel credential requires value or valueFrom") + } + v, err := cred.ValueFrom.Resolve(ctx, kube, namespace) + if err != nil { + return fmt.Errorf("resolve credential %s: %w", envKey, err) + } + env[envKey] = v + return nil +} + +// channelCredentialContainerEnv maps a harness channel credential to an ActorTemplate env var. +// Inline values use env.Value; Secret/ConfigMap sources use valueFrom refs resolved by Substrate ate-api at resume. +func channelCredentialContainerEnv(cred v1alpha2.AgentHarnessChannelCredential, envKey string) (corev1.EnvVar, error) { + if v := strings.TrimSpace(cred.Value); v != "" { + return corev1.EnvVar{Name: envKey, Value: v}, nil + } + if cred.ValueFrom == nil { + return corev1.EnvVar{}, fmt.Errorf("channel credential requires value or valueFrom") + } + switch cred.ValueFrom.Type { + case v1alpha2.SecretValueSource: + return corev1.EnvVar{ + Name: envKey, + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: cred.ValueFrom.Name}, + Key: cred.ValueFrom.Key, + }, + }, + }, nil + case v1alpha2.ConfigMapValueSource: + return corev1.EnvVar{ + Name: envKey, + ValueFrom: &corev1.EnvVarSource{ + ConfigMapKeyRef: &corev1.ConfigMapKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: cred.ValueFrom.Name}, + Key: cred.ValueFrom.Key, + }, + }, + }, nil + default: + return corev1.EnvVar{}, fmt.Errorf("unknown value source type %q", cred.ValueFrom.Type) + } +} + +// resolvedChannelSecret returns the plaintext value putChannelCredential stored in env. +// OpenShell bootstrap still inlines channel tokens in openclaw.json; Substrate uses OpenClaw env SecretRefs instead. +func resolvedChannelSecret(env map[string]string, envKey string) (string, error) { + v := strings.TrimSpace(env[envKey]) + if v == "" { + return "", fmt.Errorf("credential %s is missing or empty after resolve", envKey) + } + return v, nil +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/defaults.go b/go/core/pkg/sandboxbackend/openclaw/defaults.go similarity index 100% rename from go/core/pkg/sandboxbackend/openshell/openclaw/defaults.go rename to go/core/pkg/sandboxbackend/openclaw/defaults.go diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/modelconfig.go b/go/core/pkg/sandboxbackend/openclaw/modelconfig.go similarity index 58% rename from go/core/pkg/sandboxbackend/openshell/openclaw/modelconfig.go rename to go/core/pkg/sandboxbackend/openclaw/modelconfig.go index 3bb29e88fd..a83e5871b6 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/modelconfig.go +++ b/go/core/pkg/sandboxbackend/openclaw/modelconfig.go @@ -16,6 +16,31 @@ func GatewayProviderRecordName(provider v1alpha2.ModelProvider) string { return strings.ToLower(string(provider)) } +// ModelConfigAPIKeyEnvVar returns a container env var that references the ModelConfig API key Secret. +// Substrate ate-api resolves secretKeyRef when resuming an actor (see workload_spec in substrate ate-api). +func ModelConfigAPIKeyEnvVar(mc *v1alpha2.ModelConfig) (corev1.EnvVar, error) { + if mc == nil { + return corev1.EnvVar{}, fmt.Errorf("ModelConfig is required") + } + if mc.Spec.APIKeyPassthrough { + return corev1.EnvVar{}, fmt.Errorf("APIKeyPassthrough is not supported for Substrate OpenClaw provisioning from ModelConfig") + } + if mc.Spec.APIKeySecret == "" || mc.Spec.APIKeySecretKey == "" { + return corev1.EnvVar{}, fmt.Errorf("modelConfig %s/%s requires apiKeySecret and apiKeySecretKey", mc.Namespace, mc.Name) + } + return corev1.EnvVar{ + Name: DefaultAPIKeyEnvVar(mc.Spec.Provider), + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mc.Spec.APIKeySecret, + }, + Key: mc.Spec.APIKeySecretKey, + }, + }, + }, nil +} + // ResolveModelConfigAPIKey reads the API key from the Secret referenced by ModelConfig. func ResolveModelConfigAPIKey(ctx context.Context, kube client.Client, mc *v1alpha2.ModelConfig) (string, error) { if mc.Spec.APIKeyPassthrough { diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/policy.go b/go/core/pkg/sandboxbackend/openclaw/policy.go similarity index 100% rename from go/core/pkg/sandboxbackend/openshell/openclaw/policy.go rename to go/core/pkg/sandboxbackend/openclaw/policy.go diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go b/go/core/pkg/sandboxbackend/openclaw/provider.go similarity index 100% rename from go/core/pkg/sandboxbackend/openshell/openclaw/provider.go rename to go/core/pkg/sandboxbackend/openclaw/provider.go diff --git a/go/core/pkg/sandboxbackend/openclaw/secrets.go b/go/core/pkg/sandboxbackend/openclaw/secrets.go new file mode 100644 index 0000000000..82bf3b8e97 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/secrets.go @@ -0,0 +1,33 @@ +package openclaw + +import ( + "encoding/json" +) + +// envSecretRef is OpenClaw's structured env SecretRef (see https://docs.openclaw.ai/gateway/secrets). +type envSecretRef struct { + Source string `json:"source"` + Provider string `json:"provider"` + ID string `json:"id"` +} + +func openclawEnvSecretRef(envVar string) envSecretRef { + return envSecretRef{ + Source: "env", + Provider: substrateSecretProviderID, + ID: envVar, + } +} + +// credentialValue marshals as either a plaintext string (OpenShell) or an OpenClaw env SecretRef (Substrate). +type credentialValue struct { + literal string + envSecret *envSecretRef +} + +func (c credentialValue) MarshalJSON() ([]byte, error) { + if c.envSecret != nil { + return json.Marshal(c.envSecret) + } + return json.Marshal(c.literal) +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/ssh_test.go b/go/core/pkg/sandboxbackend/openclaw/ssh_test.go similarity index 72% rename from go/core/pkg/sandboxbackend/openshell/openclaw/ssh_test.go rename to go/core/pkg/sandboxbackend/openclaw/ssh_test.go index a2f5d32aef..0d2773a544 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/ssh_test.go +++ b/go/core/pkg/sandboxbackend/openclaw/ssh_test.go @@ -3,7 +3,7 @@ package openclaw_test import ( "testing" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/stretchr/testify/require" ) diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go b/go/core/pkg/sandboxbackend/openclaw/types.go similarity index 77% rename from go/core/pkg/sandboxbackend/openshell/openclaw/types.go rename to go/core/pkg/sandboxbackend/openclaw/types.go index 40b3e75b4c..2fac8ba330 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openclaw/types.go @@ -36,11 +36,11 @@ type modelsSection struct { } type providerSettings struct { - BaseURL string `json:"baseUrl,omitempty"` - APIKey string `json:"apiKey"` - Auth string `json:"auth"` - API string `json:"api"` - Models []modelSlot `json:"models"` + BaseURL string `json:"baseUrl,omitempty"` + APIKey credentialValue `json:"apiKey"` + Auth string `json:"auth"` + API string `json:"api"` + Models []modelSlot `json:"models"` } type modelSlot struct { @@ -72,11 +72,11 @@ type telegramBundle struct { } type telegramAccount struct { - Name string `json:"name"` - Enabled bool `json:"enabled"` - BotToken string `json:"botToken"` - DMPolicy string `json:"dmPolicy"` - AllowFrom []string `json:"allowFrom,omitempty"` + Name string `json:"name"` + Enabled bool `json:"enabled"` + BotToken credentialValue `json:"botToken"` + DMPolicy string `json:"dmPolicy"` + AllowFrom []string `json:"allowFrom,omitempty"` } type slackBundle struct { @@ -90,11 +90,11 @@ type slackBundle struct { } type slackAccount struct { - Name string `json:"name"` - Enabled bool `json:"enabled"` - Mode string `json:"mode"` - BotToken string `json:"botToken"` - AppToken string `json:"appToken"` + Name string `json:"name"` + Enabled bool `json:"enabled"` + Mode string `json:"mode"` + BotToken credentialValue `json:"botToken"` + AppToken credentialValue `json:"appToken"` UserTokenReadOnly bool `json:"userTokenReadOnly"` GroupPolicy string `json:"groupPolicy"` Capabilities slackCaps `json:"capabilities"` @@ -112,9 +112,14 @@ type groupDM struct { type secretsSection struct { Providers map[string]secretProvider `json:"providers"` + Defaults *secretsDefaults `json:"defaults,omitempty"` } type secretProvider struct { Source string `json:"source"` - Allowlist []string `json:"allowlist"` + Allowlist []string `json:"allowlist,omitempty"` +} + +type secretsDefaults struct { + Env string `json:"env"` } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw.go b/go/core/pkg/sandboxbackend/openshell/openclaw.go index f8032b4235..44a3508e10 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw.go @@ -10,7 +10,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go deleted file mode 100644 index e30f64daf8..0000000000 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go +++ /dev/null @@ -1,22 +0,0 @@ -package openclaw_test - -import ( - "encoding/json" - "testing" - - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" - "github.com/stretchr/testify/require" -) - -func TestSubstrateGatewayBootstrap(t *testing.T) { - t.Parallel() - raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/kagent/claw/gateway/")) - require.NoError(t, err) - var root map[string]any - require.NoError(t, json.Unmarshal(raw, &root)) - gw := root["gateway"].(map[string]any) - require.Equal(t, "lan", gw["bind"]) - cui := gw["controlUi"].(map[string]any) - require.Equal(t, "/api/agentharnesses/kagent/claw/gateway", cui["basePath"]) - require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) -} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/channels.go b/go/core/pkg/sandboxbackend/openshell/openclaw/channels.go deleted file mode 100644 index 1223a30f2f..0000000000 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/channels.go +++ /dev/null @@ -1,97 +0,0 @@ -package openclaw - -import ( - "context" - "maps" - - "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/channels" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type harnessChannels struct { - resolved *channels.Resolved -} - -func accumulateHarnessChannels(ctx context.Context, kube client.Client, namespace string, backend v1alpha2.AgentHarnessBackendType, specChannels []v1alpha2.AgentHarnessChannel, env map[string]string) (*harnessChannels, error) { - resolved, err := channels.Resolve(ctx, kube, namespace, backend, specChannels) - if err != nil { - return nil, err - } - maps.Copy(env, resolved.Secrets) - return &harnessChannels{resolved: resolved}, nil -} - -func (a *harnessChannels) channelsJSON() *channelsConfig { - if a == nil || a.resolved == nil { - return nil - } - r := a.resolved - if len(r.Telegram) == 0 && len(r.Slack) == 0 { - return nil - } - out := &channelsConfig{} - if len(r.Telegram) > 0 { - accounts := make(map[string]telegramAccount, len(r.Telegram)) - var def string - for _, tg := range r.Telegram { - acc := telegramAccount{ - Name: tg.Name, - Enabled: true, - BotToken: openshellResolveEnv(channels.TelegramBotTokenEnvKey(tg.Name)), - } - if len(tg.AllowFrom) > 0 { - acc.DMPolicy = "allowlist" - acc.AllowFrom = tg.AllowFrom - } else { - acc.DMPolicy = "pairing" - } - accounts[tg.Name] = acc - if def == "" { - def = tg.Name - } - } - out.Telegram = &telegramBundle{ - Enabled: true, - Accounts: accounts, - DefaultAccount: def, - } - } - if len(r.Slack) > 0 { - accounts := make(map[string]slackAccount, len(r.Slack)) - var def string - for _, sl := range r.Slack { - botKey := channels.SlackBotTokenEnvKey(sl.Name) - appKey := channels.SlackAppTokenEnvKey(sl.Name) - acc := slackAccount{ - Name: sl.Name, - Enabled: true, - Mode: "socket", - BotToken: channels.SlackBotTokenPlaceholder(botKey), - AppToken: channels.SlackAppTokenPlaceholder(appKey), - UserTokenReadOnly: true, - GroupPolicy: string(sl.ChannelAccess), - Capabilities: slackCaps{ - InteractiveReplies: sl.InteractiveReplies, - }, - } - if chans := sl.AllowlistChannels; len(chans) > 0 { - acc.DM = &groupDM{GroupEnabled: true, GroupChannels: chans} - } - accounts[sl.Name] = acc - if def == "" { - def = sl.Name - } - } - out.Slack = &slackBundle{ - Enabled: true, - Mode: "socket", - WebhookPath: "/slack/events", - UserTokenReadOnly: true, - GroupPolicy: string(r.SlackRootGroupPolicy()), - Accounts: accounts, - DefaultAccount: def, - } - } - return out -} diff --git a/go/core/pkg/sandboxbackend/openshell/openshell_test.go b/go/core/pkg/sandboxbackend/openshell/openshell_test.go index 1f21c0d161..7c55ea45ab 100644 --- a/go/core/pkg/sandboxbackend/openshell/openshell_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openshell_test.go @@ -13,7 +13,7 @@ import ( openshellv1 "github.com/kagent-dev/kagent/go/api/openshell/gen/openshellv1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/stretchr/testify/require" "google.golang.org/grpc" "google.golang.org/grpc/codes" diff --git a/go/core/pkg/sandboxbackend/openshell/policy.go b/go/core/pkg/sandboxbackend/openshell/policy.go index b01b3a20e5..5f82cac698 100644 --- a/go/core/pkg/sandboxbackend/openshell/policy.go +++ b/go/core/pkg/sandboxbackend/openshell/policy.go @@ -6,8 +6,8 @@ import ( sandboxv1 "github.com/kagent-dev/kagent/go/api/openshell/gen/sandboxv1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" "google.golang.org/protobuf/proto" ) diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go index 2299a76dc8..c7e6c2033c 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go @@ -5,7 +5,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" ) // ResolveSSHRemoteCommand decides whether to run an interactive shell or a harness CLI. diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go index d9d1451dea..a1f4849711 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go @@ -5,7 +5,7 @@ import ( "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" ) func TestResolveSSHRemoteCommand(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/openshell/translate.go b/go/core/pkg/sandboxbackend/openshell/translate.go index c6ae5a0d0d..d29c1fd8b1 100644 --- a/go/core/pkg/sandboxbackend/openshell/translate.go +++ b/go/core/pkg/sandboxbackend/openshell/translate.go @@ -9,7 +9,7 @@ import ( openshellv1 "github.com/kagent-dev/kagent/go/api/openshell/gen/openshellv1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" diff --git a/go/core/pkg/sandboxbackend/openshell/translate_test.go b/go/core/pkg/sandboxbackend/openshell/translate_test.go index 463ad5aa14..88f7124f42 100644 --- a/go/core/pkg/sandboxbackend/openshell/translate_test.go +++ b/go/core/pkg/sandboxbackend/openshell/translate_test.go @@ -4,8 +4,8 @@ import ( "testing" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go index 0b74d786f7..47d641a2cf 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -21,7 +21,7 @@ func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) err if ah == nil || ah.Annotations == nil { return nil } - if ah.Annotations[annotationManagedActorTemplate] == "true" { + if ah.Annotations[AnnotationManagedActorTemplate] == "true" { key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} if err := p.deleteGoldenActor(ctx, key); err != nil { return err @@ -35,7 +35,7 @@ func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) err return err } } - if ah.Annotations[annotationManagedWorkerPool] == "true" { + if ah.Annotations[AnnotationManagedWorkerPool] == "true" { key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} var wp atev1alpha1.WorkerPool if err := p.Client.Get(ctx, key, &wp); err == nil { diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go index dc4b8e338d..cfa632157e 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go @@ -44,7 +44,7 @@ func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { Name: "peterj-claw", Namespace: ns, Annotations: map[string]string{ - annotationManagedActorTemplate: "true", + AnnotationManagedActorTemplate: "true", }, }, } diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 0d269a45d8..7022085dea 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -115,7 +115,7 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle) error { // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time - // (see substrate.Provisioner.buildOpenClawActorStartup — same openclaw.BuildBootstrapJSON as OpenShell). + // (see substrate/provision_openclaw.go — openclaw.BuildSubstrateBootstrapJSON with secretKeyRef env). _ = ctx _ = ah _ = h diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go index d704baa927..836e48e065 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -5,62 +5,10 @@ import ( "fmt" "strings" - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -const ( - AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" - AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" - - annotationManagedWorkerPool = AnnotationManagedWorkerPool - annotationManagedActorTemplate = AnnotationManagedActorTemplate - - defaultWorkerPoolReplicas = int32(1) - defaultSnapshotsBucket = "ate-snapshots" - defaultOpenClawContainer = "openclaw" -) - -// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. -type ProvisionDefaults struct { - PauseImage string - RunscAMD64URL string - RunscAMD64SHA256 string - RunscARM64URL string - RunscARM64SHA256 string - DefaultAteomImage string - DefaultWorkloadImage string -} - -// ateActorDeleter removes actors from ate-api during harness teardown. -type ateActorDeleter interface { - deleteActorSequenced(ctx context.Context, actorID string) error -} - -// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. -type Provisioner struct { - Client client.Client - Defaults ProvisionDefaults - // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. - Ate ateActorDeleter -} - -// EnsureResult describes provisioned Substrate resources. -type EnsureResult struct { - WorkerPoolRef types.NamespacedName - ActorTemplateRef types.NamespacedName - ActorTemplateReady bool - ManagedWorkerPool bool - ManagedActorTemplate bool -} - // Ensure creates or updates Substrate CRs and waits for ActorTemplate Ready. func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { if ah == nil || ah.Spec.Substrate == nil { @@ -70,23 +18,8 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En return EnsureResult{}, err } - // Legacy / advanced: user supplied an existing template. if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { - ref := ah.Spec.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} - ready, err := p.actorTemplateReady(ctx, tmplKey) - if err != nil { - return EnsureResult{}, err - } - return EnsureResult{ - ActorTemplateRef: tmplKey, - ActorTemplateReady: ready, - ManagedActorTemplate: false, - }, nil + return p.ensureAdoptedActorTemplate(ctx, ah) } wpKey, managedWP, err := p.ensureWorkerPool(ctx, ah) @@ -104,7 +37,6 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En return EnsureResult{}, err } - _ = managedWP return EnsureResult{ WorkerPoolRef: wpKey, ActorTemplateRef: tmplKey, @@ -114,208 +46,20 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En }, nil } -func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { - sub := ah.Spec.Substrate - if err := ValidateGatewayTokenSpec(sub); err != nil { - return err - } - if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { - return nil +func (p *Provisioner) ensureAdoptedActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { + ref := ah.Spec.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace } - loc := substrateSnapshotsLocation(ah) - if !strings.HasPrefix(loc, "gs://") { - return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") - } - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { - return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") - } - return nil -} - -func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { - sub := ah.Spec.Substrate - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { - ns := sub.WorkerPoolRef.Namespace - if ns == "" { - ns = ah.Namespace - } - key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} - var wp atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &wp); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) - } - return key, false, nil - } - - key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} - replicas := defaultWorkerPoolReplicas - ateomImage := "" - if sub.WorkerPool != nil { - if sub.WorkerPool.Replicas > 0 { - replicas = sub.WorkerPool.Replicas - } - ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) - } - if ateomImage == "" { - ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) - } - if ateomImage == "" { - return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") - } - - desired := &atev1alpha1.WorkerPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: key.Name, - Namespace: key.Namespace, - Labels: provisionLabels(ah), - }, - Spec: atev1alpha1.WorkerPoolSpec{ - Replicas: replicas, - AteomImage: ateomImage, - }, - } - if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) - } - - var existing atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { - if err := p.Client.Create(ctx, desired); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) - } - return key, true, nil - } else if err != nil { - return types.NamespacedName{}, false, err - } - existing.Spec.Replicas = desired.Spec.Replicas - existing.Spec.AteomImage = desired.Spec.AteomImage - if err := p.Client.Update(ctx, &existing); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) - } - return key, true, nil -} - -func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { - key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} - workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) - if workloadImage == "" { - workloadImage = strings.TrimSpace(p.Defaults.DefaultWorkloadImage) - } - if workloadImage == "" { - workloadImage = openshell.NemoclawSandboxBaseImage - } - startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) + tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} + ready, err := p.actorTemplateReady(ctx, tmplKey) if err != nil { - return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) - } - - desired := &atev1alpha1.ActorTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: key.Name, - Namespace: key.Namespace, - Labels: provisionLabels(ah), - }, - Spec: atev1alpha1.ActorTemplateSpec{ - PauseImage: p.Defaults.PauseImage, - Runsc: defaultRunscConfig(p.Defaults), - Containers: []atev1alpha1.Container{ - { - Name: defaultOpenClawContainer, - Image: workloadImage, - Ports: []corev1.ContainerPort{{ContainerPort: 80}}, - Command: []string{ - "/bin/sh", - "-c", - startupScript, - }, - Env: containerEnv, - }, - }, - WorkerPoolRef: corev1.ObjectReference{ - Name: wpKey.Name, - Namespace: wpKey.Namespace, - }, - SnapshotsConfig: atev1alpha1.SnapshotsConfig{ - Location: substrateSnapshotsLocation(ah), - }, - }, - } - if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { - return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) - } - - var existing atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { - if err := p.Client.Create(ctx, desired); err != nil { - return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) - } - return key, nil - } else if err != nil { - return types.NamespacedName{}, err - } - existing.Spec = desired.Spec - if err := p.Client.Update(ctx, &existing); err != nil { - return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) - } - return key, nil -} - -func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { - var tmpl atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &tmpl); err != nil { - return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) - } - return tmpl.Status.Phase == atev1alpha1.PhaseReady, nil -} - -func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { - return atev1alpha1.RunscConfig{ - AMD64: &atev1alpha1.RunscPlatformConfig{ - URL: d.RunscAMD64URL, - SHA256Hash: d.RunscAMD64SHA256, - }, - ARM64: &atev1alpha1.RunscPlatformConfig{ - URL: d.RunscARM64URL, - SHA256Hash: d.RunscARM64SHA256, - }, - } -} - -func substrateSnapshotsLocation(ah *v1alpha2.AgentHarness) string { - if ah == nil { - return defaultSubstrateSnapshotsLocation("", "") - } - if sub := ah.Spec.Substrate; sub != nil && sub.SnapshotsConfig != nil { - if loc := strings.TrimSpace(sub.SnapshotsConfig.Location); loc != "" { - return loc - } - } - return defaultSubstrateSnapshotsLocation(ah.Namespace, ah.Name) -} - -func defaultSubstrateSnapshotsLocation(namespace, name string) string { - return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) -} - -func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { - return map[string]string{ - "app.kubernetes.io/managed-by": "kagent", - "kagent.dev/agent-harness": ah.Name, - } -} - -func workerPoolName(ah *v1alpha2.AgentHarness) string { - return truncateDNS1123(ah.Name + "-wp") -} - -func actorTemplateName(ah *v1alpha2.AgentHarness) string { - return truncateDNS1123(ah.Name) -} - -func truncateDNS1123(s string) string { - s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) - if len(s) > 63 { - s = strings.TrimRight(s[:63], "-") + return EnsureResult{}, err } - return s + return EnsureResult{ + ActorTemplateRef: tmplKey, + ActorTemplateReady: ready, + ManagedActorTemplate: false, + }, nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go new file mode 100644 index 0000000000..c1ae943125 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go @@ -0,0 +1,89 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) + if workloadImage == "" { + workloadImage = strings.TrimSpace(p.Defaults.DefaultWorkloadImage) + } + if workloadImage == "" { + workloadImage = openclaw.NemoclawSandboxBaseImage + } + startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) + if err != nil { + return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) + } + + desired := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.ActorTemplateSpec{ + PauseImage: p.Defaults.PauseImage, + Runsc: defaultRunscConfig(p.Defaults), + Containers: []atev1alpha1.Container{ + { + Name: defaultOpenClawContainer, + Image: workloadImage, + Ports: []corev1.ContainerPort{{ContainerPort: 80}}, + Command: []string{ + "/bin/sh", + "-c", + startupScript, + }, + Env: containerEnv, + }, + }, + WorkerPoolRef: corev1.ObjectReference{ + Name: wpKey.Name, + Namespace: wpKey.Namespace, + }, + SnapshotsConfig: atev1alpha1.SnapshotsConfig{ + Location: substrateSnapshotsLocation(ah), + }, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) + } + + var existing atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) + } + return key, nil + } else if err != nil { + return types.NamespacedName{}, err + } + existing.Spec = desired.Spec + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) + } + return key, nil +} + +func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &tmpl); err != nil { + return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) + } + return tmpl.Status.Phase == atev1alpha1.PhaseReady, nil +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index 95c561d48b..6892389127 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -4,12 +4,11 @@ import ( "context" "encoding/base64" "fmt" - "sort" "strings" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" corev1 "k8s.io/api/core/v1" ) @@ -32,7 +31,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort, openClawControlUIBasePath(ah)) var jsonBytes []byte - var envMap map[string]string + var containerEnv []corev1.EnvVar ref := strings.TrimSpace(ah.Spec.ModelConfigRef) if ref != "" { @@ -44,7 +43,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if getErr := p.Client.Get(ctx, mcRef, mc); getErr != nil { return "", nil, fmt.Errorf("get ModelConfig %s: %w", mcRef, getErr) } - jsonBytes, envMap, err = openclaw.BuildBootstrapJSON(ctx, p.Client, ah.Namespace, ah, mc, gw, openclaw.SubstrateBootstrapDefaultBaseURL) + jsonBytes, containerEnv, err = openclaw.BuildSubstrateBootstrapJSON(ctx, p.Client, ah.Namespace, ah, mc, gw) if err != nil { return "", nil, fmt.Errorf("build openclaw bootstrap json: %w", err) } @@ -53,10 +52,8 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if err != nil { return "", nil, fmt.Errorf("build gateway-only openclaw json: %w", err) } - envMap = map[string]string{} + containerEnv = []corev1.EnvVar{{Name: "HOME", Value: "/root"}} } - - containerEnv := openClawEnvVars(envMap) script = openClawStartupScript(jsonBytes, gw.Port) return script, containerEnv, nil } @@ -68,20 +65,6 @@ func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" } -func openClawEnvVars(envMap map[string]string) []corev1.EnvVar { - keys := make([]string, 0, len(envMap)) - for k := range envMap { - keys = append(keys, k) - } - sort.Strings(keys) - out := make([]corev1.EnvVar, 0, len(keys)+1) - for _, k := range keys { - out = append(out, corev1.EnvVar{Name: k, Value: envMap[k]}) - } - out = append(out, corev1.EnvVar{Name: "HOME", Value: "/root"}) - return out -} - func openClawStartupScript(jsonBytes []byte, gwPort int) string { b64 := base64.StdEncoding.EncodeToString(jsonBytes) return strings.Join([]string{ diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index 1de21a3685..bd24ca7b1e 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -63,11 +63,17 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { var foundKey bool for _, e := range env { - if e.Name == "OPENAI_API_KEY" && e.Value == "sk-test" { - foundKey = true + if e.Name != "OPENAI_API_KEY" { + continue } + require.NotNil(t, e.ValueFrom) + require.NotNil(t, e.ValueFrom.SecretKeyRef) + require.Equal(t, "openai-key", e.ValueFrom.SecretKeyRef.Name) + require.Equal(t, "OPENAI_API_KEY", e.ValueFrom.SecretKeyRef.Key) + require.Empty(t, e.Value, "API key must not be inlined in ActorTemplate env") + foundKey = true } - require.True(t, foundKey, "expected OPENAI_API_KEY in container env") + require.True(t, foundKey, "expected OPENAI_API_KEY secretKeyRef in container env") // Decode embedded JSON from the base64 line in the startup script. var payload string diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/provision_shared.go new file mode 100644 index 0000000000..c0d4b842e3 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_shared.go @@ -0,0 +1,124 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" + AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" + + defaultWorkerPoolReplicas = int32(1) + defaultSnapshotsBucket = "ate-snapshots" + defaultOpenClawContainer = "openclaw" +) + +// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. +type ProvisionDefaults struct { + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string + DefaultAteomImage string + DefaultWorkloadImage string +} + +// ateActorDeleter removes actors from ate-api during harness teardown. +type ateActorDeleter interface { + deleteActorSequenced(ctx context.Context, actorID string) error +} + +// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. +type Provisioner struct { + Client client.Client + Defaults ProvisionDefaults + // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. + Ate ateActorDeleter +} + +// EnsureResult describes provisioned Substrate resources. +type EnsureResult struct { + WorkerPoolRef types.NamespacedName + ActorTemplateRef types.NamespacedName + ActorTemplateReady bool + ManagedWorkerPool bool + ManagedActorTemplate bool +} + +func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { + sub := ah.Spec.Substrate + if err := ValidateGatewayTokenSpec(sub); err != nil { + return err + } + if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { + return nil + } + loc := substrateSnapshotsLocation(ah) + if !strings.HasPrefix(loc, "gs://") { + return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") + } + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { + return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") + } + return nil +} + +func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { + return atev1alpha1.RunscConfig{ + AMD64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscAMD64URL, + SHA256Hash: d.RunscAMD64SHA256, + }, + ARM64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscARM64URL, + SHA256Hash: d.RunscARM64SHA256, + }, + } +} + +func substrateSnapshotsLocation(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return defaultSubstrateSnapshotsLocation("", "") + } + if sub := ah.Spec.Substrate; sub != nil && sub.SnapshotsConfig != nil { + if loc := strings.TrimSpace(sub.SnapshotsConfig.Location); loc != "" { + return loc + } + } + return defaultSubstrateSnapshotsLocation(ah.Namespace, ah.Name) +} + +func defaultSubstrateSnapshotsLocation(namespace, name string) string { + return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) +} + +func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { + return map[string]string{ + "app.kubernetes.io/managed-by": "kagent", + "kagent.dev/agent-harness": ah.Name, + } +} + +func workerPoolName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name + "-wp") +} + +func actorTemplateName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name) +} + +func truncateDNS1123(s string) string { + s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) + if len(s) > 63 { + s = strings.TrimRight(s[:63], "-") + } + return s +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go new file mode 100644 index 0000000000..3504c7f651 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go @@ -0,0 +1,77 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { + sub := ah.Spec.Substrate + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { + ns := sub.WorkerPoolRef.Namespace + if ns == "" { + ns = ah.Namespace + } + key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) + } + return key, false, nil + } + + key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + replicas := defaultWorkerPoolReplicas + ateomImage := "" + if sub.WorkerPool != nil { + if sub.WorkerPool.Replicas > 0 { + replicas = sub.WorkerPool.Replicas + } + ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) + } + if ateomImage == "" { + ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) + } + if ateomImage == "" { + return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") + } + + desired := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: replicas, + AteomImage: ateomImage, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) + } + + var existing atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) + } + return key, true, nil + } else if err != nil { + return types.NamespacedName{}, false, err + } + existing.Spec.Replicas = desired.Spec.Replicas + existing.Spec.AteomImage = desired.Spec.AteomImage + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) + } + return key, true, nil +} From 25bf681585a32db2556a1bc3ff350ea5efe4cb17 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Wed, 27 May 2026 10:27:34 -0700 Subject: [PATCH 05/32] move startup script to a template Signed-off-by: Peter Jausovec --- .../substrate/provision_openclaw.go | 40 ++++++++++++------- .../templates/openclaw_startup.sh.tmpl | 9 +++++ 2 files changed, 35 insertions(+), 14 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index 6892389127..96927611b7 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -1,10 +1,13 @@ package substrate import ( + "bytes" "context" + _ "embed" "encoding/base64" "fmt" "strings" + "text/template" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" @@ -14,6 +17,16 @@ import ( const defaultSubstrateOpenClawGatewayPort = 80 +//go:embed templates/openclaw_startup.sh.tmpl +var openClawStartupScriptTmplContent string + +var openClawStartupScriptTmpl = template.Must(template.New("openclaw_startup").Parse(openClawStartupScriptTmplContent)) + +type openClawStartupScriptData struct { + OpenClawJSONBase64 string + GatewayPort int +} + // buildOpenClawActorStartup returns the ateom workload startup script and container env for OpenClaw on Substrate. // When spec.modelConfigRef is set, openclaw.json includes models/agents/channels like the OpenShell bootstrap path. func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha2.AgentHarness) (script string, env []corev1.EnvVar, err error) { @@ -54,7 +67,10 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha } containerEnv = []corev1.EnvVar{{Name: "HOME", Value: "/root"}} } - script = openClawStartupScript(jsonBytes, gw.Port) + script, err = openClawStartupScript(jsonBytes, gw.Port) + if err != nil { + return "", nil, err + } return script, containerEnv, nil } @@ -65,17 +81,13 @@ func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" } -func openClawStartupScript(jsonBytes []byte, gwPort int) string { - b64 := base64.StdEncoding.EncodeToString(jsonBytes) - return strings.Join([]string{ - "set -e", - `mkdir -p "${HOME}/.openclaw"`, - fmt.Sprintf(`echo '%s' | base64 -d > "${HOME}/.openclaw/openclaw.json"`, b64), - fmt.Sprintf("openclaw gateway run --port %d --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 &", gwPort), - `for i in $(seq 1 60); do`, - ` curl -sf http://127.0.0.1:80/ >/dev/null 2>&1 && echo "gateway up" && break`, - " sleep 1", - "done", - "tail -f /tmp/openclaw-gateway.log /dev/null", - }, "\n") +func openClawStartupScript(jsonBytes []byte, gwPort int) (string, error) { + var buf bytes.Buffer + if err := openClawStartupScriptTmpl.Execute(&buf, openClawStartupScriptData{ + OpenClawJSONBase64: base64.StdEncoding.EncodeToString(jsonBytes), + GatewayPort: gwPort, + }); err != nil { + return "", fmt.Errorf("render openclaw startup script: %w", err) + } + return strings.TrimRight(buf.String(), "\n"), nil } diff --git a/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl b/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl new file mode 100644 index 0000000000..184ad91c74 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl @@ -0,0 +1,9 @@ +set -e +mkdir -p "${HOME}/.openclaw" +echo '{{.OpenClawJSONBase64}}' | base64 -d > "${HOME}/.openclaw/openclaw.json" +openclaw gateway run --port {{.GatewayPort}} --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 & +for i in $(seq 1 60); do + curl -sf http://127.0.0.1:{{.GatewayPort}}/ >/dev/null 2>&1 && echo "gateway up" && break + sleep 1 +done +tail -f /tmp/openclaw-gateway.log /dev/null From 5c470ce6552e3af85f8af232fab03dec1ca54ace Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Wed, 27 May 2026 14:32:51 -0700 Subject: [PATCH 06/32] pr feedback Signed-off-by: Peter Jausovec --- .../crd/bases/kagent.dev_agentharnesses.yaml | 111 ++++++++----- go/api/v1alpha2/agentharness_types.go | 33 ++-- go/api/v1alpha2/zz_generated.deepcopy.go | 17 +- .../controller/agentharness_controller.go | 157 +++++++++++++----- .../agentharness_substrate_watches.go | 97 +++++++++++ .../sandboxbackend/substrate/delete_actor.go | 116 +++---------- .../substrate/delete_actor_test.go | 15 +- .../substrate/delete_provision.go | 157 +++++++++++------- .../substrate/delete_provision_test.go | 33 +++- .../sandboxbackend/substrate/gateway_token.go | 33 +--- .../pkg/sandboxbackend/substrate/openclaw.go | 35 +--- .../pkg/sandboxbackend/substrate/provision.go | 11 +- .../substrate/provision_openclaw_test.go | 2 +- .../substrate/provision_shared.go | 20 +-- .../substrate/provision_test.go | 31 +--- .../substrate/provision_workerpool.go | 6 +- .../templates/kagent.dev_agentharnesses.yaml | 111 ++++++++----- 17 files changed, 554 insertions(+), 431 deletions(-) create mode 100644 go/core/internal/controller/agentharness_substrate_watches.go diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index 2c2f18ff71..52f814c1aa 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -536,8 +536,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -564,8 +562,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -578,6 +574,7 @@ spec: description: |- Location is the GCS URI prefix for golden and incremental snapshots. Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + pattern: ^gs:// type: string required: - location @@ -609,8 +606,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -624,6 +619,8 @@ spec: be specified rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) + - message: workerPoolRef and workerPool are mutually exclusive + rule: '!(has(self.workerPoolRef) && has(self.workerPool))' required: - backend type: object @@ -633,6 +630,10 @@ spec: || (has(c.slack) && ((self.backend == ''hermes'' && has(c.slack.hermes) && !has(c.slack.openclaw)) || ((self.backend == ''openclaw'' || self.backend == ''nemoclaw'') && has(c.slack.openclaw) && !has(c.slack.hermes)))))' + - message: spec.substrate may only be set when runtime is substrate + rule: '!has(self.substrate) || self.runtime == ''substrate''' + - message: spec.substrate is required when runtime is substrate + rule: self.runtime != 'substrate' || has(self.substrate) status: description: AgentHarnessStatus is the observed state of an AgentHarness. properties: @@ -726,42 +727,70 @@ spec: format: int64 type: integer substrate: - description: Substrate records auto-provisioned Substrate CR references. + description: Substrate records observed Substrate provisioning state. properties: - actorTemplateReady: - description: ActorTemplateReady is true when the template phase - is Ready (golden snapshot taken). - type: boolean - actorTemplateRef: - description: ActorTemplateRef is the ActorTemplate used when creating - the actor. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object - workerPoolRef: - description: WorkerPoolRef is the WorkerPool used by the harness - ActorTemplate. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object + conditions: + description: Conditions describe substrate provisioning progress + (e.g. ActorTemplate golden snapshot). + items: + description: Condition contains details for one aspect of the + current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object type: object diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index 44902e1ffb..f0181a569b 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -52,6 +52,7 @@ type AgentHarnessSubstrateSnapshotsConfig struct { // Location is the GCS URI prefix for golden and incremental snapshots. // Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ // +required + // +kubebuilder:validation:Pattern=`^gs://` Location string `json:"location"` } @@ -74,11 +75,12 @@ type AgentHarnessSubstrateWorkerPoolSpec struct { // By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). // Set actorTemplateRef only to adopt an existing template (advanced / legacy). // +kubebuilder:validation:XValidation:rule="(has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef))",message="Exactly one of gatewayToken or gatewayTokenSecretRef must be specified" +// +kubebuilder:validation:XValidation:rule="!(has(self.workerPoolRef) && has(self.workerPool))",message="workerPoolRef and workerPool are mutually exclusive" type AgentHarnessSubstrateSpec struct { // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). // Mutually exclusive with workerPool. // +optional - WorkerPoolRef *TypedReference `json:"workerPoolRef,omitempty"` + WorkerPoolRef *TypedLocalReference `json:"workerPoolRef,omitempty"` // WorkerPool creates a dedicated WorkerPool in the harness namespace when workerPoolRef is unset. // +optional @@ -96,7 +98,7 @@ type AgentHarnessSubstrateSpec struct { // ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. // When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. // +optional - ActorTemplateRef *TypedReference `json:"actorTemplateRef,omitempty"` + ActorTemplateRef *TypedLocalReference `json:"actorTemplateRef,omitempty"` // GatewayPort is the port OpenClaw listens on inside the actor (Substrate routes to :80 today). // +optional @@ -112,7 +114,7 @@ type AgentHarnessSubstrateSpec struct { // GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. // The Secret must contain a "token" key. // +optional - GatewayTokenSecretRef *TypedReference `json:"gatewayTokenSecretRef,omitempty"` + GatewayTokenSecretRef *TypedLocalReference `json:"gatewayTokenSecretRef,omitempty"` } // AgentHarnessChannelType selects a messenger integration for OpenClaw harness VMs. @@ -231,6 +233,8 @@ type AgentHarnessChannel struct { // in. The backend is responsible for provisioning an environment that stays // ready to accept incoming commands. // +kubebuilder:validation:XValidation:rule="!has(self.channels) || self.channels.all(c, c.type != 'slack' || (has(c.slack) && ((self.backend == 'hermes' && has(c.slack.hermes) && !has(c.slack.openclaw)) || ((self.backend == 'openclaw' || self.backend == 'nemoclaw') && has(c.slack.openclaw) && !has(c.slack.hermes)))))",message="slack backend-specific settings must match spec.backend" +// +kubebuilder:validation:XValidation:rule="!has(self.substrate) || self.runtime == 'substrate'",message="spec.substrate may only be set when runtime is substrate" +// +kubebuilder:validation:XValidation:rule="self.runtime != 'substrate' || has(self.substrate)",message="spec.substrate is required when runtime is substrate" type AgentHarnessSpec struct { // Backend selects the control plane to use. Required. // +required @@ -318,26 +322,27 @@ type AgentHarnessStatus struct { // +optional Connection *AgentHarnessConnection `json:"connection,omitempty"` - // Substrate records auto-provisioned Substrate CR references. + // Substrate records observed Substrate provisioning state. // +optional Substrate *AgentHarnessSubstrateStatus `json:"substrate,omitempty"` } // AgentHarnessSubstrateStatus is observed Substrate control-plane state for this harness. type AgentHarnessSubstrateStatus struct { - // WorkerPoolRef is the WorkerPool used by the harness ActorTemplate. + // Conditions describe substrate provisioning progress (e.g. ActorTemplate golden snapshot). // +optional - WorkerPoolRef TypedReference `json:"workerPoolRef,omitempty"` - - // ActorTemplateRef is the ActorTemplate used when creating the actor. - // +optional - ActorTemplateRef TypedReference `json:"actorTemplateRef,omitempty"` - - // ActorTemplateReady is true when the template phase is Ready (golden snapshot taken). - // +optional - ActorTemplateReady bool `json:"actorTemplateReady,omitempty"` + // +listType=map + // +listMapKey=type + Conditions []metav1.Condition `json:"conditions,omitempty"` } +// AgentHarnessSubstrateConditionType enumerates substrate-specific condition types. +const ( + AgentHarnessSubstrateConditionTypeActorTemplateReady = "ActorTemplateReady" + // AgentHarnessSubstrateConditionTypeResourcesCleaned is True when managed Substrate CRs are gone during delete. + AgentHarnessSubstrateConditionTypeResourcesCleaned = "ResourcesCleaned" +) + // AgentHarnessConditionType enumerates the condition types an AgentHarness may report. const ( AgentHarnessConditionTypeReady = "Ready" diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 6acf8938f6..9694e72608 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -354,7 +354,7 @@ func (in *AgentHarnessStatus) DeepCopyInto(out *AgentHarnessStatus) { if in.Substrate != nil { in, out := &in.Substrate, &out.Substrate *out = new(AgentHarnessSubstrateStatus) - **out = **in + (*in).DeepCopyInto(*out) } } @@ -403,7 +403,7 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec *out = *in if in.WorkerPoolRef != nil { in, out := &in.WorkerPoolRef, &out.WorkerPoolRef - *out = new(TypedReference) + *out = new(TypedLocalReference) **out = **in } if in.WorkerPool != nil { @@ -418,12 +418,12 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec } if in.ActorTemplateRef != nil { in, out := &in.ActorTemplateRef, &out.ActorTemplateRef - *out = new(TypedReference) + *out = new(TypedLocalReference) **out = **in } if in.GatewayTokenSecretRef != nil { in, out := &in.GatewayTokenSecretRef, &out.GatewayTokenSecretRef - *out = new(TypedReference) + *out = new(TypedLocalReference) **out = **in } } @@ -441,8 +441,13 @@ func (in *AgentHarnessSubstrateSpec) DeepCopy() *AgentHarnessSubstrateSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessSubstrateStatus) DeepCopyInto(out *AgentHarnessSubstrateStatus) { *out = *in - out.WorkerPoolRef = in.WorkerPoolRef - out.ActorTemplateRef = in.ActorTemplateRef + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateStatus. diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index 94b857d9ef..e371dbb0b3 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -40,6 +40,9 @@ const ( // status while the sandbox is still provisioning. agentHarnessNotReadyRequeue = 10 * time.Second + // substrateDeleteTimeout is the maximum time to wait for substrate cleanup during delete. + substrateDeleteTimeout = 5 * time.Minute + // annotationAgentHarnessBootstrapGeneration records the AgentHarness metadata.generation for which // post-ready bootstrap (backend OnAgentHarnessReady, e.g. exec hooks) already completed. annotationAgentHarnessBootstrapGeneration = "kagent.dev/agent-harness-bootstrap-generation" @@ -82,6 +85,7 @@ func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxba // +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) @@ -122,11 +126,20 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, nil } - runtime := ah.Spec.Runtime - if runtime == "" { - runtime = v1alpha2.AgentHarnessRuntimeOpenshell - } - if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { + runtime := effectiveAgentHarnessRuntime(&ah) + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { + if r.SubstrateProvisioner == nil { + log.Error(nil, "substrate provisioner not configured") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "SubstrateProvisionerUnavailable", + "substrate runtime requires a configured substrate provisioner (set --substrate-ate-api-endpoint)") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "SubstrateProvisionerUnavailable", "") + if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } provRes, err := r.SubstrateProvisioner.Ensure(ctx, &ah) if err != nil { log.Error(err, "substrate provision failed") @@ -139,20 +152,13 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request } return ctrl.Result{}, err } - if ah.Status.Substrate == nil { - ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} + if provRes.ActorTemplateReady { + setSubstrateCondition(&ah, v1alpha2.AgentHarnessSubstrateConditionTypeActorTemplateReady, + metav1.ConditionTrue, "Ready", "ActorTemplate golden snapshot is ready") + } else { + setSubstrateCondition(&ah, v1alpha2.AgentHarnessSubstrateConditionTypeActorTemplateReady, + metav1.ConditionFalse, "NotReady", "waiting for ActorTemplate golden snapshot") } - if provRes.WorkerPoolRef.Name != "" { - ah.Status.Substrate.WorkerPoolRef = v1alpha2.TypedReference{ - Name: provRes.WorkerPoolRef.Name, - Namespace: provRes.WorkerPoolRef.Namespace, - } - } - ah.Status.Substrate.ActorTemplateRef = v1alpha2.TypedReference{ - Name: provRes.ActorTemplateRef.Name, - Namespace: provRes.ActorTemplateRef.Namespace, - } - ah.Status.Substrate.ActorTemplateReady = provRes.ActorTemplateReady // Persist status before metadata annotation patch (client Patch can refresh ah and drop in-memory status). if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { return ctrl.Result{}, err @@ -273,22 +279,74 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph return ctrl.Result{}, nil } - if ah.Status.BackendRef != nil && ah.Status.BackendRef.ID != "" { - del := r.backendFor(ah) - if del != nil { - if err := del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: ah.Status.BackendRef.ID}); err != nil { - if r.Recorder != nil { - r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) - } - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + if substrateDeleteTimedOut(ah) { + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionFalse, "DeleteTimeout", "substrate cleanup exceeded timeout") + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, fmt.Errorf("substrate cleanup timed out for AgentHarness %s", ah.Name) + } + + runtime := effectiveAgentHarnessRuntime(ah) + actorID := "" + if ah.Status.BackendRef != nil { + actorID = ah.Status.BackendRef.ID + } + + if actorID != "" { + var actorDone bool + var err error + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { + actorDone, err = r.SubstrateProvisioner.AdvanceActorDelete(ctx, actorID) + } else if del := r.backendFor(ah); del != nil { + err = del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) + actorDone = err == nil + } else { + actorDone = true + } + if err != nil { + if r.Recorder != nil { + r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + } + if !actorDone { + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionFalse, "ActorDeleting", fmt.Sprintf("waiting for substrate actor %q deletion", actorID)) + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + ah.Status.BackendRef = nil + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err } } - if r.SubstrateProvisioner != nil { - if err := r.SubstrateProvisioner.Delete(ctx, ah); err != nil { + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { + if r.SubstrateProvisioner == nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, + fmt.Errorf("substrate provisioner is not configured") + } + complete, err := r.SubstrateProvisioner.AdvanceDelete(ctx, ah) + if err != nil { return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("delete substrate resources: %w", err) } + if !complete { + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionFalse, "CleanupInProgress", "waiting for managed Substrate resources to be removed") + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionTrue, "Cleaned", "managed Substrate resources removed") + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } } controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) @@ -298,6 +356,13 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph return ctrl.Result{}, nil } +func substrateDeleteTimedOut(ah *v1alpha2.AgentHarness) bool { + if ah == nil || ah.DeletionTimestamp.IsZero() { + return false + } + return time.Since(ah.DeletionTimestamp.Time) > substrateDeleteTimeout +} + func (r *AgentHarnessController) patchAgentHarnessStatus(ctx context.Context, ah *v1alpha2.AgentHarness) error { if err := r.Client.Status().Update(ctx, ah); err != nil { return fmt.Errorf("update AgentHarness status: %w", err) @@ -322,10 +387,28 @@ func (r *AgentHarnessController) patchAgentHarnessProvisionAnnotations(ctx conte return nil } +func effectiveAgentHarnessRuntime(ah *v1alpha2.AgentHarness) v1alpha2.AgentHarnessRuntime { + if ah.Spec.Runtime == "" { + return v1alpha2.AgentHarnessRuntimeOpenshell + } + return ah.Spec.Runtime +} + func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { + setConditions(&ah.Status.Conditions, ah.Generation, t, s, reason, msg) +} + +func setSubstrateCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { + if ah.Status.Substrate == nil { + ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} + } + setConditions(&ah.Status.Substrate.Conditions, ah.Generation, t, s, reason, msg) +} + +func setConditions(conditions *[]metav1.Condition, generation int64, t string, s metav1.ConditionStatus, reason, msg string) { now := metav1.Now() - for i := range ah.Status.Conditions { - c := &ah.Status.Conditions[i] + for i := range *conditions { + c := &(*conditions)[i] if c.Type != t { continue } @@ -335,27 +418,27 @@ func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.Cond c.Status = s c.Reason = reason c.Message = msg - c.ObservedGeneration = ah.Generation + c.ObservedGeneration = generation return } - ah.Status.Conditions = append(ah.Status.Conditions, metav1.Condition{ + *conditions = append(*conditions, metav1.Condition{ Type: t, Status: s, Reason: reason, Message: msg, LastTransitionTime: now, - ObservedGeneration: ah.Generation, + ObservedGeneration: generation, }) } // SetupWithManager registers the controller with the manager. func (r *AgentHarnessController) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). + b := ctrl.NewControllerManagedBy(mgr). WithOptions(controller.Options{NeedLeaderElection: new(true)}). For(&v1alpha2.AgentHarness{}, builder.WithPredicates(predicate.Or( predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, - ))). - Named("agentharness"). - Complete(r) + ))) + b = r.substrateWatches(b) + return b.Named("agentharness").Complete(r) } diff --git a/go/core/internal/controller/agentharness_substrate_watches.go b/go/core/internal/controller/agentharness_substrate_watches.go new file mode 100644 index 0000000000..14dbf9b59e --- /dev/null +++ b/go/core/internal/controller/agentharness_substrate_watches.go @@ -0,0 +1,97 @@ +package controller + +import ( + "context" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" +) + +func (r *AgentHarnessController) enqueueAgentHarnessForSubstrateResource(ctx context.Context, obj client.Object) []reconcile.Request { + harnessName := substrate.HarnessNameFromLabels(obj.GetLabels()) + if harnessName == "" { + return nil + } + return []reconcile.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: obj.GetNamespace(), + Name: harnessName, + }, + }} +} + +func (r *AgentHarnessController) enqueueAgentHarnessForWorkerPoolDeployment(ctx context.Context, obj client.Object) []reconcile.Request { + deploy, ok := obj.(*appsv1.Deployment) + if !ok { + return nil + } + harnessName := substrate.HarnessNameFromLabels(deploy.GetLabels()) + if harnessName == "" { + harnessName = r.harnessNameFromWorkerPoolDeployment(ctx, deploy) + } + if harnessName == "" { + return nil + } + return []reconcile.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: deploy.Namespace, + Name: harnessName, + }, + }} +} + +// harnessNameFromWorkerPoolDeployment resolves the harness via the owning WorkerPool's labels. +// Substrate names deployments "{workerPool}-deployment" and does not copy harness labels onto them. +func (r *AgentHarnessController) harnessNameFromWorkerPoolDeployment(ctx context.Context, deploy *appsv1.Deployment) string { + if r == nil || r.Client == nil || deploy == nil { + return "" + } + for _, ref := range deploy.GetOwnerReferences() { + if ref.Kind != "WorkerPool" || ref.Controller == nil || !*ref.Controller { + continue + } + if !strings.Contains(ref.APIVersion, "ate.dev") { + continue + } + var wp atev1alpha1.WorkerPool + key := types.NamespacedName{Namespace: deploy.Namespace, Name: ref.Name} + if err := r.Client.Get(ctx, key, &wp); err != nil { + if apierrors.IsNotFound(err) { + continue + } + return "" + } + if name := substrate.HarnessNameFromLabels(wp.GetLabels()); name != "" { + return name + } + } + return "" +} + +func (r *AgentHarnessController) substrateWatches(b *builder.Builder) *builder.Builder { + if r == nil || r.SubstrateProvisioner == nil { + return b + } + return b. + Watches( + &atev1alpha1.WorkerPool{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForSubstrateResource), + ). + Watches( + &atev1alpha1.ActorTemplate{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForSubstrateResource), + ). + Watches( + &appsv1.Deployment{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForWorkerPoolDeployment), + ) +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor.go b/go/core/pkg/sandboxbackend/substrate/delete_actor.go index c7a36e8409..462a57de26 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_actor.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor.go @@ -3,125 +3,49 @@ package substrate import ( "context" "fmt" - "time" "github.com/agent-substrate/substrate/proto/ateapipb" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) -const ( - actorDeletePollInterval = 2 * time.Second - actorDeleteTimeout = 5 * time.Minute -) - -// deleteActorSequenced suspends the actor, waits until suspended, deletes it, and waits until gone. -func (c *Client) deleteActorSequenced(ctx context.Context, actorID string) error { +// AdvanceActorDelete performs at most one mutating ate-api step per call. +// Returns true when the actor no longer exists. Callers should requeue until true. +func (c *Client) AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) { if actorID == "" { - return nil + return true, nil } - deadline := time.Now().Add(actorDeleteTimeout) actor, err := c.GetActor(ctx, actorID) if err != nil { if status.Code(err) == codes.NotFound { - return nil + return true, nil } - return fmt.Errorf("get actor %q: %w", actorID, err) + return false, fmt.Errorf("get actor %q: %w", actorID, err) } - if err := c.ensureActorSuspended(ctx, actorID, actor.GetStatus(), deadline); err != nil { - return err - } - - if err := c.DeleteActor(ctx, actorID); err != nil { - if status.Code(err) == codes.NotFound { - return nil - } - if status.Code(err) == codes.FailedPrecondition { - // ate-api requires STATUS_SUSPENDED; re-check and surface current status. - actor, getErr := c.GetActor(ctx, actorID) - if getErr == nil { - return fmt.Errorf("delete actor %q: not suspended (status %s)", actorID, actor.GetStatus()) + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + if err := c.DeleteActor(ctx, actorID); err != nil { + if status.Code(err) == codes.NotFound { + return true, nil + } + if status.Code(err) == codes.FailedPrecondition { + return false, fmt.Errorf("delete actor %q: not suspended (status %s)", actorID, actor.GetStatus()) } + return false, fmt.Errorf("delete actor %q: %w", actorID, err) } - return fmt.Errorf("delete actor %q: %w", actorID, err) - } - - return c.waitForActorDeleted(ctx, actorID, deadline) -} - -func (c *Client) ensureActorSuspended(ctx context.Context, actorID string, st ateapipb.Actor_Status, deadline time.Time) error { - switch st { - case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: - return nil + return false, nil case ateapipb.Actor_STATUS_SUSPENDING: - // Retry suspend periodically; stuck checkpoint may need manual worker pod deletion. _ = c.SuspendActor(ctx, actorID) - return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + return false, nil case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: if err := c.SuspendActor(ctx, actorID); err != nil && status.Code(err) != codes.NotFound { - return fmt.Errorf("suspend actor %q: %w", actorID, err) + return false, fmt.Errorf("suspend actor %q: %w", actorID, err) } - return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + return false, nil default: - // Best-effort suspend for unknown/intermediate states before delete. _ = c.SuspendActor(ctx, actorID) - return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) - } -} - -func (c *Client) waitForActorStatus(ctx context.Context, actorID string, want ateapipb.Actor_Status, deadline time.Time) error { - for time.Now().Before(deadline) { - actor, err := c.GetActor(ctx, actorID) - if err != nil { - if status.Code(err) == codes.NotFound { - if want == ateapipb.Actor_STATUS_UNSPECIFIED { - return nil - } - return fmt.Errorf("actor %q not found while waiting for %s", actorID, want) - } - return fmt.Errorf("get actor %q: %w", actorID, err) - } - if actor.GetStatus() == want { - return nil - } - if want == ateapipb.Actor_STATUS_SUSPENDED && actor.GetStatus() == ateapipb.Actor_STATUS_SUSPENDING { - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - continue - } - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - } - return fmt.Errorf("timeout waiting for actor %q status %s", actorID, want) -} - -func (c *Client) waitForActorDeleted(ctx context.Context, actorID string, deadline time.Time) error { - for time.Now().Before(deadline) { - _, err := c.GetActor(ctx, actorID) - if err != nil { - if status.Code(err) == codes.NotFound { - return nil - } - return fmt.Errorf("get actor %q: %w", actorID, err) - } - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - } - return fmt.Errorf("timeout waiting for actor %q deletion", actorID) -} - -func sleepOrDone(ctx context.Context, d time.Duration) error { - t := time.NewTimer(d) - defer t.Stop() - select { - case <-ctx.Done(): - return ctx.Err() - case <-t.C: - return nil + return false, nil } } diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go index 38b9bdae39..9453005fc9 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go @@ -1,18 +1,15 @@ package substrate -import ( - "testing" - "time" +import "testing" - "github.com/agent-substrate/substrate/proto/ateapipb" -) - -func TestEnsureActorSuspendedAlreadySuspended(t *testing.T) { +func TestAdvanceActorDeleteEmptyID(t *testing.T) { t.Parallel() c := &Client{} - deadline := time.Now().Add(time.Minute) - err := c.ensureActorSuspended(t.Context(), "ahr-test", ateapipb.Actor_STATUS_SUSPENDED, deadline) + done, err := c.AdvanceActorDelete(t.Context(), "") if err != nil { t.Fatalf("unexpected error: %v", err) } + if !done { + t.Fatal("expected done for empty actor id") + } } diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go index 47d641a2cf..780fcd4b54 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -4,106 +4,135 @@ import ( "context" "fmt" "strings" - "time" atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" ) -const workerPoolDrainTimeout = 3 * time.Minute +// AdvanceActorDelete deletes a harness actor via ate-api (one RPC step per call). +func (p *Provisioner) AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) { + if p == nil || p.Ate == nil || strings.TrimSpace(actorID) == "" { + return true, nil + } + return p.Ate.AdvanceActorDelete(ctx, actorID) +} -// Delete removes kagent-managed Substrate CRs after the harness actor has been removed. -// Order: golden snapshot actor (from ActorTemplate status), ActorTemplate, WorkerPool. -func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) error { +// AdvanceDelete issues delete requests and observes substrate cleanup progress without blocking. +// Returns true when all kagent-managed Substrate resources for this harness are gone. +func (p *Provisioner) AdvanceDelete(ctx context.Context, ah *v1alpha2.AgentHarness) (bool, error) { if ah == nil || ah.Annotations == nil { - return nil + return true, nil } + if p.Client == nil { + return true, nil + } + if ah.Annotations[AnnotationManagedActorTemplate] == "true" { - key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} - if err := p.deleteGoldenActor(ctx, key); err != nil { - return err + tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + goldenID, err := p.goldenActorID(ctx, tmplKey) + if err != nil { + return false, err } - var tmpl atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &tmpl); err == nil { - if err := p.Client.Delete(ctx, &tmpl); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("delete ActorTemplate %s: %w", key, err) + if goldenID != "" { + if p.Ate == nil { + return false, fmt.Errorf("substrate ate-api client is required to delete golden actor %q", goldenID) + } + done, err := p.Ate.AdvanceActorDelete(ctx, goldenID) + if err != nil { + return false, fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) } - } else if !apierrors.IsNotFound(err) { - return err + if !done { + return false, nil + } + } + var tmpl atev1alpha1.ActorTemplate + if done, err := p.advanceDeleteCR(ctx, tmplKey, &tmpl); err != nil || !done { + return false, err } } + if ah.Annotations[AnnotationManagedWorkerPool] == "true" { - key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + wpKey := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} var wp atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &wp); err == nil { - if err := p.Client.Delete(ctx, &wp); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("delete WorkerPool %s: %w", key, err) - } - } else if !apierrors.IsNotFound(err) { - return err + if done, err := p.advanceDeleteCR(ctx, wpKey, &wp); err != nil || !done { + return false, err + } + gone, err := p.workerPoolDeploymentGone(ctx, wpKey) + if err != nil { + return false, err } - if err := p.waitForWorkerPoolDeploymentGone(ctx, key); err != nil { - return err + if !gone { + return false, nil } } - return nil + + return true, nil } -func (p *Provisioner) deleteGoldenActor(ctx context.Context, tmplKey types.NamespacedName) error { - if p.Ate == nil || p.Client == nil { - return nil - } +func (p *Provisioner) goldenActorID(ctx context.Context, tmplKey types.NamespacedName) (string, error) { var tmpl atev1alpha1.ActorTemplate if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { if apierrors.IsNotFound(err) { - return nil + return "", nil } - return fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) + return "", fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) } - goldenID := strings.TrimSpace(tmpl.Status.GoldenActorID) - if goldenID == "" { - return nil + return strings.TrimSpace(tmpl.Status.GoldenActorID), nil +} + +// advanceDeleteCR deletes obj when present; returns true when the object is gone. +func (p *Provisioner) advanceDeleteCR(ctx context.Context, key types.NamespacedName, obj client.Object) (bool, error) { + if err := p.Client.Get(ctx, key, obj); err != nil { + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err } - if err := p.Ate.deleteActorSequenced(ctx, goldenID); err != nil { - return fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) + if obj.GetDeletionTimestamp().IsZero() { + if err := p.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete %s: %w", key, err) + } + return false, nil } - return nil + return false, nil } func workerPoolDeploymentName(wpName string) string { return wpName + "-deployment" } -func (p *Provisioner) waitForWorkerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) error { - if p.Client == nil { - return nil - } +// workerPoolDeploymentGone reports whether the substrate WorkerPool deployment is absent or fully drained. +func (p *Provisioner) workerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) (bool, error) { deployKey := types.NamespacedName{Namespace: wpKey.Namespace, Name: workerPoolDeploymentName(wpKey.Name)} - deadline := time.Now().Add(workerPoolDrainTimeout) - for time.Now().Before(deadline) { - var deploy appsv1.Deployment - err := p.Client.Get(ctx, deployKey, &deploy) - if apierrors.IsNotFound(err) { - return nil - } - if err != nil { - return fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) - } - if deploy.DeletionTimestamp != nil { - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - continue - } - if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { - return nil - } - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } + var deploy appsv1.Deployment + err := p.Client.Get(ctx, deployKey, &deploy) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) + } + if !deploy.DeletionTimestamp.IsZero() { + return false, nil + } + if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { + return true, nil } - return fmt.Errorf("timeout waiting for WorkerPool deployment %s to drain", deployKey) + return false, nil } + +// HarnessLabelKey labels substrate resources managed for an AgentHarness. +const HarnessLabelKey = "kagent.dev/agent-harness" + +// HarnessNameFromLabels returns the AgentHarness name from provision labels. +func HarnessNameFromLabels(labels map[string]string) string { + if labels == nil { + return "" + } + return strings.TrimSpace(labels[HarnessLabelKey]) +} + diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go index cfa632157e..ae316c43be 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" @@ -19,12 +20,12 @@ type recordingActorDeleter struct { deleted []string } -func (r *recordingActorDeleter) deleteActorSequenced(_ context.Context, actorID string) error { +func (r *recordingActorDeleter) AdvanceActorDelete(_ context.Context, actorID string) (bool, error) { r.deleted = append(r.deleted, actorID) - return nil + return true, nil } -func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { +func TestProvisionerAdvanceDelete_DeletesGoldenActor(t *testing.T) { t.Parallel() scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -33,7 +34,9 @@ func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { ns := "kagent" tmpl := &atev1alpha1.ActorTemplate{ - ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns}, + ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns, Labels: map[string]string{ + HarnessLabelKey: "peterj-claw", + }}, Status: atev1alpha1.ActorTemplateStatus{ GoldenActorID: "golden-actor-uuid", Phase: atev1alpha1.PhaseReady, @@ -53,9 +56,29 @@ func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { rec := &recordingActorDeleter{} p := &Provisioner{Client: kube, Ate: rec} - require.NoError(t, p.Delete(context.Background(), ah)) + var complete bool + var err error + for range 5 { + complete, err = p.AdvanceDelete(context.Background(), ah) + require.NoError(t, err) + if complete { + break + } + } + require.True(t, complete, "AdvanceDelete should finish within a few reconcile passes") require.Equal(t, []string{"golden-actor-uuid"}, rec.deleted) var got atev1alpha1.ActorTemplate require.Error(t, kube.Get(context.Background(), client.ObjectKeyFromObject(tmpl), &got)) } + +func TestWorkerPoolDeploymentGoneNotFound(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + p := &Provisioner{Client: kube} + gone, err := p.workerPoolDeploymentGone(context.Background(), types.NamespacedName{Namespace: "kagent", Name: "claw-wp"}) + require.NoError(t, err) + require.True(t, gone) +} diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_token.go b/go/core/pkg/sandboxbackend/substrate/gateway_token.go index abe4b0ba53..fbb5c51634 100644 --- a/go/core/pkg/sandboxbackend/substrate/gateway_token.go +++ b/go/core/pkg/sandboxbackend/substrate/gateway_token.go @@ -14,52 +14,33 @@ import ( // GatewayTokenSecretKey is the Secret data key used for per-harness OpenClaw gateway tokens. const GatewayTokenSecretKey = "token" -// ValidateGatewayTokenSpec requires exactly one per-harness OpenClaw gateway token source. -func ValidateGatewayTokenSpec(sub *v1alpha2.AgentHarnessSubstrateSpec) error { - if sub == nil { - return fmt.Errorf("spec.substrate is required") - } - hasToken := strings.TrimSpace(sub.GatewayToken) != "" - hasSecretRef := sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" - if hasToken == hasSecretRef { - return fmt.Errorf("exactly one of spec.substrate.gatewayToken or gatewayTokenSecretRef must be specified") - } - return nil -} - // ResolveGatewayToken returns the per-harness gateway token. +// Token source is validated at admission via AgentHarnessSubstrateSpec CEL rules. func ResolveGatewayToken(ctx context.Context, kube client.Client, ah *v1alpha2.AgentHarness) (string, error) { if ah == nil || ah.Spec.Substrate == nil { return "", fmt.Errorf("spec.substrate is required") } - if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { - return "", err - } sub := ah.Spec.Substrate - if sub.GatewayTokenSecretRef != nil { + if sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" { return resolveGatewayTokenSecret(ctx, kube, ah.Namespace, sub.GatewayTokenSecretRef) } return strings.TrimSpace(sub.GatewayToken), nil } -func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, defaultNamespace string, ref *v1alpha2.TypedReference) (string, error) { +func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, namespace string, ref *v1alpha2.TypedLocalReference) (string, error) { if kube == nil { return "", fmt.Errorf("kubernetes client is required to resolve gateway token secret") } - ns := ref.Namespace - if ns == "" { - ns = defaultNamespace - } var secret corev1.Secret - if err := kube.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { - return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) + if err := kube.Get(ctx, types.NamespacedName{Namespace: namespace, Name: ref.Name}, &secret); err != nil { + return "", fmt.Errorf("get gateway token secret %s/%s: %w", namespace, ref.Name, err) } if secret.Data == nil { - return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) + return "", fmt.Errorf("gateway token secret %s/%s is empty", namespace, ref.Name) } val, ok := secret.Data[GatewayTokenSecretKey] if !ok { - return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, GatewayTokenSecretKey) + return "", fmt.Errorf("gateway token secret %s/%s missing key %q", namespace, ref.Name, GatewayTokenSecretKey) } return strings.TrimSpace(string(val)), nil } diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 7022085dea..1909a1183b 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -107,9 +107,13 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H if h.ID == "" { return nil } - if err := b.client.deleteActorSequenced(ctx, h.ID); err != nil { + done, err := b.client.AdvanceActorDelete(ctx, h.ID) + if err != nil { return fmt.Errorf("substrate delete actor %q: %w", h.ID, err) } + if !done { + return fmt.Errorf("substrate delete actor %q in progress", h.ID) + } return nil } @@ -150,22 +154,9 @@ func ActorHost(actorID string, suffix string) string { } func actorTemplateRef(ah *v1alpha2.AgentHarness, cfg Config) (string, string) { - if ah.Status.Substrate != nil && ah.Status.Substrate.ActorTemplateRef.Name != "" { - ref := ah.Status.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - return ns, ref.Name - } if ah.Spec.Substrate != nil && ah.Spec.Substrate.ActorTemplateRef != nil { - ref := ah.Spec.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - if ref.Name != "" { - return ns, ref.Name + if ref := ah.Spec.Substrate.ActorTemplateRef; ref.Name != "" { + return ah.Namespace, ref.Name } } // Auto-provisioned template in the harness namespace (also when status was not persisted yet). @@ -197,18 +188,6 @@ func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { if runtime != v1alpha2.AgentHarnessRuntimeSubstrate { return fmt.Errorf("substrate backend called for runtime %q", runtime) } - if ah.Spec.Substrate == nil { - return fmt.Errorf("spec.substrate is required when runtime is substrate") - } - if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { - return err - } - if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { - return nil - } - if loc := substrateSnapshotsLocation(ah); !strings.HasPrefix(loc, "gs://") { - return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") - } return nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go index 836e48e065..156f10aa9a 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -9,14 +9,11 @@ import ( "k8s.io/apimachinery/pkg/types" ) -// Ensure creates or updates Substrate CRs and waits for ActorTemplate Ready. +// Ensure creates or updates Substrate CRs and reports whether ActorTemplate is Ready (controller requeues until true). func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { if ah == nil || ah.Spec.Substrate == nil { return EnsureResult{}, fmt.Errorf("spec.substrate is required") } - if err := validateSubstrateProvisionSpec(ah); err != nil { - return EnsureResult{}, err - } if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { return p.ensureAdoptedActorTemplate(ctx, ah) @@ -48,11 +45,7 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En func (p *Provisioner) ensureAdoptedActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { ref := ah.Spec.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} + tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: ref.Name} ready, err := p.actorTemplateReady(ctx, tmplKey) if err != nil { return EnsureResult{}, err diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index bd24ca7b1e..cc61eb826a 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -131,7 +131,7 @@ func TestBuildOpenClawActorStartup_WithHarnessGatewayToken(t *testing.T) { { name: "secret token", substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - GatewayTokenSecretRef: &v1alpha2.TypedReference{Name: "openclaw-token"}, + GatewayTokenSecretRef: &v1alpha2.TypedLocalReference{Name: "openclaw-token"}, }, wantToken: "secret-token", }, diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/provision_shared.go index c0d4b842e3..87a526b8bd 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_shared.go @@ -33,7 +33,7 @@ type ProvisionDefaults struct { // ateActorDeleter removes actors from ate-api during harness teardown. type ateActorDeleter interface { - deleteActorSequenced(ctx context.Context, actorID string) error + AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) } // Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. @@ -53,24 +53,6 @@ type EnsureResult struct { ManagedActorTemplate bool } -func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { - sub := ah.Spec.Substrate - if err := ValidateGatewayTokenSpec(sub); err != nil { - return err - } - if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { - return nil - } - loc := substrateSnapshotsLocation(ah) - if !strings.HasPrefix(loc, "gs://") { - return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") - } - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { - return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") - } - return nil -} - func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { return atev1alpha1.RunscConfig{ AMD64: &atev1alpha1.RunscPlatformConfig{ diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go index e0e767e458..c08e87f8e0 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_test.go @@ -14,7 +14,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" ) -func TestValidateSubstrateProvisionSpec(t *testing.T) { +func TestSubstrateSnapshotsLocationDefault(t *testing.T) { t.Parallel() ah := &v1alpha2.AgentHarness{ ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, @@ -22,41 +22,12 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ GatewayToken: "test-token", - SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ - Location: "gs://bucket/prefix/", - }, }, }, } - if err := validateSubstrateProvisionSpec(ah); err != nil { - t.Fatalf("expected valid: %v", err) - } - - ah.Spec.Substrate.SnapshotsConfig = nil - if err := validateSubstrateProvisionSpec(ah); err != nil { - t.Fatalf("expected default snapshots config to be valid: %v", err) - } if got := substrateSnapshotsLocation(ah); got != "gs://ate-snapshots/kagent/claw" { t.Fatalf("got default snapshots location %q", got) } - - ah.Spec.Substrate.GatewayToken = "" - if err := validateSubstrateProvisionSpec(ah); err == nil { - t.Fatal("expected error when gateway token is not configured") - } - - ah.Spec.Substrate.GatewayToken = "test-token" - ah.Spec.Substrate.SnapshotsConfig = &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{Location: "s3://nope"} - if err := validateSubstrateProvisionSpec(ah); err == nil { - t.Fatal("expected error for non-gs location") - } - - ah.Spec.Substrate.SnapshotsConfig.Location = "gs://ok" - ah.Spec.Substrate.WorkerPoolRef = &v1alpha2.TypedReference{Name: "pool"} - ah.Spec.Substrate.WorkerPool = &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 2} - if err := validateSubstrateProvisionSpec(ah); err == nil { - t.Fatal("expected error for workerPoolRef and workerPool together") - } } func TestEnsureWorkerPoolUsesDefaultAteomImage(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go index 3504c7f651..f715aa1651 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go @@ -16,11 +16,7 @@ import ( func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { sub := ah.Spec.Substrate if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { - ns := sub.WorkerPoolRef.Namespace - if ns == "" { - ns = ah.Namespace - } - key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} + key := types.NamespacedName{Namespace: ah.Namespace, Name: sub.WorkerPoolRef.Name} var wp atev1alpha1.WorkerPool if err := p.Client.Get(ctx, key, &wp); err != nil { return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index 2c2f18ff71..52f814c1aa 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -536,8 +536,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -564,8 +562,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -578,6 +574,7 @@ spec: description: |- Location is the GCS URI prefix for golden and incremental snapshots. Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + pattern: ^gs:// type: string required: - location @@ -609,8 +606,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -624,6 +619,8 @@ spec: be specified rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) + - message: workerPoolRef and workerPool are mutually exclusive + rule: '!(has(self.workerPoolRef) && has(self.workerPool))' required: - backend type: object @@ -633,6 +630,10 @@ spec: || (has(c.slack) && ((self.backend == ''hermes'' && has(c.slack.hermes) && !has(c.slack.openclaw)) || ((self.backend == ''openclaw'' || self.backend == ''nemoclaw'') && has(c.slack.openclaw) && !has(c.slack.hermes)))))' + - message: spec.substrate may only be set when runtime is substrate + rule: '!has(self.substrate) || self.runtime == ''substrate''' + - message: spec.substrate is required when runtime is substrate + rule: self.runtime != 'substrate' || has(self.substrate) status: description: AgentHarnessStatus is the observed state of an AgentHarness. properties: @@ -726,42 +727,70 @@ spec: format: int64 type: integer substrate: - description: Substrate records auto-provisioned Substrate CR references. + description: Substrate records observed Substrate provisioning state. properties: - actorTemplateReady: - description: ActorTemplateReady is true when the template phase - is Ready (golden snapshot taken). - type: boolean - actorTemplateRef: - description: ActorTemplateRef is the ActorTemplate used when creating - the actor. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object - workerPoolRef: - description: WorkerPoolRef is the WorkerPool used by the harness - ActorTemplate. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object + conditions: + description: Conditions describe substrate provisioning progress + (e.g. ActorTemplate golden snapshot). + items: + description: Condition contains details for one aspect of the + current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object type: object From 6b682d5b791ebed891004bc8cba5de65ec6be6f4 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Fri, 29 May 2026 08:34:54 -0700 Subject: [PATCH 07/32] make linter happy Signed-off-by: Peter Jausovec --- go/api/httpapi/types.go | 14 +++++++------- .../internal/controller/agentharness_controller.go | 8 ++++---- go/core/internal/httpserver/handlers/agents.go | 4 ++-- go/core/pkg/sandboxbackend/openclaw/credentials.go | 10 ---------- go/core/pkg/sandboxbackend/openclaw/types.go | 8 ++++---- .../pkg/sandboxbackend/openshell/ssh_terminal.go | 2 +- .../sandboxbackend/openshell/ssh_terminal_test.go | 2 +- .../sandboxbackend/substrate/delete_provision.go | 1 - go/core/pkg/sandboxbackend/substrate/openclaw.go | 5 +---- .../substrate/provision_openclaw_test.go | 6 +++--- 10 files changed, 23 insertions(+), 37 deletions(-) diff --git a/go/api/httpapi/types.go b/go/api/httpapi/types.go index 0107e5ffe0..d704eb549a 100644 --- a/go/api/httpapi/types.go +++ b/go/api/httpapi/types.go @@ -146,13 +146,13 @@ type OpenshellAgentHarnessListEntry struct { // SubstrateAgentHarnessListEntry is set when runtime is substrate. type SubstrateAgentHarnessListEntry struct { - Backend v1alpha2.AgentHarnessBackendType `json:"backend"` - Runtime v1alpha2.AgentHarnessRuntime `json:"runtime"` - ActorID string `json:"actorId,omitempty"` + Backend v1alpha2.AgentHarnessBackendType `json:"backend"` + Runtime v1alpha2.AgentHarnessRuntime `json:"runtime"` + ActorID string `json:"actorId,omitempty"` GatewayUIPath string `json:"gatewayUIPath,omitempty"` ModelConfigRef string `json:"modelConfigRef,omitempty"` - BackendRefID string `json:"backendRefId,omitempty"` - Endpoint string `json:"endpoint,omitempty"` + BackendRefID string `json:"backendRefId,omitempty"` + Endpoint string `json:"endpoint,omitempty"` } type AgentResponse struct { @@ -167,8 +167,8 @@ type AgentResponse struct { DeploymentReady bool `json:"deploymentReady"` Accepted bool `json:"accepted"` WorkloadMode v1alpha2.WorkloadMode `json:"workloadMode,omitempty"` - OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` - SubstrateAgentHarness *SubstrateAgentHarnessListEntry `json:"substrateAgentHarness,omitempty"` + OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` + SubstrateAgentHarness *SubstrateAgentHarnessListEntry `json:"substrateAgentHarness,omitempty"` } // Session types diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index e371dbb0b3..7e771b0ffb 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -53,10 +53,10 @@ const ( // harness VMs are a generic exec/SSH-able environment with no in-cluster // workload owned by kagent. type AgentHarnessController struct { - Client client.Client - Recorder events.EventRecorder - OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + Client client.Client + Recorder events.EventRecorder + OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend SubstrateProvisioner *substrate.Provisioner } diff --git a/go/core/internal/httpserver/handlers/agents.go b/go/core/internal/httpserver/handlers/agents.go index 76056660c8..96249cf9d3 100644 --- a/go/core/internal/httpserver/handlers/agents.go +++ b/go/core/internal/httpserver/handlers/agents.go @@ -179,7 +179,7 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, }, }, DeploymentReady: ready, - Accepted: accepted, + Accepted: accepted, } switch runtime { @@ -188,7 +188,7 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, Backend: sb.Spec.Backend, Runtime: runtime, ModelConfigRef: sb.Spec.ModelConfigRef, - GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), + GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), } if sb.Status.BackendRef != nil { subEntry.BackendRefID = sb.Status.BackendRef.ID diff --git a/go/core/pkg/sandboxbackend/openclaw/credentials.go b/go/core/pkg/sandboxbackend/openclaw/credentials.go index b167802c96..f0d56c4bc9 100644 --- a/go/core/pkg/sandboxbackend/openclaw/credentials.go +++ b/go/core/pkg/sandboxbackend/openclaw/credentials.go @@ -81,13 +81,3 @@ func channelCredentialContainerEnv(cred v1alpha2.AgentHarnessChannelCredential, return corev1.EnvVar{}, fmt.Errorf("unknown value source type %q", cred.ValueFrom.Type) } } - -// resolvedChannelSecret returns the plaintext value putChannelCredential stored in env. -// OpenShell bootstrap still inlines channel tokens in openclaw.json; Substrate uses OpenClaw env SecretRefs instead. -func resolvedChannelSecret(env map[string]string, envKey string) (string, error) { - v := strings.TrimSpace(env[envKey]) - if v == "" { - return "", fmt.Errorf("credential %s is missing or empty after resolve", envKey) - } - return v, nil -} diff --git a/go/core/pkg/sandboxbackend/openclaw/types.go b/go/core/pkg/sandboxbackend/openclaw/types.go index 2fac8ba330..5d993dd824 100644 --- a/go/core/pkg/sandboxbackend/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openclaw/types.go @@ -95,10 +95,10 @@ type slackAccount struct { Mode string `json:"mode"` BotToken credentialValue `json:"botToken"` AppToken credentialValue `json:"appToken"` - UserTokenReadOnly bool `json:"userTokenReadOnly"` - GroupPolicy string `json:"groupPolicy"` - Capabilities slackCaps `json:"capabilities"` - DM *groupDM `json:"dm,omitempty"` + UserTokenReadOnly bool `json:"userTokenReadOnly"` + GroupPolicy string `json:"groupPolicy"` + Capabilities slackCaps `json:"capabilities"` + DM *groupDM `json:"dm,omitempty"` } type slackCaps struct { diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go index c7e6c2033c..4437ba35b4 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go @@ -4,8 +4,8 @@ import ( "strings" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" ) // ResolveSSHRemoteCommand decides whether to run an interactive shell or a harness CLI. diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go index a1f4849711..252b111a7b 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go @@ -3,9 +3,9 @@ package openshell_test import ( "testing" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" ) func TestResolveSSHRemoteCommand(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go index 780fcd4b54..a662454793 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -135,4 +135,3 @@ func HarnessNameFromLabels(labels map[string]string) string { } return strings.TrimSpace(labels[HarnessLabelKey]) } - diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 1909a1183b..a3374d01fc 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -117,12 +117,9 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H return nil } -func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle) error { +func (b *ClawBackend) OnAgentHarnessReady(_ context.Context, _ *v1alpha2.AgentHarness, _ sandboxbackend.Handle) error { // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time // (see substrate/provision_openclaw.go — openclaw.BuildSubstrateBootstrapJSON with secretKeyRef env). - _ = ctx - _ = ah - _ = h return nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index cc61eb826a..95e58211e7 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -77,7 +77,7 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { // Decode embedded JSON from the base64 line in the startup script. var payload string - for _, line := range strings.Split(script, "\n") { + for line := range strings.SplitSeq(script, "\n") { if !strings.Contains(line, "base64 -d") { continue } @@ -197,7 +197,7 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { require.NoError(t, err) var payload string - for _, line := range strings.Split(script, "\n") { + for line := range strings.SplitSeq(script, "\n") { if strings.Contains(line, "base64 -d") { start := strings.Index(line, `'`) + 1 end := strings.LastIndex(line, `'`) @@ -217,7 +217,7 @@ func gatewayTokenFromStartup(t *testing.T, script string) string { t.Helper() var payload string - for _, line := range strings.Split(script, "\n") { + for line := range strings.SplitSeq(script, "\n") { if strings.Contains(line, "base64 -d") { start := strings.Index(line, `'`) + 1 end := strings.LastIndex(line, `'`) From 768a115f35d640a9980c45f0f812d9add11933aa Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Fri, 29 May 2026 09:06:26 -0700 Subject: [PATCH 08/32] move write ops to writer-role Signed-off-by: Peter Jausovec --- helm/kagent/templates/rbac/getter-role.yaml | 4 ---- helm/kagent/templates/rbac/writer-role.yaml | 10 ++++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/helm/kagent/templates/rbac/getter-role.yaml b/helm/kagent/templates/rbac/getter-role.yaml index cafe9d0f5c..ceab5ec9aa 100644 --- a/helm/kagent/templates/rbac/getter-role.yaml +++ b/helm/kagent/templates/rbac/getter-role.yaml @@ -62,10 +62,6 @@ - get - list - watch - - create - - update - - patch - - delete - apiGroups: - ate.dev resources: diff --git a/helm/kagent/templates/rbac/writer-role.yaml b/helm/kagent/templates/rbac/writer-role.yaml index b735e159bd..b9516cae03 100644 --- a/helm/kagent/templates/rbac/writer-role.yaml +++ b/helm/kagent/templates/rbac/writer-role.yaml @@ -75,6 +75,16 @@ - update - patch - delete +- apiGroups: + - ate.dev + resources: + - workerpools + - actortemplates + verbs: + - create + - update + - patch + - delete {{- end -}} {{- include "kagent.rbac.validate" . -}} From cb9a31b65122ea12541205bfc41bc4d4fef123ef Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Fri, 29 May 2026 09:30:23 -0700 Subject: [PATCH 09/32] commenting out the substrate section in values Signed-off-by: Peter Jausovec --- helm/kagent/values.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index a9a335ef85..0305601676 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -234,16 +234,16 @@ controller: # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. - substrate: - enabled: true - ateApiEndpoint: "dns:///api.ate-system.svc:443" - ateApiInsecure: false - pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" - runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" - runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" - runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" - runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" - ateomImage: "localhost:5001/ateom-gvisor:latest" + # substrate: + # enabled: true + # ateApiEndpoint: "dns:///api.ate-system.svc:443" + # ateApiInsecure: false + # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" + # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" + # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + # ateomImage: "localhost:5001/ateom-gvisor:latest" envFrom: [] From 6cf6d58d1a4cfa420e593bd6e4617480c4b4de86 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 1 Jun 2026 08:19:38 -0700 Subject: [PATCH 10/32] fix failing helm unit tests Signed-off-by: Peter Jausovec --- .../templates/controller-deployment.yaml | 2 +- helm/kagent/values.yaml | 25 +++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index d63ff6bd92..2727ace3a6 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -87,7 +87,7 @@ spec: {{- with .Values.controller.env }} {{- toYaml . | nindent 12 }} {{- end }} - {{- if .Values.controller.substrate.enabled }} + {{- if and .Values.controller.substrate .Values.controller.substrate.enabled }} - name: SUBSTRATE_ATE_API_ENDPOINT value: {{ .Values.controller.substrate.ateApiEndpoint | quote }} {{- if .Values.controller.substrate.ateApiInsecure }} diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index 0305601676..39e97ba23d 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -234,16 +234,21 @@ controller: # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. - # substrate: - # enabled: true - # ateApiEndpoint: "dns:///api.ate-system.svc:443" - # ateApiInsecure: false - # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" - # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" - # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" - # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" - # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" - # ateomImage: "localhost:5001/ateom-gvisor:latest" + substrate: + enabled: false + ateApiEndpoint: "" + ateApiInsecure: false + defaultActorTemplateNamespace: "" + defaultActorTemplateName: "" + # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" + # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" + # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + # ateomImage: "localhost:5001/ateom-gvisor:latest" + # Example when enabled: + # enabled: true + # ateApiEndpoint: "dns:///api.ate-system.svc:443" envFrom: [] From 40f6209878360c2fd7f510b559f8a24d40a99400 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 1 Jun 2026 09:24:01 -0700 Subject: [PATCH 11/32] fix remaining pr feedback Signed-off-by: Peter Jausovec --- go/api/httpapi/types.go | 1 - go/api/v1alpha2/agentharness_types.go | 7 +- .../handlers/agentharness_gateway.go | 57 +++++++----- .../agentharness_gateway_path_test.go | 4 +- .../handlers/agentharness_gateway_test.go | 34 +++++-- .../internal/httpserver/handlers/agents.go | 6 +- go/core/internal/httpserver/server.go | 2 +- go/core/pkg/sandboxbackend/async.go | 3 +- .../pkg/sandboxbackend/substrate/client.go | 28 +++++- .../sandboxbackend/substrate/client_test.go | 31 +++++++ .../pkg/sandboxbackend/substrate/config.go | 5 +- .../sandboxbackend/substrate/gateway_token.go | 20 ++++- .../substrate/gateway_token_test.go | 82 +++++++++++++++++ .../pkg/sandboxbackend/substrate/openclaw.go | 64 ++++++------- .../sandboxbackend/substrate/openclaw_test.go | 90 ++++++++++++++++--- .../substrate/provision_actortemplate.go | 2 +- .../substrate/provision_openclaw.go | 11 +-- .../substrate/provision_openclaw_test.go | 24 +++++ .../substrate/provision_shared.go | 18 +++- .../agent-form/OpenClawSandboxFields.tsx | 57 ++++++++++++ .../lib/__tests__/openClawSandboxForm.test.ts | 59 ++++++++++++ ui/src/lib/openClawSandboxForm.ts | 57 +++++++++++- ui/src/types/index.ts | 1 - 23 files changed, 557 insertions(+), 106 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/client_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/gateway_token_test.go diff --git a/go/api/httpapi/types.go b/go/api/httpapi/types.go index d704eb549a..9d4e225ff8 100644 --- a/go/api/httpapi/types.go +++ b/go/api/httpapi/types.go @@ -152,7 +152,6 @@ type SubstrateAgentHarnessListEntry struct { GatewayUIPath string `json:"gatewayUIPath,omitempty"` ModelConfigRef string `json:"modelConfigRef,omitempty"` BackendRefID string `json:"backendRefId,omitempty"` - Endpoint string `json:"endpoint,omitempty"` } type AgentResponse struct { diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index f0181a569b..699d629386 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -100,7 +100,7 @@ type AgentHarnessSubstrateSpec struct { // +optional ActorTemplateRef *TypedLocalReference `json:"actorTemplateRef,omitempty"` - // GatewayPort is the port OpenClaw listens on inside the actor (Substrate routes to :80 today). + // GatewayPort is the port OpenClaw listens on inside the actor. Defaults to 80. // +optional // +kubebuilder:default=80 GatewayPort int32 `json:"gatewayPort,omitempty"` @@ -292,8 +292,9 @@ type AgentHarnessNetwork struct { // AgentHarnessConnection describes how clients reach the provisioned harness VM. type AgentHarnessConnection struct { - // Endpoint is the backend-specific address (gRPC target, SSH host:port, - // ...) clients should use to reach the harness. + // Endpoint is the backend-specific address clients should use to reach the harness. + // OpenShell: gRPC gateway URL with sandbox id (gateway#sandbox). Substrate: kagent + // gateway proxy path (/api/agentharnesses///gateway/). // +optional Endpoint string `json:"endpoint,omitempty"` } diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go index 5215fe82cb..9da0397694 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -12,6 +12,7 @@ import ( "github.com/gorilla/mux" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" @@ -21,12 +22,14 @@ import ( const ( // OpenClaw 2026.3.28+ returns 403 without operator scopes on HTTP/WS when only Bearer token is sent. openclawDefaultOperatorScopes = "operator.admin" - // Origin OpenClaw accepts by default for bind=lan port=80 (localhost/127.0.0.1 on gateway port). - openclawLoopbackOrigin = "http://127.0.0.1:80" ) +func openclawLoopbackOrigin(port int32) string { + return fmt.Sprintf("http://127.0.0.1:%d", port) +} + // AgentHarnessGatewayConfig configures Substrate harness HTTP/WebSocket proxy. -// Traffic is proxied directly to the actor ateom pod IP on port 80 (no atenet-router fallback). +// Traffic is proxied directly to the actor ateom pod IP on spec.substrate.gatewayPort (default 80). type AgentHarnessGatewayConfig struct { AteAPIEndpoint string AteAPIInsecure bool @@ -50,6 +53,16 @@ func (h *Handlers) HandleAgentHarnessGateway(w ErrorResponseWriter, r *http.Requ return } + if h.Agents == nil { + http.Error(w, "agents handler is not configured", http.StatusInternalServerError) + return + } + agentRef := types.NamespacedName{Namespace: namespace, Name: name}.String() + if err := Check(h.Agents.Authorizer, r, auth.Resource{Type: "Agent", Name: agentRef}); err != nil { + w.RespondWithError(err) + return + } + var ah v1alpha2.AgentHarness if err := h.KubeClient.Get(r.Context(), types.NamespacedName{Namespace: namespace, Name: name}, &ah); err != nil { if apierrors.IsNotFound(err) { @@ -88,7 +101,7 @@ func (h *Handlers) HandleAgentHarnessGateway(w ErrorResponseWriter, r *http.Requ return } - publicPrefix := agentHarnessGatewayPublicPrefix(namespace, name) + publicPrefix := substrate.AgentHarnessGatewayUIPath(namespace, name) _, redirectTo, ok := resolveGatewayUpstreamPath(r.URL.Path, namespace, name, isWebSocketUpgrade(r)) if !ok { @@ -105,7 +118,8 @@ func (h *Handlers) HandleAgentHarnessGateway(w ErrorResponseWriter, r *http.Requ return } - proxy := newAgentHarnessGatewayProxy(target, upstreamHost, token, publicPrefix, namespace, name, log) + gwPort := substrate.GatewayPort(&ah) + proxy := newAgentHarnessGatewayProxy(target, upstreamHost, token, publicPrefix, namespace, name, gwPort, log) proxy.ServeHTTP(w, r) } @@ -138,7 +152,8 @@ func (h *Handlers) resolveSubstrateGatewayTarget(ctx context.Context, ah *v1alph if podIP == "" { return nil, "", fmt.Errorf("substrate actor %q has no pod IP (status %s; resume the actor and wait until running)", actorID, actor.GetStatus()) } - target, host, err := substrateGatewayPodTarget(podIP) + port := substrate.GatewayPort(ah) + target, host, err := substrateGatewayPodTarget(podIP, port) if err != nil { return nil, "", fmt.Errorf("substrate actor %q pod IP %q: %w", actorID, podIP, err) } @@ -150,12 +165,15 @@ func (h *Handlers) resolveSubstrateGatewayTarget(ctx context.Context, ah *v1alph return target, host, nil } -func substrateGatewayPodTarget(podIP string) (*url.URL, string, error) { +func substrateGatewayPodTarget(podIP string, port int32) (*url.URL, string, error) { ip := strings.TrimSpace(podIP) if ip == "" || net.ParseIP(ip) == nil { return nil, "", fmt.Errorf("invalid actor pod IP %q", podIP) } - target, err := url.Parse("http://" + net.JoinHostPort(ip, "80")) + if port <= 0 { + port = 80 + } + target, err := url.Parse("http://" + net.JoinHostPort(ip, fmt.Sprintf("%d", port))) if err != nil { return nil, "", fmt.Errorf("parse actor pod target: %w", err) } @@ -163,11 +181,7 @@ func substrateGatewayPodTarget(podIP string) (*url.URL, string, error) { } func agentHarnessHarnessBase(namespace, name string) string { - return "/api/agentharnesses/" + namespace + "/" + name -} - -func agentHarnessGatewayPublicPrefix(namespace, name string) string { - return agentHarnessHarnessBase(namespace, name) + "/gateway/" + return substrate.AgentHarnessAPIBase(namespace, name) } // resolveGatewayUpstreamPath maps the public URL to the upstream path on the actor. @@ -181,12 +195,12 @@ func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade b } rel := strings.TrimPrefix(requestPath, base) if rel == "" { - return "", agentHarnessGatewayPublicPrefix(namespace, name), true + return "", substrate.AgentHarnessGatewayUIPath(namespace, name), true } switch { case rel == "/gateway": - upstream := agentHarnessGatewayPublicPrefix(namespace, name) + upstream := substrate.AgentHarnessGatewayUIPath(namespace, name) if wsUpgrade { return upstream, "", true } @@ -199,16 +213,17 @@ func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade b } // normalizeOpenClawBrowserOrigin rewrites Origin/Referer so OpenClaw accepts WS/API from kagent-ui -// (e.g. http://localhost:8001) while the gateway listens on the actor pod :80. -func normalizeOpenClawBrowserOrigin(req *http.Request) { +// (e.g. http://localhost:8001) while the gateway listens on the actor pod. +func normalizeOpenClawBrowserOrigin(req *http.Request, gwPort int32) { if req == nil { return } + origin := openclawLoopbackOrigin(gwPort) if req.Header.Get("Origin") != "" { - req.Header.Set("Origin", openclawLoopbackOrigin) + req.Header.Set("Origin", origin) } if req.Header.Get("Referer") != "" { - req.Header.Set("Referer", openclawLoopbackOrigin+"/") + req.Header.Set("Referer", origin+"/") } } @@ -220,7 +235,7 @@ func isWebSocketUpgrade(r *http.Request) bool { strings.Contains(strings.ToLower(r.Header.Get("Connection")), "upgrade") } -func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPrefix, namespace, name string, log interface { +func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPrefix, namespace, name string, gwPort int32, log interface { Error(error, string, ...any) }) *httputil.ReverseProxy { proxy := &httputil.ReverseProxy{ @@ -237,7 +252,7 @@ func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPre pr.Out.Header.Set("Authorization", "Bearer "+token) } pr.Out.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) - normalizeOpenClawBrowserOrigin(pr.Out) + normalizeOpenClawBrowserOrigin(pr.Out, gwPort) subPath, _, pathOK := resolveGatewayUpstreamPath(pr.In.URL.Path, namespace, name, isWebSocketUpgrade(pr.In)) if !pathOK { subPath = "/" diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go index 433bcd5205..2b5603829a 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go @@ -3,12 +3,14 @@ package handlers import ( "net/http" "testing" + + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" ) func TestResolveGatewayUpstreamPath(t *testing.T) { t.Parallel() ns, name := "kagent", "my-claw" - public := agentHarnessGatewayPublicPrefix(ns, name) + public := substrate.AgentHarnessGatewayUIPath(ns, name) tests := []struct { name string diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_test.go index 5e9775a4fa..7d161a3991 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_test.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_test.go @@ -8,11 +8,13 @@ import ( "net/url" "strings" "testing" + + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" ) func TestSubstrateGatewayPodTarget(t *testing.T) { t.Parallel() - target, host, err := substrateGatewayPodTarget("10.244.0.29") + target, host, err := substrateGatewayPodTarget("10.244.0.29", 80) if err != nil { t.Fatal(err) } @@ -24,9 +26,23 @@ func TestSubstrateGatewayPodTarget(t *testing.T) { } } +func TestSubstrateGatewayPodTargetCustomPort(t *testing.T) { + t.Parallel() + target, host, err := substrateGatewayPodTarget("10.244.0.29", 8080) + if err != nil { + t.Fatal(err) + } + if host != "10.244.0.29" { + t.Fatalf("host = %q", host) + } + if target.Scheme != "http" || target.Host != "10.244.0.29:8080" { + t.Fatalf("target = %s", target.String()) + } +} + func TestSubstrateGatewayPodTargetRejectsInvalidIP(t *testing.T) { t.Parallel() - _, _, err := substrateGatewayPodTarget("not-an-ip") + _, _, err := substrateGatewayPodTarget("not-an-ip", 80) if err == nil { t.Fatal("expected error for invalid pod IP") } @@ -37,7 +53,7 @@ func TestGatewayProxyForwardsToPodIPWithAuthHeaders(t *testing.T) { const podIP = "10.244.0.29" const token = "some-token" ns, name := "kagent", "my-claw" - publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) + publicPrefix := substrate.AgentHarnessGatewayUIPath(ns, name) var gotHost, gotAuth, gotScopes, gotPath string upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -55,7 +71,7 @@ func TestGatewayProxyForwardsToPodIPWithAuthHeaders(t *testing.T) { t.Fatal(err) } - proxy := newAgentHarnessGatewayProxy(target, podIP, token, publicPrefix, ns, name, testLog{t}) + proxy := newAgentHarnessGatewayProxy(target, podIP, token, publicPrefix, ns, name, 80, testLog{t}) req := httptest.NewRequest(http.MethodGet, publicPrefix, nil) rec := httptest.NewRecorder() proxy.ServeHTTP(rec, req) @@ -85,13 +101,13 @@ func TestGatewayProxyRewriteTargetsPodIPOnWebSocketPath(t *testing.T) { t.Parallel() const podIP = "10.244.0.29" ns, name := "kagent", "my-claw" - publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) + publicPrefix := substrate.AgentHarnessGatewayUIPath(ns, name) target, err := url.Parse("http://" + podIP + ":80") if err != nil { t.Fatal(err) } - proxy := newAgentHarnessGatewayProxy(target, podIP, "tok", publicPrefix, ns, name, testLog{t}) + proxy := newAgentHarnessGatewayProxy(target, podIP, "tok", publicPrefix, ns, name, 80, testLog{t}) req := httptest.NewRequest(http.MethodGet, strings.TrimSuffix(publicPrefix, "/"), nil) req.Header.Set("Connection", "Upgrade") req.Header.Set("Upgrade", "websocket") @@ -116,10 +132,10 @@ func TestGatewayProxyRewriteTargetsPodIPOnWebSocketPath(t *testing.T) { if outReq.Header.Get("x-openclaw-scopes") != openclawDefaultOperatorScopes { t.Fatalf("missing scopes header") } - if outReq.Header.Get("Origin") != openclawLoopbackOrigin { - t.Fatalf("Origin = %q, want %q", outReq.Header.Get("Origin"), openclawLoopbackOrigin) + if outReq.Header.Get("Origin") != openclawLoopbackOrigin(80) { + t.Fatalf("Origin = %q, want %q", outReq.Header.Get("Origin"), openclawLoopbackOrigin(80)) } - if outReq.Header.Get("Referer") != openclawLoopbackOrigin+"/" { + if outReq.Header.Get("Referer") != openclawLoopbackOrigin(80)+"/" { t.Fatalf("Referer = %q", outReq.Header.Get("Referer")) } } diff --git a/go/core/internal/httpserver/handlers/agents.go b/go/core/internal/httpserver/handlers/agents.go index 96249cf9d3..4ac37e848a 100644 --- a/go/core/internal/httpserver/handlers/agents.go +++ b/go/core/internal/httpserver/handlers/agents.go @@ -15,6 +15,7 @@ import ( "github.com/kagent-dev/kagent/go/core/internal/utils" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -188,15 +189,12 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, Backend: sb.Spec.Backend, Runtime: runtime, ModelConfigRef: sb.Spec.ModelConfigRef, - GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), + GatewayUIPath: substrate.AgentHarnessGatewayUIPath(sb.Namespace, sb.Name), } if sb.Status.BackendRef != nil { subEntry.BackendRefID = sb.Status.BackendRef.ID subEntry.ActorID = sb.Status.BackendRef.ID } - if sb.Status.Connection != nil { - subEntry.Endpoint = sb.Status.Connection.Endpoint - } resp.SubstrateAgentHarness = subEntry default: entry := &api.OpenshellAgentHarnessListEntry{ diff --git a/go/core/internal/httpserver/server.go b/go/core/internal/httpserver/server.go index 037393381f..294fa820e9 100644 --- a/go/core/internal/httpserver/server.go +++ b/go/core/internal/httpserver/server.go @@ -315,7 +315,7 @@ func (s *HTTPServer) setupRoutes() { // OpenShell sandbox PTY (browser WebSocket → gateway CONNECT → SSH). Authenticated like other /api routes. s.router.HandleFunc(APIPathSandboxSSH, adaptHandler(s.handlers.HandleSandboxSSHWebSocket)).Methods(http.MethodGet) - // Substrate OpenClaw gateway proxy (HTTP + WebSocket) to the actor pod IP :80. + // Substrate OpenClaw gateway proxy (HTTP + WebSocket) to the actor pod IP on spec.substrate.gatewayPort. s.router.PathPrefix(APIPathAgentHarnessHarness).Handler( adaptHandler(s.handlers.HandleAgentHarnessGateway), ) diff --git a/go/core/pkg/sandboxbackend/async.go b/go/core/pkg/sandboxbackend/async.go index e680259949..61e0d85542 100644 --- a/go/core/pkg/sandboxbackend/async.go +++ b/go/core/pkg/sandboxbackend/async.go @@ -14,7 +14,8 @@ type Handle struct { } // EnsureResult is returned by EnsureAgentHarness. Endpoint (if set) is surfaced -// to users via AgentHarness.Status.Connection. +// to users via AgentHarness.Status.Connection (OpenShell: gateway URL#sandbox id; +// Substrate: kagent gateway proxy path). type EnsureResult struct { Handle Handle Endpoint string diff --git a/go/core/pkg/sandboxbackend/substrate/client.go b/go/core/pkg/sandboxbackend/substrate/client.go index 70291c7bb8..b7987c668b 100644 --- a/go/core/pkg/sandboxbackend/substrate/client.go +++ b/go/core/pkg/sandboxbackend/substrate/client.go @@ -8,6 +8,7 @@ import ( "github.com/agent-substrate/substrate/proto/ateapipb" "google.golang.org/grpc" + "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" ) @@ -35,14 +36,19 @@ func Dial(ctx context.Context, cfg Config) (*Client, error) { if cfg.Insecure { opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) } else { - opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}))) + opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{MinVersion: tls.VersionTLS12}))) } conn, err := grpc.NewClient(cfg.AteAPIEndpoint, opts...) if err != nil { return nil, fmt.Errorf("substrate: dial ate-api %q: %w", cfg.AteAPIEndpoint, err) } - _ = dialCtx + // NewClient stays idle until Connect() or an RPC; waitConnReady enforces DialTimeout. + conn.Connect() + if err := waitConnReady(dialCtx, conn); err != nil { + _ = conn.Close() + return nil, fmt.Errorf("substrate: dial ate-api %q: %w", cfg.AteAPIEndpoint, err) + } return &Client{ ControlClient: ateapipb.NewControlClient(conn), @@ -51,6 +57,24 @@ func Dial(ctx context.Context, cfg Config) (*Client, error) { }, nil } +func waitConnReady(ctx context.Context, conn *grpc.ClientConn) error { + for { + switch s := conn.GetState(); s { + case connectivity.Ready: + return nil + case connectivity.Shutdown: + return fmt.Errorf("connection shut down") + default: + if !conn.WaitForStateChange(ctx, s) { + if err := ctx.Err(); err != nil { + return err + } + return fmt.Errorf("connection closed before ready") + } + } + } +} + func (c *Client) Close() error { if c.conn != nil { return c.conn.Close() diff --git a/go/core/pkg/sandboxbackend/substrate/client_test.go b/go/core/pkg/sandboxbackend/substrate/client_test.go new file mode 100644 index 0000000000..d1c1417db2 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/client_test.go @@ -0,0 +1,31 @@ +package substrate + +import ( + "context" + "net" + "testing" + "time" + + "github.com/stretchr/testify/require" + "google.golang.org/grpc" +) + +func TestDial_tcpReachesReady(t *testing.T) { + lis, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + srv := grpc.NewServer() + go func() { _ = srv.Serve(lis) }() + t.Cleanup(func() { + srv.Stop() + _ = lis.Close() + }) + + cfg := Config{ + AteAPIEndpoint: lis.Addr().String(), + Insecure: true, + DialTimeout: 2 * time.Second, + } + c, err := Dial(context.Background(), cfg) + require.NoError(t, err) + require.NoError(t, c.Close()) +} diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go index 092e68ef92..793a4c98d5 100644 --- a/go/core/pkg/sandboxbackend/substrate/config.go +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -7,8 +7,9 @@ type Config struct { // AteAPIEndpoint is a gRPC target (e.g. dns:///api.ate-system.svc:443). AteAPIEndpoint string Insecure bool - DialTimeout time.Duration - CallTimeout time.Duration + // DialTimeout bounds the initial dial. Zero defaults to 10s in Dial. + DialTimeout time.Duration + CallTimeout time.Duration // DefaultActorTemplateNamespace/name is a legacy fallback when status/spec refs are unset. DefaultActorTemplateNamespace string diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_token.go b/go/core/pkg/sandboxbackend/substrate/gateway_token.go index fbb5c51634..0244431dca 100644 --- a/go/core/pkg/sandboxbackend/substrate/gateway_token.go +++ b/go/core/pkg/sandboxbackend/substrate/gateway_token.go @@ -21,10 +21,20 @@ func ResolveGatewayToken(ctx context.Context, kube client.Client, ah *v1alpha2.A return "", fmt.Errorf("spec.substrate is required") } sub := ah.Spec.Substrate + var token string + var err error if sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" { - return resolveGatewayTokenSecret(ctx, kube, ah.Namespace, sub.GatewayTokenSecretRef) + token, err = resolveGatewayTokenSecret(ctx, kube, ah.Namespace, sub.GatewayTokenSecretRef) + } else { + token = strings.TrimSpace(sub.GatewayToken) } - return strings.TrimSpace(sub.GatewayToken), nil + if err != nil { + return "", err + } + if token == "" { + return "", fmt.Errorf("gateway token must not be empty") + } + return token, nil } func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, namespace string, ref *v1alpha2.TypedLocalReference) (string, error) { @@ -42,5 +52,9 @@ func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, namespac if !ok { return "", fmt.Errorf("gateway token secret %s/%s missing key %q", namespace, ref.Name, GatewayTokenSecretKey) } - return strings.TrimSpace(string(val)), nil + token := strings.TrimSpace(string(val)) + if token == "" { + return "", fmt.Errorf("gateway token secret %s/%s: key %q must not be empty", namespace, ref.Name, GatewayTokenSecretKey) + } + return token, nil } diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_token_test.go b/go/core/pkg/sandboxbackend/substrate/gateway_token_test.go new file mode 100644 index 0000000000..9cfb7dc57f --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/gateway_token_test.go @@ -0,0 +1,82 @@ +package substrate + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestResolveGatewayTokenRejectsEmptySecretValue(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + const ns = "kagent" + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayTokenSecretRef: &v1alpha2.TypedLocalReference{Name: "openclaw-token"}, + }, + }, + } + + for _, tt := range []struct { + name string + value []byte + }{ + {name: "empty", value: []byte{}}, + {name: "whitespace", value: []byte(" \t\n ")}, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openclaw-token", Namespace: ns}, + Data: map[string][]byte{GatewayTokenSecretKey: tt.value}, + } + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret).Build() + + _, err := ResolveGatewayToken(context.Background(), kube, ah) + require.Error(t, err) + require.Contains(t, err.Error(), `key "token" must not be empty`) + }) + } +} + +func TestResolveGatewayTokenAcceptsNonemptySecretValue(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + const ns = "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openclaw-token", Namespace: ns}, + Data: map[string][]byte{GatewayTokenSecretKey: []byte(" secret-token ")}, + } + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret).Build() + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayTokenSecretRef: &v1alpha2.TypedLocalReference{Name: "openclaw-token"}, + }, + }, + } + + token, err := ResolveGatewayToken(context.Background(), kube, ah) + require.NoError(t, err) + require.Equal(t, "secret-token", token) +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index a3374d01fc..3427000393 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -2,8 +2,9 @@ package substrate import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" - "regexp" "strings" "github.com/agent-substrate/substrate/proto/ateapipb" @@ -17,11 +18,33 @@ import ( const ( defaultActorHostSuffix = "actors.resources.substrate.ate.dev" - defaultSubstrateGWPort = int32(80) actorIDPrefix = "ahr" + actorIDHashHexLen = 16 ) -var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) +// AgentHarnessAPIBase is the kagent REST prefix for an AgentHarness resource. +func AgentHarnessAPIBase(namespace, name string) string { + return fmt.Sprintf("/api/agentharnesses/%s/%s", namespace, name) +} + +// AgentHarnessGatewayUIPath is the same-origin HTTP/WebSocket path clients use for +// the OpenClaw Control UI (proxied by kagent to the actor pod). +func AgentHarnessGatewayUIPath(namespace, name string) string { + return AgentHarnessAPIBase(namespace, name) + "/gateway/" +} + +// AgentHarnessGatewayControlUIBasePath is gateway.controlUi.basePath in openclaw.json +// (no trailing slash; OpenClaw expects a path prefix, not a URL). +func AgentHarnessGatewayControlUIBasePath(namespace, name string) string { + return strings.TrimSuffix(AgentHarnessGatewayUIPath(namespace, name), "/") +} + +func connectionEndpoint(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return "" + } + return AgentHarnessGatewayUIPath(ah.Namespace, ah.Name) +} // ClawBackend implements AsyncBackend for OpenClaw/NemoClaw on Agent Substrate. type ClawBackend struct { @@ -73,7 +96,7 @@ func (b *ClawBackend) EnsureAgentHarness(ctx context.Context, ah *v1alpha2.Agent case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: // already active or waking case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: - actor, err = b.client.ResumeActor(ctx, actorID) + _, err = b.client.ResumeActor(ctx, actorID) if err != nil { return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate ResumeActor %q: %w", actorID, err) } @@ -81,7 +104,7 @@ func (b *ClawBackend) EnsureAgentHarness(ctx context.Context, ah *v1alpha2.Agent // suspending — wait for next reconcile } - endpoint := substrateConnectionEndpoint(ah.Namespace, ah.Name, actor) + endpoint := connectionEndpoint(ah) return sandboxbackend.EnsureResult{ Handle: sandboxbackend.Handle{ID: actorID}, @@ -123,23 +146,13 @@ func (b *ClawBackend) OnAgentHarnessReady(_ context.Context, _ *v1alpha2.AgentHa return nil } -// ActorID returns a stable DNS-1123 actor id for this harness. +// ActorID returns a stable DNS-1123 actor id derived from namespace/name (ahr-). func ActorID(ah *v1alpha2.AgentHarness) string { - raw := fmt.Sprintf("%s-%s-%s", actorIDPrefix, ah.Namespace, ah.Name) - raw = strings.ToLower(raw) - raw = strings.ReplaceAll(raw, "_", "-") - if len(raw) > 63 { - raw = raw[:63] - raw = strings.TrimRight(raw, "-") - } - if !dns1123Label.MatchString(raw) { - // fallback: hash-like trim - raw = fmt.Sprintf("%s-%s", actorIDPrefix, ah.UID) - if len(raw) > 63 { - raw = raw[:63] - } + if ah == nil { + return "" } - return raw + sum := sha256.Sum256([]byte(ah.Namespace + "/" + ah.Name)) + return fmt.Sprintf("%s-%s", actorIDPrefix, hex.EncodeToString(sum[:])[:actorIDHashHexLen]) } // ActorHost returns the atenet router Host header value for the actor. @@ -166,17 +179,6 @@ func actorTemplateRef(ah *v1alpha2.AgentHarness, cfg Config) (string, string) { return ah.Namespace, actorTemplateName(ah) } -func substrateConnectionEndpoint(namespace, name string, actor *ateapipb.Actor) string { - gw := fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", namespace, name) - if actor == nil { - return "kagent gateway: " + gw - } - if podIP := strings.TrimSpace(actor.GetAteomPodIp()); podIP != "" { - return fmt.Sprintf("http://%s:80 (pod IP; UI via kagent %s)", podIP, gw) - } - return fmt.Sprintf("kagent gateway: %s (actor status %s)", gw, actor.GetStatus()) -} - func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { runtime := ah.Spec.Runtime if runtime == "" { diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go index fa7c6c8d75..12102cbf1e 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go @@ -1,27 +1,95 @@ package substrate import ( + "regexp" "testing" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/kagent-dev/kagent/go/api/v1alpha2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func TestActorID(t *testing.T) { +// Golden output for ActorID(kagent/my-claw); catches accidental algorithm changes. +const actorIDGoldenKagentMyClaw = "ahr-a252e34585581de8" + +func TestAgentHarnessGatewayPaths(t *testing.T) { + t.Parallel() + const ns, name = "kagent", "my-claw" + wantPublic := "/api/agentharnesses/kagent/my-claw/gateway/" + wantControlUI := "/api/agentharnesses/kagent/my-claw/gateway" + + if got := AgentHarnessAPIBase(ns, name); got != "/api/agentharnesses/kagent/my-claw" { + t.Fatalf("APIBase = %q", got) + } + if got := AgentHarnessGatewayUIPath(ns, name); got != wantPublic { + t.Fatalf("GatewayUIPath = %q, want %q", got, wantPublic) + } + if got := AgentHarnessGatewayControlUIBasePath(ns, name); got != wantControlUI { + t.Fatalf("ControlUIBasePath = %q, want %q", got, wantControlUI) + } +} + +func TestConnectionEndpoint(t *testing.T) { + t.Parallel() ah := &v1alpha2.AgentHarness{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "kagent", - Name: "my-claw", - UID: "00000000-0000-0000-0000-000000000001", + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "my-claw"}, + } + want := "/api/agentharnesses/kagent/my-claw/gateway/" + + if got := connectionEndpoint(nil); got != "" { + t.Fatalf("nil harness = %q, want empty", got) + } + if got := connectionEndpoint(ah); got != want { + t.Fatalf("connectionEndpoint = %q, want %q", got, want) + } +} + +func TestGatewayPort(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ + Spec: v1alpha2.AgentHarnessSpec{ + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{GatewayPort: 8080}, }, } + if got := GatewayPort(ah); got != 8080 { + t.Fatalf("GatewayPort = %d, want 8080", got) + } + if got := GatewayPort(nil); got != 80 { + t.Fatalf("GatewayPort(nil) = %d, want 80", got) + } + if got := GatewayPort(&v1alpha2.AgentHarness{}); got != 80 { + t.Fatalf("GatewayPort(empty) = %d, want 80", got) + } +} + +func TestActorID(t *testing.T) { + if ActorID(nil) != "" { + t.Fatalf("nil harness: got %q, want empty", ActorID(nil)) + } + + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "my-claw"}, + } id := ActorID(ah) - if !dns1123Label.MatchString(id) { - t.Fatalf("ActorID %q is not DNS-1123", id) + if id != actorIDGoldenKagentMyClaw { + t.Fatalf("ActorID = %q, want golden %q", id, actorIDGoldenKagentMyClaw) + } + if ActorID(ah) != id { + t.Fatal("expected stable id across calls") + } + + other := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "other-claw"}, + } + if ActorID(other) == id { + t.Fatalf("different harnesses should not share actor id %q", id) + } + + const wantLen = len(actorIDPrefix) + 1 + actorIDHashHexLen + if len(id) != wantLen { + t.Fatalf("id length = %d, want %d (%q)", len(id), wantLen, id) } - if id == "" { - t.Fatal("expected non-empty actor id") + if !regexp.MustCompile(`^ahr-[0-9a-f]{16}$`).MatchString(id) { + t.Fatalf("id %q does not match ahr- form", id) } } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go index c1ae943125..dcee1f5c14 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go @@ -42,7 +42,7 @@ func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.Agen { Name: defaultOpenClawContainer, Image: workloadImage, - Ports: []corev1.ContainerPort{{ContainerPort: 80}}, + Ports: []corev1.ContainerPort{{ContainerPort: GatewayPort(ah)}}, Command: []string{ "/bin/sh", "-c", diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index 96927611b7..66c810e6c9 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -15,8 +15,6 @@ import ( corev1 "k8s.io/api/core/v1" ) -const defaultSubstrateOpenClawGatewayPort = 80 - //go:embed templates/openclaw_startup.sh.tmpl var openClawStartupScriptTmplContent string @@ -41,7 +39,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if err != nil { return "", nil, fmt.Errorf("resolve gateway token: %w", err) } - gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort, openClawControlUIBasePath(ah)) + gw := openclaw.SubstrateGatewayBootstrap(token, int(GatewayPort(ah)), AgentHarnessGatewayControlUIBasePath(ah.Namespace, ah.Name)) var jsonBytes []byte var containerEnv []corev1.EnvVar @@ -74,13 +72,6 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha return script, containerEnv, nil } -func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { - if ah == nil { - return "" - } - return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" -} - func openClawStartupScript(jsonBytes []byte, gwPort int) (string, error) { var buf bytes.Buffer if err := openClawStartupScriptTmpl.Execute(&buf, openClawStartupScriptData{ diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index 95e58211e7..87e1379a24 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -17,6 +17,30 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" ) +func TestBuildOpenClawActorStartup_CustomGatewayPort(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayToken: "some-token", + GatewayPort: 8080, + }, + }, + } + + p := &Provisioner{Client: fake.NewClientBuilder().WithScheme(scheme).Build()} + script, _, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + require.Contains(t, script, "openclaw gateway run --port 8080") + require.Contains(t, script, "http://127.0.0.1:8080/") +} + func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { t.Parallel() scheme := runtime.NewScheme() diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/provision_shared.go index 87a526b8bd..2344b514d4 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_shared.go @@ -15,11 +15,23 @@ const ( AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" - defaultWorkerPoolReplicas = int32(1) - defaultSnapshotsBucket = "ate-snapshots" - defaultOpenClawContainer = "openclaw" + defaultWorkerPoolReplicas = int32(1) + defaultSnapshotsBucket = "ate-snapshots" + defaultOpenClawContainer = "openclaw" + defaultSubstrateGatewayPort int32 = 80 ) +// GatewayPort returns spec.substrate.gatewayPort, defaulting to 80 when unset. +func GatewayPort(ah *v1alpha2.AgentHarness) int32 { + if ah == nil || ah.Spec.Substrate == nil { + return defaultSubstrateGatewayPort + } + if p := ah.Spec.Substrate.GatewayPort; p > 0 { + return p + } + return defaultSubstrateGatewayPort +} + // ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. type ProvisionDefaults struct { PauseImage string diff --git a/ui/src/components/agent-form/OpenClawSandboxFields.tsx b/ui/src/components/agent-form/OpenClawSandboxFields.tsx index 54e7cc6cf4..c2f60be124 100644 --- a/ui/src/components/agent-form/OpenClawSandboxFields.tsx +++ b/ui/src/components/agent-form/OpenClawSandboxFields.tsx @@ -184,6 +184,63 @@ export function OpenClawSandboxFields({ {value.runtime === "substrate" ? (
+ {section === "substrate" ? validationError?.message : null} + + Gateway token + +

+ Bearer token for the OpenClaw gateway inside the actor. Required for Substrate harnesses. +

+
+ {value.substrateGatewayTokenSource === "inline" ? ( + + Gateway token value + set({ substrateGatewayToken: e.target.value })} + /> + + ) : ( +
+ + Secret name + set({ substrateGatewaySecretName: e.target.value })} + /> + + + Secret key + set({ substrateGatewaySecretKey: e.target.value })} + /> +

Must exist in the harness namespace (default key: token).

+
+
+ )} Snapshot location (GCS) { expect(draft.spec.backend).toBe("openclaw"); }); + it("requires substrate gateway token in validation", () => { + const openClaw = { + ...defaultOpenClawSandboxFormSlice(), + runtime: "substrate" as const, + substrateGatewayToken: "", + }; + const result = validateOpenClawSandboxForm({ + openClaw, + modelRef: "ns/m1", + }); + expect(result?.section).toBe("substrate"); + expect(result?.message).toContain("gateway token"); + }); + + it("writes spec.substrate.gatewayToken for inline token", () => { + const openClaw = { + ...defaultOpenClawSandboxFormSlice(), + runtime: "substrate" as const, + substrateGatewayTokenSource: "inline" as const, + substrateGatewayToken: "gw-secret-token", + }; + const draft = buildSandboxCRDraft({ + name: "h1", + namespace: "kagent", + description: "", + modelRef: "m1", + openClaw, + }); + expect("error" in draft).toBe(false); + if ("error" in draft) return; + expect(draft.spec.runtime).toBe("substrate"); + const substrate = draft.spec.substrate as Record; + expect(substrate.gatewayToken).toBe("gw-secret-token"); + expect(substrate).not.toHaveProperty("gatewayTokenSecretRef"); + expect(substrate.snapshotsConfig).toEqual({ location: "gs://ate-snapshots/kagent/" }); + }); + + it("writes spec.substrate.gatewayTokenSecretRef for secret token", () => { + const openClaw = { + ...defaultOpenClawSandboxFormSlice(), + runtime: "substrate" as const, + substrateGatewayTokenSource: "secret" as const, + substrateGatewaySecretName: "openclaw-token", + substrateGatewaySecretKey: "token", + }; + const draft = buildSandboxCRDraft({ + name: "h1", + namespace: "kagent", + description: "", + modelRef: "m1", + openClaw, + }); + expect("error" in draft).toBe(false); + if ("error" in draft) return; + const substrate = draft.spec.substrate as Record; + expect(substrate.gatewayTokenSecretRef).toEqual({ name: "openclaw-token" }); + expect(substrate).not.toHaveProperty("gatewayToken"); + }); + it("writes Hermes slack allowedUserIDs and home channel fields", () => { const row = newOpenClawChannelRow(); row.name = "slack-main"; diff --git a/ui/src/lib/openClawSandboxForm.ts b/ui/src/lib/openClawSandboxForm.ts index 46608384f1..e16283eeb2 100644 --- a/ui/src/lib/openClawSandboxForm.ts +++ b/ui/src/lib/openClawSandboxForm.ts @@ -77,6 +77,12 @@ export interface OpenClawSandboxFormSlice { substrateWorkerPoolReplicas: string; /** GCS snapshot prefix (gs://bucket/path/) — required for auto-provisioned templates. */ substrateSnapshotsLocation: string; + /** OpenClaw gateway Bearer token (inline or Secret in harness namespace). Required when runtime is substrate. */ + substrateGatewayTokenSource: "inline" | "secret"; + substrateGatewayToken: string; + substrateGatewaySecretName: string; + /** Secret data key; controller expects "token" (see GatewayTokenSecretKey). */ + substrateGatewaySecretKey: string; /** Optional override for Sandbox.spec.image (OpenShell VM template image). Empty → controller default. */ image: string; channels: OpenClawChannelRow[]; @@ -97,6 +103,10 @@ export function defaultOpenClawSandboxFormSlice(): OpenClawSandboxFormSlice { substrateWorkerPoolRefName: "", substrateWorkerPoolReplicas: "2", substrateSnapshotsLocation: "gs://ate-snapshots/kagent/", + substrateGatewayTokenSource: "inline", + substrateGatewayToken: "", + substrateGatewaySecretName: "", + substrateGatewaySecretKey: "token", image: "", channels: [], allowedDomains: "", @@ -151,7 +161,7 @@ export function parseAllowedDomainsList(raw: string): string[] { } /** Where to show a harness OpenClaw validation message and which element to focus. */ -export type OpenClawSandboxSectionErrorKind = "allowedDomains" | "channels" | "general"; +export type OpenClawSandboxSectionErrorKind = "allowedDomains" | "channels" | "substrate" | "general"; export interface OpenClawSandboxFormValidationError { message: string; @@ -199,6 +209,22 @@ export function validateOpenClawSandboxForm(args: { return openClawValidationFail("general", "Please select a model config for this sandbox."); } + if (args.openClaw.runtime === "substrate") { + const gw = substrateGatewayTokenForDraft(args.openClaw); + if ("error" in gw) { + return openClawValidationFail("substrate", gw.error); + } + const snapshots = args.openClaw.substrateSnapshotsLocation?.trim(); + if (!snapshots) { + return openClawValidationFail("substrate", "Substrate snapshots location (gs://…) is required."); + } + if (args.openClaw.substrateWorkerPoolMode === "existing") { + if (!args.openClaw.substrateWorkerPoolRefName?.trim()) { + return openClawValidationFail("substrate", "WorkerPool name is required when using an existing pool."); + } + } + } + for (const entry of trimSplitList(args.openClaw.allowedDomains)) { if (!isPlausibleAllowedDomainHost(entry)) { return openClawValidationFail( @@ -277,6 +303,30 @@ export interface SandboxCRDraft { spec: Record; } +/** Maps form fields to exactly one of spec.substrate.gatewayToken | gatewayTokenSecretRef. */ +function substrateGatewayTokenForDraft( + openClaw: OpenClawSandboxFormSlice, +): { gatewayToken: string } | { gatewayTokenSecretRef: { name: string } } | { error: string } { + const secretKey = openClaw.substrateGatewaySecretKey.trim() || "token"; + const cred = credentialFromRow( + openClaw.substrateGatewayTokenSource, + openClaw.substrateGatewayToken, + openClaw.substrateGatewaySecretName, + secretKey, + "OpenClaw gateway token", + ); + if ("error" in cred) { + return { error: cred.error }; + } + if (cred.value !== undefined) { + return { gatewayToken: cred.value }; + } + if (cred.valueFrom?.name) { + return { gatewayTokenSecretRef: { name: cred.valueFrom.name } }; + } + return { error: "OpenClaw gateway token: inline token or secret name and key are required" }; +} + function modelConfigRefForSandbox(agentNamespace: string, modelRef: string): string { const t = modelRef.trim(); if (!t) { @@ -391,8 +441,13 @@ export function buildSandboxCRDraft(args: { if (!snapshots) { return { error: "Substrate snapshots location (gs://…) is required." }; } + const gw = substrateGatewayTokenForDraft(args.openClaw); + if ("error" in gw) { + return { error: gw.error }; + } const substrate: Record = { snapshotsConfig: { location: snapshots }, + ...gw, }; if (args.openClaw.substrateWorkerPoolMode === "existing") { const wpName = args.openClaw.substrateWorkerPoolRefName?.trim(); diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts index 7f50f04e5b..8bc5dd16a5 100644 --- a/ui/src/types/index.ts +++ b/ui/src/types/index.ts @@ -436,7 +436,6 @@ export interface SubstrateAgentHarnessListEntry { gatewayUIPath?: string; modelConfigRef?: string; backendRefId?: string; - endpoint?: string; } export interface AgentResponse { From a161e7e8e2c9d4cd19d0b4ad0407f419d982e1fd Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 1 Jun 2026 10:37:13 -0700 Subject: [PATCH 12/32] go mod tidy Signed-off-by: Peter Jausovec --- go/go.mod | 2 -- go/go.sum | 4 ---- 2 files changed, 6 deletions(-) diff --git a/go/go.mod b/go/go.mod index b2d0f7fa2b..befbd00285 100644 --- a/go/go.mod +++ b/go/go.mod @@ -64,7 +64,6 @@ require ( github.com/agent-substrate/substrate v0.0.0 github.com/aws/aws-sdk-go-v2 v1.41.7 github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.50.6 - github.com/golang/protobuf v1.5.4 github.com/google/jsonschema-go v0.4.3 github.com/jackc/pgx/v5 v5.9.2 github.com/ollama/ollama v0.24.0 @@ -304,7 +303,6 @@ require ( github.com/moby/moby/api v1.54.1 // indirect github.com/moby/moby/client v0.4.0 // indirect github.com/moby/patternmatcher v0.6.1 // indirect - github.com/moby/spdystream v0.5.1 // indirect github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect diff --git a/go/go.sum b/go/go.sum index 1deb8cbab3..4b9b3d89c1 100644 --- a/go/go.sum +++ b/go/go.sum @@ -94,8 +94,6 @@ github.com/anthropics/anthropic-sdk-go v1.43.0 h1:ShY3C7lafzHP0ze1dCxL3ZFZzvkGfX github.com/anthropics/anthropic-sdk-go v1.43.0/go.mod h1:5cEaslQ6A9ajdL5YUvhNW57LKxEz0OAZ7WEzgZWLD7k= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/ashanbrown/forbidigo/v2 v2.3.1 h1:KAZijvQ7zeIBKbhikT4jCm0TLYXC4u78bTiLh/8JROI= github.com/ashanbrown/forbidigo/v2 v2.3.1/go.mod h1:2QDkLTzU6TV937eFROamXrW92M3paehdae4HCDCOZCM= github.com/ashanbrown/makezero/v2 v2.2.1 h1:A7uU8dgB1PA9aelTxHMfHIQ8Qev8AB3JLxJUBUsejqM= @@ -603,8 +601,6 @@ github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjI github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= -github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= -github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= From 520cd9b77ebd1ed395367eed4c06c8eb73c326e6 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Sat, 23 May 2026 08:33:49 -0700 Subject: [PATCH 13/32] substrate wip Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 77 ++++ .../crd/bases/kagent.dev_agentharnesses.yaml | 141 +++++++ go/api/httpapi/types.go | 14 +- go/api/v1alpha2/agentharness_types.go | 99 +++++ go/api/v1alpha2/zz_generated.deepcopy.go | 93 +++++ .../controller/agentharness_controller.go | 117 +++++- .../handlers/agentharness_gateway.go | 365 ++++++++++++++++++ .../handlers/agentharness_gateway_rewrite.go | 235 +++++++++++ .../agentharness_gateway_rewrite_test.go | 165 ++++++++ .../handlers/agentharness_gateway_test.go | 132 +++++++ .../internal/httpserver/handlers/agents.go | 52 ++- .../internal/httpserver/handlers/handlers.go | 17 +- go/core/internal/httpserver/middleware.go | 17 +- go/core/internal/httpserver/server.go | 45 ++- go/core/pkg/app/app.go | 180 ++++++++- .../pkg/sandboxbackend/openshell/openclaw.go | 2 +- .../openshell/openclaw/bootstrap.go | 147 +++++-- .../openclaw/bootstrap_substrate_test.go | 21 + .../openshell/openclaw/bootstrap_test.go | 40 +- .../openshell/openclaw/constants.go | 7 +- .../openshell/openclaw/provider.go | 11 +- .../openshell/openclaw/types.go | 21 +- .../pkg/sandboxbackend/substrate/client.go | 114 ++++++ .../pkg/sandboxbackend/substrate/config.go | 22 ++ .../sandboxbackend/substrate/delete_actor.go | 127 ++++++ .../substrate/delete_actor_test.go | 18 + .../substrate/delete_provision.go | 109 ++++++ .../substrate/delete_provision_test.go | 61 +++ .../pkg/sandboxbackend/substrate/openclaw.go | 231 +++++++++++ .../sandboxbackend/substrate/openclaw_test.go | 53 +++ .../pkg/sandboxbackend/substrate/provision.go | 301 +++++++++++++++ .../substrate/provision_openclaw.go | 88 +++++ .../substrate/provision_openclaw_test.go | 153 ++++++++ .../substrate/provision_test.go | 47 +++ go/go.mod | 38 +- go/go.sum | 88 +++-- .../templates/kagent.dev_agentharnesses.yaml | 141 +++++++ .../templates/controller-deployment.yaml | 26 ++ helm/kagent/templates/rbac/getter-role.yaml | 19 + helm/kagent/values.yaml | 28 +- ui/next.config.ts | 15 + .../app/openshell/OpenshellTerminalPage.tsx | 42 +- ui/src/components/AgentCard.tsx | 37 +- ui/src/components/AgentListView.tsx | 43 ++- .../agent-form/OpenClawSandboxFields.tsx | 91 +++++ ui/src/lib/agentHarness.ts | 23 +- ui/src/lib/openClawSandboxForm.ts | 46 +++ ui/src/lib/openshellSandboxAgents.ts | 8 + ui/src/types/index.ts | 13 + 49 files changed, 3779 insertions(+), 201 deletions(-) create mode 100644 examples/substrate-openclaw/README.md create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway.go create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_test.go create mode 100644 go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/client.go create mode 100644 go/core/pkg/sandboxbackend/substrate/config.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_actor.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_actor_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_provision.go create mode 100644 go/core/pkg/sandboxbackend/substrate/delete_provision_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/openclaw.go create mode 100644 go/core/pkg/sandboxbackend/substrate/openclaw_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_openclaw.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_test.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md new file mode 100644 index 0000000000..1b27550895 --- /dev/null +++ b/examples/substrate-openclaw/README.md @@ -0,0 +1,77 @@ +# OpenClaw on Agent Substrate + +## 1. Install Substrate on your Kind cluster + +Uses cluster `kind` (`KIND_CLUSTER_NAME=kind`; or set `KUBECONFIG` / context accordingly). + +```bash +cd substrate + +./hack/create-kind-cluster.sh +./hack/install-ate-kind.sh --deploy-ate-system +``` + +`--deploy-ate-system` installs the **control plane only** (ate-api, ate-controller, atelet, atenet, …). Your registry catalog will show `ateapi-*`, `atelet-*`, etc., but **not** ateom until you build it. + +Build and push **ateom-gvisor** (required for kagent `workerPool.ateomImage`): + +```bash +# build the ateom-gvisor image from the substrate folder +export KO_DOCKER_REPO=localhost:5001 +export KO_DEFAULTPLATFORMS=linux/$(go env GOARCH) +./hack/ko.sh build -B ./cmd/servers/ateom-gvisor +``` + +## 2. Load nemoclaw image + +The image is a multi-arch manifest list. On Apple Silicon, `kind load docker-image` often fails with `content digest ... not found` because Docker only has the local arch locally while kind imports with `--all-platforms`. Use `docker save` + `ctr import` instead (match `--name` to your cluster, e.g. `agent` for context `kind-agent`): + +```bash +docker pull --platform linux/arm64 ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 +docker save ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 | \ + docker exec -i kind-control-plane ctr --namespace=k8s.io images import - +``` + +On amd64 hosts, use `--platform linux/amd64` in the pull step. + +## kagent AgentHarness with substrate runtime + +kagent **auto-provisions** a per-harness `ActorTemplate` (and optionally a `WorkerPool`). + +Install kagent (Substrate must already be running in the cluster): + +```bash +export KIND_CLUSTER_NAME=kind +make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true" +``` + +Create a harness with only what you must choose: + +- **`snapshotsConfig.location`** — GCS `gs://` prefix (Substrate snapshots are GCS-only today) +- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool` + **`ateomImage`**) +- **`workerPool.ateomImage`** — (`localhost:5001/ateom-gvisor:latest`) + +```yaml +apiVersion: kagent.dev/v1alpha2 +kind: AgentHarness +metadata: + name: peterj-claw + namespace: kagent +spec: + runtime: substrate + backend: openclaw + description: OpenClaw on Agent Substrate + modelConfigRef: default-model-config + substrate: + snapshotsConfig: + location: gs://ate-snapshots/kagent/kagent/my-claw/ + workerPool: + replicas: 1 + ateomImage: localhost:5001/ateom-gvisor:latest + # Optional: adopt existing resources instead of auto-create + # workerPoolRef: + # name: my-pool + # namespace: ate-system +``` + +Port-forward the UI (`kubectl port-forward -n kagent svc/kagent-ui 8001:8080`) and navigate to the deployed agent harness. Use `test-token` as a gateway token to OpenClaw. diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index 308d7ba0f2..f82ff2a1d4 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -19,6 +19,9 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: + - jsonPath: .spec.runtime + name: Runtime + type: string - jsonPath: .spec.backend name: Backend type: string @@ -511,6 +514,106 @@ spec: type: string type: array type: object + runtime: + default: openshell + description: Runtime selects the harness provisioning stack. Defaults + to openshell when unset. + enum: + - openshell + - substrate + type: string + substrate: + description: Substrate is required when runtime is substrate. + properties: + actorTemplateRef: + description: |- + ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. + When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + gatewayPort: + default: 80 + description: GatewayPort is the port OpenClaw listens on inside + the actor (Substrate routes to :80 today). + format: int32 + type: integer + gatewayTokenSecretRef: + description: |- + GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. + When unset, the controller falls back to --substrate-gateway-token(-file). + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + snapshotsConfig: + description: SnapshotsConfig is required for auto-provisioned + templates (GCS gs:// location). + properties: + location: + description: |- + Location is the GCS URI prefix for golden and incremental snapshots. + Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + type: string + required: + - location + type: object + workerPool: + description: WorkerPool creates a dedicated WorkerPool in the + harness namespace when workerPoolRef is unset. + properties: + ateomImage: + description: |- + AteomImage is the ateom herder image (pullable registry ref, not ko://). + Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + type: string + replicas: + default: 2 + description: Replicas is the number of ateom worker pods. + Defaults to 2 when unset or zero. + format: int32 + type: integer + type: object + workerPoolRef: + description: |- + WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). + Mutually exclusive with workerPool. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workloadImage: + description: WorkloadImage overrides the default nemoclaw/openclaw + sandbox image in the ActorTemplate. + type: string + required: + - snapshotsConfig + type: object required: - backend type: object @@ -612,6 +715,44 @@ spec: observedGeneration: format: int64 type: integer + substrate: + description: Substrate records auto-provisioned Substrate CR references. + properties: + actorTemplateReady: + description: ActorTemplateReady is true when the template phase + is Ready (golden snapshot taken). + type: boolean + actorTemplateRef: + description: ActorTemplateRef is the ActorTemplate used when creating + the actor. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workerPoolRef: + description: WorkerPoolRef is the WorkerPool used by the harness + ActorTemplate. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + type: object type: object type: object served: true diff --git a/go/api/httpapi/types.go b/go/api/httpapi/types.go index ec80d49ea1..0107e5ffe0 100644 --- a/go/api/httpapi/types.go +++ b/go/api/httpapi/types.go @@ -144,6 +144,17 @@ type OpenshellAgentHarnessListEntry struct { Endpoint string `json:"endpoint,omitempty"` } +// SubstrateAgentHarnessListEntry is set when runtime is substrate. +type SubstrateAgentHarnessListEntry struct { + Backend v1alpha2.AgentHarnessBackendType `json:"backend"` + Runtime v1alpha2.AgentHarnessRuntime `json:"runtime"` + ActorID string `json:"actorId,omitempty"` + GatewayUIPath string `json:"gatewayUIPath,omitempty"` + ModelConfigRef string `json:"modelConfigRef,omitempty"` + BackendRefID string `json:"backendRefId,omitempty"` + Endpoint string `json:"endpoint,omitempty"` +} + type AgentResponse struct { ID string `json:"id"` Agent *AgentResource `json:"agent"` @@ -156,7 +167,8 @@ type AgentResponse struct { DeploymentReady bool `json:"deploymentReady"` Accepted bool `json:"accepted"` WorkloadMode v1alpha2.WorkloadMode `json:"workloadMode,omitempty"` - OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` + OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` + SubstrateAgentHarness *SubstrateAgentHarnessListEntry `json:"substrateAgentHarness,omitempty"` } // Session types diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index c6a43f6c02..26c0109069 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -37,6 +37,76 @@ func IsKnownAgentHarnessBackend(b AgentHarnessBackendType) bool { } } +// AgentHarnessRuntime selects which control plane provisions the harness VM. +// +kubebuilder:validation:Enum=openshell;substrate +type AgentHarnessRuntime string + +const ( + AgentHarnessRuntimeOpenshell AgentHarnessRuntime = "openshell" + AgentHarnessRuntimeSubstrate AgentHarnessRuntime = "substrate" +) + +// AgentHarnessSubstrateSnapshotsConfig points at a GCS prefix for actor memory snapshots. +// Substrate currently expects a gs:// location (see Agent Substrate SnapshotsConfig). +type AgentHarnessSubstrateSnapshotsConfig struct { + // Location is the GCS URI prefix for golden and incremental snapshots. + // Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + // +required + Location string `json:"location"` +} + +// AgentHarnessSubstrateWorkerPoolSpec creates a dedicated WorkerPool for this harness. +// Mutually exclusive with workerPoolRef. +type AgentHarnessSubstrateWorkerPoolSpec struct { + // Replicas is the number of ateom worker pods. Defaults to 2 when unset or zero. + // +optional + // +kubebuilder:default=2 + Replicas int32 `json:"replicas,omitempty"` + + // AteomImage is the ateom herder image (pullable registry ref, not ko://). + // Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + // +optional + AteomImage string `json:"ateomImage,omitempty"` +} + +// AgentHarnessSubstrateSpec configures Agent Substrate (WorkerPool + ActorTemplate + Actor). +// +// By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). +// Set actorTemplateRef only to adopt an existing template (advanced / legacy). +type AgentHarnessSubstrateSpec struct { + // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). + // Mutually exclusive with workerPool. + // +optional + WorkerPoolRef *TypedReference `json:"workerPoolRef,omitempty"` + + // WorkerPool creates a dedicated WorkerPool in the harness namespace when workerPoolRef is unset. + // +optional + WorkerPool *AgentHarnessSubstrateWorkerPoolSpec `json:"workerPool,omitempty"` + + // SnapshotsConfig is required for auto-provisioned templates (GCS gs:// location). + // +required + SnapshotsConfig AgentHarnessSubstrateSnapshotsConfig `json:"snapshotsConfig"` + + // WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. + // +optional + WorkloadImage string `json:"workloadImage,omitempty"` + + // ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. + // When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. + // +optional + ActorTemplateRef *TypedReference `json:"actorTemplateRef,omitempty"` + + // GatewayPort is the port OpenClaw listens on inside the actor (Substrate routes to :80 today). + // +optional + // +kubebuilder:default=80 + GatewayPort int32 `json:"gatewayPort,omitempty"` + + // GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. + // When unset, the controller falls back to --substrate-gateway-token(-file). + // +optional + GatewayTokenSecretRef *TypedReference `json:"gatewayTokenSecretRef,omitempty"` +} + // AgentHarnessChannelType selects a messenger integration for OpenClaw harness VMs. // +kubebuilder:validation:Enum=telegram;slack type AgentHarnessChannelType string @@ -158,6 +228,15 @@ type AgentHarnessSpec struct { // +required Backend AgentHarnessBackendType `json:"backend"` + // Runtime selects the harness provisioning stack. Defaults to openshell when unset. + // +optional + // +kubebuilder:default=openshell + Runtime AgentHarnessRuntime `json:"runtime,omitempty"` + + // Substrate is required when runtime is substrate. + // +optional + Substrate *AgentHarnessSubstrateSpec `json:"substrate,omitempty"` + // Description is a short human-readable summary shown in the UI (e.g. agents list). // +optional Description string `json:"description,omitempty"` @@ -230,6 +309,25 @@ type AgentHarnessStatus struct { // Connection is populated by the controller when the harness is ready. // +optional Connection *AgentHarnessConnection `json:"connection,omitempty"` + + // Substrate records auto-provisioned Substrate CR references. + // +optional + Substrate *AgentHarnessSubstrateStatus `json:"substrate,omitempty"` +} + +// AgentHarnessSubstrateStatus is observed Substrate control-plane state for this harness. +type AgentHarnessSubstrateStatus struct { + // WorkerPoolRef is the WorkerPool used by the harness ActorTemplate. + // +optional + WorkerPoolRef TypedReference `json:"workerPoolRef,omitempty"` + + // ActorTemplateRef is the ActorTemplate used when creating the actor. + // +optional + ActorTemplateRef TypedReference `json:"actorTemplateRef,omitempty"` + + // ActorTemplateReady is true when the template phase is Ready (golden snapshot taken). + // +optional + ActorTemplateReady bool `json:"actorTemplateReady,omitempty"` } // AgentHarnessConditionType enumerates the condition types an AgentHarness may report. @@ -241,6 +339,7 @@ const ( // +kubebuilder:object:root=true // +kubebuilder:resource:path=agentharnesses,singular=agentharness,shortName=ahr,categories=kagent // +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Runtime",type="string",JSONPath=".spec.runtime" // +kubebuilder:printcolumn:name="Backend",type="string",JSONPath=".spec.backend" // +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status" // +kubebuilder:printcolumn:name="ID",type="string",JSONPath=".status.backendRef.id" diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 52d10ed714..dd1b350ccb 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -295,6 +295,11 @@ func (in *AgentHarnessSlackChannelSpec) DeepCopy() *AgentHarnessSlackChannelSpec // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessSpec) DeepCopyInto(out *AgentHarnessSpec) { *out = *in + if in.Substrate != nil { + in, out := &in.Substrate, &out.Substrate + *out = new(AgentHarnessSubstrateSpec) + (*in).DeepCopyInto(*out) + } if in.Env != nil { in, out := &in.Env, &out.Env *out = make([]v1.EnvVar, len(*in)) @@ -346,6 +351,11 @@ func (in *AgentHarnessStatus) DeepCopyInto(out *AgentHarnessStatus) { *out = new(AgentHarnessConnection) **out = **in } + if in.Substrate != nil { + in, out := &in.Substrate, &out.Substrate + *out = new(AgentHarnessSubstrateStatus) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessStatus. @@ -373,6 +383,89 @@ func (in *AgentHarnessStatusRef) DeepCopy() *AgentHarnessStatusRef { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateSnapshotsConfig) DeepCopyInto(out *AgentHarnessSubstrateSnapshotsConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateSnapshotsConfig. +func (in *AgentHarnessSubstrateSnapshotsConfig) DeepCopy() *AgentHarnessSubstrateSnapshotsConfig { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateSnapshotsConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec) { + *out = *in + if in.WorkerPoolRef != nil { + in, out := &in.WorkerPoolRef, &out.WorkerPoolRef + *out = new(TypedReference) + **out = **in + } + if in.WorkerPool != nil { + in, out := &in.WorkerPool, &out.WorkerPool + *out = new(AgentHarnessSubstrateWorkerPoolSpec) + **out = **in + } + out.SnapshotsConfig = in.SnapshotsConfig + if in.ActorTemplateRef != nil { + in, out := &in.ActorTemplateRef, &out.ActorTemplateRef + *out = new(TypedReference) + **out = **in + } + if in.GatewayTokenSecretRef != nil { + in, out := &in.GatewayTokenSecretRef, &out.GatewayTokenSecretRef + *out = new(TypedReference) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateSpec. +func (in *AgentHarnessSubstrateSpec) DeepCopy() *AgentHarnessSubstrateSpec { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateStatus) DeepCopyInto(out *AgentHarnessSubstrateStatus) { + *out = *in + out.WorkerPoolRef = in.WorkerPoolRef + out.ActorTemplateRef = in.ActorTemplateRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateStatus. +func (in *AgentHarnessSubstrateStatus) DeepCopy() *AgentHarnessSubstrateStatus { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentHarnessSubstrateWorkerPoolSpec) DeepCopyInto(out *AgentHarnessSubstrateWorkerPoolSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateWorkerPoolSpec. +func (in *AgentHarnessSubstrateWorkerPoolSpec) DeepCopy() *AgentHarnessSubstrateWorkerPoolSpec { + if in == nil { + return nil + } + out := new(AgentHarnessSubstrateWorkerPoolSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessTelegramChannelSpec) DeepCopyInto(out *AgentHarnessTelegramChannelSpec) { *out = *in diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index a73d7a7835..94b857d9ef 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -28,6 +28,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" ) const ( @@ -49,14 +50,38 @@ const ( // harness VMs are a generic exec/SSH-able environment with no in-cluster // workload owned by kagent. type AgentHarnessController struct { - Client client.Client - Recorder events.EventRecorder - Backends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + Client client.Client + Recorder events.EventRecorder + OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateProvisioner *substrate.Provisioner +} + +func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxbackend.AsyncBackend { + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + switch runtime { + case v1alpha2.AgentHarnessRuntimeSubstrate: + if r.SubstrateBackends == nil { + return nil + } + return r.SubstrateBackends[ah.Spec.Backend] + default: + if r.OpenshellBackends == nil { + return nil + } + return r.OpenshellBackends[ah.Spec.Backend] + } } // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/status,verbs=get;update;patch // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/finalizers,verbs=update +// +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) @@ -80,11 +105,15 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{Requeue: true}, nil } - backend := r.Backends[ah.Spec.Backend] + backend := r.backendFor(&ah) if backend == nil { + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, "BackendUnavailable", - fmt.Sprintf("no backend configured for %q", ah.Spec.Backend)) + fmt.Sprintf("no %s backend configured for %q", runtime, ah.Spec.Backend)) setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "BackendUnavailable", "") if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { @@ -93,6 +122,59 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, nil } + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { + provRes, err := r.SubstrateProvisioner.Ensure(ctx, &ah) + if err != nil { + log.Error(err, "substrate provision failed") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "SubstrateProvisionFailed", err.Error()) + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "SubstrateProvisionFailed", "") + if perr := r.patchAgentHarnessStatus(ctx, &ah); perr != nil { + return ctrl.Result{}, perr + } + return ctrl.Result{}, err + } + if ah.Status.Substrate == nil { + ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} + } + if provRes.WorkerPoolRef.Name != "" { + ah.Status.Substrate.WorkerPoolRef = v1alpha2.TypedReference{ + Name: provRes.WorkerPoolRef.Name, + Namespace: provRes.WorkerPoolRef.Namespace, + } + } + ah.Status.Substrate.ActorTemplateRef = v1alpha2.TypedReference{ + Name: provRes.ActorTemplateRef.Name, + Namespace: provRes.ActorTemplateRef.Namespace, + } + ah.Status.Substrate.ActorTemplateReady = provRes.ActorTemplateReady + // Persist status before metadata annotation patch (client Patch can refresh ah and drop in-memory status). + if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { + return ctrl.Result{}, err + } + if err := r.patchAgentHarnessProvisionAnnotations(ctx, &ah, provRes); err != nil { + return ctrl.Result{}, err + } + if !provRes.ActorTemplateReady { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, + "SubstrateProvisioning", "waiting for ActorTemplate golden snapshot") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "ActorTemplateNotReady", "ActorTemplate is not Ready yet") + if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { + return ctrl.Result{}, fmt.Errorf("reload AgentHarness after substrate provision: %w", err) + } + } + res, err := backend.EnsureAgentHarness(ctx, &ah) if err != nil { log.Error(err, "EnsureAgentHarness failed") @@ -192,7 +274,7 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } if ah.Status.BackendRef != nil && ah.Status.BackendRef.ID != "" { - del := r.Backends[ah.Status.BackendRef.Backend] + del := r.backendFor(ah) if del != nil { if err := del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: ah.Status.BackendRef.ID}); err != nil { if r.Recorder != nil { @@ -203,6 +285,12 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } } + if r.SubstrateProvisioner != nil { + if err := r.SubstrateProvisioner.Delete(ctx, ah); err != nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("delete substrate resources: %w", err) + } + } + controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) if err := r.Client.Update(ctx, ah); err != nil { return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) @@ -217,6 +305,23 @@ func (r *AgentHarnessController) patchAgentHarnessStatus(ctx context.Context, ah return nil } +func (r *AgentHarnessController) patchAgentHarnessProvisionAnnotations(ctx context.Context, ah *v1alpha2.AgentHarness, prov substrate.EnsureResult) error { + base := ah.DeepCopy() + if ah.Annotations == nil { + ah.Annotations = map[string]string{} + } + if prov.ManagedWorkerPool { + ah.Annotations[substrate.AnnotationManagedWorkerPool] = "true" + } + if prov.ManagedActorTemplate { + ah.Annotations[substrate.AnnotationManagedActorTemplate] = "true" + } + if err := r.Client.Patch(ctx, ah, client.MergeFrom(base)); err != nil { + return fmt.Errorf("patch AgentHarness substrate annotations: %w", err) + } + return nil +} + func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { now := metav1.Now() for i := range ah.Status.Conditions { diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go new file mode 100644 index 0000000000..453ec5d907 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -0,0 +1,365 @@ +package handlers + +import ( + "bytes" + "compress/gzip" + "context" + "fmt" + "io" + "net" + "net/http" + "net/http/httputil" + "net/url" + "os" + "strings" + "time" + + "github.com/gorilla/mux" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + substrateGatewayTokenSecretKey = "token" + // OpenClaw 2026.3.28+ returns 403 without operator scopes on HTTP/WS when only Bearer token is sent. + openclawDefaultOperatorScopes = "operator.admin" + // Origin OpenClaw accepts by default for bind=lan port=80 (localhost/127.0.0.1 on gateway port). + openclawLoopbackOrigin = "http://127.0.0.1:80" +) + +// AgentHarnessGatewayConfig configures Substrate harness HTTP/WebSocket proxy. +// Traffic is proxied directly to the actor ateom pod IP on port 80 (no atenet-router fallback). +type AgentHarnessGatewayConfig struct { + GatewayToken string + GatewayTokenFile string + AteAPIEndpoint string + AteAPIInsecure bool + DialTimeout time.Duration + CallTimeout time.Duration +} + +func (c *AgentHarnessGatewayConfig) resolveToken() (string, error) { + if c == nil { + return "", nil + } + if c.GatewayTokenFile != "" { + data, err := os.ReadFile(c.GatewayTokenFile) + if err != nil { + return "", fmt.Errorf("read substrate gateway token file: %w", err) + } + return strings.TrimSpace(string(data)), nil + } + return strings.TrimSpace(c.GatewayToken), nil +} + +// HandleAgentHarnessGateway proxies browser traffic to the actor OpenClaw gateway (pod IP when available). +func (h *Handlers) HandleAgentHarnessGateway(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("agentharness-gateway") + if h.AgentHarnessGateway == nil { + http.Error(w, "substrate gateway proxy is not configured", http.StatusServiceUnavailable) + return + } + + vars := mux.Vars(r) + namespace := strings.TrimSpace(vars["namespace"]) + name := strings.TrimSpace(vars["name"]) + if namespace == "" || name == "" { + http.Error(w, "namespace and name are required", http.StatusBadRequest) + return + } + + var ah v1alpha2.AgentHarness + if err := h.KubeClient.Get(r.Context(), types.NamespacedName{Namespace: namespace, Name: name}, &ah); err != nil { + if apierrors.IsNotFound(err) { + http.Error(w, "AgentHarness not found", http.StatusNotFound) + return + } + log.Error(err, "get AgentHarness") + http.Error(w, "failed to load AgentHarness", http.StatusInternalServerError) + return + } + + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + if runtime != v1alpha2.AgentHarnessRuntimeSubstrate { + http.Error(w, "gateway proxy is only available for runtime=substrate", http.StatusBadRequest) + return + } + if ah.Status.BackendRef == nil || ah.Status.BackendRef.ID == "" { + http.Error(w, "harness has no substrate actor yet", http.StatusServiceUnavailable) + return + } + + token, err := h.resolveHarnessGatewayToken(r.Context(), &ah) + if err != nil { + log.Error(err, "resolve gateway token") + http.Error(w, "gateway token not configured", http.StatusInternalServerError) + return + } + + target, upstreamHost, err := h.resolveSubstrateGatewayTarget(r.Context(), &ah) + if err != nil { + log.Info("resolve substrate gateway target failed", "error", err) + http.Error(w, err.Error(), http.StatusServiceUnavailable) + return + } + + publicPrefix := agentHarnessGatewayPublicPrefix(namespace, name) + + _, redirectTo, ok := resolveGatewayUpstreamPath(r.URL.Path, namespace, name, isWebSocketUpgrade(r)) + if !ok { + http.NotFound(w, r) + return + } + // Browsers do not complete WebSocket handshakes through 30x redirects. + if redirectTo != "" && !isWebSocketUpgrade(r) { + dest := redirectTo + if r.URL.RawQuery != "" { + dest += "?" + r.URL.RawQuery + } + http.Redirect(w, r, dest, http.StatusPermanentRedirect) + return + } + + proxy := newAgentHarnessGatewayProxy(target, upstreamHost, token, publicPrefix, namespace, name, log) + proxy.ServeHTTP(w, r) +} + +func (h *Handlers) resolveSubstrateGatewayTarget(ctx context.Context, ah *v1alpha2.AgentHarness) (*url.URL, string, error) { + cfg := h.AgentHarnessGateway + if cfg == nil { + return nil, "", fmt.Errorf("substrate gateway is not configured") + } + if cfg.AteAPIEndpoint == "" { + return nil, "", fmt.Errorf("substrate ate-api is not configured on the controller") + } + + ateClient, err := substrate.Dial(ctx, substrate.Config{ + AteAPIEndpoint: cfg.AteAPIEndpoint, + Insecure: cfg.AteAPIInsecure, + DialTimeout: cfg.DialTimeout, + CallTimeout: cfg.CallTimeout, + }) + if err != nil { + return nil, "", fmt.Errorf("dial ate-api: %w", err) + } + defer ateClient.Close() + + actorID := ah.Status.BackendRef.ID + actor, err := ateClient.GetActor(ctx, actorID) + if err != nil { + return nil, "", fmt.Errorf("get substrate actor %q: %w", actorID, err) + } + podIP := strings.TrimSpace(actor.GetAteomPodIp()) + if podIP == "" { + return nil, "", fmt.Errorf("substrate actor %q has no pod IP (status %s; resume the actor and wait until running)", actorID, actor.GetStatus()) + } + target, host, err := substrateGatewayPodTarget(podIP) + if err != nil { + return nil, "", fmt.Errorf("substrate actor %q pod IP %q: %w", actorID, podIP, err) + } + ctrllog.FromContext(ctx).WithName("agentharness-gateway").Info( + "proxying via actor pod IP", + "actor", actorID, + "podIP", host, + ) + return target, host, nil +} + +func substrateGatewayPodTarget(podIP string) (*url.URL, string, error) { + ip := strings.TrimSpace(podIP) + if ip == "" || net.ParseIP(ip) == nil { + return nil, "", fmt.Errorf("invalid actor pod IP %q", podIP) + } + target, err := url.Parse("http://" + net.JoinHostPort(ip, "80")) + if err != nil { + return nil, "", fmt.Errorf("parse actor pod target: %w", err) + } + return target, ip, nil +} + +func agentHarnessHarnessBase(namespace, name string) string { + return "/api/agentharnesses/" + namespace + "/" + name +} + +func agentHarnessGatewayPublicPrefix(namespace, name string) string { + return agentHarnessHarnessBase(namespace, name) + "/gateway/" +} + +// resolveGatewayUpstreamPath maps the public URL to the upstream path on the actor. +// redirectTo is set when the browser should use a trailing slash under /gateway/. +// HTTP and WebSocket upgrades to the gateway entry both proxy to upstream / (OpenClaw gateway UI). +func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade bool) (upstreamPath, redirectTo string, ok bool) { + base := agentHarnessHarnessBase(namespace, name) + if !strings.HasPrefix(requestPath, base) { + return "", "", false + } + rel := strings.TrimPrefix(requestPath, base) + if rel == "" { + return "", agentHarnessGatewayPublicPrefix(namespace, name), true + } + + switch { + case rel == "/gateway": + _ = wsUpgrade + return "/", agentHarnessGatewayPublicPrefix(namespace, name), true + case strings.HasPrefix(rel, "/gateway/"): + sub := strings.TrimPrefix(rel, "/gateway") + if sub == "" { + sub = "/" + } + return sub, "", true + case isHarnessStaticAssetPath(rel): + return rel, "", true + default: + return "", "", false + } +} + +func isHarnessStaticAssetPath(rel string) bool { + if strings.HasPrefix(rel, "/assets/") { + return true + } + switch rel { + case "/manifest.webmanifest", "/vite.svg", "/favicon.ico": + return true + } + return strings.HasPrefix(rel, "/favicon") +} + +// normalizeOpenClawBrowserOrigin rewrites Origin/Referer so OpenClaw accepts WS/API from kagent-ui +// (e.g. http://localhost:8001) while the gateway listens on the actor pod :80. +func normalizeOpenClawBrowserOrigin(req *http.Request) { + if req == nil { + return + } + if req.Header.Get("Origin") != "" { + req.Header.Set("Origin", openclawLoopbackOrigin) + } + if req.Header.Get("Referer") != "" { + req.Header.Set("Referer", openclawLoopbackOrigin+"/") + } +} + +func isWebSocketUpgrade(r *http.Request) bool { + if r == nil { + return false + } + return strings.EqualFold(r.Header.Get("Upgrade"), "websocket") && + strings.Contains(strings.ToLower(r.Header.Get("Connection")), "upgrade") +} + +func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPrefix, namespace, name string, log interface { + Error(error, string, ...any) +}) *httputil.ReverseProxy { + proxy := httputil.NewSingleHostReverseProxy(target) + proxy.FlushInterval = -1 + proxy.Transport = &http.Transport{ + Proxy: http.ProxyFromEnvironment, + ResponseHeaderTimeout: 0, + IdleConnTimeout: 90 * time.Second, + } + origDirector := proxy.Director + proxy.Director = func(req *http.Request) { + origDirector(req) + req.Host = upstreamHost + req.Header.Set("Host", upstreamHost) + if token != "" { + req.Header.Set("Authorization", "Bearer "+token) + } + req.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) + normalizeOpenClawBrowserOrigin(req) + subPath, _, pathOK := resolveGatewayUpstreamPath(req.URL.Path, namespace, name, isWebSocketUpgrade(req)) + if !pathOK { + subPath = "/" + } + if subPath == "" { + subPath = "/" + } else if !strings.HasPrefix(subPath, "/") { + subPath = "/" + subPath + } + req.URL.Path = subPath + req.URL.RawPath = subPath + } + proxy.ModifyResponse = func(resp *http.Response) error { + // Do not read or rewrite WebSocket upgrade responses (would break 101 handshakes). + if resp.StatusCode == http.StatusSwitchingProtocols { + return nil + } + + resp.Header.Del("Content-Security-Policy") + resp.Header.Del("Content-Security-Policy-Report-Only") + + if loc := resp.Header.Get("Location"); loc != "" { + if strings.HasPrefix(loc, "/") && !strings.HasPrefix(loc, publicPrefix) { + resp.Header.Set("Location", strings.TrimSuffix(publicPrefix, "/")+loc) + } + } + + ct := resp.Header.Get("Content-Type") + if !shouldRewriteGatewayBody(ct) { + return nil + } + body, err := readGatewayResponseBody(resp) + if err != nil { + return err + } + rewritten := rewriteGatewayBody(body, ct, publicPrefix) + if strings.Contains(strings.ToLower(ct), "text/html") { + rewritten = injectGatewayClientShim(rewritten, token) + } + resp.Header.Del("Content-Encoding") + resp.Header.Del("Content-Length") + resp.ContentLength = int64(len(rewritten)) + resp.Body = io.NopCloser(bytes.NewReader(rewritten)) + return nil + } + proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, proxyErr error) { + log.Error(proxyErr, "gateway proxy error", "host", upstreamHost) + http.Error(rw, "gateway proxy error", http.StatusBadGateway) + } + return proxy +} + +func readGatewayResponseBody(resp *http.Response) ([]byte, error) { + var reader io.Reader = resp.Body + if strings.EqualFold(resp.Header.Get("Content-Encoding"), "gzip") { + gz, err := gzip.NewReader(resp.Body) + if err != nil { + return nil, err + } + defer gz.Close() + reader = gz + } + defer resp.Body.Close() + return io.ReadAll(reader) +} + +func (h *Handlers) resolveHarnessGatewayToken(ctx context.Context, ah *v1alpha2.AgentHarness) (string, error) { + if ah.Spec.Substrate != nil && ah.Spec.Substrate.GatewayTokenSecretRef != nil { + ref := ah.Spec.Substrate.GatewayTokenSecretRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + var secret corev1.Secret + if err := h.KubeClient.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { + return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) + } + if secret.Data == nil { + return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) + } + val, ok := secret.Data[substrateGatewayTokenSecretKey] + if !ok { + return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, substrateGatewayTokenSecretKey) + } + return strings.TrimSpace(string(val)), nil + } + return h.AgentHarnessGateway.resolveToken() +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go new file mode 100644 index 0000000000..13818acb39 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go @@ -0,0 +1,235 @@ +package handlers + +import ( + "bytes" + "encoding/json" + "fmt" + "strings" +) + +// shouldRewriteGatewayQuotedPath returns true for root-absolute app paths we proxy, +// not for short tokens like "/g" (RegExp flags) or other non-asset paths. +func shouldRewriteGatewayQuotedPath(path string) bool { + if path == "" || !strings.HasPrefix(path, "/") || strings.HasPrefix(path, "//") { + return false + } + switch { + case strings.HasPrefix(path, "/assets"): + return true + case strings.HasPrefix(path, "/manifest"): + return true + case strings.HasPrefix(path, "/favicon"): + return true + case path == "/vite.svg": + return true + default: + return false + } +} + +// rewriteGatewayRootPaths prefixes root-absolute URLs in HTML/JS/CSS so assets load under +// /api/agentharnesses/{ns}/{name}/gateway/ (OpenClaw CSP blocks ; base-uri 'none'). +func rewriteGatewayRootPaths(body []byte, prefix string) []byte { + if len(body) == 0 || prefix == "" { + return body + } + if !strings.HasPrefix(prefix, "/") { + prefix = "/" + prefix + } + if !strings.HasSuffix(prefix, "/") { + prefix += "/" + } + + var out bytes.Buffer + out.Grow(len(body) + len(prefix)*4) + s := string(body) + for i := 0; i < len(s); i++ { + c := s[i] + if (c == '"' || c == '\'') && i+1 < len(s) && s[i+1] == '/' { + if i+2 < len(s) && s[i+2] == '/' { + out.WriteByte(c) + continue + } + quote := c + j := i + 1 + for j < len(s) && s[j] != quote { + j++ + } + path := s[i+1 : j] + out.WriteByte(quote) + if shouldRewriteGatewayQuotedPath(path) { + out.WriteString(prefix) + out.WriteString(strings.TrimPrefix(path, "/")) + } else { + out.WriteString(path) + } + if j < len(s) { + out.WriteByte(quote) + } + i = j + continue + } + if i+4 < len(s) && strings.EqualFold(s[i:i+4], "url(") { + j := i + 4 + for j < len(s) && (s[j] == ' ' || s[j] == '\t') { + j++ + } + if j < len(s) && (s[j] == '"' || s[j] == '\'') { + quote := s[j] + if j+1 < len(s) && s[j+1] == '/' && !(j+2 < len(s) && s[j+2] == '/') { + k := j + 1 + for k < len(s) && s[k] != quote { + k++ + } + path := s[j+1 : k] + out.WriteString(s[i : j+1]) + if shouldRewriteGatewayQuotedPath(path) { + out.WriteString(prefix) + out.WriteString(strings.TrimPrefix(path, "/")) + } else { + out.WriteString(path) + } + if k < len(s) { + out.WriteByte(quote) + } + i = k + continue + } + } else if j < len(s) && s[j] == '/' && !(j+1 < len(s) && s[j+1] == '/') { + k := j + 1 + for k < len(s) && s[k] != ')' && s[k] != ' ' && s[k] != '\t' && s[k] != '"' && s[k] != '\'' { + k++ + } + path := s[j:k] + out.WriteString(s[i:j]) + if shouldRewriteGatewayQuotedPath(path) { + out.WriteString(prefix) + out.WriteString(strings.TrimPrefix(path, "/")) + } else { + out.WriteString(path) + } + i = k - 1 + continue + } + } + out.WriteByte(c) + } + return out.Bytes() +} + +func stripGatewayBaseTag(body []byte) []byte { + lower := bytes.ToLower(body) + for { + idx := bytes.Index(lower, []byte("")) + if end < 0 { + break + } + endIdx := idx + end + 1 + body = append(append(body[:idx], body[endIdx:]...)) + lower = bytes.ToLower(body) + } + return body +} + +func stripGatewayCSP(body []byte) []byte { + lower := bytes.ToLower(body) + for _, tag := range []string{ + `")) + if end < 0 { + break + } + endIdx := idx + end + 1 + body = append(append(body[:idx], body[endIdx:]...)) + lower = bytes.ToLower(body) + } + } + return body +} + +func rewriteGatewayBody(body []byte, contentType, prefix string) []byte { + body = stripGatewayCSP(body) + ct := strings.ToLower(contentType) + if strings.Contains(ct, "text/html") { + body = stripGatewayBaseTag(body) + } + if shouldRewriteGatewayBody(contentType) { + body = rewriteGatewayRootPaths(body, prefix) + return rewriteGatewayWebSocketPaths(body, prefix) + } + return body +} + +// injectGatewayClientShim patches WebSocket URLs (trailing slash + ?token= for OpenClaw Control UI). +func injectGatewayClientShim(body []byte, gatewayToken string) []byte { + tokenJSON, _ := json.Marshal(gatewayToken) + shim := fmt.Sprintf(``, tokenJSON) + lower := bytes.ToLower(body) + for _, tag := range []string{"", ""} { + if idx := bytes.Index(lower, []byte(strings.ToLower(tag))); idx >= 0 { + out := make([]byte, 0, len(body)+len(shim)) + out = append(out, body[:idx]...) + out = append(out, shim...) + out = append(out, body[idx:]...) + return out + } + } + return append(bytes.Clone(body), shim...) +} + +// rewriteGatewayWebSocketPaths ensures bundled/runtime WS URLs use .../gateway/ (trailing slash). +// Only rewrites occurrences not already followed by '/' (avoids breaking .../gateway/assets/...). +func rewriteGatewayWebSocketPaths(body []byte, prefix string) []byte { + gatewayWithSlash := strings.TrimSuffix(prefix, "/") + "/" + gatewayNoSlash := strings.TrimSuffix(gatewayWithSlash, "/") + if gatewayNoSlash == "" || gatewayNoSlash == gatewayWithSlash { + return body + } + needle := []byte(gatewayNoSlash) + var out bytes.Buffer + out.Grow(len(body) + 16) + for i := 0; i < len(body); { + idx := bytes.Index(body[i:], needle) + if idx < 0 { + out.Write(body[i:]) + break + } + idx += i + out.Write(body[i:idx]) + end := idx + len(needle) + if end < len(body) && body[end] == '/' { + out.Write(needle) + } else { + out.Write([]byte(gatewayWithSlash)) + } + i = end + } + return out.Bytes() +} + +func shouldRewriteGatewayBody(contentType string) bool { + ct := strings.ToLower(contentType) + return strings.Contains(ct, "text/html") || + strings.Contains(ct, "javascript") || + strings.Contains(ct, "text/css") || + strings.Contains(ct, "application/json") +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go new file mode 100644 index 0000000000..eaab469051 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go @@ -0,0 +1,165 @@ +package handlers + +import ( + "net/http" + "strings" + "testing" +) + +func TestResolveGatewayUpstreamPath(t *testing.T) { + t.Parallel() + ns, name := "kagent", "my-claw" + public := agentHarnessGatewayPublicPrefix(ns, name) + + tests := []struct { + name string + path string + wsUpgrade bool + wantUp string + wantRedir string + wantOK bool + }{ + { + name: "harness root redirects", + path: "/api/agentharnesses/kagent/my-claw", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash redirects", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wantUp: "/", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash websocket", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wsUpgrade: true, + wantUp: "/", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway index", + path: "/api/agentharnesses/kagent/my-claw/gateway/", + wantUp: "/", + wantOK: true, + }, + { + name: "gateway asset", + path: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", + wantUp: "/assets/foo.js", + wantOK: true, + }, + { + name: "mis-resolved asset shim", + path: "/api/agentharnesses/kagent/my-claw/assets/foo.js", + wantUp: "/assets/foo.js", + wantOK: true, + }, + { + name: "manifest shim", + path: "/api/agentharnesses/kagent/my-claw/manifest.webmanifest", + wantUp: "/manifest.webmanifest", + wantOK: true, + }, + { + name: "unknown path", + path: "/api/agentharnesses/kagent/my-claw/api/v1/foo", + wantOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + up, redir, ok := resolveGatewayUpstreamPath(tt.path, ns, name, tt.wsUpgrade) + if ok != tt.wantOK { + t.Fatalf("ok = %v, want %v", ok, tt.wantOK) + } + if up != tt.wantUp { + t.Fatalf("upstream = %q, want %q", up, tt.wantUp) + } + if redir != tt.wantRedir { + t.Fatalf("redirect = %q, want %q", redir, tt.wantRedir) + } + }) + } +} + +func TestRewriteGatewayRootPaths(t *testing.T) { + t.Parallel() + prefix := "/api/agentharnesses/kagent/my-claw/gateway/" + in := `` + out := string(rewriteGatewayRootPaths([]byte(in), prefix)) + if !strings.Contains(out, `src="/api/agentharnesses/kagent/my-claw/gateway/assets/index.js"`) { + t.Fatalf("script src not rewritten: %s", out) + } + if !strings.Contains(out, `href="/api/agentharnesses/kagent/my-claw/gateway/manifest.webmanifest"`) { + t.Fatalf("link href not rewritten: %s", out) + } +} + +func TestIsWebSocketUpgrade(t *testing.T) { + t.Parallel() + req, _ := http.NewRequest(http.MethodGet, "http://example.com/api/x/gateway", nil) + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Upgrade", "websocket") + if !isWebSocketUpgrade(req) { + t.Fatal("expected websocket upgrade") + } + req2, _ := http.NewRequest(http.MethodGet, "http://example.com/", nil) + if isWebSocketUpgrade(req2) { + t.Fatal("expected not websocket upgrade") + } +} + +func TestRewriteGatewayWebSocketPaths(t *testing.T) { + t.Parallel() + prefix := "/api/agentharnesses/kagent/my-claw/gateway/" + in := `const u="ws://localhost:8001/api/agentharnesses/kagent/my-claw/gateway"; const v='wss://host/api/agentharnesses/kagent/my-claw/gateway'` + out := string(rewriteGatewayWebSocketPaths([]byte(in), prefix)) + want := "/api/agentharnesses/kagent/my-claw/gateway/" + if !strings.Contains(out, "ws://localhost:8001"+want) { + t.Fatalf("ws URL not rewritten: %s", out) + } + if !strings.Contains(out, "wss://host"+want) { + t.Fatalf("wss URL not rewritten: %s", out) + } +} + +func TestRewriteGatewayBodyStripsBaseAndCSP(t *testing.T) { + t.Parallel() + prefix := "/api/agentharnesses/kagent/my-claw/gateway/" + in := `` + out := string(rewriteGatewayBody([]byte(in), "text/html", prefix)) + if strings.Contains(strings.ToLower(out), "ok")) + })) + defer upstream.Close() + + target, err := url.Parse(upstream.URL) + if err != nil { + t.Fatal(err) + } + + proxy := newAgentHarnessGatewayProxy(target, podIP, token, publicPrefix, ns, name, testLog{t}) + req := httptest.NewRequest(http.MethodGet, publicPrefix, nil) + rec := httptest.NewRecorder() + proxy.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) + } + if gotHost != podIP { + t.Fatalf("upstream Host = %q, want %q", gotHost, podIP) + } + if gotAuth != "Bearer "+token { + t.Fatalf("Authorization = %q", gotAuth) + } + if gotScopes != openclawDefaultOperatorScopes { + t.Fatalf("x-openclaw-scopes = %q", gotScopes) + } + if gotPath != "/" { + t.Fatalf("upstream path = %q, want /", gotPath) + } + body, _ := io.ReadAll(rec.Body) + if !strings.Contains(string(body), "ok") { + t.Fatalf("response body missing upstream content: %s", body) + } +} + +func TestGatewayProxyDirectorTargetsPodIPOnWebSocketPath(t *testing.T) { + t.Parallel() + const podIP = "10.244.0.29" + ns, name := "kagent", "my-claw" + publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) + + target, err := url.Parse("http://" + podIP + ":80") + if err != nil { + t.Fatal(err) + } + proxy := newAgentHarnessGatewayProxy(target, podIP, "tok", publicPrefix, ns, name, testLog{t}) + req := httptest.NewRequest(http.MethodGet, strings.TrimSuffix(publicPrefix, "/"), nil) + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Upgrade", "websocket") + req.Header.Set("Origin", "http://localhost:8001") + req.Header.Set("Referer", "http://localhost:8001/api/agentharnesses/kagent/my-claw/gateway/") + + proxy.Director(req) + + if req.Host != podIP { + t.Fatalf("Host = %q, want pod IP", req.Host) + } + if req.URL.Host != podIP+":80" { + t.Fatalf("URL.Host = %q", req.URL.Host) + } + if req.URL.Path != "/" { + t.Fatalf("URL.Path = %q, want /", req.URL.Path) + } + if req.Header.Get("Authorization") != "Bearer tok" { + t.Fatalf("missing Authorization") + } + if req.Header.Get("x-openclaw-scopes") != openclawDefaultOperatorScopes { + t.Fatalf("missing scopes header") + } + if req.Header.Get("Origin") != openclawLoopbackOrigin { + t.Fatalf("Origin = %q, want %q", req.Header.Get("Origin"), openclawLoopbackOrigin) + } + if req.Header.Get("Referer") != openclawLoopbackOrigin+"/" { + t.Fatalf("Referer = %q", req.Header.Get("Referer")) + } +} + +type testLog struct { + t *testing.T +} + +func (l testLog) Error(err error, msg string, _ ...any) { + l.t.Helper() + l.t.Logf("%s: %v", msg, err) +} diff --git a/go/core/internal/httpserver/handlers/agents.go b/go/core/internal/httpserver/handlers/agents.go index 59c68ce27f..76056660c8 100644 --- a/go/core/internal/httpserver/handlers/agents.go +++ b/go/core/internal/httpserver/handlers/agents.go @@ -160,19 +160,13 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, } } + runtime := sb.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + gatewayName := fmt.Sprintf("%s-%s", sb.Namespace, sb.Name) desc := strings.TrimSpace(sb.Spec.Description) - entry := &api.OpenshellAgentHarnessListEntry{ - Backend: sb.Spec.Backend, - GatewaySandboxName: gatewayName, - ModelConfigRef: sb.Spec.ModelConfigRef, - } - if sb.Status.BackendRef != nil { - entry.BackendRefID = sb.Status.BackendRef.ID - } - if sb.Status.Connection != nil { - entry.Endpoint = sb.Status.Connection.Endpoint - } resp := api.AgentResponse{ ID: id, @@ -184,9 +178,39 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, Description: desc, }, }, - DeploymentReady: ready, - Accepted: accepted, - OpenshellAgentHarness: entry, + DeploymentReady: ready, + Accepted: accepted, + } + + switch runtime { + case v1alpha2.AgentHarnessRuntimeSubstrate: + subEntry := &api.SubstrateAgentHarnessListEntry{ + Backend: sb.Spec.Backend, + Runtime: runtime, + ModelConfigRef: sb.Spec.ModelConfigRef, + GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), + } + if sb.Status.BackendRef != nil { + subEntry.BackendRefID = sb.Status.BackendRef.ID + subEntry.ActorID = sb.Status.BackendRef.ID + } + if sb.Status.Connection != nil { + subEntry.Endpoint = sb.Status.Connection.Endpoint + } + resp.SubstrateAgentHarness = subEntry + default: + entry := &api.OpenshellAgentHarnessListEntry{ + Backend: sb.Spec.Backend, + GatewaySandboxName: gatewayName, + ModelConfigRef: sb.Spec.ModelConfigRef, + } + if sb.Status.BackendRef != nil { + entry.BackendRefID = sb.Status.BackendRef.ID + } + if sb.Status.Connection != nil { + entry.Endpoint = sb.Status.Connection.Endpoint + } + resp.OpenshellAgentHarness = entry } mcRef := strings.TrimSpace(sb.Spec.ModelConfigRef) diff --git a/go/core/internal/httpserver/handlers/handlers.go b/go/core/internal/httpserver/handlers/handlers.go index 13a66adeb9..3d854bd134 100644 --- a/go/core/internal/httpserver/handlers/handlers.go +++ b/go/core/internal/httpserver/handlers/handlers.go @@ -12,6 +12,9 @@ import ( // Handlers holds all the HTTP handler components type Handlers struct { + KubeClient client.Client + AgentHarnessGateway *AgentHarnessGatewayConfig + Health *HealthHandler ModelConfig *ModelConfigHandler Model *ModelHandler @@ -43,7 +46,17 @@ type Base struct { } // NewHandlers creates a new Handlers instance with all handler components. -func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedName, dbService database.Client, watchedNamespaces []string, authorizer auth.Authorizer, proxyURL string, rcnclr reconciler.KagentReconciler, sandboxBackend sandboxbackend.Backend) *Handlers { +func NewHandlers( + kubeClient client.Client, + defaultModelConfig types.NamespacedName, + dbService database.Client, + watchedNamespaces []string, + authorizer auth.Authorizer, + proxyURL string, + rcnclr reconciler.KagentReconciler, + sandboxBackend sandboxbackend.Backend, + agentHarnessGateway *AgentHarnessGatewayConfig, +) *Handlers { base := &Base{ KubeClient: kubeClient, DefaultModelConfig: defaultModelConfig, @@ -55,6 +68,8 @@ func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedNa } return &Handlers{ + KubeClient: kubeClient, + AgentHarnessGateway: agentHarnessGateway, Health: NewHealthHandler(), ModelConfig: NewModelConfigHandler(base), Model: NewModelHandler(base), diff --git a/go/core/internal/httpserver/middleware.go b/go/core/internal/httpserver/middleware.go index 2f3a329378..fa112a8fa7 100644 --- a/go/core/internal/httpserver/middleware.go +++ b/go/core/internal/httpserver/middleware.go @@ -5,6 +5,7 @@ import ( "fmt" "net" "net/http" + "strings" "time" "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" @@ -77,9 +78,23 @@ func (w *statusResponseWriter) RespondWithError(err error) { } } +func isAgentHarnessGatewayPath(path string) bool { + if !strings.HasPrefix(path, "/api/agentharnesses/") { + return false + } + for _, marker := range []string{"/gateway", "/assets/", "/manifest.webmanifest", "/favicon"} { + if strings.Contains(path, marker) { + return true + } + } + return false +} + func contentTypeMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if len(r.URL.Path) >= 4 && r.URL.Path[:4] == "/api" && r.URL.Path != APIPathSandboxSSH { + if len(r.URL.Path) >= 4 && r.URL.Path[:4] == "/api" && + r.URL.Path != APIPathSandboxSSH && + !isAgentHarnessGatewayPath(r.URL.Path) { w.Header().Set("Content-Type", "application/json") } next.ServeHTTP(w, r) diff --git a/go/core/internal/httpserver/server.go b/go/core/internal/httpserver/server.go index aac7e831ab..40e624a3c5 100644 --- a/go/core/internal/httpserver/server.go +++ b/go/core/internal/httpserver/server.go @@ -50,6 +50,7 @@ const ( APIPathLangGraph = "/api/langgraph" APIPathCrewAI = "/api/crewai" APIPathSandboxSSH = "/api/sandbox/ssh" + APIPathAgentHarnessHarness = "/api/agentharnesses/{namespace}/{name}/" ) var defaultModelConfig = types.NamespacedName{ @@ -70,7 +71,8 @@ type ServerConfig struct { Authorizer auth.Authorizer ProxyURL string Reconciler reconciler.KagentReconciler - SandboxBackend sandboxbackend.Backend + SandboxBackend sandboxbackend.Backend + AgentHarnessGateway *handlers.AgentHarnessGatewayConfig } // HTTPServer is the structure that manages the HTTP server @@ -89,7 +91,17 @@ func NewHTTPServer(config ServerConfig) (*HTTPServer, error) { return &HTTPServer{ config: config, router: config.Router, - handlers: handlers.NewHandlers(config.KubeClient, defaultModelConfig, config.DbClient, config.WatchedNamespaces, config.Authorizer, config.ProxyURL, config.Reconciler, config.SandboxBackend), + handlers: handlers.NewHandlers( + config.KubeClient, + defaultModelConfig, + config.DbClient, + config.WatchedNamespaces, + config.Authorizer, + config.ProxyURL, + config.Reconciler, + config.SandboxBackend, + config.AgentHarnessGateway, + ), authenticator: config.Authenticator, }, nil } @@ -303,6 +315,12 @@ func (s *HTTPServer) setupRoutes() { // OpenShell sandbox PTY (browser WebSocket → gateway CONNECT → SSH). Authenticated like other /api routes. s.router.HandleFunc(APIPathSandboxSSH, adaptHandler(s.handlers.HandleSandboxSSHWebSocket)).Methods(http.MethodGet) + // Substrate OpenClaw gateway proxy (HTTP + WebSocket) to the actor pod IP :80. + // Includes /gateway/* and mis-resolved static paths (/assets/, manifest, etc.). + s.router.PathPrefix(APIPathAgentHarnessHarness).Handler( + adaptHandler(s.handlers.HandleAgentHarnessGateway), + ) + // A2A s.router.PathPrefix(APIPathA2A + "/{namespace}/{name}").Handler(s.config.A2AHandler) s.router.PathPrefix(APIPathA2ASandboxes + "/{namespace}/{name}").Handler(s.config.A2AHandler) @@ -313,21 +331,30 @@ func (s *HTTPServer) setupRoutes() { } // Use middleware for common functionality (first registered runs outermost on incoming requests). - s.router.Use(wsSandboxSSHAuthQueryMiddleware) + s.router.Use(wsAuthQueryMiddleware) s.router.Use(auth.AuthnMiddleware(s.authenticator)) s.router.Use(contentTypeMiddleware) s.router.Use(loggingMiddleware) s.router.Use(errorHandlerMiddleware) } -// wsSandboxSSHAuthQueryMiddleware maps access_token query → Authorization for browser WebSocket upgrades +// wsAuthQueryMiddleware maps token query params → Authorization for browser WebSocket upgrades // (fetch can send headers; WebSocket cannot). -func wsSandboxSSHAuthQueryMiddleware(next http.Handler) http.Handler { +func wsAuthQueryMiddleware(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path == APIPathSandboxSSH && r.Header.Get("Authorization") == "" { - if t := r.URL.Query().Get("access_token"); t != "" { - r.Header.Set("Authorization", "Bearer "+strings.TrimSpace(t)) - } + if r.Header.Get("Authorization") != "" { + next.ServeHTTP(w, r) + return + } + var token string + switch { + case r.URL.Path == APIPathSandboxSSH || strings.HasSuffix(r.URL.Path, "/ssh"): + token = r.URL.Query().Get("access_token") + case isAgentHarnessGatewayPath(r.URL.Path): + token = r.URL.Query().Get("token") + } + if token != "" { + r.Header.Set("Authorization", "Bearer "+strings.TrimSpace(token)) } next.ServeHTTP(w, r) }) diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index ddad07d546..7885292985 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -54,10 +54,13 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" dbpkg "github.com/kagent-dev/kagent/go/api/database" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/migrations" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "github.com/kagent-dev/kagent/go/core/pkg/translator" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -99,6 +102,7 @@ func init() { utilruntime.Must(v1alpha1.AddToScheme(scheme)) utilruntime.Must(v1alpha2.AddToScheme(scheme)) utilruntime.Must(agentsandboxv1.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } @@ -148,6 +152,21 @@ type Config struct { DialTimeout time.Duration CallTimeout time.Duration } + Substrate struct { + AteAPIEndpoint string + Insecure bool + DialTimeout time.Duration + CallTimeout time.Duration + DefaultActorTemplateNamespace string + DefaultActorTemplateName string + GatewayToken string + GatewayTokenFile string + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string + } } func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { @@ -207,6 +226,20 @@ func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { commandLine.DurationVar(&cfg.Openshell.DialTimeout, "openshell-dial-timeout", 10*time.Second, "Timeout for the initial dial to the OpenShell gateway.") commandLine.DurationVar(&cfg.Openshell.CallTimeout, "openshell-call-timeout", 30*time.Second, "Per-RPC timeout for OpenShell gateway calls.") + commandLine.StringVar(&cfg.Substrate.AteAPIEndpoint, "substrate-ate-api-endpoint", "", "gRPC target for Agent Substrate ate-api (e.g. dns:///api.ate-system.svc:443). Enables substrate AgentHarness runtime when set.") + commandLine.BoolVar(&cfg.Substrate.Insecure, "substrate-ate-api-insecure", false, "Dial ate-api without TLS (local dev only).") + commandLine.DurationVar(&cfg.Substrate.DialTimeout, "substrate-dial-timeout", 10*time.Second, "Timeout for the initial dial to ate-api.") + commandLine.DurationVar(&cfg.Substrate.CallTimeout, "substrate-call-timeout", 30*time.Second, "Per-RPC timeout for ate-api calls.") + commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateNamespace, "substrate-default-actor-template-namespace", "", "Legacy fallback ActorTemplate namespace when adopting an external template (set spec.substrate.actorTemplateRef instead).") + commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateName, "substrate-default-actor-template-name", "", "Legacy fallback ActorTemplate name when adopting an external template (set spec.substrate.actorTemplateRef instead).") + commandLine.StringVar(&cfg.Substrate.GatewayToken, "substrate-gateway-token", "", "OpenClaw gateway Bearer token for substrate proxy. Prefer --substrate-gateway-token-file.") + commandLine.StringVar(&cfg.Substrate.GatewayTokenFile, "substrate-gateway-token-file", "", "File containing OpenClaw gateway Bearer token for substrate harness proxy.") + commandLine.StringVar(&cfg.Substrate.PauseImage, "substrate-pause-image", "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", "Pause image for auto-provisioned ActorTemplates.") + commandLine.StringVar(&cfg.Substrate.RunscAMD64URL, "substrate-runsc-amd64-url", "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc", "gVisor runsc URL for amd64.") + commandLine.StringVar(&cfg.Substrate.RunscAMD64SHA256, "substrate-runsc-amd64-sha256", "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63", "gVisor runsc sha256 for amd64.") + commandLine.StringVar(&cfg.Substrate.RunscARM64URL, "substrate-runsc-arm64-url", "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc", "gVisor runsc URL for arm64.") + commandLine.StringVar(&cfg.Substrate.RunscARM64SHA256, "substrate-runsc-arm64-sha256", "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9", "gVisor runsc sha256 for arm64.") + commandLine.StringVar(&agent_translator.DefaultServiceAccountName, "default-service-account-name", "", "Global default ServiceAccount name for agent pods. When set, agents without an explicit serviceAccountName will use this instead of creating a per-agent ServiceAccount.") commandLine.Var(&MapValue{Target: &agent_translator.DefaultAgentPodLabels}, "default-agent-pod-labels", "Comma-separated key=value pairs of labels to apply to all agent pod templates (e.g. 'team=platform,env=prod'). Per-agent labels take precedence.") @@ -563,23 +596,43 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne os.Exit(1) } + kubeClient := mgr.GetClient() + var openshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + var substrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend if cfg.Openshell.GatewayURL != "" { - kubeClient := mgr.GetClient() - openshellBackends, err := buildOpenshellSandboxBackends(ctx, &cfg, kubeClient) + var err error + openshellBackends, err = buildOpenshellSandboxBackends(ctx, &cfg, kubeClient) if err != nil { setupLog.Error(err, "unable to build openshell sandbox backends") os.Exit(1) } + } + var substrateAteClient *substrate.Client + if cfg.Substrate.AteAPIEndpoint != "" { + var err error + substrateBackends, substrateAteClient, err = buildSubstrateSandboxBackends(ctx, &cfg) + if err != nil { + setupLog.Error(err, "unable to build substrate sandbox backends") + os.Exit(1) + } + } + if len(openshellBackends) > 0 || len(substrateBackends) > 0 { + var substrateProvisioner *substrate.Provisioner + if len(substrateBackends) > 0 { + substrateProvisioner = substrateProvisionerFromConfig(kubeClient, &cfg, substrateAteClient) + } if err := (&controller.AgentHarnessController{ - Client: kubeClient, - Recorder: mgr.GetEventRecorder("agentharness-controller"), - Backends: openshellBackends, + Client: kubeClient, + Recorder: mgr.GetEventRecorder("agentharness-controller"), + OpenshellBackends: openshellBackends, + SubstrateBackends: substrateBackends, + SubstrateProvisioner: substrateProvisioner, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "AgentHarness") os.Exit(1) } } else { - setupLog.Info("AgentHarness controller disabled: --openshell-gateway-url not set") + setupLog.Info("AgentHarness controller disabled: set --openshell-gateway-url and/or --substrate-ate-api-endpoint") } if err = (&controller.ModelConfigController{ @@ -677,19 +730,40 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne os.Exit(1) } + var agentHarnessGateway *handlers.AgentHarnessGatewayConfig + if cfg.Substrate.AteAPIEndpoint != "" { + gwToken := cfg.Substrate.GatewayToken + if cfg.Substrate.GatewayTokenFile != "" { + data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) + if err != nil { + setupLog.Error(err, "unable to read substrate gateway token file") + os.Exit(1) + } + gwToken = strings.TrimSpace(string(data)) + } + agentHarnessGateway = &handlers.AgentHarnessGatewayConfig{ + GatewayToken: gwToken, + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + AteAPIInsecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, + } + } + httpServer, err := httpserver.NewHTTPServer(httpserver.ServerConfig{ - Router: router, - BindAddr: cfg.HttpServerAddr, - KubeClient: mgr.GetClient(), - A2AHandler: a2aHandler, - MCPHandler: mcpHandler, - WatchedNamespaces: watchNamespacesList, - DbClient: dbClient, - Authorizer: extensionCfg.Authorizer, - Authenticator: extensionCfg.Authenticator, - ProxyURL: cfg.Proxy.URL, - Reconciler: rcnclr, - SandboxBackend: extensionCfg.SandboxBackend, + Router: router, + BindAddr: cfg.HttpServerAddr, + KubeClient: mgr.GetClient(), + A2AHandler: a2aHandler, + MCPHandler: mcpHandler, + WatchedNamespaces: watchNamespacesList, + DbClient: dbClient, + Authorizer: extensionCfg.Authorizer, + Authenticator: extensionCfg.Authenticator, + ProxyURL: cfg.Proxy.URL, + Reconciler: rcnclr, + SandboxBackend: extensionCfg.SandboxBackend, + AgentHarnessGateway: agentHarnessGateway, }) if err != nil { setupLog.Error(err, "unable to create HTTP server") @@ -747,12 +821,80 @@ func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient ocl := openshell.NewOpenClawBackend(kubeClient, clients, oc, nil) hermesBackend := openshell.NewHermesBackend(kubeClient, clients, oc, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ocl, v1alpha2.AgentHarnessBackendHermes: hermesBackend, }, nil } +func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend, *substrate.Client, error) { + sc, _, err := substrateAppConfig(cfg) + if err != nil { + return nil, nil, err + } + client, err := substrate.Dial(ctx, sc) + if err != nil { + return nil, nil, err + } + + ocl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendOpenClaw, nil) + ncl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendNemoClaw, nil) + return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendNemoClaw: ncl, + }, client, nil +} + +func substrateAppConfig(cfg *Config) (substrate.Config, string, error) { + gwToken := cfg.Substrate.GatewayToken + if cfg.Substrate.GatewayTokenFile != "" { + data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) + if err != nil { + return substrate.Config{}, "", fmt.Errorf("read substrate gateway token file: %w", err) + } + gwToken = strings.TrimSpace(string(data)) + } + sc := substrate.Config{ + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + Insecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, + DefaultActorTemplateNamespace: cfg.Substrate.DefaultActorTemplateNamespace, + DefaultActorTemplateName: cfg.Substrate.DefaultActorTemplateName, + GatewayToken: gwToken, + ProvisionDefaults: substrate.ProvisionDefaults{ + PauseImage: cfg.Substrate.PauseImage, + RunscAMD64URL: cfg.Substrate.RunscAMD64URL, + RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, + RunscARM64URL: cfg.Substrate.RunscARM64URL, + RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + GatewayToken: gwToken, + }, + } + return sc, gwToken, nil +} + +func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate *substrate.Client) *substrate.Provisioner { + _, gwToken, err := substrateAppConfig(cfg) + if err != nil { + gwToken = cfg.Substrate.GatewayToken + } + return &substrate.Provisioner{ + Client: kubeClient, + Ate: ate, + Defaults: substrate.ProvisionDefaults{ + PauseImage: cfg.Substrate.PauseImage, + RunscAMD64URL: cfg.Substrate.RunscAMD64URL, + RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, + RunscARM64URL: cfg.Substrate.RunscARM64URL, + RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + GatewayToken: gwToken, + }, + } +} + // configureNamespaceWatching sets up the controller manager to watch specific namespaces // based on the provided configuration. It returns the list of namespaces being watched, // or nil if watching all namespaces. diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw.go b/go/core/pkg/sandboxbackend/openshell/openclaw.go index 9f95a407a5..f8032b4235 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw.go @@ -87,7 +87,7 @@ func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.Agen gwPort := defaultOpenclawGatewayPort token := b.cfg.Token - jsonBytes, env, err := openclaw.BuildBootstrapJSON(ctx, b.kubeClient, ah.Namespace, ah, mc, gwPort) + jsonBytes, env, err := openclaw.BuildBootstrapJSON(ctx, b.kubeClient, ah.Namespace, ah, mc, openclaw.OpenshellGatewayBootstrap(gwPort), openclaw.DefaultInferenceBaseURL) if err != nil { return fmt.Errorf("build openclaw config: %w", err) } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go index db2fdb373e..23d61fdbfb 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go @@ -11,9 +11,46 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// GatewayBootstrapConfig describes the gateway section of openclaw.json for a harness runtime. +type GatewayBootstrapConfig struct { + Port int + Bind string // loopback | lan + AuthMode string // none | token + Token string // required when AuthMode is token + ControlUI *ControlUIBootstrapConfig +} + +// ControlUIBootstrapConfig maps to gateway.controlUi in openclaw.json. +type ControlUIBootstrapConfig struct { + AllowedOrigins []string + DangerouslyDisableDeviceAuth bool +} + +// OpenshellGatewayBootstrap is the default gateway profile for OpenShell sandboxes. +func OpenshellGatewayBootstrap(port int) GatewayBootstrapConfig { + return GatewayBootstrapConfig{Port: port, Bind: "loopback", AuthMode: "none"} +} + +// SubstrateGatewayBootstrap is the gateway profile for Agent Substrate actors (port 80, token auth, proxied Control UI). +func SubstrateGatewayBootstrap(token string, port int) GatewayBootstrapConfig { + return GatewayBootstrapConfig{ + Port: port, + Bind: "lan", + AuthMode: "token", + Token: strings.TrimSpace(token), + ControlUI: &ControlUIBootstrapConfig{ + AllowedOrigins: []string{"*"}, + DangerouslyDisableDeviceAuth: true, + }, + } +} + // BuildBootstrapJSON builds ~/.openclaw/openclaw.json contents plus environment variables that must be present when // OpenClaw resolves openshell:resolve:env: (API key + channel tokens). -func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gwPort int) ([]byte, map[string]string, error) { +// +// defaultBaseURLWhenUnset is used when ModelConfig has no explicit provider base URL. +// OpenShell callers should pass DefaultInferenceBaseURL; Substrate should pass SubstrateBootstrapDefaultBaseURL. +func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, defaultBaseURLWhenUnset string) ([]byte, map[string]string, error) { if mc == nil { return nil, nil, fmt.Errorf("ModelConfig is required") } @@ -37,7 +74,7 @@ func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace strin } providerRecord := GatewayProviderRecordName(mc.Spec.Provider) - doc := buildCoreBootstrapDocument(mc, gwPort, apiKeyEnv, providerRecord, modelID, apiAdapter) + doc := buildCoreBootstrapDocument(mc, gw, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset) chState, err := accumulateHarnessChannels(ctx, kube, namespace, sbx.Spec.Backend, sbx.Spec.Channels, env) if err != nil { @@ -54,29 +91,19 @@ func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace strin return raw, env, nil } -func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gwPort int, apiKeyEnv, providerRecord, modelID, apiAdapter string) bootstrapDocument { - baseURL := bootstrapProviderBaseURL(mc) - return bootstrapDocument{ - Gateway: gatewaySection{ - Mode: "local", - Bind: "loopback", - Auth: gatewayAuth{Mode: "none"}, - Port: gwPort, - }, - Models: modelsSection{ - Mode: "merge", - Providers: map[string]providerSettings{ - providerRecord: { - BaseURL: baseURL, - APIKey: openshellResolveEnv(apiKeyEnv), - Auth: providerAuth(mc), - API: apiAdapter, - Models: []modelSlot{ - {ID: modelID, Name: modelID}, - }, - }, - }, - }, +// BuildGatewayOnlyBootstrapJSON returns a minimal openclaw.json with gateway settings only (no models/channels). +func BuildGatewayOnlyBootstrapJSON(gw GatewayBootstrapConfig) ([]byte, error) { + doc := bootstrapDocument{Gateway: buildGatewaySection(gw)} + raw, err := json.Marshal(doc) + if err != nil { + return nil, fmt.Errorf("marshal openclaw json: %w", err) + } + return raw, nil +} + +func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset string) bootstrapDocument { + doc := bootstrapDocument{ + Gateway: buildGatewaySection(gw), Agents: agentsSection{ Defaults: agentDefaults{ Model: defaultModelPick{ @@ -85,6 +112,76 @@ func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gwPort int, apiKeyEnv, }, }, } + + // Substrate: do not emit models.providers without baseUrl (OpenClaw rejects undefined baseUrl). + // Rely on agents.defaults + API key env unless the user set an explicit URL on ModelConfig. + if defaultBaseURLWhenUnset == SubstrateBootstrapDefaultBaseURL { + if explicit := modelConfigExplicitBaseURL(mc); explicit != "" { + doc.Models = &modelsSection{ + Mode: "merge", + Providers: map[string]providerSettings{ + providerRecord: { + BaseURL: explicit, + APIKey: openshellResolveEnv(apiKeyEnv), + Auth: providerAuth(mc), + API: apiAdapter, + Models: []modelSlot{ + {ID: modelID, Name: modelID}, + }, + }, + }, + } + } + return doc + } + + baseURL := bootstrapProviderBaseURL(mc, defaultBaseURLWhenUnset) + doc.Models = &modelsSection{ + Mode: "merge", + Providers: map[string]providerSettings{ + providerRecord: { + BaseURL: baseURL, + APIKey: openshellResolveEnv(apiKeyEnv), + Auth: providerAuth(mc), + API: apiAdapter, + Models: []modelSlot{ + {ID: modelID, Name: modelID}, + }, + }, + }, + } + return doc +} + +func buildGatewaySection(gw GatewayBootstrapConfig) gatewaySection { + port := gw.Port + if port <= 0 { + port = 18800 + } + bind := strings.TrimSpace(gw.Bind) + if bind == "" { + bind = "loopback" + } + authMode := strings.TrimSpace(gw.AuthMode) + if authMode == "" { + authMode = "none" + } + section := gatewaySection{ + Mode: "local", + Bind: bind, + Auth: gatewayAuth{Mode: authMode}, + Port: port, + } + if authMode == "token" { + section.Auth.Token = gw.Token + } + if gw.ControlUI != nil { + section.ControlUi = &controlUiSection{ + AllowedOrigins: gw.ControlUI.AllowedOrigins, + DangerouslyDisableDeviceAuth: gw.ControlUI.DangerouslyDisableDeviceAuth, + } + } + return section } func applySecretsAllowlist(doc *bootstrapDocument, env map[string]string) { diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go new file mode 100644 index 0000000000..4fd9ff2e72 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go @@ -0,0 +1,21 @@ +package openclaw_test + +import ( + "encoding/json" + "testing" + + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/stretchr/testify/require" +) + +func TestSubstrateGatewayBootstrap(t *testing.T) { + t.Parallel() + raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80)) + require.NoError(t, err) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + require.Equal(t, "lan", gw["bind"]) + cui := gw["controlUi"].(map[string]any) + require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go index 4dfa1a2633..3ffbd7c9ca 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go @@ -39,7 +39,7 @@ func TestBuildBootstrapJSON_OpenAIDefaultBaseURLInferenceLocal(t *testing.T) { sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, 18800) + raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.OpenshellGatewayBootstrap(18800), openclaw.DefaultInferenceBaseURL) require.NoError(t, err) var root map[string]any @@ -56,6 +56,42 @@ func TestBuildBootstrapJSON_OpenAIDefaultBaseURLInferenceLocal(t *testing.T) { require.Contains(t, kagent["allowlist"], "OPENAI_API_KEY") } +func TestBuildBootstrapJSON_SubstrateOmitsModelsWhenNoExplicitBaseURL(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "default" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openai-key", Namespace: ns}, + Data: map[string][]byte{"OPENAI_API_KEY": []byte("sk-test")}, + } + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc1", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() + raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80), openclaw.SubstrateBootstrapDefaultBaseURL) + require.NoError(t, err) + + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + _, hasModels := root["models"] + require.False(t, hasModels) + agents := root["agents"].(map[string]any) + defaults := agents["defaults"].(map[string]any) + model := defaults["model"].(map[string]any) + require.Equal(t, "openai/gpt-4o", model["primary"]) +} + func TestBuildBootstrapJSON_OpenAIAndTelegram(t *testing.T) { scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -92,7 +128,7 @@ func TestBuildBootstrapJSON_OpenAIAndTelegram(t *testing.T) { } kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - raw, env, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, 18800) + raw, env, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.OpenshellGatewayBootstrap(18800), openclaw.DefaultInferenceBaseURL) require.NoError(t, err) require.Equal(t, "sk-test", env["OPENAI_API_KEY"]) require.Equal(t, "telegram-bot-token", env["TELEGRAM_BOT_TOKEN_TG1"]) diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go b/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go index dd0f98cdc8..e94d0789f1 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go @@ -7,6 +7,11 @@ const ( // bootstrapSecretProviderID is the secrets.providers key written into openclaw.json. bootstrapSecretProviderID = "kagent" - // DefaultInferenceBaseURL is the Model provider baseUrl when ModelConfig does not set an explicit upstream. + // DefaultInferenceBaseURL is the Model provider baseUrl when ModelConfig does not set an explicit upstream (OpenShell). DefaultInferenceBaseURL = "https://inference.local/v1" + + // SubstrateBootstrapDefaultBaseURL is passed to BuildBootstrapJSON for Substrate harnesses. + // When ModelConfig has no explicit provider URL, the models section is omitted entirely so + // OpenClaw is not given a partial providers.* block (baseUrl is required when present). + SubstrateBootstrapDefaultBaseURL = "" ) diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go b/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go index 70a075a272..8c4183e0e9 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go @@ -7,7 +7,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" ) -func bootstrapProviderBaseURL(mc *v1alpha2.ModelConfig) string { +func modelConfigExplicitBaseURL(mc *v1alpha2.ModelConfig) string { switch mc.Spec.Provider { case v1alpha2.ModelProviderOpenAI: if mc.Spec.OpenAI != nil && strings.TrimSpace(mc.Spec.OpenAI.BaseURL) != "" { @@ -30,7 +30,14 @@ func bootstrapProviderBaseURL(mc *v1alpha2.ModelConfig) string { return strings.TrimSpace(mc.Spec.SAPAICore.BaseURL) } } - return DefaultInferenceBaseURL + return "" +} + +func bootstrapProviderBaseURL(mc *v1alpha2.ModelConfig, defaultWhenUnset string) string { + if u := modelConfigExplicitBaseURL(mc); u != "" { + return u + } + return defaultWhenUnset } func providerAuth(mc *v1alpha2.ModelConfig) string { diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go index da73347668..bf6dd73760 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go @@ -5,21 +5,28 @@ package openclaw type bootstrapDocument struct { Gateway gatewaySection `json:"gateway"` - Models modelsSection `json:"models"` + Models *modelsSection `json:"models,omitempty"` Agents agentsSection `json:"agents"` Channels *channelsConfig `json:"channels,omitempty"` Secrets secretsSection `json:"secrets"` } type gatewaySection struct { - Mode string `json:"mode"` - Bind string `json:"bind"` - Auth gatewayAuth `json:"auth"` - Port int `json:"port"` + Mode string `json:"mode"` + Bind string `json:"bind"` + Auth gatewayAuth `json:"auth"` + Port int `json:"port"` + ControlUi *controlUiSection `json:"controlUi,omitempty"` } type gatewayAuth struct { - Mode string `json:"mode"` + Mode string `json:"mode"` + Token string `json:"token,omitempty"` +} + +type controlUiSection struct { + AllowedOrigins []string `json:"allowedOrigins,omitempty"` + DangerouslyDisableDeviceAuth bool `json:"dangerouslyDisableDeviceAuth,omitempty"` } type modelsSection struct { @@ -28,7 +35,7 @@ type modelsSection struct { } type providerSettings struct { - BaseURL string `json:"baseUrl"` + BaseURL string `json:"baseUrl,omitempty"` APIKey string `json:"apiKey"` Auth string `json:"auth"` API string `json:"api"` diff --git a/go/core/pkg/sandboxbackend/substrate/client.go b/go/core/pkg/sandboxbackend/substrate/client.go new file mode 100644 index 0000000000..70291c7bb8 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/client.go @@ -0,0 +1,114 @@ +package substrate + +import ( + "context" + "crypto/tls" + "fmt" + "time" + + "github.com/agent-substrate/substrate/proto/ateapipb" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "google.golang.org/grpc/credentials/insecure" +) + +// Client wraps ate-api Control gRPC. +type Client struct { + ateapipb.ControlClient + conn *grpc.ClientConn + cfg Config +} + +// Dial connects to the ate-api server. +func Dial(ctx context.Context, cfg Config) (*Client, error) { + if cfg.AteAPIEndpoint == "" { + return nil, fmt.Errorf("substrate: ate-api endpoint is required") + } + dialTimeout := cfg.DialTimeout + if dialTimeout <= 0 { + dialTimeout = 10 * time.Second + } + dialCtx, cancel := context.WithTimeout(ctx, dialTimeout) + defer cancel() + + var opts []grpc.DialOption + if cfg.Insecure { + opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) + } else { + opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}))) + } + + conn, err := grpc.NewClient(cfg.AteAPIEndpoint, opts...) + if err != nil { + return nil, fmt.Errorf("substrate: dial ate-api %q: %w", cfg.AteAPIEndpoint, err) + } + _ = dialCtx + + return &Client{ + ControlClient: ateapipb.NewControlClient(conn), + conn: conn, + cfg: cfg, + }, nil +} + +func (c *Client) Close() error { + if c.conn != nil { + return c.conn.Close() + } + return nil +} + +func (c *Client) callCtx(ctx context.Context) (context.Context, context.CancelFunc) { + if c.cfg.CallTimeout <= 0 { + return ctx, func() {} + } + return context.WithTimeout(ctx, c.cfg.CallTimeout) +} + +func (c *Client) GetActor(ctx context.Context, actorID string) (*ateapipb.Actor, error) { + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.GetActor(ctx, &ateapipb.GetActorRequest{ActorId: actorID}) + if err != nil { + return nil, err + } + return resp.GetActor(), nil +} + +func (c *Client) CreateActor(ctx context.Context, actorID, tmplNS, tmplName string) (*ateapipb.Actor, error) { + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.CreateActor(ctx, &ateapipb.CreateActorRequest{ + ActorId: actorID, + ActorTemplateNamespace: tmplNS, + ActorTemplateName: tmplName, + }) + if err != nil { + return nil, err + } + return resp.GetActor(), nil +} + +func (c *Client) ResumeActor(ctx context.Context, actorID string) (*ateapipb.Actor, error) { + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.ResumeActor(ctx, &ateapipb.ResumeActorRequest{ActorId: actorID}) + if err != nil { + return nil, err + } + return resp.GetActor(), nil +} + +func (c *Client) SuspendActor(ctx context.Context, actorID string) error { + ctx, cancel := c.callCtx(ctx) + defer cancel() + _, err := c.ControlClient.SuspendActor(ctx, &ateapipb.SuspendActorRequest{ActorId: actorID}) + return err +} + +func (c *Client) DeleteActor(ctx context.Context, actorID string) error { + ctx, cancel := c.callCtx(ctx) + defer cancel() + _, err := c.ControlClient.DeleteActor(ctx, &ateapipb.DeleteActorRequest{ActorId: actorID}) + return err +} diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go new file mode 100644 index 0000000000..45b5cb2b48 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -0,0 +1,22 @@ +package substrate + +import "time" + +// Config holds connection settings for Agent Substrate ate-api. +type Config struct { + // AteAPIEndpoint is a gRPC target (e.g. dns:///api.ate-system.svc:443). + AteAPIEndpoint string + Insecure bool + DialTimeout time.Duration + CallTimeout time.Duration + + // DefaultActorTemplateNamespace/name is a legacy fallback when status/spec refs are unset. + DefaultActorTemplateNamespace string + DefaultActorTemplateName string + + // ProvisionDefaults configures auto-created WorkerPool/ActorTemplate resources. + ProvisionDefaults ProvisionDefaults + + // GatewayToken is the OpenClaw gateway Bearer token injected by the HTTP proxy. + GatewayToken string +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor.go b/go/core/pkg/sandboxbackend/substrate/delete_actor.go new file mode 100644 index 0000000000..c7a36e8409 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor.go @@ -0,0 +1,127 @@ +package substrate + +import ( + "context" + "fmt" + "time" + + "github.com/agent-substrate/substrate/proto/ateapipb" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +const ( + actorDeletePollInterval = 2 * time.Second + actorDeleteTimeout = 5 * time.Minute +) + +// deleteActorSequenced suspends the actor, waits until suspended, deletes it, and waits until gone. +func (c *Client) deleteActorSequenced(ctx context.Context, actorID string) error { + if actorID == "" { + return nil + } + deadline := time.Now().Add(actorDeleteTimeout) + + actor, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + return nil + } + return fmt.Errorf("get actor %q: %w", actorID, err) + } + + if err := c.ensureActorSuspended(ctx, actorID, actor.GetStatus(), deadline); err != nil { + return err + } + + if err := c.DeleteActor(ctx, actorID); err != nil { + if status.Code(err) == codes.NotFound { + return nil + } + if status.Code(err) == codes.FailedPrecondition { + // ate-api requires STATUS_SUSPENDED; re-check and surface current status. + actor, getErr := c.GetActor(ctx, actorID) + if getErr == nil { + return fmt.Errorf("delete actor %q: not suspended (status %s)", actorID, actor.GetStatus()) + } + } + return fmt.Errorf("delete actor %q: %w", actorID, err) + } + + return c.waitForActorDeleted(ctx, actorID, deadline) +} + +func (c *Client) ensureActorSuspended(ctx context.Context, actorID string, st ateapipb.Actor_Status, deadline time.Time) error { + switch st { + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + return nil + case ateapipb.Actor_STATUS_SUSPENDING: + // Retry suspend periodically; stuck checkpoint may need manual worker pod deletion. + _ = c.SuspendActor(ctx, actorID) + return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: + if err := c.SuspendActor(ctx, actorID); err != nil && status.Code(err) != codes.NotFound { + return fmt.Errorf("suspend actor %q: %w", actorID, err) + } + return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + default: + // Best-effort suspend for unknown/intermediate states before delete. + _ = c.SuspendActor(ctx, actorID) + return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + } +} + +func (c *Client) waitForActorStatus(ctx context.Context, actorID string, want ateapipb.Actor_Status, deadline time.Time) error { + for time.Now().Before(deadline) { + actor, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + if want == ateapipb.Actor_STATUS_UNSPECIFIED { + return nil + } + return fmt.Errorf("actor %q not found while waiting for %s", actorID, want) + } + return fmt.Errorf("get actor %q: %w", actorID, err) + } + if actor.GetStatus() == want { + return nil + } + if want == ateapipb.Actor_STATUS_SUSPENDED && actor.GetStatus() == ateapipb.Actor_STATUS_SUSPENDING { + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + continue + } + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + } + return fmt.Errorf("timeout waiting for actor %q status %s", actorID, want) +} + +func (c *Client) waitForActorDeleted(ctx context.Context, actorID string, deadline time.Time) error { + for time.Now().Before(deadline) { + _, err := c.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) == codes.NotFound { + return nil + } + return fmt.Errorf("get actor %q: %w", actorID, err) + } + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + } + return fmt.Errorf("timeout waiting for actor %q deletion", actorID) +} + +func sleepOrDone(ctx context.Context, d time.Duration) error { + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return ctx.Err() + case <-t.C: + return nil + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go new file mode 100644 index 0000000000..38b9bdae39 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go @@ -0,0 +1,18 @@ +package substrate + +import ( + "testing" + "time" + + "github.com/agent-substrate/substrate/proto/ateapipb" +) + +func TestEnsureActorSuspendedAlreadySuspended(t *testing.T) { + t.Parallel() + c := &Client{} + deadline := time.Now().Add(time.Minute) + err := c.ensureActorSuspended(t.Context(), "ahr-test", ateapipb.Actor_STATUS_SUSPENDED, deadline) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go new file mode 100644 index 0000000000..0b74d786f7 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -0,0 +1,109 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + "time" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" +) + +const workerPoolDrainTimeout = 3 * time.Minute + +// Delete removes kagent-managed Substrate CRs after the harness actor has been removed. +// Order: golden snapshot actor (from ActorTemplate status), ActorTemplate, WorkerPool. +func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) error { + if ah == nil || ah.Annotations == nil { + return nil + } + if ah.Annotations[annotationManagedActorTemplate] == "true" { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + if err := p.deleteGoldenActor(ctx, key); err != nil { + return err + } + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &tmpl); err == nil { + if err := p.Client.Delete(ctx, &tmpl); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete ActorTemplate %s: %w", key, err) + } + } else if !apierrors.IsNotFound(err) { + return err + } + } + if ah.Annotations[annotationManagedWorkerPool] == "true" { + key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err == nil { + if err := p.Client.Delete(ctx, &wp); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("delete WorkerPool %s: %w", key, err) + } + } else if !apierrors.IsNotFound(err) { + return err + } + if err := p.waitForWorkerPoolDeploymentGone(ctx, key); err != nil { + return err + } + } + return nil +} + +func (p *Provisioner) deleteGoldenActor(ctx context.Context, tmplKey types.NamespacedName) error { + if p.Ate == nil || p.Client == nil { + return nil + } + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) + } + goldenID := strings.TrimSpace(tmpl.Status.GoldenActorID) + if goldenID == "" { + return nil + } + if err := p.Ate.deleteActorSequenced(ctx, goldenID); err != nil { + return fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) + } + return nil +} + +func workerPoolDeploymentName(wpName string) string { + return wpName + "-deployment" +} + +func (p *Provisioner) waitForWorkerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) error { + if p.Client == nil { + return nil + } + deployKey := types.NamespacedName{Namespace: wpKey.Namespace, Name: workerPoolDeploymentName(wpKey.Name)} + deadline := time.Now().Add(workerPoolDrainTimeout) + for time.Now().Before(deadline) { + var deploy appsv1.Deployment + err := p.Client.Get(ctx, deployKey, &deploy) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) + } + if deploy.DeletionTimestamp != nil { + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + continue + } + if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { + return nil + } + if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { + return err + } + } + return fmt.Errorf("timeout waiting for WorkerPool deployment %s to drain", deployKey) +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go new file mode 100644 index 0000000000..dc4b8e338d --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go @@ -0,0 +1,61 @@ +package substrate + +import ( + "context" + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +type recordingActorDeleter struct { + deleted []string +} + +func (r *recordingActorDeleter) deleteActorSequenced(_ context.Context, actorID string) error { + r.deleted = append(r.deleted, actorID) + return nil +} + +func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + ns := "kagent" + tmpl := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns}, + Status: atev1alpha1.ActorTemplateStatus{ + GoldenActorID: "golden-actor-uuid", + Phase: atev1alpha1.PhaseReady, + }, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{ + Name: "peterj-claw", + Namespace: ns, + Annotations: map[string]string{ + annotationManagedActorTemplate: "true", + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tmpl).Build() + rec := &recordingActorDeleter{} + p := &Provisioner{Client: kube, Ate: rec} + + require.NoError(t, p.Delete(context.Background(), ah)) + require.Equal(t, []string{"golden-actor-uuid"}, rec.deleted) + + var got atev1alpha1.ActorTemplate + require.Error(t, kube.Get(context.Background(), client.ObjectKeyFromObject(tmpl), &got)) +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go new file mode 100644 index 0000000000..08d5b7d7a3 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -0,0 +1,231 @@ +package substrate + +import ( + "context" + "fmt" + "regexp" + "strings" + + "github.com/agent-substrate/substrate/proto/ateapipb" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/record" +) + +const ( + defaultActorHostSuffix = "actors.resources.substrate.ate.dev" + defaultSubstrateGWPort = int32(80) + actorIDPrefix = "ahr" +) + +var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) + +// ClawBackend implements AsyncBackend for OpenClaw/NemoClaw on Agent Substrate. +type ClawBackend struct { + client *Client + cfg Config + backend v1alpha2.AgentHarnessBackendType + recorder record.EventRecorder +} + +var _ sandboxbackend.AsyncBackend = (*ClawBackend)(nil) + +// NewOpenClawBackend returns a substrate backend for openclaw/nemoclaw harness types. +func NewOpenClawBackend(client *Client, cfg Config, backend v1alpha2.AgentHarnessBackendType, recorder record.EventRecorder) *ClawBackend { + return &ClawBackend{ + client: client, + cfg: cfg, + backend: backend, + recorder: recorder, + } +} + +func (b *ClawBackend) Name() v1alpha2.AgentHarnessBackendType { + return b.backend +} + +func (b *ClawBackend) EnsureAgentHarness(ctx context.Context, ah *v1alpha2.AgentHarness) (sandboxbackend.EnsureResult, error) { + if ah == nil { + return sandboxbackend.EnsureResult{}, fmt.Errorf("AgentHarness is required") + } + if err := validateSubstrateSpec(ah); err != nil { + return sandboxbackend.EnsureResult{}, err + } + + actorID := ActorID(ah) + tmplNS, tmplName := actorTemplateRef(ah, b.cfg) + + actor, err := b.client.GetActor(ctx, actorID) + if err != nil { + if status.Code(err) != codes.NotFound { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate GetActor %q: %w", actorID, err) + } + actor, err = b.client.CreateActor(ctx, actorID, tmplNS, tmplName) + if err != nil { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate CreateActor %q: %w", actorID, err) + } + } + + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: + // already active or waking + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + actor, err = b.client.ResumeActor(ctx, actorID) + if err != nil { + return sandboxbackend.EnsureResult{}, fmt.Errorf("substrate ResumeActor %q: %w", actorID, err) + } + default: + // suspending — wait for next reconcile + } + + endpoint := substrateConnectionEndpoint(ah.Namespace, ah.Name, actor) + + return sandboxbackend.EnsureResult{ + Handle: sandboxbackend.Handle{ID: actorID}, + Endpoint: endpoint, + }, nil +} + +func (b *ClawBackend) GetStatus(ctx context.Context, h sandboxbackend.Handle) (metav1.ConditionStatus, string, string) { + if h.ID == "" { + return metav1.ConditionUnknown, "ActorHandleMissing", "no substrate actor id recorded yet" + } + actor, err := b.client.GetActor(ctx, h.ID) + if err != nil { + if status.Code(err) == codes.NotFound { + return metav1.ConditionUnknown, "ActorNotFound", fmt.Sprintf("substrate actor %q not found", h.ID) + } + return metav1.ConditionUnknown, "ActorGetFailed", err.Error() + } + return actorStatusToCondition(actor) +} + +func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.Handle) error { + if h.ID == "" { + return nil + } + if err := b.client.deleteActorSequenced(ctx, h.ID); err != nil { + return fmt.Errorf("substrate delete actor %q: %w", h.ID, err) + } + return nil +} + +func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle) error { + // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time + // (see substrate.Provisioner.buildOpenClawActorStartup — same openclaw.BuildBootstrapJSON as OpenShell). + _ = ctx + _ = ah + _ = h + return nil +} + +// ActorID returns a stable DNS-1123 actor id for this harness. +func ActorID(ah *v1alpha2.AgentHarness) string { + raw := fmt.Sprintf("%s-%s-%s", actorIDPrefix, ah.Namespace, ah.Name) + raw = strings.ToLower(raw) + raw = strings.ReplaceAll(raw, "_", "-") + if len(raw) > 63 { + raw = raw[:63] + raw = strings.TrimRight(raw, "-") + } + if !dns1123Label.MatchString(raw) { + // fallback: hash-like trim + raw = fmt.Sprintf("%s-%s", actorIDPrefix, ah.UID) + if len(raw) > 63 { + raw = raw[:63] + } + } + return raw +} + +// ActorHost returns the atenet router Host header value for the actor. +func ActorHost(actorID string, suffix string) string { + if suffix == "" { + suffix = defaultActorHostSuffix + } + return actorID + "." + suffix +} + +func actorTemplateRef(ah *v1alpha2.AgentHarness, cfg Config) (string, string) { + if ah.Status.Substrate != nil && ah.Status.Substrate.ActorTemplateRef.Name != "" { + ref := ah.Status.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + return ns, ref.Name + } + if ah.Spec.Substrate != nil && ah.Spec.Substrate.ActorTemplateRef != nil { + ref := ah.Spec.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + if ref.Name != "" { + return ns, ref.Name + } + } + // Auto-provisioned template in the harness namespace (also when status was not persisted yet). + if ah.Annotations != nil && ah.Annotations[AnnotationManagedActorTemplate] == "true" { + return ah.Namespace, actorTemplateName(ah) + } + if cfg.DefaultActorTemplateNamespace != "" && cfg.DefaultActorTemplateName != "" { + return cfg.DefaultActorTemplateNamespace, cfg.DefaultActorTemplateName + } + return ah.Namespace, actorTemplateName(ah) +} + +func substrateConnectionEndpoint(namespace, name string, actor *ateapipb.Actor) string { + gw := fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", namespace, name) + if actor == nil { + return "kagent gateway: " + gw + } + if podIP := strings.TrimSpace(actor.GetAteomPodIp()); podIP != "" { + return fmt.Sprintf("http://%s:80 (pod IP; UI via kagent %s)", podIP, gw) + } + return fmt.Sprintf("kagent gateway: %s (actor status %s)", gw, actor.GetStatus()) +} + +func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { + runtime := ah.Spec.Runtime + if runtime == "" { + runtime = v1alpha2.AgentHarnessRuntimeOpenshell + } + if runtime != v1alpha2.AgentHarnessRuntimeSubstrate { + return fmt.Errorf("substrate backend called for runtime %q", runtime) + } + if ah.Spec.Substrate == nil { + return fmt.Errorf("spec.substrate is required when runtime is substrate") + } + if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { + return nil + } + if strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location) == "" { + return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") + } + return nil +} + +func actorStatusToCondition(actor *ateapipb.Actor) (metav1.ConditionStatus, string, string) { + if actor == nil { + return metav1.ConditionUnknown, "ActorMissing", "empty actor response" + } + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_RUNNING: + if ip := actor.GetAteomPodIp(); ip != "" { + return metav1.ConditionTrue, "ActorRunning", fmt.Sprintf("actor running on %s", ip) + } + return metav1.ConditionTrue, "ActorRunning", "actor is running" + case ateapipb.Actor_STATUS_RESUMING: + return metav1.ConditionFalse, "ActorResuming", "actor is resuming" + case ateapipb.Actor_STATUS_SUSPENDING: + return metav1.ConditionFalse, "ActorSuspending", "actor is suspending" + case ateapipb.Actor_STATUS_SUSPENDED: + return metav1.ConditionFalse, "ActorSuspended", "actor is suspended" + default: + return metav1.ConditionUnknown, "ActorStatusUnknown", actor.GetStatus().String() + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go new file mode 100644 index 0000000000..fa7c6c8d75 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go @@ -0,0 +1,53 @@ +package substrate + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestActorID(t *testing.T) { + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "kagent", + Name: "my-claw", + UID: "00000000-0000-0000-0000-000000000001", + }, + } + id := ActorID(ah) + if !dns1123Label.MatchString(id) { + t.Fatalf("ActorID %q is not DNS-1123", id) + } + if id == "" { + t.Fatal("expected non-empty actor id") + } +} + +func TestActorHost(t *testing.T) { + got := ActorHost("ahr-kagent-my-claw", "") + if got != "ahr-kagent-my-claw.actors.resources.substrate.ate.dev" { + t.Fatalf("ActorHost = %q", got) + } +} + +func TestActorTemplateRefManagedProvisioner(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "kagent", + Name: "peterj-claw", + Annotations: map[string]string{ + AnnotationManagedActorTemplate: "true", + }, + }, + } + ns, name := actorTemplateRef(ah, Config{ + DefaultActorTemplateNamespace: "ate-demo-openclaw", + DefaultActorTemplateName: "openclaw", + }) + if ns != "kagent" || name != "peterj-claw" { + t.Fatalf("got %s/%s, want kagent/peterj-claw", ns, name) + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go new file mode 100644 index 0000000000..d8a63de188 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -0,0 +1,301 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +const ( + AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" + AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" + + annotationManagedWorkerPool = AnnotationManagedWorkerPool + annotationManagedActorTemplate = AnnotationManagedActorTemplate + + defaultWorkerPoolReplicas = int32(2) + defaultOpenClawContainer = "openclaw" +) + +// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. +type ProvisionDefaults struct { + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string + DefaultWorkloadImage string + GatewayToken string +} + +// ateActorDeleter removes actors from ate-api during harness teardown. +type ateActorDeleter interface { + deleteActorSequenced(ctx context.Context, actorID string) error +} + +// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. +type Provisioner struct { + Client client.Client + Defaults ProvisionDefaults + // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. + Ate ateActorDeleter +} + +// EnsureResult describes provisioned Substrate resources. +type EnsureResult struct { + WorkerPoolRef types.NamespacedName + ActorTemplateRef types.NamespacedName + ActorTemplateReady bool + ManagedWorkerPool bool + ManagedActorTemplate bool +} + +// Ensure creates or updates Substrate CRs and waits for ActorTemplate Ready. +func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { + if ah == nil || ah.Spec.Substrate == nil { + return EnsureResult{}, fmt.Errorf("spec.substrate is required") + } + if err := validateSubstrateProvisionSpec(ah); err != nil { + return EnsureResult{}, err + } + + // Legacy / advanced: user supplied an existing template. + if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { + ref := ah.Spec.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace + } + tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} + ready, err := p.actorTemplateReady(ctx, tmplKey) + if err != nil { + return EnsureResult{}, err + } + return EnsureResult{ + ActorTemplateRef: tmplKey, + ActorTemplateReady: ready, + ManagedActorTemplate: false, + }, nil + } + + wpKey, managedWP, err := p.ensureWorkerPool(ctx, ah) + if err != nil { + return EnsureResult{}, err + } + + tmplKey, err := p.ensureActorTemplate(ctx, ah, wpKey) + if err != nil { + return EnsureResult{}, err + } + + ready, err := p.actorTemplateReady(ctx, tmplKey) + if err != nil { + return EnsureResult{}, err + } + + _ = managedWP + return EnsureResult{ + WorkerPoolRef: wpKey, + ActorTemplateRef: tmplKey, + ActorTemplateReady: ready, + ManagedWorkerPool: managedWP, + ManagedActorTemplate: true, + }, nil +} + +func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { + sub := ah.Spec.Substrate + if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { + return nil + } + loc := strings.TrimSpace(sub.SnapshotsConfig.Location) + if loc == "" { + return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") + } + if !strings.HasPrefix(loc, "gs://") { + return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") + } + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { + return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") + } + return nil +} + +func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { + sub := ah.Spec.Substrate + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { + ns := sub.WorkerPoolRef.Namespace + if ns == "" { + ns = ah.Namespace + } + key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) + } + return key, false, nil + } + + key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + replicas := defaultWorkerPoolReplicas + ateomImage := "" + if sub.WorkerPool != nil { + if sub.WorkerPool.Replicas > 0 { + replicas = sub.WorkerPool.Replicas + } + ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) + } + if ateomImage == "" { + return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set spec.substrate.workerPool.ateomImage)") + } + + desired := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: replicas, + AteomImage: ateomImage, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) + } + + var existing atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) + } + return key, true, nil + } else if err != nil { + return types.NamespacedName{}, false, err + } + existing.Spec.Replicas = desired.Spec.Replicas + existing.Spec.AteomImage = desired.Spec.AteomImage + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) + } + return key, true, nil +} + +func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) + if workloadImage == "" { + workloadImage = strings.TrimSpace(p.Defaults.DefaultWorkloadImage) + } + if workloadImage == "" { + workloadImage = openshell.NemoclawSandboxBaseImage + } + startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) + if err != nil { + return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) + } + + desired := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.ActorTemplateSpec{ + PauseImage: p.Defaults.PauseImage, + Runsc: defaultRunscConfig(p.Defaults), + Containers: []atev1alpha1.Container{ + { + Name: defaultOpenClawContainer, + Image: workloadImage, + Ports: []corev1.ContainerPort{{ContainerPort: 80}}, + Command: []string{ + "/bin/sh", + "-c", + startupScript, + }, + Env: containerEnv, + }, + }, + WorkerPoolRef: corev1.ObjectReference{ + Name: wpKey.Name, + Namespace: wpKey.Namespace, + }, + SnapshotsConfig: atev1alpha1.SnapshotsConfig{ + Location: strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location), + }, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) + } + + var existing atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) + } + return key, nil + } else if err != nil { + return types.NamespacedName{}, err + } + existing.Spec = desired.Spec + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) + } + return key, nil +} + +func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &tmpl); err != nil { + return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) + } + return tmpl.Status.Phase == atev1alpha1.PhaseReady, nil +} + +func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { + return atev1alpha1.RunscConfig{ + AMD64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscAMD64URL, + SHA256Hash: d.RunscAMD64SHA256, + }, + ARM64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscARM64URL, + SHA256Hash: d.RunscARM64SHA256, + }, + } +} + +func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { + return map[string]string{ + "app.kubernetes.io/managed-by": "kagent", + "kagent.dev/agent-harness": ah.Name, + } +} + +func workerPoolName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name + "-wp") +} + +func actorTemplateName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name) +} + +func truncateDNS1123(s string) string { + s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) + if len(s) > 63 { + s = strings.TrimRight(s[:63], "-") + } + return s +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go new file mode 100644 index 0000000000..b2d53e405a --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -0,0 +1,88 @@ +package substrate + +import ( + "context" + "encoding/base64" + "fmt" + "sort" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/internal/utils" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + corev1 "k8s.io/api/core/v1" +) + +const defaultSubstrateOpenClawGatewayPort = 80 + +// buildOpenClawActorStartup returns the ateom workload startup script and container env for OpenClaw on Substrate. +// When spec.modelConfigRef is set, openclaw.json includes models/agents/channels like the OpenShell bootstrap path. +func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha2.AgentHarness) (script string, env []corev1.EnvVar, err error) { + if ah == nil { + return "", nil, fmt.Errorf("AgentHarness is required") + } + if p.Client == nil { + return "", nil, fmt.Errorf("substrate provisioner kubernetes client is required") + } + + token := strings.TrimSpace(p.Defaults.GatewayToken) + gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort) + + var jsonBytes []byte + var envMap map[string]string + + ref := strings.TrimSpace(ah.Spec.ModelConfigRef) + if ref != "" { + mcRef, parseErr := utils.ParseRefString(ref, ah.Namespace) + if parseErr != nil { + return "", nil, fmt.Errorf("parse modelConfigRef %q: %w", ref, parseErr) + } + mc := &v1alpha2.ModelConfig{} + if getErr := p.Client.Get(ctx, mcRef, mc); getErr != nil { + return "", nil, fmt.Errorf("get ModelConfig %s: %w", mcRef, getErr) + } + jsonBytes, envMap, err = openclaw.BuildBootstrapJSON(ctx, p.Client, ah.Namespace, ah, mc, gw, openclaw.SubstrateBootstrapDefaultBaseURL) + if err != nil { + return "", nil, fmt.Errorf("build openclaw bootstrap json: %w", err) + } + } else { + jsonBytes, err = openclaw.BuildGatewayOnlyBootstrapJSON(gw) + if err != nil { + return "", nil, fmt.Errorf("build gateway-only openclaw json: %w", err) + } + envMap = map[string]string{} + } + + containerEnv := openClawEnvVars(envMap) + script = openClawStartupScript(jsonBytes, gw.Port) + return script, containerEnv, nil +} + +func openClawEnvVars(envMap map[string]string) []corev1.EnvVar { + keys := make([]string, 0, len(envMap)) + for k := range envMap { + keys = append(keys, k) + } + sort.Strings(keys) + out := make([]corev1.EnvVar, 0, len(keys)+1) + for _, k := range keys { + out = append(out, corev1.EnvVar{Name: k, Value: envMap[k]}) + } + out = append(out, corev1.EnvVar{Name: "HOME", Value: "/root"}) + return out +} + +func openClawStartupScript(jsonBytes []byte, gwPort int) string { + b64 := base64.StdEncoding.EncodeToString(jsonBytes) + return strings.Join([]string{ + "set -e", + `mkdir -p "${HOME}/.openclaw"`, + fmt.Sprintf(`echo '%s' | base64 -d > "${HOME}/.openclaw/openclaw.json"`, b64), + fmt.Sprintf("openclaw gateway run --port %d --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 &", gwPort), + `for i in $(seq 1 60); do`, + ` curl -sf http://127.0.0.1:80/ >/dev/null 2>&1 && echo "gateway up" && break`, + " sleep 1", + "done", + "tail -f /tmp/openclaw-gateway.log /dev/null", + }, "\n") +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go new file mode 100644 index 0000000000..f4ac28d3f9 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -0,0 +1,153 @@ +package substrate + +import ( + "context" + "encoding/base64" + "encoding/json" + "strings" + "testing" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openai-key", Namespace: ns}, + Data: map[string][]byte{"OPENAI_API_KEY": []byte("sk-test")}, + } + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "default-model-config", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + ModelConfigRef: "default-model-config", + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + Location: "gs://bucket/prefix/", + }, + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() + p := &Provisioner{ + Client: kube, + Defaults: ProvisionDefaults{GatewayToken: "some-token"}, + } + + script, env, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + require.Contains(t, script, "base64 -d") + require.Contains(t, script, "openclaw gateway run --port 80") + + var foundKey bool + for _, e := range env { + if e.Name == "OPENAI_API_KEY" && e.Value == "sk-test" { + foundKey = true + } + } + require.True(t, foundKey, "expected OPENAI_API_KEY in container env") + + // Decode embedded JSON from the base64 line in the startup script. + var payload string + for _, line := range strings.Split(script, "\n") { + if !strings.Contains(line, "base64 -d") { + continue + } + start := strings.Index(line, `'`) + 1 + end := strings.LastIndex(line, `'`) + require.Greater(t, end, start) + payload = line[start:end] + break + } + require.NotEmpty(t, payload) + raw, decErr := base64.StdEncoding.DecodeString(payload) + require.NoError(t, decErr) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + require.Equal(t, "lan", gw["bind"]) + require.Equal(t, float64(80), gw["port"]) + auth := gw["auth"].(map[string]any) + require.Equal(t, "token", auth["mode"]) + require.Equal(t, "some-token", auth["token"]) + _, hasModels := root["models"] + require.False(t, hasModels, "substrate bootstrap should omit models unless ModelConfig sets an explicit baseUrl") + require.Contains(t, root, "agents") +} + +func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openai-key", Namespace: ns}, + Data: map[string][]byte{"OPENAI_API_KEY": []byte("sk-test")}, + } + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{BaseURL: "https://api.example/v1"}, + }, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + ModelConfigRef: "mc", + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + Location: "gs://bucket/prefix/", + }, + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() + p := &Provisioner{Client: kube, Defaults: ProvisionDefaults{}} + script, _, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + + var payload string + for _, line := range strings.Split(script, "\n") { + if strings.Contains(line, "base64 -d") { + start := strings.Index(line, `'`) + 1 + end := strings.LastIndex(line, `'`) + payload = line[start:end] + break + } + } + raw, decErr := base64.StdEncoding.DecodeString(payload) + require.NoError(t, decErr) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + openai := root["models"].(map[string]any)["providers"].(map[string]any)["openai"].(map[string]any) + require.Equal(t, "https://api.example/v1", openai["baseUrl"]) +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go new file mode 100644 index 0000000000..4878d40a99 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_test.go @@ -0,0 +1,47 @@ +package substrate + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestValidateSubstrateProvisionSpec(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + Location: "gs://bucket/prefix/", + }, + }, + }, + } + if err := validateSubstrateProvisionSpec(ah); err != nil { + t.Fatalf("expected valid: %v", err) + } + + ah.Spec.Substrate.SnapshotsConfig.Location = "s3://nope" + if err := validateSubstrateProvisionSpec(ah); err == nil { + t.Fatal("expected error for non-gs location") + } + + ah.Spec.Substrate.SnapshotsConfig.Location = "gs://ok" + ah.Spec.Substrate.WorkerPoolRef = &v1alpha2.TypedReference{Name: "pool"} + ah.Spec.Substrate.WorkerPool = &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 2} + if err := validateSubstrateProvisionSpec(ah); err == nil { + t.Fatal("expected error for workerPoolRef and workerPool together") + } +} + +func TestActorTemplateName(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "my-claw"}} + if got := actorTemplateName(ah); got != "my-claw" { + t.Fatalf("got %q", got) + } +} diff --git a/go/go.mod b/go/go.mod index 94f2dd970e..5ff82a4418 100644 --- a/go/go.mod +++ b/go/go.mod @@ -61,6 +61,7 @@ require ( ) require ( + github.com/agent-substrate/substrate v0.0.0 github.com/aws/aws-sdk-go-v2 v1.41.7 github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.50.6 github.com/golang/protobuf v1.5.4 @@ -86,7 +87,7 @@ require ( cel.dev/expr v0.25.1 // indirect charm.land/lipgloss/v2 v2.0.3 // indirect cloud.google.com/go v0.123.0 // indirect - cloud.google.com/go/auth v0.18.2 // indirect + cloud.google.com/go/auth v0.19.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect codeberg.org/chavacava/garif v0.2.0 // indirect @@ -200,20 +201,20 @@ require ( github.com/go-critic/go-critic v0.14.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect - github.com/go-openapi/jsonpointer v0.22.1 // indirect - github.com/go-openapi/jsonreference v0.21.2 // indirect - github.com/go-openapi/swag v0.25.1 // indirect - github.com/go-openapi/swag/cmdutils v0.25.1 // indirect - github.com/go-openapi/swag/conv v0.25.1 // indirect - github.com/go-openapi/swag/fileutils v0.25.1 // indirect - github.com/go-openapi/swag/jsonname v0.25.1 // indirect - github.com/go-openapi/swag/jsonutils v0.25.1 // indirect - github.com/go-openapi/swag/loading v0.25.1 // indirect - github.com/go-openapi/swag/mangling v0.25.1 // indirect - github.com/go-openapi/swag/netutils v0.25.1 // indirect - github.com/go-openapi/swag/stringutils v0.25.1 // indirect - github.com/go-openapi/swag/typeutils v0.25.1 // indirect - github.com/go-openapi/swag/yamlutils v0.25.1 // indirect + github.com/go-openapi/jsonpointer v0.22.4 // indirect + github.com/go-openapi/jsonreference v0.21.4 // indirect + github.com/go-openapi/swag v0.25.4 // indirect + github.com/go-openapi/swag/cmdutils v0.25.4 // indirect + github.com/go-openapi/swag/conv v0.25.4 // indirect + github.com/go-openapi/swag/fileutils v0.25.4 // indirect + github.com/go-openapi/swag/jsonname v0.25.4 // indirect + github.com/go-openapi/swag/jsonutils v0.25.4 // indirect + github.com/go-openapi/swag/loading v0.25.4 // indirect + github.com/go-openapi/swag/mangling v0.25.4 // indirect + github.com/go-openapi/swag/netutils v0.25.4 // indirect + github.com/go-openapi/swag/stringutils v0.25.4 // indirect + github.com/go-openapi/swag/typeutils v0.25.4 // indirect + github.com/go-openapi/swag/yamlutils v0.25.4 // indirect github.com/go-toolsmith/astcast v1.1.0 // indirect github.com/go-toolsmith/astcopy v1.1.0 // indirect github.com/go-toolsmith/astequal v1.2.0 // indirect @@ -245,7 +246,7 @@ require ( github.com/google/s2a-go v0.1.9 // indirect github.com/google/safehtml v0.1.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.14 // indirect - github.com/googleapis/gax-go/v2 v2.18.0 // indirect + github.com/googleapis/gax-go/v2 v2.21.0 // indirect github.com/gordonklaus/ineffassign v0.2.0 // indirect github.com/gostaticanalysis/analysisutil v0.7.1 // indirect github.com/gostaticanalysis/comment v1.5.0 // indirect @@ -308,6 +309,7 @@ require ( github.com/moby/moby/api v1.54.1 // indirect github.com/moby/moby/client v0.4.0 // indirect github.com/moby/patternmatcher v0.6.1 // indirect + github.com/moby/spdystream v0.5.1 // indirect github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect @@ -397,7 +399,7 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/bridges/prometheus v0.68.0 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.42.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect @@ -423,7 +425,7 @@ require ( golang.org/x/time v0.15.0 // indirect golang.org/x/tools v0.45.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/api v0.272.0 // indirect + google.golang.org/api v0.274.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260406210006-6f92a3bedf2d // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect diff --git a/go/go.sum b/go/go.sum index 03a15884dc..63b6d6f3e2 100644 --- a/go/go.sum +++ b/go/go.sum @@ -8,8 +8,8 @@ charm.land/lipgloss/v2 v2.0.3 h1:yM2zJ4Cf5Y51b7RHIwioil4ApI/aypFXXVHSwlM6RzU= charm.land/lipgloss/v2 v2.0.3/go.mod h1:7myLU9iG/3xluAWzpY/fSxYYHCgoKTie7laxk6ATwXA= cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE= cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU= -cloud.google.com/go/auth v0.18.2 h1:+Nbt5Ev0xEqxlNjd6c+yYUeosQ5TtEUaNcN/3FozlaM= -cloud.google.com/go/auth v0.18.2/go.mod h1:xD+oY7gcahcu7G2SG2DsBerfFxgPAJz17zz2joOFF3M= +cloud.google.com/go/auth v0.19.0 h1:DGYwtbcsGsT1ywuxsIoWi1u/vlks0moIblQHgSDgQkQ= +cloud.google.com/go/auth v0.19.0/go.mod h1:2Aph7BT2KnaSFOM0JDPyiYgNh6PL9vGMiP8CUIXZ+IY= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= @@ -70,6 +70,8 @@ github.com/abiosoft/ishell/v2 v2.0.2 h1:5qVfGiQISaYM8TkbBl7RFO6MddABoXpATrsFbVI+ github.com/abiosoft/ishell/v2 v2.0.2/go.mod h1:E4oTCXfo6QjoCart0QYa5m9w4S+deXs/P/9jA77A9Bs= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db h1:CjPUSXOiYptLbTdr1RceuZgSFDQ7U15ITERUGrUORx8= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db/go.mod h1:rB3B4rKii8V21ydCbIzH5hZiCQE7f5E9SzUb/ZZx530= +github.com/agent-substrate/substrate v0.0.0 h1:XEX4QAjzaIcv4amBqBvPE/f40WV5WHRWo7u04xvqv/g= +github.com/agent-substrate/substrate v0.0.0/go.mod h1:8Z4SJqPWDMPBa76JgIdpiX0jTY1JXcfLTXEAtkUv7go= github.com/alecthomas/assert/v2 v2.11.0 h1:2Q9r3ki8+JYXvGsDyBXwH3LcJ+WK5D0gc5E8vS6K3D0= github.com/alecthomas/assert/v2 v2.11.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k= github.com/alecthomas/chroma/v2 v2.24.1 h1:m5ffpfZbIb++k8AqFEKy9uVgY12xIQtBsQlc6DfZJQM= @@ -92,6 +94,8 @@ github.com/anthropics/anthropic-sdk-go v1.43.0 h1:ShY3C7lafzHP0ze1dCxL3ZFZzvkGfX github.com/anthropics/anthropic-sdk-go v1.43.0/go.mod h1:5cEaslQ6A9ajdL5YUvhNW57LKxEz0OAZ7WEzgZWLD7k= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= +github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/ashanbrown/forbidigo/v2 v2.3.1 h1:KAZijvQ7zeIBKbhikT4jCm0TLYXC4u78bTiLh/8JROI= github.com/ashanbrown/forbidigo/v2 v2.3.1/go.mod h1:2QDkLTzU6TV937eFROamXrW92M3paehdae4HCDCOZCM= github.com/ashanbrown/makezero/v2 v2.2.1 h1:A7uU8dgB1PA9aelTxHMfHIQ8Qev8AB3JLxJUBUsejqM= @@ -308,36 +312,40 @@ github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= -github.com/go-openapi/jsonpointer v0.22.1 h1:sHYI1He3b9NqJ4wXLoJDKmUmHkWy/L7rtEo92JUxBNk= -github.com/go-openapi/jsonpointer v0.22.1/go.mod h1:pQT9OsLkfz1yWoMgYFy4x3U5GY5nUlsOn1qSBH5MkCM= -github.com/go-openapi/jsonreference v0.21.2 h1:Wxjda4M/BBQllegefXrY/9aq1fxBA8sI5M/lFU6tSWU= -github.com/go-openapi/jsonreference v0.21.2/go.mod h1:pp3PEjIsJ9CZDGCNOyXIQxsNuroxm8FAJ/+quA0yKzQ= -github.com/go-openapi/swag v0.25.1 h1:6uwVsx+/OuvFVPqfQmOOPsqTcm5/GkBhNwLqIR916n8= -github.com/go-openapi/swag v0.25.1/go.mod h1:bzONdGlT0fkStgGPd3bhZf1MnuPkf2YAys6h+jZipOo= -github.com/go-openapi/swag/cmdutils v0.25.1 h1:nDke3nAFDArAa631aitksFGj2omusks88GF1VwdYqPY= -github.com/go-openapi/swag/cmdutils v0.25.1/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= -github.com/go-openapi/swag/conv v0.25.1 h1:+9o8YUg6QuqqBM5X6rYL/p1dpWeZRhoIt9x7CCP+he0= -github.com/go-openapi/swag/conv v0.25.1/go.mod h1:Z1mFEGPfyIKPu0806khI3zF+/EUXde+fdeksUl2NiDs= -github.com/go-openapi/swag/fileutils v0.25.1 h1:rSRXapjQequt7kqalKXdcpIegIShhTPXx7yw0kek2uU= -github.com/go-openapi/swag/fileutils v0.25.1/go.mod h1:+NXtt5xNZZqmpIpjqcujqojGFek9/w55b3ecmOdtg8M= -github.com/go-openapi/swag/jsonname v0.25.1 h1:Sgx+qbwa4ej6AomWC6pEfXrA6uP2RkaNjA9BR8a1RJU= -github.com/go-openapi/swag/jsonname v0.25.1/go.mod h1:71Tekow6UOLBD3wS7XhdT98g5J5GR13NOTQ9/6Q11Zo= -github.com/go-openapi/swag/jsonutils v0.25.1 h1:AihLHaD0brrkJoMqEZOBNzTLnk81Kg9cWr+SPtxtgl8= -github.com/go-openapi/swag/jsonutils v0.25.1/go.mod h1:JpEkAjxQXpiaHmRO04N1zE4qbUEg3b7Udll7AMGTNOo= -github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1 h1:DSQGcdB6G0N9c/KhtpYc71PzzGEIc/fZ1no35x4/XBY= -github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.1/go.mod h1:kjmweouyPwRUEYMSrbAidoLMGeJ5p6zdHi9BgZiqmsg= -github.com/go-openapi/swag/loading v0.25.1 h1:6OruqzjWoJyanZOim58iG2vj934TysYVptyaoXS24kw= -github.com/go-openapi/swag/loading v0.25.1/go.mod h1:xoIe2EG32NOYYbqxvXgPzne989bWvSNoWoyQVWEZicc= -github.com/go-openapi/swag/mangling v0.25.1 h1:XzILnLzhZPZNtmxKaz/2xIGPQsBsvmCjrJOWGNz/ync= -github.com/go-openapi/swag/mangling v0.25.1/go.mod h1:CdiMQ6pnfAgyQGSOIYnZkXvqhnnwOn997uXZMAd/7mQ= -github.com/go-openapi/swag/netutils v0.25.1 h1:2wFLYahe40tDUHfKT1GRC4rfa5T1B4GWZ+msEFA4Fl4= -github.com/go-openapi/swag/netutils v0.25.1/go.mod h1:CAkkvqnUJX8NV96tNhEQvKz8SQo2KF0f7LleiJwIeRE= -github.com/go-openapi/swag/stringutils v0.25.1 h1:Xasqgjvk30eUe8VKdmyzKtjkVjeiXx1Iz0zDfMNpPbw= -github.com/go-openapi/swag/stringutils v0.25.1/go.mod h1:JLdSAq5169HaiDUbTvArA2yQxmgn4D6h4A+4HqVvAYg= -github.com/go-openapi/swag/typeutils v0.25.1 h1:rD/9HsEQieewNt6/k+JBwkxuAHktFtH3I3ysiFZqukA= -github.com/go-openapi/swag/typeutils v0.25.1/go.mod h1:9McMC/oCdS4BKwk2shEB7x17P6HmMmA6dQRtAkSnNb8= -github.com/go-openapi/swag/yamlutils v0.25.1 h1:mry5ez8joJwzvMbaTGLhw8pXUnhDK91oSJLDPF1bmGk= -github.com/go-openapi/swag/yamlutils v0.25.1/go.mod h1:cm9ywbzncy3y6uPm/97ysW8+wZ09qsks+9RS8fLWKqg= +github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4= +github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80= +github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8= +github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4= +github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU= +github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ= +github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4= +github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0= +github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4= +github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU= +github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y= +github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk= +github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI= +github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag= +github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA= +github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo= +github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM= +github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s= +github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE= +github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48= +github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg= +github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0= +github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg= +github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8= +github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0= +github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw= +github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE= +github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw= +github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4= +github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg= +github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls= +github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54= github.com/go-pg/pg/v10 v10.11.0 h1:CMKJqLgTrfpE/aOVeLdybezR2om071Vh38OLZjsyMI0= github.com/go-pg/pg/v10 v10.11.0/go.mod h1:4BpHRoxE61y4Onpof3x1a2SQvi9c+q1dJnrNdMjsroA= github.com/go-pg/zerochecker v0.2.0 h1:pp7f72c3DobMWOb2ErtZsnrPaSvHd2W4o9//8HtF4mU= @@ -434,8 +442,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.14 h1:yh8ncqsbUY4shRD5dA6RlzjJaT4hi3kII+zYw8wmLb8= github.com/googleapis/enterprise-certificate-proxy v0.3.14/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg= -github.com/googleapis/gax-go/v2 v2.18.0 h1:jxP5Uuo3bxm3M6gGtV94P4lliVetoCB4Wk2x8QA86LI= -github.com/googleapis/gax-go/v2 v2.18.0/go.mod h1:uSzZN4a356eRG985CzJ3WfbFSpqkLTjsnhWGJR6EwrE= +github.com/googleapis/gax-go/v2 v2.21.0 h1:h45NjjzEO3faG9Lg/cFrBh2PgegVVgzqKzuZl/wMbiI= +github.com/googleapis/gax-go/v2 v2.21.0/go.mod h1:But/NJU6TnZsrLai/xBAQLLz+Hc7fHZJt/hsCz3Fih4= github.com/gordonklaus/ineffassign v0.2.0 h1:Uths4KnmwxNJNzq87fwQQDDnbNb7De00VOk9Nu0TySs= github.com/gordonklaus/ineffassign v0.2.0/go.mod h1:TIpymnagPSexySzs7F9FnO1XFTy8IT3a59vmZp5Y9Lw= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= @@ -603,6 +611,8 @@ github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjI github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= +github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= +github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= @@ -879,8 +889,8 @@ go.opentelemetry.io/contrib/detectors/gcp v1.42.0 h1:kpt2PEJuOuqYkPcktfJqWWDjTEd go.opentelemetry.io/contrib/detectors/gcp v1.42.0/go.mod h1:W9zQ439utxymRrXsUOzZbFX4JhLxXU4+ZnCt8GG7yA8= go.opentelemetry.io/contrib/exporters/autoexport v0.68.0 h1:0D3GFvELGIwQGfC6agLsbrEYSGWZTRTxIXxcQUqrOuk= go.opentelemetry.io/contrib/exporters/autoexport v0.68.0/go.mod h1:DM2NV7Zb8CcGeVPt6glouY0FAiwZQ/iqgcWExhgWeN8= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0 h1:XmiuHzgJt067+a6kwyAzkhXooYVv3/TOw9cM2VfJgUM= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.65.0/go.mod h1:KDgtbWKTQs4bM+VPUr6WlL9m/WXcmkCcBlIzqxPGzmI= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0 h1:0Qx7VGBacMm9ZENQ7TnNObTYI4ShC+lHI16seduaxZo= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.68.0/go.mod h1:Sje3i3MjSPKTSPvVWCaL8ugBzJwik3u4smCjUeuupqg= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8VOmDefoh0+ztfGaymYbhdB/tT3zs79QaZTNGY= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= @@ -1037,10 +1047,12 @@ gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= google.golang.org/adk v1.2.0 h1:MfQD1/GqPfIsFNBcozNykkjdqNIdCrPH/SNqKPZF/yM= google.golang.org/adk v1.2.0/go.mod h1:6QY5jQI7awU4WYtJqvyIkJQheCvqsGWweU6BX63USEc= -google.golang.org/api v0.272.0 h1:eLUQZGnAS3OHn31URRf9sAmRk3w2JjMx37d2k8AjJmA= -google.golang.org/api v0.272.0/go.mod h1:wKjowi5LNJc5qarNvDCvNQBn3rVK8nSy6jg2SwRwzIA= +google.golang.org/api v0.274.0 h1:aYhycS5QQCwxHLwfEHRRLf9yNsfvp1JadKKWBE54RFA= +google.golang.org/api v0.274.0/go.mod h1:JbAt7mF+XVmWu6xNP8/+CTiGH30ofmCmk9nM8d8fHew= google.golang.org/genai v1.57.0 h1:qTyG2ynz5dQy2jF4CvZdLHHVslhR0heMue+zM1a4GNM= google.golang.org/genai v1.57.0/go.mod h1:A3kkl0nyBjyFlNjgxIwKq70julKbIxpSxqKO5gw/gmk= +google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgnawEVsOn6OFsnpyxNPRY9QV01dNB0= +google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:L43LFes82YgSonw6iTXTxXUX1OlULt4AQtkik4ULL/I= google.golang.org/genproto/googleapis/api v0.0.0-20260406210006-6f92a3bedf2d h1:/aDRtSZJjyLQzm75d+a1wOJaqyKBMvIAfeQmoa3ORiI= google.golang.org/genproto/googleapis/api v0.0.0-20260406210006-6f92a3bedf2d/go.mod h1:etfGUgejTiadZAUaEP14NP97xi1RGeawqkjDARA/UOs= google.golang.org/genproto/googleapis/rpc v0.0.0-20260406210006-6f92a3bedf2d h1:wT2n40TBqFY6wiwazVK9/iTWbsQrgk5ZfCSVFLO9LQA= diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index 308d7ba0f2..f82ff2a1d4 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -19,6 +19,9 @@ spec: scope: Namespaced versions: - additionalPrinterColumns: + - jsonPath: .spec.runtime + name: Runtime + type: string - jsonPath: .spec.backend name: Backend type: string @@ -511,6 +514,106 @@ spec: type: string type: array type: object + runtime: + default: openshell + description: Runtime selects the harness provisioning stack. Defaults + to openshell when unset. + enum: + - openshell + - substrate + type: string + substrate: + description: Substrate is required when runtime is substrate. + properties: + actorTemplateRef: + description: |- + ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. + When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + gatewayPort: + default: 80 + description: GatewayPort is the port OpenClaw listens on inside + the actor (Substrate routes to :80 today). + format: int32 + type: integer + gatewayTokenSecretRef: + description: |- + GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. + When unset, the controller falls back to --substrate-gateway-token(-file). + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + snapshotsConfig: + description: SnapshotsConfig is required for auto-provisioned + templates (GCS gs:// location). + properties: + location: + description: |- + Location is the GCS URI prefix for golden and incremental snapshots. + Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + type: string + required: + - location + type: object + workerPool: + description: WorkerPool creates a dedicated WorkerPool in the + harness namespace when workerPoolRef is unset. + properties: + ateomImage: + description: |- + AteomImage is the ateom herder image (pullable registry ref, not ko://). + Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + type: string + replicas: + default: 2 + description: Replicas is the number of ateom worker pods. + Defaults to 2 when unset or zero. + format: int32 + type: integer + type: object + workerPoolRef: + description: |- + WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). + Mutually exclusive with workerPool. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workloadImage: + description: WorkloadImage overrides the default nemoclaw/openclaw + sandbox image in the ActorTemplate. + type: string + required: + - snapshotsConfig + type: object required: - backend type: object @@ -612,6 +715,44 @@ spec: observedGeneration: format: int64 type: integer + substrate: + description: Substrate records auto-provisioned Substrate CR references. + properties: + actorTemplateReady: + description: ActorTemplateReady is true when the template phase + is Ready (golden snapshot taken). + type: boolean + actorTemplateRef: + description: ActorTemplateRef is the ActorTemplate used when creating + the actor. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + workerPoolRef: + description: WorkerPoolRef is the WorkerPool used by the harness + ActorTemplate. + properties: + apiGroup: + type: string + kind: + type: string + name: + type: string + namespace: + type: string + required: + - name + type: object + type: object type: object type: object served: true diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index 9d85f1066e..4ffc8998c8 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -87,6 +87,32 @@ spec: {{- with .Values.controller.env }} {{- toYaml . | nindent 12 }} {{- end }} + {{- if .Values.controller.substrate.enabled }} + - name: SUBSTRATE_ATE_API_ENDPOINT + value: {{ .Values.controller.substrate.ateApiEndpoint | quote }} + {{- if .Values.controller.substrate.ateApiInsecure }} + - name: SUBSTRATE_ATE_API_INSECURE + value: "true" + {{- end }} + - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAMESPACE + value: {{ .Values.controller.substrate.defaultActorTemplateNamespace | quote }} + - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAME + value: {{ .Values.controller.substrate.defaultActorTemplateName | quote }} + {{- if .Values.controller.substrate.gatewayTokenSecretName }} + - name: SUBSTRATE_GATEWAY_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.controller.substrate.gatewayTokenSecretName | quote }} + key: token + {{- else if .Values.controller.substrate.gatewayToken }} + - name: SUBSTRATE_GATEWAY_TOKEN + value: {{ .Values.controller.substrate.gatewayToken | quote }} + {{- end }} + {{- with .Values.controller.substrate.pauseImage }} + - name: SUBSTRATE_PAUSE_IMAGE + value: {{ . | quote }} + {{- end }} + {{- end }} envFrom: - configMapRef: name: {{ include "kagent.fullname" . }}-controller diff --git a/helm/kagent/templates/rbac/getter-role.yaml b/helm/kagent/templates/rbac/getter-role.yaml index f0ed9614fe..cafe9d0f5c 100644 --- a/helm/kagent/templates/rbac/getter-role.yaml +++ b/helm/kagent/templates/rbac/getter-role.yaml @@ -53,6 +53,25 @@ - get - list - watch +- apiGroups: + - ate.dev + resources: + - workerpools + - actortemplates + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - ate.dev + resources: + - actortemplates/status + verbs: + - get - apiGroups: - "apps" resources: diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index ebd3f6987c..8014e967f0 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -223,12 +223,26 @@ controller: ports: port: 8083 targetPort: 8083 - # TODO: NEED TO MAKE SURE THESE GET RENDERED IN controller-deployment.yaml - # Extra controller env. Examples — OpenShell: - # env: - # - name: OPENSHELL_GRPC_ADDR - # value: "openshell.my-namespace.svc.cluster.local:8080" - env: [] + # Extra controller env (mapped to flags via SUBSTRATE_* / OPENSHELL_* env names). + # OpenShell AgentHarness: set OPENSHELL_GATEWAY_URL (or leave defaults below). + env: + # - name: OPENSHELL_GATEWAY_URL + # value: openshell.openshell.svc.cluster.local:8080 + # - name: OPENSHELL_INSECURE + # value: "true" + + # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. + # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool); users set + # spec.substrate.snapshotsConfig.location (gs://) and worker pool ref or create spec. + substrate: + enabled: false + ateApiEndpoint: "dns:///api.ate-system.svc:443" + ateApiInsecure: false + gatewayToken: "test-token" + gatewayTokenSecretName: "" + gatewayTokenSecretNamespace: "" + pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + envFrom: [] # Additional volumes on the output Deployment definition. @@ -657,7 +671,7 @@ oauth2-proxy: # Skip authentication for kagent's branded login page, health checks, and static assets # This allows unauthenticated users to see the landing page and k8s probes to work skip-auth-route: "^/(health|login)$" - skip-auth-regex: "^/(login|_next/static|_next/image|login-bg\\.(jpg|png|webp)|logo-.*\\.png|favicon\\.ico).*$" + skip-auth-regex: "^/(login|_next/static|_next/image|login-bg\\.(jpg|png|webp)|logo-.*\\.png|favicon\\.ico|api/agentharnesses/.*/gateway).*$" # Use custom templates that redirect to kagent's branded /login page custom-templates-dir: "/templates" diff --git a/ui/next.config.ts b/ui/next.config.ts index e816991448..cc63e385f6 100644 --- a/ui/next.config.ts +++ b/ui/next.config.ts @@ -1,7 +1,22 @@ import type { NextConfig } from "next"; +const controllerDevURL = + process.env.KAGENT_DEV_CONTROLLER_URL ?? "http://127.0.0.1:8083"; + const nextConfig: NextConfig = { output: "standalone", + // Proxy /api to the controller in local dev (next dev :8001 → controller :8083). + async rewrites() { + if (process.env.NODE_ENV === "production") { + return []; + } + return [ + { + source: "/api/:path*", + destination: `${controllerDevURL}/api/:path*`, + }, + ]; + }, logging: { fetches: { fullUrl: true, diff --git a/ui/src/app/openshell/OpenshellTerminalPage.tsx b/ui/src/app/openshell/OpenshellTerminalPage.tsx index ab7a77a618..e937186f38 100644 --- a/ui/src/app/openshell/OpenshellTerminalPage.tsx +++ b/ui/src/app/openshell/OpenshellTerminalPage.tsx @@ -50,9 +50,9 @@ export function OpenshellTerminalPage() { : undefined; const clawHarnessSession = searchParams.get("clawHarness") === "1"; const harnessTerminalSession = clawHarnessSession || harnessBackend === "hermes"; - const autoConnect = Boolean(gatewaySandboxName); const namespace = searchParams.get("ns")?.trim() ?? ""; const crName = searchParams.get("name")?.trim() ?? ""; + const autoConnect = Boolean(gatewaySandboxName); const modelConfigRef = searchParams.get("modelConfigRef")?.trim() ?? ""; const [plainShellOnly, setPlainShellOnly] = useState(() => searchParams.get("plainShell") === "1"); /** Plain-shell mode the active SSH session was opened with (null when disconnected). */ @@ -63,7 +63,7 @@ export function OpenshellTerminalPage() { const [termError, setTermError] = useState(null); const [sessionActive, setSessionActive] = useState(false); - const [connecting, setConnecting] = useState(() => Boolean(autoConnect && gatewaySandboxName)); + const [connecting, setConnecting] = useState(() => Boolean(autoConnect)); const termHostRef = useRef(null); const termRef = useRef(null); @@ -118,25 +118,28 @@ export function OpenshellTerminalPage() { wsRef.current?.close(); }, []); - const connectTerminal = useCallback( - (gatewayName: string) => { + const connectTerminal = useCallback(() => { const term = termRef.current; if (!term) { setConnecting(false); return; } - const name = gatewayName.trim(); - if (!name) { - setTermError("Missing gateway sandbox name."); - return; - } setTermError(null); setConnecting(true); setSessionActive(false); wsRef.current?.close(); - const url = sandboxSshWebSocketURL(terminalApiBase()); + const name = gatewaySandboxName.trim(); + if (!name) { + setConnecting(false); + setTermError("Missing gateway sandbox name."); + return; + } + + const apiBase = terminalApiBase(); + const url = sandboxSshWebSocketURL(apiBase); + let ws: WebSocket; try { ws = new WebSocket(url); @@ -210,24 +213,22 @@ export function OpenshellTerminalPage() { } }; }, - [plainShellOnly, harnessBackend], + [plainShellOnly, harnessBackend, gatewaySandboxName], ); const restartSession = useCallback(() => { - const name = gatewaySandboxName.trim(); - if (!name) return; wsRef.current?.close(); - window.setTimeout(() => connectTerminal(name), 120); - }, [gatewaySandboxName, connectTerminal]); + window.setTimeout(() => connectTerminal(), 120); + }, [connectTerminal]); useEffect(() => { - if (!autoConnect || !gatewaySandboxName) return; + if (!autoConnect) return; const t = window.setTimeout(() => { if (!termRef.current) return; - connectTerminal(gatewaySandboxName); + connectTerminal(); }, 400); return () => window.clearTimeout(t); - }, [autoConnect, gatewaySandboxName, connectTerminal]); + }, [autoConnect, connectTerminal]); const showReconnect = Boolean(gatewaySandboxName) && !sessionActive && !connecting; const plainShellPendingRestart = @@ -284,7 +285,7 @@ export function OpenshellTerminalPage() { ) : null}
{showReconnect ? ( - ) : null} @@ -304,8 +305,7 @@ export function OpenshellTerminalPage() { {!gatewaySandboxName ? (

- Open an OpenShell sandbox from the Agents list to start a terminal - session. + Open a harness from the Agents list to start a terminal session.

) : null} diff --git a/ui/src/components/AgentCard.tsx b/ui/src/components/AgentCard.tsx index 42289faf0d..7a17f89573 100644 --- a/ui/src/components/AgentCard.tsx +++ b/ui/src/components/AgentCard.tsx @@ -20,11 +20,17 @@ import { Brain, MoreHorizontal, Pencil, Terminal, Trash2 } from "lucide-react"; import { k8sRefUtils } from "@/lib/k8sUtils"; import { agentHarnessIcon, + agentHarnessRuntimeLabel, agentHarnessTypeLabel, getAgentHarnessBackend, + getAgentHarnessRuntime, isAgentHarness, } from "@/lib/agentHarness"; -import { isOpenshellSandboxRow, openshellTerminalHref } from "@/lib/openshellSandboxAgents"; +import { + isOpenshellSandboxRow, + isSubstrateHarnessRow, + openshellTerminalHref, +} from "@/lib/openshellSandboxAgents"; import { cn } from "@/lib/utils"; interface AgentCardProps { @@ -39,8 +45,10 @@ export function AgentCard({ agentResponse, onAgentsChanged }: AgentCardProps) { const [deleteOpen, setDeleteOpen] = useState(false); const sshSandbox = isOpenshellSandboxRow(agentResponse); + const substrateHarness = isSubstrateHarnessRow(agentResponse); const agentHarness = isAgentHarness(agentResponse); const harnessBackend = getAgentHarnessBackend(agentResponse); + const harnessRuntime = getAgentHarnessRuntime(agentResponse); const agentRef = k8sRefUtils.toRef( agent.metadata.namespace || '', @@ -89,7 +97,11 @@ export function AgentCard({ agentResponse, onAgentsChanged }: AgentCardProps) { {harnessBackend ? agentHarnessIcon(harnessBackend) : "🦞"} @@ -173,16 +185,19 @@ export function AgentCard({ agentResponse, onAgentsChanged }: AgentCardProps) { ); + const substrateGatewayPath = agentResponse.substrateAgentHarness?.gatewayUIPath; const chatHref = - sshSandbox && agentResponse.openshellAgentHarness - ? openshellTerminalHref({ - gatewaySandboxName: agentResponse.openshellAgentHarness.gatewaySandboxName, - namespace: agent.metadata.namespace, - crName: agent.metadata.name, - modelConfigRef: agentResponse.modelConfigRef, - harnessBackend: harnessBackend, - }) - : `/agents/${agent.metadata.namespace}/${agent.metadata.name}/chat`; + substrateHarness && substrateGatewayPath + ? substrateGatewayPath + : sshSandbox && agentResponse.openshellAgentHarness + ? openshellTerminalHref({ + gatewaySandboxName: agentResponse.openshellAgentHarness.gatewaySandboxName, + namespace: agent.metadata.namespace, + crName: agent.metadata.name, + modelConfigRef: agentResponse.modelConfigRef, + harnessBackend, + }) + : `/agents/${agent.metadata.namespace}/${agent.metadata.name}/chat`; return ( <> diff --git a/ui/src/components/AgentListView.tsx b/ui/src/components/AgentListView.tsx index db7d47a60d..0072d7cfc6 100644 --- a/ui/src/components/AgentListView.tsx +++ b/ui/src/components/AgentListView.tsx @@ -24,7 +24,11 @@ import { getAgentHarnessBackend, isAgentHarness, } from "@/lib/agentHarness"; -import { isOpenshellSandboxRow, openshellTerminalHref } from "@/lib/openshellSandboxAgents"; +import { + isOpenshellSandboxRow, + isSubstrateHarnessRow, + openshellTerminalHref, +} from "@/lib/openshellSandboxAgents"; interface AgentListViewProps { agentResponse: AgentResponse[]; @@ -222,6 +226,7 @@ function AgentListRow({ item, onAgentsChanged }: { item: AgentResponse; onAgents const [deleteOpen, setDeleteOpen] = useState(false); const sshSandbox = isOpenshellSandboxRow(item); + const substrateHarness = isSubstrateHarnessRow(item); const agentHarness = isAgentHarness(item); const harnessBackend = getAgentHarnessBackend(item); @@ -233,26 +238,38 @@ function AgentListRow({ item, onAgentsChanged }: { item: AgentResponse; onAgents const nTools = countAgentToolBindings(item); const nSkills = countSkills(agent); + const substrateGatewayPath = item.substrateAgentHarness?.gatewayUIPath; const gatewaySandboxName = item.openshellAgentHarness?.gatewaySandboxName; const chatPath = useMemo( () => - sshSandbox && gatewaySandboxName - ? openshellTerminalHref({ - gatewaySandboxName, - namespace, - crName: name, - modelConfigRef: item.modelConfigRef, - harnessBackend, - }) - : `/agents/${encodeURIComponent(namespace)}/${encodeURIComponent(name)}/chat`, - [sshSandbox, gatewaySandboxName, namespace, name, item.modelConfigRef, harnessBackend], + substrateHarness && substrateGatewayPath + ? substrateGatewayPath + : sshSandbox && gatewaySandboxName + ? openshellTerminalHref({ + gatewaySandboxName, + namespace, + crName: name, + modelConfigRef: item.modelConfigRef, + harnessBackend, + }) + : `/agents/${encodeURIComponent(namespace)}/${encodeURIComponent(name)}/chat`, + [ + substrateHarness, + substrateGatewayPath, + sshSandbox, + gatewaySandboxName, + namespace, + name, + item.modelConfigRef, + harnessBackend, + ], ); - const goChat = () => { + const goChat = useCallback(() => { if (isReady) { router.push(chatPath); } - }; + }, [isReady, router, chatPath]); const handleEdit = (e: React.MouseEvent) => { e.preventDefault(); diff --git a/ui/src/components/agent-form/OpenClawSandboxFields.tsx b/ui/src/components/agent-form/OpenClawSandboxFields.tsx index 8af459e329..54e7cc6cf4 100644 --- a/ui/src/components/agent-form/OpenClawSandboxFields.tsx +++ b/ui/src/components/agent-form/OpenClawSandboxFields.tsx @@ -162,6 +162,97 @@ export function OpenClawSandboxFields({ {section === "general" ? validationError?.message : null} + + + Control plane + + + {value.runtime === "substrate" ? ( +
+ + Snapshot location (GCS) + set({ substrateSnapshotsLocation: e.target.value })} + /> +

+ Substrate stores golden and incremental snapshots at this gs:// prefix (GCS only today). +

+
+ + Worker pool + + + {value.substrateWorkerPoolMode === "existing" ? ( +
+ + WorkerPool namespace + set({ substrateWorkerPoolRefNamespace: e.target.value })} + /> + + + WorkerPool name + set({ substrateWorkerPoolRefName: e.target.value })} + /> + +
+ ) : ( + + Worker replicas + set({ substrateWorkerPoolReplicas: e.target.value })} + /> + + )} +
+ ) : null} +
+ b === value); } +export function getAgentHarnessRuntime(item: AgentResponse): "openshell" | "substrate" | undefined { + if (!isHarnessListRow(item)) { + return undefined; + } + if (isSubstrateHarnessRow(item)) { + return "substrate"; + } + return "openshell"; +} + /** * When this agent row represents an agent harness, returns the AgentHarness CR backend discriminator (e.g. openclaw vs nemoclaw). * Use {@link isAgentHarness} for a simple boolean check. */ export function getAgentHarnessBackend(item: AgentResponse): AgentHarnessBackend | undefined { - if (!isOpenshellSandboxRow(item)) { + if (!isHarnessListRow(item)) { return undefined; } - const backend = item.openshellAgentHarness?.backend; + const backend = + item.substrateAgentHarness?.backend ?? item.openshellAgentHarness?.backend; return isAgentHarnessBackend(backend) ? backend : undefined; } -/** True when the agents-list row is an agent harness (OpenShell sandbox whose backend is a known harness runtime). */ +/** True when the agents-list row is an agent harness. */ export function isAgentHarness(item: AgentResponse): boolean { return getAgentHarnessBackend(item) !== undefined; } @@ -80,3 +91,7 @@ export function agentHarnessTypeLabel(backend: AgentHarnessBackend): string { } } } + +export function agentHarnessRuntimeLabel(runtime: "openshell" | "substrate"): string { + return runtime === "substrate" ? "Substrate" : "OpenShell"; +} diff --git a/ui/src/lib/openClawSandboxForm.ts b/ui/src/lib/openClawSandboxForm.ts index 50b0c83cac..46608384f1 100644 --- a/ui/src/lib/openClawSandboxForm.ts +++ b/ui/src/lib/openClawSandboxForm.ts @@ -65,7 +65,18 @@ export function isClawHarnessBackend(backend: AgentHarnessSandboxBackend | undef return backend === "openclaw" || backend === "nemoclaw"; } +export type HarnessRuntimeForm = "openshell" | "substrate"; + export interface OpenClawSandboxFormSlice { + /** Harness control plane: OpenShell (default) or Agent Substrate. */ + runtime: HarnessRuntimeForm; + /** Use an existing Substrate WorkerPool or let kagent create one per harness. */ + substrateWorkerPoolMode: "create" | "existing"; + substrateWorkerPoolRefNamespace: string; + substrateWorkerPoolRefName: string; + substrateWorkerPoolReplicas: string; + /** GCS snapshot prefix (gs://bucket/path/) — required for auto-provisioned templates. */ + substrateSnapshotsLocation: string; /** Optional override for Sandbox.spec.image (OpenShell VM template image). Empty → controller default. */ image: string; channels: OpenClawChannelRow[]; @@ -80,6 +91,12 @@ export interface OpenClawSandboxFormSlice { export function defaultOpenClawSandboxFormSlice(): OpenClawSandboxFormSlice { return { + runtime: "openshell", + substrateWorkerPoolMode: "create", + substrateWorkerPoolRefNamespace: "", + substrateWorkerPoolRefName: "", + substrateWorkerPoolReplicas: "2", + substrateSnapshotsLocation: "gs://ate-snapshots/kagent/", image: "", channels: [], allowedDomains: "", @@ -361,11 +378,40 @@ export function buildSandboxCRDraft(args: { } const backend = resolveSandboxBackend(args.backend); + const runtime = args.openClaw.runtime?.trim() || "openshell"; + const spec: Record = { backend, + runtime, modelConfigRef, }; + if (runtime === "substrate") { + const snapshots = args.openClaw.substrateSnapshotsLocation?.trim(); + if (!snapshots) { + return { error: "Substrate snapshots location (gs://…) is required." }; + } + const substrate: Record = { + snapshotsConfig: { location: snapshots }, + }; + if (args.openClaw.substrateWorkerPoolMode === "existing") { + const wpName = args.openClaw.substrateWorkerPoolRefName?.trim(); + if (!wpName) { + return { error: "WorkerPool name is required when using an existing pool." }; + } + substrate.workerPoolRef = { + name: wpName, + namespace: args.openClaw.substrateWorkerPoolRefNamespace?.trim() || args.namespace.trim(), + }; + } else { + const replicas = Number.parseInt(args.openClaw.substrateWorkerPoolReplicas?.trim() || "2", 10); + substrate.workerPool = { + replicas: Number.isFinite(replicas) && replicas > 0 ? replicas : 2, + }; + } + spec.substrate = substrate; + } + const desc = args.description.trim(); if (desc) { spec.description = desc; diff --git a/ui/src/lib/openshellSandboxAgents.ts b/ui/src/lib/openshellSandboxAgents.ts index 64c7e45d9d..64f79770c2 100644 --- a/ui/src/lib/openshellSandboxAgents.ts +++ b/ui/src/lib/openshellSandboxAgents.ts @@ -5,6 +5,14 @@ export function isOpenshellSandboxRow(item: AgentResponse): boolean { return Boolean(item.openshellAgentHarness?.gatewaySandboxName); } +export function isSubstrateHarnessRow(item: AgentResponse): boolean { + return Boolean(item.substrateAgentHarness?.gatewayUIPath); +} + +export function isHarnessListRow(item: AgentResponse): boolean { + return isOpenshellSandboxRow(item) || isSubstrateHarnessRow(item); +} + export type OpenshellTerminalLinkParams = { gatewaySandboxName: string; namespace?: string; diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts index b4a441ce8e..7f50f04e5b 100644 --- a/ui/src/types/index.ts +++ b/ui/src/types/index.ts @@ -427,6 +427,18 @@ export interface OpenshellAgentHarnessListEntry { endpoint?: string; } +/** Merged into GET /api/agents when AgentHarness.spec.runtime is substrate. */ +export interface SubstrateAgentHarnessListEntry { + backend: string; + runtime: "substrate"; + actorId?: string; + /** Same-origin path for OpenClaw UI (HTTP + WebSocket via kagent proxy to actor pod IP). */ + gatewayUIPath?: string; + modelConfigRef?: string; + backendRefId?: string; + endpoint?: string; +} + export interface AgentResponse { id: number | string; agent: Agent; @@ -438,6 +450,7 @@ export interface AgentResponse { accepted: boolean; workloadMode?: "deployment" | "sandbox"; openshellAgentHarness?: OpenshellAgentHarnessListEntry; + substrateAgentHarness?: SubstrateAgentHarnessListEntry; } export interface RemoteMCPServer { From 8fcd1f034039119d9b498d9f792a384680aa5a04 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 26 May 2026 10:58:54 -0700 Subject: [PATCH 14/32] fix up the optional/non-optional types in the crd/values file Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 92 +++++++++++++++++-- .../crd/bases/kagent.dev_agentharnesses.yaml | 26 ++++-- go/api/v1alpha2/agentharness_types.go | 26 ++++-- go/api/v1alpha2/zz_generated.deepcopy.go | 6 +- .../handlers/agentharness_gateway.go | 48 +--------- go/core/pkg/app/app.go | 56 +++-------- .../pkg/sandboxbackend/substrate/config.go | 3 - .../sandboxbackend/substrate/gateway_token.go | 65 +++++++++++++ .../pkg/sandboxbackend/substrate/openclaw.go | 11 ++- .../pkg/sandboxbackend/substrate/provision.go | 36 ++++++-- .../substrate/provision_openclaw.go | 5 +- .../substrate/provision_openclaw_test.go | 85 ++++++++++++++++- .../substrate/provision_test.go | 84 ++++++++++++++++- .../templates/kagent.dev_agentharnesses.yaml | 26 ++++-- .../templates/controller-deployment.yaml | 30 ++++-- helm/kagent/values.yaml | 14 +-- 16 files changed, 454 insertions(+), 159 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/gateway_token.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index 1b27550895..eebeb17e29 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -42,14 +42,23 @@ Install kagent (Substrate must already be running in the cluster): ```bash export KIND_CLUSTER_NAME=kind -make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true" +make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" ``` -Create a harness with only what you must choose: +The generated `ActorTemplate` uses `controller.substrate.pauseImage`, +`controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, +`controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` +from the Helm values. Override them with `--set` or a values file when you need +to pin a different gVisor build. -- **`snapshotsConfig.location`** — GCS `gs://` prefix (Substrate snapshots are GCS-only today) -- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool` + **`ateomImage`**) -- **`workerPool.ateomImage`** — (`localhost:5001/ateom-gvisor:latest`) +Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to +`gs://ate-snapshots//`. If Helm sets +`controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can +be omitted unless you want to override it. + +- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool`) +- **`workerPool.ateomImage`** — optional override for the Helm/controller default (`localhost:5001/ateom-gvisor:latest`) +- **Gateway token** — required per harness with either `gatewayToken` or `gatewayTokenSecretRef` ```yaml apiVersion: kagent.dev/v1alpha2 @@ -63,15 +72,78 @@ spec: description: OpenClaw on Agent Substrate modelConfigRef: default-model-config substrate: - snapshotsConfig: - location: gs://ate-snapshots/kagent/kagent/my-claw/ - workerPool: - replicas: 1 - ateomImage: localhost:5001/ateom-gvisor:latest + # Optional: defaults to gs://ate-snapshots/kagent/peterj-claw + # snapshotsConfig: + # location: gs://ate-snapshots/kagent/peterj-claw + + # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. + # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. + # workerPool: + # replicas: 1 + # ateomImage: localhost:5001/ateom-gvisor:latest + + # Required: configure the OpenClaw gateway token for this harness. + # Use either gatewayToken or gatewayTokenSecretRef. The Secret must contain key "token". + gatewayToken: test-token + # gatewayTokenSecretRef: + # name: openclaw-gateway-token + # namespace: kagent + + # Optional: override the sandbox image used in the ActorTemplate. + # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + # Optional: adopt existing resources instead of auto-create # workerPoolRef: # name: my-pool # namespace: ate-system + # actorTemplateRef: + # name: my-template + # namespace: ate-system +``` + +When `actorTemplateRef` is not set, kagent creates an `ActorTemplate` that looks roughly like this: + +```yaml +apiVersion: ate.dev/v1alpha1 +kind: ActorTemplate +metadata: + name: peterj-claw + namespace: kagent + labels: + app.kubernetes.io/managed-by: kagent + kagent.dev/agent-harness: peterj-claw +spec: + pauseImage: gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da + runsc: + amd64: + url: gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc + sha256Hash: a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63 + arm64: + url: gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc + sha256Hash: 1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9 + workerPoolRef: + name: peterj-claw-wp + namespace: kagent + snapshotsConfig: + location: gs://ate-snapshots/kagent/peterj-claw + containers: + - name: openclaw + image: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + ports: + - containerPort: 80 + command: + - /bin/sh + - -c + - | + # Generated by kagent: + # 1. writes ~/.openclaw/openclaw.json from modelConfigRef/channels/gateway token + # 2. starts `openclaw gateway run --port 80 --allow-unconfigured` + # 3. waits for the gateway and tails the log + env: + - name: HOME + value: /root ``` +The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. + Port-forward the UI (`kubectl port-forward -n kagent svc/kagent-ui 8001:8080`) and navigate to the deployed agent harness. Use `test-token` as a gateway token to OpenClaw. diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index f82ff2a1d4..2c2f18ff71 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -547,10 +547,16 @@ spec: the actor (Substrate routes to :80 today). format: int32 type: integer + gatewayToken: + description: |- + GatewayToken is the OpenClaw gateway Bearer token for this harness. + Prefer gatewayTokenSecretRef for production secrets. + minLength: 1 + type: string gatewayTokenSecretRef: description: |- GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. - When unset, the controller falls back to --substrate-gateway-token(-file). + The Secret must contain a "token" key. properties: apiGroup: type: string @@ -564,8 +570,9 @@ spec: - name type: object snapshotsConfig: - description: SnapshotsConfig is required for auto-provisioned - templates (GCS gs:// location). + description: |- + SnapshotsConfig configures actor memory snapshots. Defaults to + gs://ate-snapshots// when unset. properties: location: description: |- @@ -582,12 +589,12 @@ spec: ateomImage: description: |- AteomImage is the ateom herder image (pullable registry ref, not ko://). - Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + Overrides the controller-wide substrate ateom image default for this WorkerPool. type: string replicas: - default: 2 + default: 1 description: Replicas is the number of ateom worker pods. - Defaults to 2 when unset or zero. + Defaults to 1 when unset or zero. format: int32 type: integer type: object @@ -611,9 +618,12 @@ spec: description: WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. type: string - required: - - snapshotsConfig type: object + x-kubernetes-validations: + - message: Exactly one of gatewayToken or gatewayTokenSecretRef must + be specified + rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) + || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) required: - backend type: object diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index 26c0109069..44902e1ffb 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -42,8 +42,8 @@ func IsKnownAgentHarnessBackend(b AgentHarnessBackendType) bool { type AgentHarnessRuntime string const ( - AgentHarnessRuntimeOpenshell AgentHarnessRuntime = "openshell" - AgentHarnessRuntimeSubstrate AgentHarnessRuntime = "substrate" + AgentHarnessRuntimeOpenshell AgentHarnessRuntime = "openshell" + AgentHarnessRuntimeSubstrate AgentHarnessRuntime = "substrate" ) // AgentHarnessSubstrateSnapshotsConfig points at a GCS prefix for actor memory snapshots. @@ -58,13 +58,13 @@ type AgentHarnessSubstrateSnapshotsConfig struct { // AgentHarnessSubstrateWorkerPoolSpec creates a dedicated WorkerPool for this harness. // Mutually exclusive with workerPoolRef. type AgentHarnessSubstrateWorkerPoolSpec struct { - // Replicas is the number of ateom worker pods. Defaults to 2 when unset or zero. + // Replicas is the number of ateom worker pods. Defaults to 1 when unset or zero. // +optional - // +kubebuilder:default=2 + // +kubebuilder:default=1 Replicas int32 `json:"replicas,omitempty"` // AteomImage is the ateom herder image (pullable registry ref, not ko://). - // Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + // Overrides the controller-wide substrate ateom image default for this WorkerPool. // +optional AteomImage string `json:"ateomImage,omitempty"` } @@ -73,6 +73,7 @@ type AgentHarnessSubstrateWorkerPoolSpec struct { // // By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). // Set actorTemplateRef only to adopt an existing template (advanced / legacy). +// +kubebuilder:validation:XValidation:rule="(has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef))",message="Exactly one of gatewayToken or gatewayTokenSecretRef must be specified" type AgentHarnessSubstrateSpec struct { // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). // Mutually exclusive with workerPool. @@ -83,9 +84,10 @@ type AgentHarnessSubstrateSpec struct { // +optional WorkerPool *AgentHarnessSubstrateWorkerPoolSpec `json:"workerPool,omitempty"` - // SnapshotsConfig is required for auto-provisioned templates (GCS gs:// location). - // +required - SnapshotsConfig AgentHarnessSubstrateSnapshotsConfig `json:"snapshotsConfig"` + // SnapshotsConfig configures actor memory snapshots. Defaults to + // gs://ate-snapshots// when unset. + // +optional + SnapshotsConfig *AgentHarnessSubstrateSnapshotsConfig `json:"snapshotsConfig,omitempty"` // WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. // +optional @@ -101,8 +103,14 @@ type AgentHarnessSubstrateSpec struct { // +kubebuilder:default=80 GatewayPort int32 `json:"gatewayPort,omitempty"` + // GatewayToken is the OpenClaw gateway Bearer token for this harness. + // Prefer gatewayTokenSecretRef for production secrets. + // +optional + // +kubebuilder:validation:MinLength=1 + GatewayToken string `json:"gatewayToken,omitempty"` + // GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. - // When unset, the controller falls back to --substrate-gateway-token(-file). + // The Secret must contain a "token" key. // +optional GatewayTokenSecretRef *TypedReference `json:"gatewayTokenSecretRef,omitempty"` } diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index dd1b350ccb..6acf8938f6 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -411,7 +411,11 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec *out = new(AgentHarnessSubstrateWorkerPoolSpec) **out = **in } - out.SnapshotsConfig = in.SnapshotsConfig + if in.SnapshotsConfig != nil { + in, out := &in.SnapshotsConfig, &out.SnapshotsConfig + *out = new(AgentHarnessSubstrateSnapshotsConfig) + **out = **in + } if in.ActorTemplateRef != nil { in, out := &in.ActorTemplateRef, &out.ActorTemplateRef *out = new(TypedReference) diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go index 453ec5d907..551a8b8981 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -10,21 +10,18 @@ import ( "net/http" "net/http/httputil" "net/url" - "os" "strings" "time" "github.com/gorilla/mux" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" ) const ( - substrateGatewayTokenSecretKey = "token" // OpenClaw 2026.3.28+ returns 403 without operator scopes on HTTP/WS when only Bearer token is sent. openclawDefaultOperatorScopes = "operator.admin" // Origin OpenClaw accepts by default for bind=lan port=80 (localhost/127.0.0.1 on gateway port). @@ -34,26 +31,10 @@ const ( // AgentHarnessGatewayConfig configures Substrate harness HTTP/WebSocket proxy. // Traffic is proxied directly to the actor ateom pod IP on port 80 (no atenet-router fallback). type AgentHarnessGatewayConfig struct { - GatewayToken string - GatewayTokenFile string - AteAPIEndpoint string - AteAPIInsecure bool - DialTimeout time.Duration - CallTimeout time.Duration -} - -func (c *AgentHarnessGatewayConfig) resolveToken() (string, error) { - if c == nil { - return "", nil - } - if c.GatewayTokenFile != "" { - data, err := os.ReadFile(c.GatewayTokenFile) - if err != nil { - return "", fmt.Errorf("read substrate gateway token file: %w", err) - } - return strings.TrimSpace(string(data)), nil - } - return strings.TrimSpace(c.GatewayToken), nil + AteAPIEndpoint string + AteAPIInsecure bool + DialTimeout time.Duration + CallTimeout time.Duration } // HandleAgentHarnessGateway proxies browser traffic to the actor OpenClaw gateway (pod IP when available). @@ -342,24 +323,5 @@ func readGatewayResponseBody(resp *http.Response) ([]byte, error) { } func (h *Handlers) resolveHarnessGatewayToken(ctx context.Context, ah *v1alpha2.AgentHarness) (string, error) { - if ah.Spec.Substrate != nil && ah.Spec.Substrate.GatewayTokenSecretRef != nil { - ref := ah.Spec.Substrate.GatewayTokenSecretRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - var secret corev1.Secret - if err := h.KubeClient.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { - return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) - } - if secret.Data == nil { - return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) - } - val, ok := secret.Data[substrateGatewayTokenSecretKey] - if !ok { - return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, substrateGatewayTokenSecretKey) - } - return strings.TrimSpace(string(val)), nil - } - return h.AgentHarnessGateway.resolveToken() + return substrate.ResolveGatewayToken(ctx, h.KubeClient, ah) } diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 7885292985..2389660463 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -53,13 +53,13 @@ import ( // to ensure that exec-entrypoint and run can make use of them. _ "k8s.io/client-go/plugin/pkg/client/auth" + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" dbpkg "github.com/kagent-dev/kagent/go/api/database" "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/migrations" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "github.com/kagent-dev/kagent/go/core/pkg/translator" "k8s.io/apimachinery/pkg/runtime" @@ -159,13 +159,12 @@ type Config struct { CallTimeout time.Duration DefaultActorTemplateNamespace string DefaultActorTemplateName string - GatewayToken string - GatewayTokenFile string PauseImage string RunscAMD64URL string RunscAMD64SHA256 string RunscARM64URL string RunscARM64SHA256 string + AteomImage string } } @@ -232,13 +231,12 @@ func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { commandLine.DurationVar(&cfg.Substrate.CallTimeout, "substrate-call-timeout", 30*time.Second, "Per-RPC timeout for ate-api calls.") commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateNamespace, "substrate-default-actor-template-namespace", "", "Legacy fallback ActorTemplate namespace when adopting an external template (set spec.substrate.actorTemplateRef instead).") commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateName, "substrate-default-actor-template-name", "", "Legacy fallback ActorTemplate name when adopting an external template (set spec.substrate.actorTemplateRef instead).") - commandLine.StringVar(&cfg.Substrate.GatewayToken, "substrate-gateway-token", "", "OpenClaw gateway Bearer token for substrate proxy. Prefer --substrate-gateway-token-file.") - commandLine.StringVar(&cfg.Substrate.GatewayTokenFile, "substrate-gateway-token-file", "", "File containing OpenClaw gateway Bearer token for substrate harness proxy.") commandLine.StringVar(&cfg.Substrate.PauseImage, "substrate-pause-image", "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", "Pause image for auto-provisioned ActorTemplates.") commandLine.StringVar(&cfg.Substrate.RunscAMD64URL, "substrate-runsc-amd64-url", "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc", "gVisor runsc URL for amd64.") commandLine.StringVar(&cfg.Substrate.RunscAMD64SHA256, "substrate-runsc-amd64-sha256", "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63", "gVisor runsc sha256 for amd64.") commandLine.StringVar(&cfg.Substrate.RunscARM64URL, "substrate-runsc-arm64-url", "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc", "gVisor runsc URL for arm64.") commandLine.StringVar(&cfg.Substrate.RunscARM64SHA256, "substrate-runsc-arm64-sha256", "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9", "gVisor runsc sha256 for arm64.") + commandLine.StringVar(&cfg.Substrate.AteomImage, "substrate-ateom-image", "", "Default ateom herder image for auto-provisioned Substrate WorkerPools. Per-harness spec.substrate.workerPool.ateomImage overrides this.") commandLine.StringVar(&agent_translator.DefaultServiceAccountName, "default-service-account-name", "", "Global default ServiceAccount name for agent pods. When set, agents without an explicit serviceAccountName will use this instead of creating a per-agent ServiceAccount.") @@ -732,21 +730,11 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne var agentHarnessGateway *handlers.AgentHarnessGatewayConfig if cfg.Substrate.AteAPIEndpoint != "" { - gwToken := cfg.Substrate.GatewayToken - if cfg.Substrate.GatewayTokenFile != "" { - data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) - if err != nil { - setupLog.Error(err, "unable to read substrate gateway token file") - os.Exit(1) - } - gwToken = strings.TrimSpace(string(data)) - } agentHarnessGateway = &handlers.AgentHarnessGatewayConfig{ - GatewayToken: gwToken, - AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, - AteAPIInsecure: cfg.Substrate.Insecure, - DialTimeout: cfg.Substrate.DialTimeout, - CallTimeout: cfg.Substrate.CallTimeout, + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + AteAPIInsecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, } } @@ -821,17 +809,14 @@ func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient ocl := openshell.NewOpenClawBackend(kubeClient, clients, oc, nil) hermesBackend := openshell.NewHermesBackend(kubeClient, clients, oc, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ocl, v1alpha2.AgentHarnessBackendHermes: hermesBackend, }, nil } func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend, *substrate.Client, error) { - sc, _, err := substrateAppConfig(cfg) - if err != nil { - return nil, nil, err - } + sc := substrateAppConfig(cfg) client, err := substrate.Dial(ctx, sc) if err != nil { return nil, nil, err @@ -840,20 +825,12 @@ func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alph ocl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendOpenClaw, nil) ncl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendNemoClaw, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, + v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ncl, }, client, nil } -func substrateAppConfig(cfg *Config) (substrate.Config, string, error) { - gwToken := cfg.Substrate.GatewayToken - if cfg.Substrate.GatewayTokenFile != "" { - data, err := os.ReadFile(cfg.Substrate.GatewayTokenFile) - if err != nil { - return substrate.Config{}, "", fmt.Errorf("read substrate gateway token file: %w", err) - } - gwToken = strings.TrimSpace(string(data)) - } +func substrateAppConfig(cfg *Config) substrate.Config { sc := substrate.Config{ AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, Insecure: cfg.Substrate.Insecure, @@ -861,25 +838,20 @@ func substrateAppConfig(cfg *Config) (substrate.Config, string, error) { CallTimeout: cfg.Substrate.CallTimeout, DefaultActorTemplateNamespace: cfg.Substrate.DefaultActorTemplateNamespace, DefaultActorTemplateName: cfg.Substrate.DefaultActorTemplateName, - GatewayToken: gwToken, ProvisionDefaults: substrate.ProvisionDefaults{ PauseImage: cfg.Substrate.PauseImage, RunscAMD64URL: cfg.Substrate.RunscAMD64URL, RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultAteomImage: cfg.Substrate.AteomImage, DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, - GatewayToken: gwToken, }, } - return sc, gwToken, nil + return sc } func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate *substrate.Client) *substrate.Provisioner { - _, gwToken, err := substrateAppConfig(cfg) - if err != nil { - gwToken = cfg.Substrate.GatewayToken - } return &substrate.Provisioner{ Client: kubeClient, Ate: ate, @@ -889,8 +861,8 @@ func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate * RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultAteomImage: cfg.Substrate.AteomImage, DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, - GatewayToken: gwToken, }, } } diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go index 45b5cb2b48..092e68ef92 100644 --- a/go/core/pkg/sandboxbackend/substrate/config.go +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -16,7 +16,4 @@ type Config struct { // ProvisionDefaults configures auto-created WorkerPool/ActorTemplate resources. ProvisionDefaults ProvisionDefaults - - // GatewayToken is the OpenClaw gateway Bearer token injected by the HTTP proxy. - GatewayToken string } diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_token.go b/go/core/pkg/sandboxbackend/substrate/gateway_token.go new file mode 100644 index 0000000000..abe4b0ba53 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/gateway_token.go @@ -0,0 +1,65 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// GatewayTokenSecretKey is the Secret data key used for per-harness OpenClaw gateway tokens. +const GatewayTokenSecretKey = "token" + +// ValidateGatewayTokenSpec requires exactly one per-harness OpenClaw gateway token source. +func ValidateGatewayTokenSpec(sub *v1alpha2.AgentHarnessSubstrateSpec) error { + if sub == nil { + return fmt.Errorf("spec.substrate is required") + } + hasToken := strings.TrimSpace(sub.GatewayToken) != "" + hasSecretRef := sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" + if hasToken == hasSecretRef { + return fmt.Errorf("exactly one of spec.substrate.gatewayToken or gatewayTokenSecretRef must be specified") + } + return nil +} + +// ResolveGatewayToken returns the per-harness gateway token. +func ResolveGatewayToken(ctx context.Context, kube client.Client, ah *v1alpha2.AgentHarness) (string, error) { + if ah == nil || ah.Spec.Substrate == nil { + return "", fmt.Errorf("spec.substrate is required") + } + if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { + return "", err + } + sub := ah.Spec.Substrate + if sub.GatewayTokenSecretRef != nil { + return resolveGatewayTokenSecret(ctx, kube, ah.Namespace, sub.GatewayTokenSecretRef) + } + return strings.TrimSpace(sub.GatewayToken), nil +} + +func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, defaultNamespace string, ref *v1alpha2.TypedReference) (string, error) { + if kube == nil { + return "", fmt.Errorf("kubernetes client is required to resolve gateway token secret") + } + ns := ref.Namespace + if ns == "" { + ns = defaultNamespace + } + var secret corev1.Secret + if err := kube.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { + return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) + } + if secret.Data == nil { + return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) + } + val, ok := secret.Data[GatewayTokenSecretKey] + if !ok { + return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, GatewayTokenSecretKey) + } + return strings.TrimSpace(string(val)), nil +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 08d5b7d7a3..0d269a45d8 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -17,8 +17,8 @@ import ( const ( defaultActorHostSuffix = "actors.resources.substrate.ate.dev" - defaultSubstrateGWPort = int32(80) - actorIDPrefix = "ahr" + defaultSubstrateGWPort = int32(80) + actorIDPrefix = "ahr" ) var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) @@ -200,11 +200,14 @@ func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { if ah.Spec.Substrate == nil { return fmt.Errorf("spec.substrate is required when runtime is substrate") } + if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { + return err + } if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { return nil } - if strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location) == "" { - return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") + if loc := substrateSnapshotsLocation(ah); !strings.HasPrefix(loc, "gs://") { + return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") } return nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go index d8a63de188..d704baa927 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -23,7 +23,8 @@ const ( annotationManagedWorkerPool = AnnotationManagedWorkerPool annotationManagedActorTemplate = AnnotationManagedActorTemplate - defaultWorkerPoolReplicas = int32(2) + defaultWorkerPoolReplicas = int32(1) + defaultSnapshotsBucket = "ate-snapshots" defaultOpenClawContainer = "openclaw" ) @@ -34,8 +35,8 @@ type ProvisionDefaults struct { RunscAMD64SHA256 string RunscARM64URL string RunscARM64SHA256 string + DefaultAteomImage string DefaultWorkloadImage string - GatewayToken string } // ateActorDeleter removes actors from ate-api during harness teardown. @@ -115,13 +116,13 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { sub := ah.Spec.Substrate + if err := ValidateGatewayTokenSpec(sub); err != nil { + return err + } if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { return nil } - loc := strings.TrimSpace(sub.SnapshotsConfig.Location) - if loc == "" { - return fmt.Errorf("spec.substrate.snapshotsConfig.location is required when not using actorTemplateRef") - } + loc := substrateSnapshotsLocation(ah) if !strings.HasPrefix(loc, "gs://") { return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") } @@ -156,7 +157,10 @@ func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHa ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) } if ateomImage == "" { - return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set spec.substrate.workerPool.ateomImage)") + ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) + } + if ateomImage == "" { + return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") } desired := &atev1alpha1.WorkerPool{ @@ -232,7 +236,7 @@ func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.Agen Namespace: wpKey.Namespace, }, SnapshotsConfig: atev1alpha1.SnapshotsConfig{ - Location: strings.TrimSpace(ah.Spec.Substrate.SnapshotsConfig.Location), + Location: substrateSnapshotsLocation(ah), }, }, } @@ -277,6 +281,22 @@ func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { } } +func substrateSnapshotsLocation(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return defaultSubstrateSnapshotsLocation("", "") + } + if sub := ah.Spec.Substrate; sub != nil && sub.SnapshotsConfig != nil { + if loc := strings.TrimSpace(sub.SnapshotsConfig.Location); loc != "" { + return loc + } + } + return defaultSubstrateSnapshotsLocation(ah.Namespace, ah.Name) +} + +func defaultSubstrateSnapshotsLocation(namespace, name string) string { + return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) +} + func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { return map[string]string{ "app.kubernetes.io/managed-by": "kagent", diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index b2d53e405a..b5e9903e30 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -25,7 +25,10 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha return "", nil, fmt.Errorf("substrate provisioner kubernetes client is required") } - token := strings.TrimSpace(p.Defaults.GatewayToken) + token, err := ResolveGatewayToken(ctx, p.Client, ah) + if err != nil { + return "", nil, fmt.Errorf("resolve gateway token: %w", err) + } gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort) var jsonBytes []byte diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index f4ac28d3f9..9c16ece575 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -43,7 +43,8 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { Spec: v1alpha2.AgentHarnessSpec{ ModelConfigRef: "default-model-config", Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + GatewayToken: "some-token", + SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ Location: "gs://bucket/prefix/", }, }, @@ -52,8 +53,7 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() p := &Provisioner{ - Client: kube, - Defaults: ProvisionDefaults{GatewayToken: "some-token"}, + Client: kube, } script, env, err := p.buildOpenClawActorStartup(context.Background(), ah) @@ -97,6 +97,58 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { require.Contains(t, root, "agents") } +func TestBuildOpenClawActorStartup_WithHarnessGatewayToken(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "kagent" + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "openclaw-token", Namespace: ns}, + Data: map[string][]byte{GatewayTokenSecretKey: []byte("secret-token")}, + } + for _, tt := range []struct { + name string + substrate *v1alpha2.AgentHarnessSubstrateSpec + wantToken string + }{ + { + name: "inline token", + substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayToken: "inline-token", + }, + wantToken: "inline-token", + }, + { + name: "secret token", + substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayTokenSecretRef: &v1alpha2.TypedReference{Name: "openclaw-token"}, + }, + wantToken: "secret-token", + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret.DeepCopy()).Build() + p := &Provisioner{ + Client: kube, + } + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "claw", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Substrate: tt.substrate, + }, + } + + script, _, err := p.buildOpenClawActorStartup(context.Background(), ah) + require.NoError(t, err) + require.Equal(t, tt.wantToken, gatewayTokenFromStartup(t, script)) + }) + } +} + func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { t.Parallel() scheme := runtime.NewScheme() @@ -123,7 +175,8 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { Spec: v1alpha2.AgentHarnessSpec{ ModelConfigRef: "mc", Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + GatewayToken: "some-token", + SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ Location: "gs://bucket/prefix/", }, }, @@ -151,3 +204,27 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { openai := root["models"].(map[string]any)["providers"].(map[string]any)["openai"].(map[string]any) require.Equal(t, "https://api.example/v1", openai["baseUrl"]) } + +func gatewayTokenFromStartup(t *testing.T, script string) string { + t.Helper() + + var payload string + for _, line := range strings.Split(script, "\n") { + if strings.Contains(line, "base64 -d") { + start := strings.Index(line, `'`) + 1 + end := strings.LastIndex(line, `'`) + require.Greater(t, end, start) + payload = line[start:end] + break + } + } + require.NotEmpty(t, payload) + raw, decErr := base64.StdEncoding.DecodeString(payload) + require.NoError(t, decErr) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + auth := gw["auth"].(map[string]any) + token, _ := auth["token"].(string) + return token +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go index 4878d40a99..e0e767e458 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_test.go @@ -1,9 +1,15 @@ package substrate import ( + "context" "testing" + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/kagent-dev/kagent/go/api/v1alpha2" ) @@ -15,7 +21,8 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { Spec: v1alpha2.AgentHarnessSpec{ Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - SnapshotsConfig: v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ + GatewayToken: "test-token", + SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ Location: "gs://bucket/prefix/", }, }, @@ -25,7 +32,21 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { t.Fatalf("expected valid: %v", err) } - ah.Spec.Substrate.SnapshotsConfig.Location = "s3://nope" + ah.Spec.Substrate.SnapshotsConfig = nil + if err := validateSubstrateProvisionSpec(ah); err != nil { + t.Fatalf("expected default snapshots config to be valid: %v", err) + } + if got := substrateSnapshotsLocation(ah); got != "gs://ate-snapshots/kagent/claw" { + t.Fatalf("got default snapshots location %q", got) + } + + ah.Spec.Substrate.GatewayToken = "" + if err := validateSubstrateProvisionSpec(ah); err == nil { + t.Fatal("expected error when gateway token is not configured") + } + + ah.Spec.Substrate.GatewayToken = "test-token" + ah.Spec.Substrate.SnapshotsConfig = &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{Location: "s3://nope"} if err := validateSubstrateProvisionSpec(ah); err == nil { t.Fatal("expected error for non-gs location") } @@ -38,6 +59,65 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { } } +func TestEnsureWorkerPoolUsesDefaultAteomImage(t *testing.T) { + t.Parallel() + + for _, tt := range []struct { + name string + defaultImg string + workerPool *v1alpha2.AgentHarnessSubstrateWorkerPoolSpec + wantImage string + wantReplica int32 + }{ + { + name: "defaults omitted replicas", + defaultImg: "registry.example/ateom:default", + workerPool: &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{}, + wantImage: "registry.example/ateom:default", + wantReplica: 1, + }, + { + name: "workerpool override", + defaultImg: "registry.example/ateom:default", + workerPool: &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 3, AteomImage: "registry.example/ateom:override"}, + wantImage: "registry.example/ateom:override", + wantReplica: 3, + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + ah := &v1alpha2.AgentHarness{ + TypeMeta: metav1.TypeMeta{APIVersion: v1alpha2.GroupVersion.String(), Kind: "AgentHarness"}, + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + WorkerPool: tt.workerPool, + }, + }, + } + p := &Provisioner{ + Client: fake.NewClientBuilder().WithScheme(scheme).Build(), + Defaults: ProvisionDefaults{DefaultAteomImage: tt.defaultImg}, + } + + key, managed, err := p.ensureWorkerPool(context.Background(), ah) + require.NoError(t, err) + require.True(t, managed) + + var wp atev1alpha1.WorkerPool + require.NoError(t, p.Client.Get(context.Background(), key, &wp)) + require.Equal(t, tt.wantImage, wp.Spec.AteomImage) + require.Equal(t, tt.wantReplica, wp.Spec.Replicas) + }) + } +} + func TestActorTemplateName(t *testing.T) { t.Parallel() ah := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "my-claw"}} diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index f82ff2a1d4..2c2f18ff71 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -547,10 +547,16 @@ spec: the actor (Substrate routes to :80 today). format: int32 type: integer + gatewayToken: + description: |- + GatewayToken is the OpenClaw gateway Bearer token for this harness. + Prefer gatewayTokenSecretRef for production secrets. + minLength: 1 + type: string gatewayTokenSecretRef: description: |- GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. - When unset, the controller falls back to --substrate-gateway-token(-file). + The Secret must contain a "token" key. properties: apiGroup: type: string @@ -564,8 +570,9 @@ spec: - name type: object snapshotsConfig: - description: SnapshotsConfig is required for auto-provisioned - templates (GCS gs:// location). + description: |- + SnapshotsConfig configures actor memory snapshots. Defaults to + gs://ate-snapshots// when unset. properties: location: description: |- @@ -582,12 +589,12 @@ spec: ateomImage: description: |- AteomImage is the ateom herder image (pullable registry ref, not ko://). - Required when kagent auto-provisions a WorkerPool (spec.workerPool without workerPoolRef). + Overrides the controller-wide substrate ateom image default for this WorkerPool. type: string replicas: - default: 2 + default: 1 description: Replicas is the number of ateom worker pods. - Defaults to 2 when unset or zero. + Defaults to 1 when unset or zero. format: int32 type: integer type: object @@ -611,9 +618,12 @@ spec: description: WorkloadImage overrides the default nemoclaw/openclaw sandbox image in the ActorTemplate. type: string - required: - - snapshotsConfig type: object + x-kubernetes-validations: + - message: Exactly one of gatewayToken or gatewayTokenSecretRef must + be specified + rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) + || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) required: - backend type: object diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index 4ffc8998c8..d63ff6bd92 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -98,20 +98,30 @@ spec: value: {{ .Values.controller.substrate.defaultActorTemplateNamespace | quote }} - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAME value: {{ .Values.controller.substrate.defaultActorTemplateName | quote }} - {{- if .Values.controller.substrate.gatewayTokenSecretName }} - - name: SUBSTRATE_GATEWAY_TOKEN - valueFrom: - secretKeyRef: - name: {{ .Values.controller.substrate.gatewayTokenSecretName | quote }} - key: token - {{- else if .Values.controller.substrate.gatewayToken }} - - name: SUBSTRATE_GATEWAY_TOKEN - value: {{ .Values.controller.substrate.gatewayToken | quote }} - {{- end }} {{- with .Values.controller.substrate.pauseImage }} - name: SUBSTRATE_PAUSE_IMAGE value: {{ . | quote }} {{- end }} + {{- with .Values.controller.substrate.runscAMD64URL }} + - name: SUBSTRATE_RUNSC_AMD64_URL + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.runscAMD64SHA256 }} + - name: SUBSTRATE_RUNSC_AMD64_SHA256 + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.runscARM64URL }} + - name: SUBSTRATE_RUNSC_ARM64_URL + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.runscARM64SHA256 }} + - name: SUBSTRATE_RUNSC_ARM64_SHA256 + value: {{ . | quote }} + {{- end }} + {{- with .Values.controller.substrate.ateomImage }} + - name: SUBSTRATE_ATEOM_IMAGE + value: {{ . | quote }} + {{- end }} {{- end }} envFrom: - configMapRef: diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index 8014e967f0..a9a335ef85 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -232,16 +232,18 @@ controller: # value: "true" # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. - # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool); users set - # spec.substrate.snapshotsConfig.location (gs://) and worker pool ref or create spec. + # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness + # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. substrate: - enabled: false + enabled: true ateApiEndpoint: "dns:///api.ate-system.svc:443" ateApiInsecure: false - gatewayToken: "test-token" - gatewayTokenSecretName: "" - gatewayTokenSecretNamespace: "" pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" + runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" + runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + ateomImage: "localhost:5001/ateom-gvisor:latest" envFrom: [] From 69087dfab5201414bcbc26fac0789d28e95aca7e Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 26 May 2026 15:05:38 -0700 Subject: [PATCH 15/32] clean up ui/gateway stuff (use base path from openclaw) Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 35 +-- .../handlers/agentharness_gateway.go | 127 +++------- .../agentharness_gateway_path_test.go | 89 +++++++ .../handlers/agentharness_gateway_rewrite.go | 235 ------------------ .../agentharness_gateway_rewrite_test.go | 165 ------------ .../handlers/agentharness_gateway_test.go | 34 +-- go/core/internal/httpserver/middleware.go | 7 +- go/core/internal/httpserver/server.go | 29 ++- .../openshell/openclaw/bootstrap.go | 24 +- .../openclaw/bootstrap_substrate_test.go | 3 +- .../openshell/openclaw/bootstrap_test.go | 2 +- .../openshell/openclaw/types.go | 11 +- .../substrate/provision_openclaw.go | 9 +- .../substrate/provision_openclaw_test.go | 2 + 14 files changed, 218 insertions(+), 554 deletions(-) create mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go delete mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go delete mode 100644 go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index eebeb17e29..f2587b17a6 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -45,16 +45,9 @@ export KIND_CLUSTER_NAME=kind make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" ``` -The generated `ActorTemplate` uses `controller.substrate.pauseImage`, -`controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, -`controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` -from the Helm values. Override them with `--set` or a values file when you need -to pin a different gVisor build. +The generated `ActorTemplate` uses `controller.substrate.pauseImage`, `controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, `controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` from the Helm values Override them with `--set` or a values file when you need to pin a different gVisor build. -Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to -`gs://ate-snapshots//`. If Helm sets -`controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can -be omitted unless you want to override it. +Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to `gs://ate-snapshots//`. If Helm sets `controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can be omitted unless you want to override it. - **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool`) - **`workerPool.ateomImage`** — optional override for the Helm/controller default (`localhost:5001/ateom-gvisor:latest`) @@ -78,8 +71,8 @@ spec: # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. - # workerPool: - # replicas: 1 + workerPool: + replicas: 2 # ateomImage: localhost:5001/ateom-gvisor:latest # Required: configure the OpenClaw gateway token for this harness. @@ -137,13 +130,25 @@ spec: - | # Generated by kagent: # 1. writes ~/.openclaw/openclaw.json from modelConfigRef/channels/gateway token - # 2. starts `openclaw gateway run --port 80 --allow-unconfigured` - # 3. waits for the gateway and tails the log + # 2. configures gateway.controlUi.basePath for the kagent proxy path + # 3. starts `openclaw gateway run --port 80 --allow-unconfigured` + # 4. waits for the gateway and tails the log env: - name: HOME value: /root ``` -The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. +The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. kagent also sets `gateway.controlUi.basePath` to `/api/agentharnesses///gateway` so OpenClaw serves the Control UI under the same path kagent proxies. -Port-forward the UI (`kubectl port-forward -n kagent svc/kagent-ui 8001:8080`) and navigate to the deployed agent harness. Use `test-token` as a gateway token to OpenClaw. +Port-forward the UI: + +```bash +kubectl port-forward -n kagent svc/kagent-ui 8001:8080 +``` + +Navigate to the deployed agent harness. If the OpenClaw Control UI asks for a gateway connection, use: + +- Gateway URL: `http://localhost:8001/api/agentharnesses/kagent/peterj-claw/gateway/` +- Gateway token: `test-token` + +The gateway URL must include the trailing slash. The token is the value configured in `spec.substrate.gatewayToken`, or the Secret value referenced by `spec.substrate.gatewayTokenSecretRef`; enter it in the token/credentials field rather than relying on a `token` query parameter. diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go index 551a8b8981..5215fe82cb 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -1,11 +1,8 @@ package handlers import ( - "bytes" - "compress/gzip" "context" "fmt" - "io" "net" "net/http" "net/http/httputil" @@ -175,7 +172,8 @@ func agentHarnessGatewayPublicPrefix(namespace, name string) string { // resolveGatewayUpstreamPath maps the public URL to the upstream path on the actor. // redirectTo is set when the browser should use a trailing slash under /gateway/. -// HTTP and WebSocket upgrades to the gateway entry both proxy to upstream / (OpenClaw gateway UI). +// OpenClaw is configured with the same controlUi.basePath, so the proxy preserves +// the public gateway base path when forwarding to the actor. func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade bool) (upstreamPath, redirectTo string, ok bool) { base := agentHarnessHarnessBase(namespace, name) if !strings.HasPrefix(requestPath, base) { @@ -188,32 +186,18 @@ func resolveGatewayUpstreamPath(requestPath, namespace, name string, wsUpgrade b switch { case rel == "/gateway": - _ = wsUpgrade - return "/", agentHarnessGatewayPublicPrefix(namespace, name), true - case strings.HasPrefix(rel, "/gateway/"): - sub := strings.TrimPrefix(rel, "/gateway") - if sub == "" { - sub = "/" + upstream := agentHarnessGatewayPublicPrefix(namespace, name) + if wsUpgrade { + return upstream, "", true } - return sub, "", true - case isHarnessStaticAssetPath(rel): - return rel, "", true + return upstream, upstream, true + case strings.HasPrefix(rel, "/gateway/"): + return requestPath, "", true default: return "", "", false } } -func isHarnessStaticAssetPath(rel string) bool { - if strings.HasPrefix(rel, "/assets/") { - return true - } - switch rel { - case "/manifest.webmanifest", "/vite.svg", "/favicon.ico": - return true - } - return strings.HasPrefix(rel, "/favicon") -} - // normalizeOpenClawBrowserOrigin rewrites Origin/Referer so OpenClaw accepts WS/API from kagent-ui // (e.g. http://localhost:8001) while the gateway listens on the actor pod :80. func normalizeOpenClawBrowserOrigin(req *http.Request) { @@ -239,66 +223,45 @@ func isWebSocketUpgrade(r *http.Request) bool { func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPrefix, namespace, name string, log interface { Error(error, string, ...any) }) *httputil.ReverseProxy { - proxy := httputil.NewSingleHostReverseProxy(target) - proxy.FlushInterval = -1 - proxy.Transport = &http.Transport{ - Proxy: http.ProxyFromEnvironment, - ResponseHeaderTimeout: 0, - IdleConnTimeout: 90 * time.Second, - } - origDirector := proxy.Director - proxy.Director = func(req *http.Request) { - origDirector(req) - req.Host = upstreamHost - req.Header.Set("Host", upstreamHost) - if token != "" { - req.Header.Set("Authorization", "Bearer "+token) - } - req.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) - normalizeOpenClawBrowserOrigin(req) - subPath, _, pathOK := resolveGatewayUpstreamPath(req.URL.Path, namespace, name, isWebSocketUpgrade(req)) - if !pathOK { - subPath = "/" - } - if subPath == "" { - subPath = "/" - } else if !strings.HasPrefix(subPath, "/") { - subPath = "/" + subPath - } - req.URL.Path = subPath - req.URL.RawPath = subPath + proxy := &httputil.ReverseProxy{ + FlushInterval: -1, + Transport: &http.Transport{ + Proxy: http.ProxyFromEnvironment, + ResponseHeaderTimeout: 0, + IdleConnTimeout: 90 * time.Second, + }, + Rewrite: func(pr *httputil.ProxyRequest) { + pr.SetURL(target) + pr.Out.Host = upstreamHost + if token != "" { + pr.Out.Header.Set("Authorization", "Bearer "+token) + } + pr.Out.Header.Set("x-openclaw-scopes", openclawDefaultOperatorScopes) + normalizeOpenClawBrowserOrigin(pr.Out) + subPath, _, pathOK := resolveGatewayUpstreamPath(pr.In.URL.Path, namespace, name, isWebSocketUpgrade(pr.In)) + if !pathOK { + subPath = "/" + } + if subPath == "" { + subPath = "/" + } else if !strings.HasPrefix(subPath, "/") { + subPath = "/" + subPath + } + pr.Out.URL.Path = subPath + pr.Out.URL.RawPath = subPath + }, } proxy.ModifyResponse = func(resp *http.Response) error { - // Do not read or rewrite WebSocket upgrade responses (would break 101 handshakes). if resp.StatusCode == http.StatusSwitchingProtocols { return nil } - resp.Header.Del("Content-Security-Policy") - resp.Header.Del("Content-Security-Policy-Report-Only") - if loc := resp.Header.Get("Location"); loc != "" { - if strings.HasPrefix(loc, "/") && !strings.HasPrefix(loc, publicPrefix) { - resp.Header.Set("Location", strings.TrimSuffix(publicPrefix, "/")+loc) + publicBase := strings.TrimSuffix(publicPrefix, "/") + if strings.HasPrefix(loc, "/") && !strings.HasPrefix(loc, publicBase) { + resp.Header.Set("Location", publicBase+loc) } } - - ct := resp.Header.Get("Content-Type") - if !shouldRewriteGatewayBody(ct) { - return nil - } - body, err := readGatewayResponseBody(resp) - if err != nil { - return err - } - rewritten := rewriteGatewayBody(body, ct, publicPrefix) - if strings.Contains(strings.ToLower(ct), "text/html") { - rewritten = injectGatewayClientShim(rewritten, token) - } - resp.Header.Del("Content-Encoding") - resp.Header.Del("Content-Length") - resp.ContentLength = int64(len(rewritten)) - resp.Body = io.NopCloser(bytes.NewReader(rewritten)) return nil } proxy.ErrorHandler = func(rw http.ResponseWriter, req *http.Request, proxyErr error) { @@ -308,20 +271,6 @@ func newAgentHarnessGatewayProxy(target *url.URL, upstreamHost, token, publicPre return proxy } -func readGatewayResponseBody(resp *http.Response) ([]byte, error) { - var reader io.Reader = resp.Body - if strings.EqualFold(resp.Header.Get("Content-Encoding"), "gzip") { - gz, err := gzip.NewReader(resp.Body) - if err != nil { - return nil, err - } - defer gz.Close() - reader = gz - } - defer resp.Body.Close() - return io.ReadAll(reader) -} - func (h *Handlers) resolveHarnessGatewayToken(ctx context.Context, ah *v1alpha2.AgentHarness) (string, error) { return substrate.ResolveGatewayToken(ctx, h.KubeClient, ah) } diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go new file mode 100644 index 0000000000..433bcd5205 --- /dev/null +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_path_test.go @@ -0,0 +1,89 @@ +package handlers + +import ( + "net/http" + "testing" +) + +func TestResolveGatewayUpstreamPath(t *testing.T) { + t.Parallel() + ns, name := "kagent", "my-claw" + public := agentHarnessGatewayPublicPrefix(ns, name) + + tests := []struct { + name string + path string + wsUpgrade bool + wantUp string + wantRedir string + wantOK bool + }{ + { + name: "harness root redirects", + path: "/api/agentharnesses/kagent/my-claw", + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash redirects", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wantUp: public, + wantRedir: public, + wantOK: true, + }, + { + name: "gateway without slash websocket", + path: "/api/agentharnesses/kagent/my-claw/gateway", + wsUpgrade: true, + wantUp: public, + wantOK: true, + }, + { + name: "gateway index", + path: "/api/agentharnesses/kagent/my-claw/gateway/", + wantUp: public, + wantOK: true, + }, + { + name: "gateway asset", + path: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", + wantUp: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", + wantOK: true, + }, + { + name: "unknown path", + path: "/api/agentharnesses/kagent/my-claw/api/v1/foo", + wantOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + up, redir, ok := resolveGatewayUpstreamPath(tt.path, ns, name, tt.wsUpgrade) + if ok != tt.wantOK { + t.Fatalf("ok = %v, want %v", ok, tt.wantOK) + } + if up != tt.wantUp { + t.Fatalf("upstream = %q, want %q", up, tt.wantUp) + } + if redir != tt.wantRedir { + t.Fatalf("redirect = %q, want %q", redir, tt.wantRedir) + } + }) + } +} + +func TestIsWebSocketUpgrade(t *testing.T) { + t.Parallel() + req, _ := http.NewRequest(http.MethodGet, "http://example.com/api/x/gateway", nil) + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Upgrade", "websocket") + if !isWebSocketUpgrade(req) { + t.Fatal("expected websocket upgrade") + } + req2, _ := http.NewRequest(http.MethodGet, "http://example.com/", nil) + if isWebSocketUpgrade(req2) { + t.Fatal("expected not websocket upgrade") + } +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go deleted file mode 100644 index 13818acb39..0000000000 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite.go +++ /dev/null @@ -1,235 +0,0 @@ -package handlers - -import ( - "bytes" - "encoding/json" - "fmt" - "strings" -) - -// shouldRewriteGatewayQuotedPath returns true for root-absolute app paths we proxy, -// not for short tokens like "/g" (RegExp flags) or other non-asset paths. -func shouldRewriteGatewayQuotedPath(path string) bool { - if path == "" || !strings.HasPrefix(path, "/") || strings.HasPrefix(path, "//") { - return false - } - switch { - case strings.HasPrefix(path, "/assets"): - return true - case strings.HasPrefix(path, "/manifest"): - return true - case strings.HasPrefix(path, "/favicon"): - return true - case path == "/vite.svg": - return true - default: - return false - } -} - -// rewriteGatewayRootPaths prefixes root-absolute URLs in HTML/JS/CSS so assets load under -// /api/agentharnesses/{ns}/{name}/gateway/ (OpenClaw CSP blocks ; base-uri 'none'). -func rewriteGatewayRootPaths(body []byte, prefix string) []byte { - if len(body) == 0 || prefix == "" { - return body - } - if !strings.HasPrefix(prefix, "/") { - prefix = "/" + prefix - } - if !strings.HasSuffix(prefix, "/") { - prefix += "/" - } - - var out bytes.Buffer - out.Grow(len(body) + len(prefix)*4) - s := string(body) - for i := 0; i < len(s); i++ { - c := s[i] - if (c == '"' || c == '\'') && i+1 < len(s) && s[i+1] == '/' { - if i+2 < len(s) && s[i+2] == '/' { - out.WriteByte(c) - continue - } - quote := c - j := i + 1 - for j < len(s) && s[j] != quote { - j++ - } - path := s[i+1 : j] - out.WriteByte(quote) - if shouldRewriteGatewayQuotedPath(path) { - out.WriteString(prefix) - out.WriteString(strings.TrimPrefix(path, "/")) - } else { - out.WriteString(path) - } - if j < len(s) { - out.WriteByte(quote) - } - i = j - continue - } - if i+4 < len(s) && strings.EqualFold(s[i:i+4], "url(") { - j := i + 4 - for j < len(s) && (s[j] == ' ' || s[j] == '\t') { - j++ - } - if j < len(s) && (s[j] == '"' || s[j] == '\'') { - quote := s[j] - if j+1 < len(s) && s[j+1] == '/' && !(j+2 < len(s) && s[j+2] == '/') { - k := j + 1 - for k < len(s) && s[k] != quote { - k++ - } - path := s[j+1 : k] - out.WriteString(s[i : j+1]) - if shouldRewriteGatewayQuotedPath(path) { - out.WriteString(prefix) - out.WriteString(strings.TrimPrefix(path, "/")) - } else { - out.WriteString(path) - } - if k < len(s) { - out.WriteByte(quote) - } - i = k - continue - } - } else if j < len(s) && s[j] == '/' && !(j+1 < len(s) && s[j+1] == '/') { - k := j + 1 - for k < len(s) && s[k] != ')' && s[k] != ' ' && s[k] != '\t' && s[k] != '"' && s[k] != '\'' { - k++ - } - path := s[j:k] - out.WriteString(s[i:j]) - if shouldRewriteGatewayQuotedPath(path) { - out.WriteString(prefix) - out.WriteString(strings.TrimPrefix(path, "/")) - } else { - out.WriteString(path) - } - i = k - 1 - continue - } - } - out.WriteByte(c) - } - return out.Bytes() -} - -func stripGatewayBaseTag(body []byte) []byte { - lower := bytes.ToLower(body) - for { - idx := bytes.Index(lower, []byte("")) - if end < 0 { - break - } - endIdx := idx + end + 1 - body = append(append(body[:idx], body[endIdx:]...)) - lower = bytes.ToLower(body) - } - return body -} - -func stripGatewayCSP(body []byte) []byte { - lower := bytes.ToLower(body) - for _, tag := range []string{ - `")) - if end < 0 { - break - } - endIdx := idx + end + 1 - body = append(append(body[:idx], body[endIdx:]...)) - lower = bytes.ToLower(body) - } - } - return body -} - -func rewriteGatewayBody(body []byte, contentType, prefix string) []byte { - body = stripGatewayCSP(body) - ct := strings.ToLower(contentType) - if strings.Contains(ct, "text/html") { - body = stripGatewayBaseTag(body) - } - if shouldRewriteGatewayBody(contentType) { - body = rewriteGatewayRootPaths(body, prefix) - return rewriteGatewayWebSocketPaths(body, prefix) - } - return body -} - -// injectGatewayClientShim patches WebSocket URLs (trailing slash + ?token= for OpenClaw Control UI). -func injectGatewayClientShim(body []byte, gatewayToken string) []byte { - tokenJSON, _ := json.Marshal(gatewayToken) - shim := fmt.Sprintf(``, tokenJSON) - lower := bytes.ToLower(body) - for _, tag := range []string{"", ""} { - if idx := bytes.Index(lower, []byte(strings.ToLower(tag))); idx >= 0 { - out := make([]byte, 0, len(body)+len(shim)) - out = append(out, body[:idx]...) - out = append(out, shim...) - out = append(out, body[idx:]...) - return out - } - } - return append(bytes.Clone(body), shim...) -} - -// rewriteGatewayWebSocketPaths ensures bundled/runtime WS URLs use .../gateway/ (trailing slash). -// Only rewrites occurrences not already followed by '/' (avoids breaking .../gateway/assets/...). -func rewriteGatewayWebSocketPaths(body []byte, prefix string) []byte { - gatewayWithSlash := strings.TrimSuffix(prefix, "/") + "/" - gatewayNoSlash := strings.TrimSuffix(gatewayWithSlash, "/") - if gatewayNoSlash == "" || gatewayNoSlash == gatewayWithSlash { - return body - } - needle := []byte(gatewayNoSlash) - var out bytes.Buffer - out.Grow(len(body) + 16) - for i := 0; i < len(body); { - idx := bytes.Index(body[i:], needle) - if idx < 0 { - out.Write(body[i:]) - break - } - idx += i - out.Write(body[i:idx]) - end := idx + len(needle) - if end < len(body) && body[end] == '/' { - out.Write(needle) - } else { - out.Write([]byte(gatewayWithSlash)) - } - i = end - } - return out.Bytes() -} - -func shouldRewriteGatewayBody(contentType string) bool { - ct := strings.ToLower(contentType) - return strings.Contains(ct, "text/html") || - strings.Contains(ct, "javascript") || - strings.Contains(ct, "text/css") || - strings.Contains(ct, "application/json") -} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go deleted file mode 100644 index eaab469051..0000000000 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_rewrite_test.go +++ /dev/null @@ -1,165 +0,0 @@ -package handlers - -import ( - "net/http" - "strings" - "testing" -) - -func TestResolveGatewayUpstreamPath(t *testing.T) { - t.Parallel() - ns, name := "kagent", "my-claw" - public := agentHarnessGatewayPublicPrefix(ns, name) - - tests := []struct { - name string - path string - wsUpgrade bool - wantUp string - wantRedir string - wantOK bool - }{ - { - name: "harness root redirects", - path: "/api/agentharnesses/kagent/my-claw", - wantRedir: public, - wantOK: true, - }, - { - name: "gateway without slash redirects", - path: "/api/agentharnesses/kagent/my-claw/gateway", - wantUp: "/", - wantRedir: public, - wantOK: true, - }, - { - name: "gateway without slash websocket", - path: "/api/agentharnesses/kagent/my-claw/gateway", - wsUpgrade: true, - wantUp: "/", - wantRedir: public, - wantOK: true, - }, - { - name: "gateway index", - path: "/api/agentharnesses/kagent/my-claw/gateway/", - wantUp: "/", - wantOK: true, - }, - { - name: "gateway asset", - path: "/api/agentharnesses/kagent/my-claw/gateway/assets/foo.js", - wantUp: "/assets/foo.js", - wantOK: true, - }, - { - name: "mis-resolved asset shim", - path: "/api/agentharnesses/kagent/my-claw/assets/foo.js", - wantUp: "/assets/foo.js", - wantOK: true, - }, - { - name: "manifest shim", - path: "/api/agentharnesses/kagent/my-claw/manifest.webmanifest", - wantUp: "/manifest.webmanifest", - wantOK: true, - }, - { - name: "unknown path", - path: "/api/agentharnesses/kagent/my-claw/api/v1/foo", - wantOK: false, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - up, redir, ok := resolveGatewayUpstreamPath(tt.path, ns, name, tt.wsUpgrade) - if ok != tt.wantOK { - t.Fatalf("ok = %v, want %v", ok, tt.wantOK) - } - if up != tt.wantUp { - t.Fatalf("upstream = %q, want %q", up, tt.wantUp) - } - if redir != tt.wantRedir { - t.Fatalf("redirect = %q, want %q", redir, tt.wantRedir) - } - }) - } -} - -func TestRewriteGatewayRootPaths(t *testing.T) { - t.Parallel() - prefix := "/api/agentharnesses/kagent/my-claw/gateway/" - in := `` - out := string(rewriteGatewayRootPaths([]byte(in), prefix)) - if !strings.Contains(out, `src="/api/agentharnesses/kagent/my-claw/gateway/assets/index.js"`) { - t.Fatalf("script src not rewritten: %s", out) - } - if !strings.Contains(out, `href="/api/agentharnesses/kagent/my-claw/gateway/manifest.webmanifest"`) { - t.Fatalf("link href not rewritten: %s", out) - } -} - -func TestIsWebSocketUpgrade(t *testing.T) { - t.Parallel() - req, _ := http.NewRequest(http.MethodGet, "http://example.com/api/x/gateway", nil) - req.Header.Set("Connection", "Upgrade") - req.Header.Set("Upgrade", "websocket") - if !isWebSocketUpgrade(req) { - t.Fatal("expected websocket upgrade") - } - req2, _ := http.NewRequest(http.MethodGet, "http://example.com/", nil) - if isWebSocketUpgrade(req2) { - t.Fatal("expected not websocket upgrade") - } -} - -func TestRewriteGatewayWebSocketPaths(t *testing.T) { - t.Parallel() - prefix := "/api/agentharnesses/kagent/my-claw/gateway/" - in := `const u="ws://localhost:8001/api/agentharnesses/kagent/my-claw/gateway"; const v='wss://host/api/agentharnesses/kagent/my-claw/gateway'` - out := string(rewriteGatewayWebSocketPaths([]byte(in), prefix)) - want := "/api/agentharnesses/kagent/my-claw/gateway/" - if !strings.Contains(out, "ws://localhost:8001"+want) { - t.Fatalf("ws URL not rewritten: %s", out) - } - if !strings.Contains(out, "wss://host"+want) { - t.Fatalf("wss URL not rewritten: %s", out) - } -} - -func TestRewriteGatewayBodyStripsBaseAndCSP(t *testing.T) { - t.Parallel() - prefix := "/api/agentharnesses/kagent/my-claw/gateway/" - in := `` - out := string(rewriteGatewayBody([]byte(in), "text/html", prefix)) - if strings.Contains(strings.ToLower(out), " (API key + channel tokens). // @@ -177,6 +190,7 @@ func buildGatewaySection(gw GatewayBootstrapConfig) gatewaySection { } if gw.ControlUI != nil { section.ControlUi = &controlUiSection{ + BasePath: normalizeControlUIBasePath(gw.ControlUI.BasePath), AllowedOrigins: gw.ControlUI.AllowedOrigins, DangerouslyDisableDeviceAuth: gw.ControlUI.DangerouslyDisableDeviceAuth, } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go index 4fd9ff2e72..e30f64daf8 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go @@ -10,12 +10,13 @@ import ( func TestSubstrateGatewayBootstrap(t *testing.T) { t.Parallel() - raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80)) + raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/kagent/claw/gateway/")) require.NoError(t, err) var root map[string]any require.NoError(t, json.Unmarshal(raw, &root)) gw := root["gateway"].(map[string]any) require.Equal(t, "lan", gw["bind"]) cui := gw["controlUi"].(map[string]any) + require.Equal(t, "/api/agentharnesses/kagent/claw/gateway", cui["basePath"]) require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go index 3ffbd7c9ca..424e850219 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go @@ -79,7 +79,7 @@ func TestBuildBootstrapJSON_SubstrateOmitsModelsWhenNoExplicitBaseURL(t *testing sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80), openclaw.SubstrateBootstrapDefaultBaseURL) + raw, _, err := openclaw.BuildBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/default/s1/gateway"), openclaw.SubstrateBootstrapDefaultBaseURL) require.NoError(t, err) var root map[string]any diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go index bf6dd73760..40b3e75b4c 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw/types.go @@ -12,11 +12,11 @@ type bootstrapDocument struct { } type gatewaySection struct { - Mode string `json:"mode"` - Bind string `json:"bind"` - Auth gatewayAuth `json:"auth"` - Port int `json:"port"` - ControlUi *controlUiSection `json:"controlUi,omitempty"` + Mode string `json:"mode"` + Bind string `json:"bind"` + Auth gatewayAuth `json:"auth"` + Port int `json:"port"` + ControlUi *controlUiSection `json:"controlUi,omitempty"` } type gatewayAuth struct { @@ -25,6 +25,7 @@ type gatewayAuth struct { } type controlUiSection struct { + BasePath string `json:"basePath,omitempty"` AllowedOrigins []string `json:"allowedOrigins,omitempty"` DangerouslyDisableDeviceAuth bool `json:"dangerouslyDisableDeviceAuth,omitempty"` } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index b5e9903e30..95c561d48b 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -29,7 +29,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if err != nil { return "", nil, fmt.Errorf("resolve gateway token: %w", err) } - gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort) + gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort, openClawControlUIBasePath(ah)) var jsonBytes []byte var envMap map[string]string @@ -61,6 +61,13 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha return script, containerEnv, nil } +func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return "" + } + return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" +} + func openClawEnvVars(envMap map[string]string) []corev1.EnvVar { keys := make([]string, 0, len(envMap)) for k := range envMap { diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index 9c16ece575..1de21a3685 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -92,6 +92,8 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { auth := gw["auth"].(map[string]any) require.Equal(t, "token", auth["mode"]) require.Equal(t, "some-token", auth["token"]) + controlUI := gw["controlUi"].(map[string]any) + require.Equal(t, "/api/agentharnesses/kagent/peterj-claw/gateway", controlUI["basePath"]) _, hasModels := root["models"] require.False(t, hasModels, "substrate bootstrap should omit models unless ModelConfig sets an explicit baseUrl") require.Contains(t, root, "agents") From fd338bdf3c014471129a31a06df2c1517ed6bd19 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Wed, 27 May 2026 10:14:35 -0700 Subject: [PATCH 16/32] split substrate and openshell, use the secrets in substrate fork for modelconfig/channels Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 7 +- go/core/pkg/app/app.go | 5 +- .../openclaw/bootstrap_openshell.go | 83 +++++ .../bootstrap_openshell_test.go} | 2 +- .../bootstrap_shared.go} | 73 +---- .../openclaw/bootstrap_substrate.go | 87 ++++++ .../openclaw/bootstrap_substrate_test.go | 145 +++++++++ .../openclaw/channels_openshell.go | 123 ++++++++ .../openclaw/channels_shared.go | 105 +++++++ .../openclaw/channels_substrate.go | 114 +++++++ .../{openshell => }/openclaw/constants.go | 9 +- .../sandboxbackend/openclaw/credentials.go | 93 ++++++ .../{openshell => }/openclaw/defaults.go | 0 .../{openshell => }/openclaw/modelconfig.go | 25 ++ .../{openshell => }/openclaw/policy.go | 0 .../{openshell => }/openclaw/provider.go | 0 .../pkg/sandboxbackend/openclaw/secrets.go | 33 ++ .../{openshell => }/openclaw/ssh_test.go | 2 +- .../{openshell => }/openclaw/types.go | 37 ++- .../pkg/sandboxbackend/openshell/openclaw.go | 2 +- .../openclaw/bootstrap_substrate_test.go | 22 -- .../openshell/openclaw/channels.go | 97 ------ .../openshell/openshell_test.go | 2 +- .../pkg/sandboxbackend/openshell/policy.go | 2 +- .../sandboxbackend/openshell/ssh_terminal.go | 2 +- .../openshell/ssh_terminal_test.go | 2 +- .../pkg/sandboxbackend/openshell/translate.go | 2 +- .../openshell/translate_test.go | 2 +- .../substrate/delete_provision.go | 4 +- .../substrate/delete_provision_test.go | 2 +- .../pkg/sandboxbackend/substrate/openclaw.go | 2 +- .../pkg/sandboxbackend/substrate/provision.go | 284 +----------------- .../substrate/provision_actortemplate.go | 89 ++++++ .../substrate/provision_openclaw.go | 25 +- .../substrate/provision_openclaw_test.go | 12 +- .../substrate/provision_shared.go | 124 ++++++++ .../substrate/provision_workerpool.go | 77 +++++ 37 files changed, 1181 insertions(+), 514 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go rename go/core/pkg/sandboxbackend/{openshell/openclaw/bootstrap_test.go => openclaw/bootstrap_openshell_test.go} (98%) rename go/core/pkg/sandboxbackend/{openshell/openclaw/bootstrap.go => openclaw/bootstrap_shared.go} (64%) create mode 100644 go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/channels_openshell.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/channels_shared.go create mode 100644 go/core/pkg/sandboxbackend/openclaw/channels_substrate.go rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/constants.go (61%) create mode 100644 go/core/pkg/sandboxbackend/openclaw/credentials.go rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/defaults.go (100%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/modelconfig.go (58%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/policy.go (100%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/provider.go (100%) create mode 100644 go/core/pkg/sandboxbackend/openclaw/secrets.go rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/ssh_test.go (72%) rename go/core/pkg/sandboxbackend/{openshell => }/openclaw/types.go (77%) delete mode 100644 go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go delete mode 100644 go/core/pkg/sandboxbackend/openshell/openclaw/channels.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_shared.go create mode 100644 go/core/pkg/sandboxbackend/substrate/provision_workerpool.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index f2587b17a6..bb5964f663 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -71,13 +71,14 @@ spec: # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. + # NOTE: use single worker for now due to https://github.com/agent-substrate/substrate/issues/50 + gatewayToken: test-token workerPool: - replicas: 2 + replicas: 1 # ateomImage: localhost:5001/ateom-gvisor:latest # Required: configure the OpenClaw gateway token for this harness. # Use either gatewayToken or gatewayTokenSecretRef. The Secret must contain key "token". - gatewayToken: test-token # gatewayTokenSecretRef: # name: openclaw-gateway-token # namespace: kagent @@ -140,6 +141,8 @@ spec: The generated `command` contains a base64-encoded `openclaw.json`, so the live object will be more verbose than the abbreviated example above. `pauseImage`, runsc URLs and hashes, and the default workload image come from controller/Helm configuration unless overridden on the `AgentHarness`; the gateway token comes from `spec.substrate.gatewayToken` or `gatewayTokenSecretRef`. kagent also sets `gateway.controlUi.basePath` to `/api/agentharnesses///gateway` so OpenClaw serves the Control UI under the same path kagent proxies. +When `modelConfigRef` or `spec.channels` are set, credentials are **not** copied into the ActorTemplate or `openclaw.json` as plaintext. kagent writes `valueFrom.secretKeyRef` (or inline `value` for harness inline tokens) on the ActorTemplate container env; Substrate `ate-api` resolves those refs at actor resume. In `openclaw.json`, kagent uses OpenClaw [env SecretRefs](https://docs.openclaw.ai/gateway/secrets) (`{source:"env",provider:"default",id:""}`) for `models.providers.*.apiKey`, `channels.telegram.accounts.*.botToken`, and `channels.slack.accounts.*.botToken` / `appToken`. Rotate a Secret and recreate the ActorTemplate golden snapshot when keys change. + Port-forward the UI: ```bash diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 2389660463..017c1ce7ee 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -59,6 +59,7 @@ import ( "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/migrations" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "github.com/kagent-dev/kagent/go/core/pkg/translator" @@ -845,7 +846,7 @@ func substrateAppConfig(cfg *Config) substrate.Config { RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, DefaultAteomImage: cfg.Substrate.AteomImage, - DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, }, } return sc @@ -862,7 +863,7 @@ func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate * RunscARM64URL: cfg.Substrate.RunscARM64URL, RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, DefaultAteomImage: cfg.Substrate.AteomImage, - DefaultWorkloadImage: openshell.NemoclawSandboxBaseImage, + DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, }, } } diff --git a/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go new file mode 100644 index 0000000000..ede55d15b8 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell.go @@ -0,0 +1,83 @@ +package openclaw + +import ( + "context" + "encoding/json" + "fmt" + "slices" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// BuildBootstrapJSON builds ~/.openclaw/openclaw.json contents plus environment variables that must be present when +// OpenClaw resolves openshell:resolve:env: (API key + channel tokens). +// +// defaultBaseURLWhenUnset is used when ModelConfig has no explicit provider base URL. +// OpenShell callers should pass DefaultInferenceBaseURL. +func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, defaultBaseURLWhenUnset string) ([]byte, map[string]string, error) { + if mc == nil { + return nil, nil, fmt.Errorf("ModelConfig is required") + } + apiKey, err := ResolveModelConfigAPIKey(ctx, kube, mc) + if err != nil { + return nil, nil, fmt.Errorf("resolve model API key: %w", err) + } + apiAdapter, err := providerAPI(mc) + if err != nil { + return nil, nil, err + } + + apiKeyEnv := DefaultAPIKeyEnvVar(mc.Spec.Provider) + env := map[string]string{ + apiKeyEnv: apiKey, + } + + modelID, err := requiredModelID(mc) + if err != nil { + return nil, nil, err + } + + providerRecord := GatewayProviderRecordName(mc.Spec.Provider) + doc := buildCoreBootstrapDocument(mc, gw, credentialValue{literal: openshellResolveEnv(apiKeyEnv)}, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset) + + chState, err := accumulateHarnessChannels(ctx, kube, namespace, sbx.Spec.Channels, env) + if err != nil { + return nil, nil, err + } + doc.Channels = chState.channelsJSON() + + applyOpenshellSecretsAllowlist(&doc, env) + + raw, err := json.Marshal(doc) + if err != nil { + return nil, nil, fmt.Errorf("marshal openclaw json: %w", err) + } + return raw, env, nil +} + +func applyOpenshellSecretsAllowlist(doc *bootstrapDocument, env map[string]string, extraEnvNames ...string) { + seen := make(map[string]struct{}, len(env)+len(extraEnvNames)) + secretAllow := make([]string, 0, len(env)+len(extraEnvNames)) + for k := range env { + if _, ok := seen[k]; !ok { + seen[k] = struct{}{} + secretAllow = append(secretAllow, k) + } + } + for _, k := range extraEnvNames { + if _, ok := seen[k]; !ok { + seen[k] = struct{}{} + secretAllow = append(secretAllow, k) + } + } + slices.Sort(secretAllow) + doc.Secrets = secretsSection{ + Providers: map[string]secretProvider{ + openshellSecretProviderID: { + Source: "env", + Allowlist: secretAllow, + }, + }, + } +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell_test.go similarity index 98% rename from go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go rename to go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell_test.go index 424e850219..18f7b1ce42 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_test.go +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_openshell_test.go @@ -6,7 +6,7 @@ import ( "testing" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_shared.go similarity index 64% rename from go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go rename to go/core/pkg/sandboxbackend/openclaw/bootstrap_shared.go index 9e72db95c3..b547c83f37 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap.go +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_shared.go @@ -1,14 +1,11 @@ package openclaw import ( - "context" "encoding/json" "fmt" - "slices" "strings" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "sigs.k8s.io/controller-runtime/pkg/client" ) // GatewayBootstrapConfig describes the gateway section of openclaw.json for a harness runtime. @@ -58,52 +55,6 @@ func normalizeControlUIBasePath(path string) string { return strings.TrimRight(path, "/") } -// BuildBootstrapJSON builds ~/.openclaw/openclaw.json contents plus environment variables that must be present when -// OpenClaw resolves openshell:resolve:env: (API key + channel tokens). -// -// defaultBaseURLWhenUnset is used when ModelConfig has no explicit provider base URL. -// OpenShell callers should pass DefaultInferenceBaseURL; Substrate should pass SubstrateBootstrapDefaultBaseURL. -func BuildBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, defaultBaseURLWhenUnset string) ([]byte, map[string]string, error) { - if mc == nil { - return nil, nil, fmt.Errorf("ModelConfig is required") - } - apiKey, err := ResolveModelConfigAPIKey(ctx, kube, mc) - if err != nil { - return nil, nil, fmt.Errorf("resolve model API key: %w", err) - } - apiAdapter, err := providerAPI(mc) - if err != nil { - return nil, nil, err - } - - apiKeyEnv := DefaultAPIKeyEnvVar(mc.Spec.Provider) - env := map[string]string{ - apiKeyEnv: apiKey, - } - - modelID := strings.TrimSpace(mc.Spec.Model) - if modelID == "" { - return nil, nil, fmt.Errorf("ModelConfig.spec.model is required for OpenClaw bootstrap JSON") - } - - providerRecord := GatewayProviderRecordName(mc.Spec.Provider) - doc := buildCoreBootstrapDocument(mc, gw, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset) - - chState, err := accumulateHarnessChannels(ctx, kube, namespace, sbx.Spec.Backend, sbx.Spec.Channels, env) - if err != nil { - return nil, nil, err - } - doc.Channels = chState.channelsJSON() - - applySecretsAllowlist(&doc, env) - - raw, err := json.Marshal(doc) - if err != nil { - return nil, nil, fmt.Errorf("marshal openclaw json: %w", err) - } - return raw, env, nil -} - // BuildGatewayOnlyBootstrapJSON returns a minimal openclaw.json with gateway settings only (no models/channels). func BuildGatewayOnlyBootstrapJSON(gw GatewayBootstrapConfig) ([]byte, error) { doc := bootstrapDocument{Gateway: buildGatewaySection(gw)} @@ -114,7 +65,7 @@ func BuildGatewayOnlyBootstrapJSON(gw GatewayBootstrapConfig) ([]byte, error) { return raw, nil } -func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, apiKeyEnv, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset string) bootstrapDocument { +func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig, apiKey credentialValue, providerRecord, modelID, apiAdapter, defaultBaseURLWhenUnset string) bootstrapDocument { doc := bootstrapDocument{ Gateway: buildGatewaySection(gw), Agents: agentsSection{ @@ -135,7 +86,7 @@ func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapCon Providers: map[string]providerSettings{ providerRecord: { BaseURL: explicit, - APIKey: openshellResolveEnv(apiKeyEnv), + APIKey: apiKey, Auth: providerAuth(mc), API: apiAdapter, Models: []modelSlot{ @@ -154,7 +105,7 @@ func buildCoreBootstrapDocument(mc *v1alpha2.ModelConfig, gw GatewayBootstrapCon Providers: map[string]providerSettings{ providerRecord: { BaseURL: baseURL, - APIKey: openshellResolveEnv(apiKeyEnv), + APIKey: apiKey, Auth: providerAuth(mc), API: apiAdapter, Models: []modelSlot{ @@ -198,18 +149,10 @@ func buildGatewaySection(gw GatewayBootstrapConfig) gatewaySection { return section } -func applySecretsAllowlist(doc *bootstrapDocument, env map[string]string) { - secretAllow := make([]string, 0, len(env)) - for k := range env { - secretAllow = append(secretAllow, k) - } - slices.Sort(secretAllow) - doc.Secrets = secretsSection{ - Providers: map[string]secretProvider{ - bootstrapSecretProviderID: { - Source: "env", - Allowlist: secretAllow, - }, - }, +func requiredModelID(mc *v1alpha2.ModelConfig) (string, error) { + modelID := strings.TrimSpace(mc.Spec.Model) + if modelID == "" { + return "", fmt.Errorf("ModelConfig.spec.model is required for OpenClaw bootstrap JSON") } + return modelID, nil } diff --git a/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go new file mode 100644 index 0000000000..f20e1294b6 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate.go @@ -0,0 +1,87 @@ +package openclaw + +import ( + "context" + "encoding/json" + "fmt" + "slices" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// BuildSubstrateBootstrapJSON builds openclaw.json and ActorTemplate container env for Agent Substrate. +// Model and channel credentials use OpenClaw env SecretRefs in openclaw.json ({source:"env",provider:"default",id:"..."}) +// and ActorTemplate container env (literal value or valueFrom secretKeyRef/configMapKeyRef, resolved by ate-api at resume). +func BuildSubstrateBootstrapJSON(ctx context.Context, kube client.Client, namespace string, sbx *v1alpha2.AgentHarness, mc *v1alpha2.ModelConfig, gw GatewayBootstrapConfig) ([]byte, []corev1.EnvVar, error) { + if mc == nil { + return nil, nil, fmt.Errorf("ModelConfig is required") + } + apiKeyEnvVar, err := ModelConfigAPIKeyEnvVar(mc) + if err != nil { + return nil, nil, err + } + apiAdapter, err := providerAPI(mc) + if err != nil { + return nil, nil, err + } + + modelID, err := requiredModelID(mc) + if err != nil { + return nil, nil, err + } + + apiKeyEnv := apiKeyEnvVar.Name + providerRecord := GatewayProviderRecordName(mc.Spec.Provider) + apiKeyRef := openclawEnvSecretRef(apiKeyEnv) + doc := buildCoreBootstrapDocument(mc, gw, credentialValue{envSecret: &apiKeyRef}, providerRecord, modelID, apiAdapter, SubstrateBootstrapDefaultBaseURL) + + chState, channelEnv, err := accumulateSubstrateHarnessChannels(ctx, kube, namespace, sbx.Spec.Channels) + if err != nil { + return nil, nil, err + } + doc.Channels = chState.channelsJSON() + + applySubstrateSecretsAllowlist(&doc, apiKeyEnv, channelEnv) + + raw, err := json.Marshal(doc) + if err != nil { + return nil, nil, fmt.Errorf("marshal openclaw json: %w", err) + } + return raw, substrateContainerEnv(apiKeyEnvVar, channelEnv), nil +} + +func substrateContainerEnv(apiKey corev1.EnvVar, extra []corev1.EnvVar) []corev1.EnvVar { + out := make([]corev1.EnvVar, 0, len(extra)+2) + out = append(out, apiKey) + out = append(out, extra...) + out = append(out, corev1.EnvVar{Name: "HOME", Value: "/root"}) + return out +} + +func applySubstrateSecretsAllowlist(doc *bootstrapDocument, apiKeyEnv string, channelEnv []corev1.EnvVar) { + seen := make(map[string]struct{}, len(channelEnv)+1) + secretAllow := make([]string, 0, len(channelEnv)+1) + add := func(name string) { + if _, ok := seen[name]; ok { + return + } + seen[name] = struct{}{} + secretAllow = append(secretAllow, name) + } + add(apiKeyEnv) + for _, env := range channelEnv { + add(env.Name) + } + slices.Sort(secretAllow) + doc.Secrets = secretsSection{ + Providers: map[string]secretProvider{ + substrateSecretProviderID: { + Source: "env", + Allowlist: secretAllow, + }, + }, + Defaults: &secretsDefaults{Env: substrateSecretProviderID}, + } +} diff --git a/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go new file mode 100644 index 0000000000..a5136bc81f --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/bootstrap_substrate_test.go @@ -0,0 +1,145 @@ +package openclaw_test + +import ( + "context" + "encoding/json" + "testing" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestSubstrateGatewayBootstrap(t *testing.T) { + t.Parallel() + raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/kagent/claw/gateway/")) + require.NoError(t, err) + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + gw := root["gateway"].(map[string]any) + require.Equal(t, "lan", gw["bind"]) + cui := gw["controlUi"].(map[string]any) + require.Equal(t, "/api/agentharnesses/kagent/claw/gateway", cui["basePath"]) + require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) +} + +func TestBuildSubstrateBootstrapJSON_ModelConfigAPIKeyUsesSecretRef(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "default" + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc1", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + sbx := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}} + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(mc).Build() + raw, env, err := openclaw.BuildSubstrateBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80, "/gw/")) + require.NoError(t, err) + + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + secRoot := root["secrets"].(map[string]any) + secProvs := secRoot["providers"].(map[string]any) + defaultProv := secProvs["default"].(map[string]any) + require.Contains(t, defaultProv["allowlist"], "OPENAI_API_KEY") + defaults := secRoot["defaults"].(map[string]any) + require.Equal(t, "default", defaults["env"]) + + var apiKeyEnv *corev1.EnvVar + for i := range env { + if env[i].Name == "OPENAI_API_KEY" { + apiKeyEnv = &env[i] + break + } + } + require.NotNil(t, apiKeyEnv) + require.NotNil(t, apiKeyEnv.ValueFrom) + require.NotNil(t, apiKeyEnv.ValueFrom.SecretKeyRef) + require.Equal(t, "openai-key", apiKeyEnv.ValueFrom.SecretKeyRef.Name) + require.Equal(t, "OPENAI_API_KEY", apiKeyEnv.ValueFrom.SecretKeyRef.Key) + require.Empty(t, apiKeyEnv.Value) +} + +func TestBuildSubstrateBootstrapJSON_TelegramUsesEnvSecretRef(t *testing.T) { + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + ns := "default" + mc := &v1alpha2.ModelConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "mc1", Namespace: ns}, + Spec: v1alpha2.ModelConfigSpec{ + Model: "gpt-4o", + Provider: v1alpha2.ModelProviderOpenAI, + APIKeySecret: "openai-key", + APIKeySecretKey: "OPENAI_API_KEY", + OpenAI: &v1alpha2.OpenAIConfig{}, + }, + } + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "tg-token", Namespace: ns}, + Data: map[string][]byte{"token": []byte("telegram-bot-token")}, + } + sbx := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Name: "s1", Namespace: ns}, + Spec: v1alpha2.AgentHarnessSpec{ + Channels: []v1alpha2.AgentHarnessChannel{ + { + Name: "tg1", + Type: v1alpha2.AgentHarnessChannelTypeTelegram, + Telegram: &v1alpha2.AgentHarnessTelegramChannelSpec{ + BotToken: v1alpha2.AgentHarnessChannelCredential{ + ValueFrom: &v1alpha2.ValueSource{ + Type: v1alpha2.SecretValueSource, + Name: "tg-token", + Key: "token", + }, + }, + }, + }, + }, + }, + } + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(mc, secret).Build() + raw, env, err := openclaw.BuildSubstrateBootstrapJSON(context.Background(), kube, ns, sbx, mc, openclaw.SubstrateGatewayBootstrap("tok", 80, "/gw/")) + require.NoError(t, err) + + var root map[string]any + require.NoError(t, json.Unmarshal(raw, &root)) + tg := root["channels"].(map[string]any)["telegram"].(map[string]any) + tg1 := tg["accounts"].(map[string]any)["tg1"].(map[string]any) + botToken := tg1["botToken"].(map[string]any) + require.Equal(t, "env", botToken["source"]) + require.Equal(t, "default", botToken["provider"]) + require.Equal(t, "KAGENT_SB_CH_TG1_TELEGRAM_BOT", botToken["id"]) + require.NotEqual(t, "telegram-bot-token", tg1["botToken"]) + + var tgEnv *corev1.EnvVar + for i := range env { + if env[i].Name == "KAGENT_SB_CH_TG1_TELEGRAM_BOT" { + tgEnv = &env[i] + break + } + } + require.NotNil(t, tgEnv) + require.NotNil(t, tgEnv.ValueFrom) + require.NotNil(t, tgEnv.ValueFrom.SecretKeyRef) + require.Equal(t, "tg-token", tgEnv.ValueFrom.SecretKeyRef.Name) + require.Equal(t, "token", tgEnv.ValueFrom.SecretKeyRef.Key) +} diff --git a/go/core/pkg/sandboxbackend/openclaw/channels_openshell.go b/go/core/pkg/sandboxbackend/openclaw/channels_openshell.go new file mode 100644 index 0000000000..a4000aa0b7 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/channels_openshell.go @@ -0,0 +1,123 @@ +package openclaw + +import ( + "context" + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/channels" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func accumulateHarnessChannels(ctx context.Context, kube client.Client, namespace string, specChannels []v1alpha2.AgentHarnessChannel, env map[string]string) (*harnessChannels, error) { + a := newHarnessChannels() + for _, ch := range specChannels { + switch ch.Type { + case v1alpha2.AgentHarnessChannelTypeTelegram: + if err := a.addTelegram(ctx, kube, namespace, ch, env); err != nil { + return nil, err + } + case v1alpha2.AgentHarnessChannelTypeSlack: + if err := a.addSlack(ctx, kube, namespace, ch, env); err != nil { + return nil, err + } + default: + return nil, unsupportedChannelType(ch.Name, ch.Type) + } + } + return a, nil +} + +func (a *harnessChannels) addTelegram(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel, env map[string]string) error { + spec := ch.Telegram + if spec == nil { + return fmt.Errorf("channel %q: telegram spec is required", ch.Name) + } + botEnv := channels.TelegramBotTokenEnvKey(ch.Name) + if err := putChannelCredential(ctx, kube, namespace, spec.BotToken, botEnv, env); err != nil { + return fmt.Errorf("channel %q telegram bot token: %w", ch.Name, err) + } + allowFrom, err := telegramAllowFrom(ctx, kube, namespace, spec) + if err != nil { + return fmt.Errorf("channel %q telegram allowlist: %w", ch.Name, err) + } + acc := telegramAccount{ + Name: ch.Name, + Enabled: true, + BotToken: credentialValue{literal: openshellResolveEnv(botEnv)}, + } + if len(allowFrom) > 0 { + acc.DMPolicy = "allowlist" + acc.AllowFrom = allowFrom + } else { + acc.DMPolicy = "pairing" + } + a.telegram[ch.Name] = acc + if a.tgDef == "" { + a.tgDef = ch.Name + } + return nil +} + +func (a *harnessChannels) addSlack(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel, env map[string]string) error { + spec := ch.Slack + if spec == nil { + return fmt.Errorf("channel %q: slack spec is required", ch.Name) + } + botEnv := channels.SlackBotTokenEnvKey(ch.Name) + appEnv := channels.SlackAppTokenEnvKey(ch.Name) + if err := putChannelCredential(ctx, kube, namespace, spec.BotToken, botEnv, env); err != nil { + return fmt.Errorf("channel %q slack bot token: %w", ch.Name, err) + } + if err := putChannelCredential(ctx, kube, namespace, spec.AppToken, appEnv, env); err != nil { + return fmt.Errorf("channel %q slack app token: %w", ch.Name, err) + } + opts := openClawSlackOptions(spec) + access := openClawSlackChannelAccess(opts) + acc := slackAccount{ + Name: ch.Name, + Enabled: true, + Mode: "socket", + BotToken: credentialValue{literal: channels.SlackBotTokenPlaceholder(botEnv)}, + AppToken: credentialValue{literal: channels.SlackAppTokenPlaceholder(appEnv)}, + UserTokenReadOnly: true, + GroupPolicy: string(access), + Capabilities: slackCaps{ + InteractiveReplies: slackInteractiveReplies(opts), + }, + } + if chans := trimNonEmptyStrings(opts.AllowlistChannels); len(chans) > 0 { + acc.DM = &groupDM{GroupEnabled: true, GroupChannels: chans} + } + a.slack[ch.Name] = acc + if a.slDef == "" { + a.slDef = ch.Name + } + if !a.slackSeen { + a.slackRootPolicy = access + a.slackSeen = true + } + return nil +} + +func telegramAllowFrom(ctx context.Context, kube client.Client, namespace string, spec *v1alpha2.AgentHarnessTelegramChannelSpec) ([]string, error) { + if len(spec.AllowedUserIDs) > 0 { + out := make([]string, 0, len(spec.AllowedUserIDs)) + for _, id := range spec.AllowedUserIDs { + s := strings.TrimSpace(id) + if s != "" { + out = append(out, s) + } + } + return out, nil + } + if spec.AllowedUserIDsFrom != nil { + raw, err := spec.AllowedUserIDsFrom.Resolve(ctx, kube, namespace) + if err != nil { + return nil, fmt.Errorf("resolve allowedUserIDsFrom: %w", err) + } + return splitAllowedList(raw), nil + } + return nil, nil +} diff --git a/go/core/pkg/sandboxbackend/openclaw/channels_shared.go b/go/core/pkg/sandboxbackend/openclaw/channels_shared.go new file mode 100644 index 0000000000..75aa66872c --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/channels_shared.go @@ -0,0 +1,105 @@ +package openclaw + +import ( + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +type harnessChannels struct { + telegram map[string]telegramAccount + tgDef string + + slack map[string]slackAccount + slDef string + + slackRootPolicy v1alpha2.AgentHarnessChannelAccess + slackSeen bool +} + +func newHarnessChannels() *harnessChannels { + return &harnessChannels{ + telegram: make(map[string]telegramAccount), + slack: make(map[string]slackAccount), + } +} + +func (a *harnessChannels) channelsJSON() *channelsConfig { + if len(a.telegram) == 0 && len(a.slack) == 0 { + return nil + } + out := &channelsConfig{} + if len(a.telegram) > 0 { + out.Telegram = &telegramBundle{ + Enabled: true, + Accounts: a.telegram, + DefaultAccount: a.tgDef, + } + } + if len(a.slack) > 0 { + out.Slack = &slackBundle{ + Enabled: true, + Mode: "socket", + WebhookPath: "/slack/events", + UserTokenReadOnly: true, + GroupPolicy: string(a.slackRootPolicy), + Accounts: a.slack, + DefaultAccount: a.slDef, + } + } + return out +} + +func openClawSlackOptions(spec *v1alpha2.AgentHarnessSlackChannelSpec) *v1alpha2.AgentHarnessOpenClawSlackOptions { + if spec == nil || spec.OpenClaw == nil { + return &v1alpha2.AgentHarnessOpenClawSlackOptions{} + } + return spec.OpenClaw +} + +func slackInteractiveReplies(opts *v1alpha2.AgentHarnessOpenClawSlackOptions) bool { + if opts == nil || opts.InteractiveReplies == nil { + return true + } + return *opts.InteractiveReplies +} + +func openClawSlackChannelAccess(opts *v1alpha2.AgentHarnessOpenClawSlackOptions) v1alpha2.AgentHarnessChannelAccess { + if opts == nil || opts.ChannelAccess == "" { + return v1alpha2.AgentHarnessChannelAccessOpen + } + return opts.ChannelAccess +} + +func splitAllowedList(raw string) []string { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil + } + var out []string + for _, part := range strings.FieldsFunc(raw, func(r rune) bool { + return r == ',' || r == '\n' || r == ';' + }) { + s := strings.TrimSpace(part) + if s != "" { + out = append(out, s) + } + } + return out +} + +func trimNonEmptyStrings(ss []string) []string { + out := make([]string, 0, len(ss)) + for _, s := range ss { + s = strings.TrimSpace(s) + if s != "" { + out = append(out, s) + } + } + return out +} + +func unsupportedChannelType(name string, typ v1alpha2.AgentHarnessChannelType) error { + return fmt.Errorf("channel %q: unsupported type %q", name, typ) +} diff --git a/go/core/pkg/sandboxbackend/openclaw/channels_substrate.go b/go/core/pkg/sandboxbackend/openclaw/channels_substrate.go new file mode 100644 index 0000000000..594661fb59 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/channels_substrate.go @@ -0,0 +1,114 @@ +package openclaw + +import ( + "context" + "fmt" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// accumulateSubstrateHarnessChannels configures channels with OpenClaw env SecretRefs in openclaw.json +// and returns container env vars (inline value or Kubernetes valueFrom refs) for the ActorTemplate. +func accumulateSubstrateHarnessChannels(ctx context.Context, kube client.Client, namespace string, channels []v1alpha2.AgentHarnessChannel) (*harnessChannels, []corev1.EnvVar, error) { + a := newHarnessChannels() + var containerEnv []corev1.EnvVar + for _, ch := range channels { + switch ch.Type { + case v1alpha2.AgentHarnessChannelTypeTelegram: + env, err := a.addSubstrateTelegram(ctx, kube, namespace, ch) + if err != nil { + return nil, nil, err + } + containerEnv = append(containerEnv, env...) + case v1alpha2.AgentHarnessChannelTypeSlack: + env, err := a.addSubstrateSlack(ctx, kube, namespace, ch) + if err != nil { + return nil, nil, err + } + containerEnv = append(containerEnv, env...) + default: + return nil, nil, unsupportedChannelType(ch.Name, ch.Type) + } + } + return a, containerEnv, nil +} + +func (a *harnessChannels) addSubstrateTelegram(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel) ([]corev1.EnvVar, error) { + spec := ch.Telegram + if spec == nil { + return nil, fmt.Errorf("channel %q: telegram spec is required", ch.Name) + } + botEnv := channelSecretEnvVar(ch.Name, "TELEGRAM_BOT") + botEnvVar, err := channelCredentialContainerEnv(spec.BotToken, botEnv) + if err != nil { + return nil, fmt.Errorf("channel %q telegram bot token: %w", ch.Name, err) + } + allowFrom, err := telegramAllowFrom(ctx, kube, namespace, spec) + if err != nil { + return nil, fmt.Errorf("channel %q telegram allowlist: %w", ch.Name, err) + } + ref := openclawEnvSecretRef(botEnv) + acc := telegramAccount{ + Name: ch.Name, + Enabled: true, + BotToken: credentialValue{envSecret: &ref}, + } + if len(allowFrom) > 0 { + acc.DMPolicy = "allowlist" + acc.AllowFrom = allowFrom + } else { + acc.DMPolicy = "pairing" + } + a.telegram[ch.Name] = acc + if a.tgDef == "" { + a.tgDef = ch.Name + } + return []corev1.EnvVar{botEnvVar}, nil +} + +func (a *harnessChannels) addSubstrateSlack(ctx context.Context, kube client.Client, namespace string, ch v1alpha2.AgentHarnessChannel) ([]corev1.EnvVar, error) { + spec := ch.Slack + if spec == nil { + return nil, fmt.Errorf("channel %q: slack spec is required", ch.Name) + } + botEnv := channelSecretEnvVar(ch.Name, "SLACK_BOT") + appEnv := channelSecretEnvVar(ch.Name, "SLACK_APP") + botEnvVar, err := channelCredentialContainerEnv(spec.BotToken, botEnv) + if err != nil { + return nil, fmt.Errorf("channel %q slack bot token: %w", ch.Name, err) + } + appEnvVar, err := channelCredentialContainerEnv(spec.AppToken, appEnv) + if err != nil { + return nil, fmt.Errorf("channel %q slack app token: %w", ch.Name, err) + } + botRef := openclawEnvSecretRef(botEnv) + appRef := openclawEnvSecretRef(appEnv) + opts := openClawSlackOptions(spec) + access := openClawSlackChannelAccess(opts) + acc := slackAccount{ + Name: ch.Name, + Enabled: true, + Mode: "socket", + BotToken: credentialValue{envSecret: &botRef}, + AppToken: credentialValue{envSecret: &appRef}, + UserTokenReadOnly: true, + GroupPolicy: string(access), + Capabilities: slackCaps{ + InteractiveReplies: slackInteractiveReplies(opts), + }, + } + if chans := trimNonEmptyStrings(opts.AllowlistChannels); len(chans) > 0 { + acc.DM = &groupDM{GroupEnabled: true, GroupChannels: chans} + } + a.slack[ch.Name] = acc + if a.slDef == "" { + a.slDef = ch.Name + } + if !a.slackSeen { + a.slackRootPolicy = access + a.slackSeen = true + } + return []corev1.EnvVar{botEnvVar, appEnvVar}, nil +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go b/go/core/pkg/sandboxbackend/openclaw/constants.go similarity index 61% rename from go/core/pkg/sandboxbackend/openshell/openclaw/constants.go rename to go/core/pkg/sandboxbackend/openclaw/constants.go index e94d0789f1..bf696bd59d 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/constants.go +++ b/go/core/pkg/sandboxbackend/openclaw/constants.go @@ -4,13 +4,16 @@ const ( // NemoclawSandboxBaseImage is the default OpenShell VM image for OpenClaw/NemoClaw harnesses. NemoclawSandboxBaseImage = "ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4" - // bootstrapSecretProviderID is the secrets.providers key written into openclaw.json. - bootstrapSecretProviderID = "kagent" + // openshellSecretProviderID is the secrets.providers key written into openclaw.json for OpenShell sandboxes. + openshellSecretProviderID = "kagent" + + // substrateSecretProviderID is the env SecretRef provider id for native OpenClaw on Substrate. + substrateSecretProviderID = "default" // DefaultInferenceBaseURL is the Model provider baseUrl when ModelConfig does not set an explicit upstream (OpenShell). DefaultInferenceBaseURL = "https://inference.local/v1" - // SubstrateBootstrapDefaultBaseURL is passed to BuildBootstrapJSON for Substrate harnesses. + // SubstrateBootstrapDefaultBaseURL is passed when building openclaw.json for Substrate harnesses. // When ModelConfig has no explicit provider URL, the models section is omitted entirely so // OpenClaw is not given a partial providers.* block (baseUrl is required when present). SubstrateBootstrapDefaultBaseURL = "" diff --git a/go/core/pkg/sandboxbackend/openclaw/credentials.go b/go/core/pkg/sandboxbackend/openclaw/credentials.go new file mode 100644 index 0000000000..b167802c96 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/credentials.go @@ -0,0 +1,93 @@ +package openclaw + +import ( + "context" + "fmt" + "strings" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func sandboxChannelEnvSuffix(name string) string { + var b strings.Builder + for _, r := range strings.ToUpper(strings.TrimSpace(name)) { + switch { + case r >= 'A' && r <= 'Z', r >= '0' && r <= '9': + b.WriteRune(r) + default: + b.WriteByte('_') + } + } + s := strings.Trim(b.String(), "_") + if s == "" { + return "CH" + } + return s +} + +func channelSecretEnvVar(channelName, tokenRole string) string { + return fmt.Sprintf("KAGENT_SB_CH_%s_%s", sandboxChannelEnvSuffix(channelName), tokenRole) +} + +func putChannelCredential(ctx context.Context, kube client.Client, namespace string, cred v1alpha2.AgentHarnessChannelCredential, envKey string, env map[string]string) error { + if strings.TrimSpace(cred.Value) != "" { + env[envKey] = strings.TrimSpace(cred.Value) + return nil + } + if cred.ValueFrom == nil { + return fmt.Errorf("channel credential requires value or valueFrom") + } + v, err := cred.ValueFrom.Resolve(ctx, kube, namespace) + if err != nil { + return fmt.Errorf("resolve credential %s: %w", envKey, err) + } + env[envKey] = v + return nil +} + +// channelCredentialContainerEnv maps a harness channel credential to an ActorTemplate env var. +// Inline values use env.Value; Secret/ConfigMap sources use valueFrom refs resolved by Substrate ate-api at resume. +func channelCredentialContainerEnv(cred v1alpha2.AgentHarnessChannelCredential, envKey string) (corev1.EnvVar, error) { + if v := strings.TrimSpace(cred.Value); v != "" { + return corev1.EnvVar{Name: envKey, Value: v}, nil + } + if cred.ValueFrom == nil { + return corev1.EnvVar{}, fmt.Errorf("channel credential requires value or valueFrom") + } + switch cred.ValueFrom.Type { + case v1alpha2.SecretValueSource: + return corev1.EnvVar{ + Name: envKey, + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: cred.ValueFrom.Name}, + Key: cred.ValueFrom.Key, + }, + }, + }, nil + case v1alpha2.ConfigMapValueSource: + return corev1.EnvVar{ + Name: envKey, + ValueFrom: &corev1.EnvVarSource{ + ConfigMapKeyRef: &corev1.ConfigMapKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: cred.ValueFrom.Name}, + Key: cred.ValueFrom.Key, + }, + }, + }, nil + default: + return corev1.EnvVar{}, fmt.Errorf("unknown value source type %q", cred.ValueFrom.Type) + } +} + +// resolvedChannelSecret returns the plaintext value putChannelCredential stored in env. +// OpenShell bootstrap still inlines channel tokens in openclaw.json; Substrate uses OpenClaw env SecretRefs instead. +func resolvedChannelSecret(env map[string]string, envKey string) (string, error) { + v := strings.TrimSpace(env[envKey]) + if v == "" { + return "", fmt.Errorf("credential %s is missing or empty after resolve", envKey) + } + return v, nil +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/defaults.go b/go/core/pkg/sandboxbackend/openclaw/defaults.go similarity index 100% rename from go/core/pkg/sandboxbackend/openshell/openclaw/defaults.go rename to go/core/pkg/sandboxbackend/openclaw/defaults.go diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/modelconfig.go b/go/core/pkg/sandboxbackend/openclaw/modelconfig.go similarity index 58% rename from go/core/pkg/sandboxbackend/openshell/openclaw/modelconfig.go rename to go/core/pkg/sandboxbackend/openclaw/modelconfig.go index 3bb29e88fd..a83e5871b6 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/modelconfig.go +++ b/go/core/pkg/sandboxbackend/openclaw/modelconfig.go @@ -16,6 +16,31 @@ func GatewayProviderRecordName(provider v1alpha2.ModelProvider) string { return strings.ToLower(string(provider)) } +// ModelConfigAPIKeyEnvVar returns a container env var that references the ModelConfig API key Secret. +// Substrate ate-api resolves secretKeyRef when resuming an actor (see workload_spec in substrate ate-api). +func ModelConfigAPIKeyEnvVar(mc *v1alpha2.ModelConfig) (corev1.EnvVar, error) { + if mc == nil { + return corev1.EnvVar{}, fmt.Errorf("ModelConfig is required") + } + if mc.Spec.APIKeyPassthrough { + return corev1.EnvVar{}, fmt.Errorf("APIKeyPassthrough is not supported for Substrate OpenClaw provisioning from ModelConfig") + } + if mc.Spec.APIKeySecret == "" || mc.Spec.APIKeySecretKey == "" { + return corev1.EnvVar{}, fmt.Errorf("modelConfig %s/%s requires apiKeySecret and apiKeySecretKey", mc.Namespace, mc.Name) + } + return corev1.EnvVar{ + Name: DefaultAPIKeyEnvVar(mc.Spec.Provider), + ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: mc.Spec.APIKeySecret, + }, + Key: mc.Spec.APIKeySecretKey, + }, + }, + }, nil +} + // ResolveModelConfigAPIKey reads the API key from the Secret referenced by ModelConfig. func ResolveModelConfigAPIKey(ctx context.Context, kube client.Client, mc *v1alpha2.ModelConfig) (string, error) { if mc.Spec.APIKeyPassthrough { diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/policy.go b/go/core/pkg/sandboxbackend/openclaw/policy.go similarity index 100% rename from go/core/pkg/sandboxbackend/openshell/openclaw/policy.go rename to go/core/pkg/sandboxbackend/openclaw/policy.go diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/provider.go b/go/core/pkg/sandboxbackend/openclaw/provider.go similarity index 100% rename from go/core/pkg/sandboxbackend/openshell/openclaw/provider.go rename to go/core/pkg/sandboxbackend/openclaw/provider.go diff --git a/go/core/pkg/sandboxbackend/openclaw/secrets.go b/go/core/pkg/sandboxbackend/openclaw/secrets.go new file mode 100644 index 0000000000..82bf3b8e97 --- /dev/null +++ b/go/core/pkg/sandboxbackend/openclaw/secrets.go @@ -0,0 +1,33 @@ +package openclaw + +import ( + "encoding/json" +) + +// envSecretRef is OpenClaw's structured env SecretRef (see https://docs.openclaw.ai/gateway/secrets). +type envSecretRef struct { + Source string `json:"source"` + Provider string `json:"provider"` + ID string `json:"id"` +} + +func openclawEnvSecretRef(envVar string) envSecretRef { + return envSecretRef{ + Source: "env", + Provider: substrateSecretProviderID, + ID: envVar, + } +} + +// credentialValue marshals as either a plaintext string (OpenShell) or an OpenClaw env SecretRef (Substrate). +type credentialValue struct { + literal string + envSecret *envSecretRef +} + +func (c credentialValue) MarshalJSON() ([]byte, error) { + if c.envSecret != nil { + return json.Marshal(c.envSecret) + } + return json.Marshal(c.literal) +} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/ssh_test.go b/go/core/pkg/sandboxbackend/openclaw/ssh_test.go similarity index 72% rename from go/core/pkg/sandboxbackend/openshell/openclaw/ssh_test.go rename to go/core/pkg/sandboxbackend/openclaw/ssh_test.go index a2f5d32aef..0d2773a544 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/ssh_test.go +++ b/go/core/pkg/sandboxbackend/openclaw/ssh_test.go @@ -3,7 +3,7 @@ package openclaw_test import ( "testing" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/stretchr/testify/require" ) diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go b/go/core/pkg/sandboxbackend/openclaw/types.go similarity index 77% rename from go/core/pkg/sandboxbackend/openshell/openclaw/types.go rename to go/core/pkg/sandboxbackend/openclaw/types.go index 40b3e75b4c..2fac8ba330 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openclaw/types.go @@ -36,11 +36,11 @@ type modelsSection struct { } type providerSettings struct { - BaseURL string `json:"baseUrl,omitempty"` - APIKey string `json:"apiKey"` - Auth string `json:"auth"` - API string `json:"api"` - Models []modelSlot `json:"models"` + BaseURL string `json:"baseUrl,omitempty"` + APIKey credentialValue `json:"apiKey"` + Auth string `json:"auth"` + API string `json:"api"` + Models []modelSlot `json:"models"` } type modelSlot struct { @@ -72,11 +72,11 @@ type telegramBundle struct { } type telegramAccount struct { - Name string `json:"name"` - Enabled bool `json:"enabled"` - BotToken string `json:"botToken"` - DMPolicy string `json:"dmPolicy"` - AllowFrom []string `json:"allowFrom,omitempty"` + Name string `json:"name"` + Enabled bool `json:"enabled"` + BotToken credentialValue `json:"botToken"` + DMPolicy string `json:"dmPolicy"` + AllowFrom []string `json:"allowFrom,omitempty"` } type slackBundle struct { @@ -90,11 +90,11 @@ type slackBundle struct { } type slackAccount struct { - Name string `json:"name"` - Enabled bool `json:"enabled"` - Mode string `json:"mode"` - BotToken string `json:"botToken"` - AppToken string `json:"appToken"` + Name string `json:"name"` + Enabled bool `json:"enabled"` + Mode string `json:"mode"` + BotToken credentialValue `json:"botToken"` + AppToken credentialValue `json:"appToken"` UserTokenReadOnly bool `json:"userTokenReadOnly"` GroupPolicy string `json:"groupPolicy"` Capabilities slackCaps `json:"capabilities"` @@ -112,9 +112,14 @@ type groupDM struct { type secretsSection struct { Providers map[string]secretProvider `json:"providers"` + Defaults *secretsDefaults `json:"defaults,omitempty"` } type secretProvider struct { Source string `json:"source"` - Allowlist []string `json:"allowlist"` + Allowlist []string `json:"allowlist,omitempty"` +} + +type secretsDefaults struct { + Env string `json:"env"` } diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw.go b/go/core/pkg/sandboxbackend/openshell/openclaw.go index f8032b4235..44a3508e10 100644 --- a/go/core/pkg/sandboxbackend/openshell/openclaw.go +++ b/go/core/pkg/sandboxbackend/openshell/openclaw.go @@ -10,7 +10,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go b/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go deleted file mode 100644 index e30f64daf8..0000000000 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/bootstrap_substrate_test.go +++ /dev/null @@ -1,22 +0,0 @@ -package openclaw_test - -import ( - "encoding/json" - "testing" - - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" - "github.com/stretchr/testify/require" -) - -func TestSubstrateGatewayBootstrap(t *testing.T) { - t.Parallel() - raw, err := openclaw.BuildGatewayOnlyBootstrapJSON(openclaw.SubstrateGatewayBootstrap("tok", 80, "/api/agentharnesses/kagent/claw/gateway/")) - require.NoError(t, err) - var root map[string]any - require.NoError(t, json.Unmarshal(raw, &root)) - gw := root["gateway"].(map[string]any) - require.Equal(t, "lan", gw["bind"]) - cui := gw["controlUi"].(map[string]any) - require.Equal(t, "/api/agentharnesses/kagent/claw/gateway", cui["basePath"]) - require.Equal(t, true, cui["dangerouslyDisableDeviceAuth"]) -} diff --git a/go/core/pkg/sandboxbackend/openshell/openclaw/channels.go b/go/core/pkg/sandboxbackend/openshell/openclaw/channels.go deleted file mode 100644 index 1223a30f2f..0000000000 --- a/go/core/pkg/sandboxbackend/openshell/openclaw/channels.go +++ /dev/null @@ -1,97 +0,0 @@ -package openclaw - -import ( - "context" - "maps" - - "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/channels" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type harnessChannels struct { - resolved *channels.Resolved -} - -func accumulateHarnessChannels(ctx context.Context, kube client.Client, namespace string, backend v1alpha2.AgentHarnessBackendType, specChannels []v1alpha2.AgentHarnessChannel, env map[string]string) (*harnessChannels, error) { - resolved, err := channels.Resolve(ctx, kube, namespace, backend, specChannels) - if err != nil { - return nil, err - } - maps.Copy(env, resolved.Secrets) - return &harnessChannels{resolved: resolved}, nil -} - -func (a *harnessChannels) channelsJSON() *channelsConfig { - if a == nil || a.resolved == nil { - return nil - } - r := a.resolved - if len(r.Telegram) == 0 && len(r.Slack) == 0 { - return nil - } - out := &channelsConfig{} - if len(r.Telegram) > 0 { - accounts := make(map[string]telegramAccount, len(r.Telegram)) - var def string - for _, tg := range r.Telegram { - acc := telegramAccount{ - Name: tg.Name, - Enabled: true, - BotToken: openshellResolveEnv(channels.TelegramBotTokenEnvKey(tg.Name)), - } - if len(tg.AllowFrom) > 0 { - acc.DMPolicy = "allowlist" - acc.AllowFrom = tg.AllowFrom - } else { - acc.DMPolicy = "pairing" - } - accounts[tg.Name] = acc - if def == "" { - def = tg.Name - } - } - out.Telegram = &telegramBundle{ - Enabled: true, - Accounts: accounts, - DefaultAccount: def, - } - } - if len(r.Slack) > 0 { - accounts := make(map[string]slackAccount, len(r.Slack)) - var def string - for _, sl := range r.Slack { - botKey := channels.SlackBotTokenEnvKey(sl.Name) - appKey := channels.SlackAppTokenEnvKey(sl.Name) - acc := slackAccount{ - Name: sl.Name, - Enabled: true, - Mode: "socket", - BotToken: channels.SlackBotTokenPlaceholder(botKey), - AppToken: channels.SlackAppTokenPlaceholder(appKey), - UserTokenReadOnly: true, - GroupPolicy: string(sl.ChannelAccess), - Capabilities: slackCaps{ - InteractiveReplies: sl.InteractiveReplies, - }, - } - if chans := sl.AllowlistChannels; len(chans) > 0 { - acc.DM = &groupDM{GroupEnabled: true, GroupChannels: chans} - } - accounts[sl.Name] = acc - if def == "" { - def = sl.Name - } - } - out.Slack = &slackBundle{ - Enabled: true, - Mode: "socket", - WebhookPath: "/slack/events", - UserTokenReadOnly: true, - GroupPolicy: string(r.SlackRootGroupPolicy()), - Accounts: accounts, - DefaultAccount: def, - } - } - return out -} diff --git a/go/core/pkg/sandboxbackend/openshell/openshell_test.go b/go/core/pkg/sandboxbackend/openshell/openshell_test.go index 1f21c0d161..7c55ea45ab 100644 --- a/go/core/pkg/sandboxbackend/openshell/openshell_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openshell_test.go @@ -13,7 +13,7 @@ import ( openshellv1 "github.com/kagent-dev/kagent/go/api/openshell/gen/openshellv1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/stretchr/testify/require" "google.golang.org/grpc" "google.golang.org/grpc/codes" diff --git a/go/core/pkg/sandboxbackend/openshell/policy.go b/go/core/pkg/sandboxbackend/openshell/policy.go index b01b3a20e5..5f82cac698 100644 --- a/go/core/pkg/sandboxbackend/openshell/policy.go +++ b/go/core/pkg/sandboxbackend/openshell/policy.go @@ -6,8 +6,8 @@ import ( sandboxv1 "github.com/kagent-dev/kagent/go/api/openshell/gen/sandboxv1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" "google.golang.org/protobuf/proto" ) diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go index 2299a76dc8..c7e6c2033c 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go @@ -5,7 +5,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" ) // ResolveSSHRemoteCommand decides whether to run an interactive shell or a harness CLI. diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go index d9d1451dea..a1f4849711 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go @@ -5,7 +5,7 @@ import ( "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" ) func TestResolveSSHRemoteCommand(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/openshell/translate.go b/go/core/pkg/sandboxbackend/openshell/translate.go index c6ae5a0d0d..d29c1fd8b1 100644 --- a/go/core/pkg/sandboxbackend/openshell/translate.go +++ b/go/core/pkg/sandboxbackend/openshell/translate.go @@ -9,7 +9,7 @@ import ( openshellv1 "github.com/kagent-dev/kagent/go/api/openshell/gen/openshellv1" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" diff --git a/go/core/pkg/sandboxbackend/openshell/translate_test.go b/go/core/pkg/sandboxbackend/openshell/translate_test.go index 463ad5aa14..88f7124f42 100644 --- a/go/core/pkg/sandboxbackend/openshell/translate_test.go +++ b/go/core/pkg/sandboxbackend/openshell/translate_test.go @@ -4,8 +4,8 @@ import ( "testing" "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go index 0b74d786f7..47d641a2cf 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -21,7 +21,7 @@ func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) err if ah == nil || ah.Annotations == nil { return nil } - if ah.Annotations[annotationManagedActorTemplate] == "true" { + if ah.Annotations[AnnotationManagedActorTemplate] == "true" { key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} if err := p.deleteGoldenActor(ctx, key); err != nil { return err @@ -35,7 +35,7 @@ func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) err return err } } - if ah.Annotations[annotationManagedWorkerPool] == "true" { + if ah.Annotations[AnnotationManagedWorkerPool] == "true" { key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} var wp atev1alpha1.WorkerPool if err := p.Client.Get(ctx, key, &wp); err == nil { diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go index dc4b8e338d..cfa632157e 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go @@ -44,7 +44,7 @@ func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { Name: "peterj-claw", Namespace: ns, Annotations: map[string]string{ - annotationManagedActorTemplate: "true", + AnnotationManagedActorTemplate: "true", }, }, } diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 0d269a45d8..7022085dea 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -115,7 +115,7 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle) error { // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time - // (see substrate.Provisioner.buildOpenClawActorStartup — same openclaw.BuildBootstrapJSON as OpenShell). + // (see substrate/provision_openclaw.go — openclaw.BuildSubstrateBootstrapJSON with secretKeyRef env). _ = ctx _ = ah _ = h diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go index d704baa927..836e48e065 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -5,62 +5,10 @@ import ( "fmt" "strings" - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" - corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -const ( - AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" - AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" - - annotationManagedWorkerPool = AnnotationManagedWorkerPool - annotationManagedActorTemplate = AnnotationManagedActorTemplate - - defaultWorkerPoolReplicas = int32(1) - defaultSnapshotsBucket = "ate-snapshots" - defaultOpenClawContainer = "openclaw" -) - -// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. -type ProvisionDefaults struct { - PauseImage string - RunscAMD64URL string - RunscAMD64SHA256 string - RunscARM64URL string - RunscARM64SHA256 string - DefaultAteomImage string - DefaultWorkloadImage string -} - -// ateActorDeleter removes actors from ate-api during harness teardown. -type ateActorDeleter interface { - deleteActorSequenced(ctx context.Context, actorID string) error -} - -// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. -type Provisioner struct { - Client client.Client - Defaults ProvisionDefaults - // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. - Ate ateActorDeleter -} - -// EnsureResult describes provisioned Substrate resources. -type EnsureResult struct { - WorkerPoolRef types.NamespacedName - ActorTemplateRef types.NamespacedName - ActorTemplateReady bool - ManagedWorkerPool bool - ManagedActorTemplate bool -} - // Ensure creates or updates Substrate CRs and waits for ActorTemplate Ready. func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { if ah == nil || ah.Spec.Substrate == nil { @@ -70,23 +18,8 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En return EnsureResult{}, err } - // Legacy / advanced: user supplied an existing template. if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { - ref := ah.Spec.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} - ready, err := p.actorTemplateReady(ctx, tmplKey) - if err != nil { - return EnsureResult{}, err - } - return EnsureResult{ - ActorTemplateRef: tmplKey, - ActorTemplateReady: ready, - ManagedActorTemplate: false, - }, nil + return p.ensureAdoptedActorTemplate(ctx, ah) } wpKey, managedWP, err := p.ensureWorkerPool(ctx, ah) @@ -104,7 +37,6 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En return EnsureResult{}, err } - _ = managedWP return EnsureResult{ WorkerPoolRef: wpKey, ActorTemplateRef: tmplKey, @@ -114,208 +46,20 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En }, nil } -func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { - sub := ah.Spec.Substrate - if err := ValidateGatewayTokenSpec(sub); err != nil { - return err - } - if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { - return nil +func (p *Provisioner) ensureAdoptedActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { + ref := ah.Spec.Substrate.ActorTemplateRef + ns := ref.Namespace + if ns == "" { + ns = ah.Namespace } - loc := substrateSnapshotsLocation(ah) - if !strings.HasPrefix(loc, "gs://") { - return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") - } - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { - return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") - } - return nil -} - -func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { - sub := ah.Spec.Substrate - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { - ns := sub.WorkerPoolRef.Namespace - if ns == "" { - ns = ah.Namespace - } - key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} - var wp atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &wp); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) - } - return key, false, nil - } - - key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} - replicas := defaultWorkerPoolReplicas - ateomImage := "" - if sub.WorkerPool != nil { - if sub.WorkerPool.Replicas > 0 { - replicas = sub.WorkerPool.Replicas - } - ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) - } - if ateomImage == "" { - ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) - } - if ateomImage == "" { - return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") - } - - desired := &atev1alpha1.WorkerPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: key.Name, - Namespace: key.Namespace, - Labels: provisionLabels(ah), - }, - Spec: atev1alpha1.WorkerPoolSpec{ - Replicas: replicas, - AteomImage: ateomImage, - }, - } - if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) - } - - var existing atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { - if err := p.Client.Create(ctx, desired); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) - } - return key, true, nil - } else if err != nil { - return types.NamespacedName{}, false, err - } - existing.Spec.Replicas = desired.Spec.Replicas - existing.Spec.AteomImage = desired.Spec.AteomImage - if err := p.Client.Update(ctx, &existing); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) - } - return key, true, nil -} - -func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { - key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} - workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) - if workloadImage == "" { - workloadImage = strings.TrimSpace(p.Defaults.DefaultWorkloadImage) - } - if workloadImage == "" { - workloadImage = openshell.NemoclawSandboxBaseImage - } - startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) + tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} + ready, err := p.actorTemplateReady(ctx, tmplKey) if err != nil { - return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) - } - - desired := &atev1alpha1.ActorTemplate{ - ObjectMeta: metav1.ObjectMeta{ - Name: key.Name, - Namespace: key.Namespace, - Labels: provisionLabels(ah), - }, - Spec: atev1alpha1.ActorTemplateSpec{ - PauseImage: p.Defaults.PauseImage, - Runsc: defaultRunscConfig(p.Defaults), - Containers: []atev1alpha1.Container{ - { - Name: defaultOpenClawContainer, - Image: workloadImage, - Ports: []corev1.ContainerPort{{ContainerPort: 80}}, - Command: []string{ - "/bin/sh", - "-c", - startupScript, - }, - Env: containerEnv, - }, - }, - WorkerPoolRef: corev1.ObjectReference{ - Name: wpKey.Name, - Namespace: wpKey.Namespace, - }, - SnapshotsConfig: atev1alpha1.SnapshotsConfig{ - Location: substrateSnapshotsLocation(ah), - }, - }, - } - if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { - return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) - } - - var existing atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { - if err := p.Client.Create(ctx, desired); err != nil { - return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) - } - return key, nil - } else if err != nil { - return types.NamespacedName{}, err - } - existing.Spec = desired.Spec - if err := p.Client.Update(ctx, &existing); err != nil { - return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) - } - return key, nil -} - -func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { - var tmpl atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &tmpl); err != nil { - return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) - } - return tmpl.Status.Phase == atev1alpha1.PhaseReady, nil -} - -func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { - return atev1alpha1.RunscConfig{ - AMD64: &atev1alpha1.RunscPlatformConfig{ - URL: d.RunscAMD64URL, - SHA256Hash: d.RunscAMD64SHA256, - }, - ARM64: &atev1alpha1.RunscPlatformConfig{ - URL: d.RunscARM64URL, - SHA256Hash: d.RunscARM64SHA256, - }, - } -} - -func substrateSnapshotsLocation(ah *v1alpha2.AgentHarness) string { - if ah == nil { - return defaultSubstrateSnapshotsLocation("", "") - } - if sub := ah.Spec.Substrate; sub != nil && sub.SnapshotsConfig != nil { - if loc := strings.TrimSpace(sub.SnapshotsConfig.Location); loc != "" { - return loc - } - } - return defaultSubstrateSnapshotsLocation(ah.Namespace, ah.Name) -} - -func defaultSubstrateSnapshotsLocation(namespace, name string) string { - return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) -} - -func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { - return map[string]string{ - "app.kubernetes.io/managed-by": "kagent", - "kagent.dev/agent-harness": ah.Name, - } -} - -func workerPoolName(ah *v1alpha2.AgentHarness) string { - return truncateDNS1123(ah.Name + "-wp") -} - -func actorTemplateName(ah *v1alpha2.AgentHarness) string { - return truncateDNS1123(ah.Name) -} - -func truncateDNS1123(s string) string { - s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) - if len(s) > 63 { - s = strings.TrimRight(s[:63], "-") + return EnsureResult{}, err } - return s + return EnsureResult{ + ActorTemplateRef: tmplKey, + ActorTemplateReady: ready, + ManagedActorTemplate: false, + }, nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go new file mode 100644 index 0000000000..c1ae943125 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go @@ -0,0 +1,89 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) + if workloadImage == "" { + workloadImage = strings.TrimSpace(p.Defaults.DefaultWorkloadImage) + } + if workloadImage == "" { + workloadImage = openclaw.NemoclawSandboxBaseImage + } + startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) + if err != nil { + return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) + } + + desired := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.ActorTemplateSpec{ + PauseImage: p.Defaults.PauseImage, + Runsc: defaultRunscConfig(p.Defaults), + Containers: []atev1alpha1.Container{ + { + Name: defaultOpenClawContainer, + Image: workloadImage, + Ports: []corev1.ContainerPort{{ContainerPort: 80}}, + Command: []string{ + "/bin/sh", + "-c", + startupScript, + }, + Env: containerEnv, + }, + }, + WorkerPoolRef: corev1.ObjectReference{ + Name: wpKey.Name, + Namespace: wpKey.Namespace, + }, + SnapshotsConfig: atev1alpha1.SnapshotsConfig{ + Location: substrateSnapshotsLocation(ah), + }, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) + } + + var existing atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) + } + return key, nil + } else if err != nil { + return types.NamespacedName{}, err + } + existing.Spec = desired.Spec + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) + } + return key, nil +} + +func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, key, &tmpl); err != nil { + return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) + } + return tmpl.Status.Phase == atev1alpha1.PhaseReady, nil +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index 95c561d48b..6892389127 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -4,12 +4,11 @@ import ( "context" "encoding/base64" "fmt" - "sort" "strings" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" corev1 "k8s.io/api/core/v1" ) @@ -32,7 +31,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha gw := openclaw.SubstrateGatewayBootstrap(token, defaultSubstrateOpenClawGatewayPort, openClawControlUIBasePath(ah)) var jsonBytes []byte - var envMap map[string]string + var containerEnv []corev1.EnvVar ref := strings.TrimSpace(ah.Spec.ModelConfigRef) if ref != "" { @@ -44,7 +43,7 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if getErr := p.Client.Get(ctx, mcRef, mc); getErr != nil { return "", nil, fmt.Errorf("get ModelConfig %s: %w", mcRef, getErr) } - jsonBytes, envMap, err = openclaw.BuildBootstrapJSON(ctx, p.Client, ah.Namespace, ah, mc, gw, openclaw.SubstrateBootstrapDefaultBaseURL) + jsonBytes, containerEnv, err = openclaw.BuildSubstrateBootstrapJSON(ctx, p.Client, ah.Namespace, ah, mc, gw) if err != nil { return "", nil, fmt.Errorf("build openclaw bootstrap json: %w", err) } @@ -53,10 +52,8 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha if err != nil { return "", nil, fmt.Errorf("build gateway-only openclaw json: %w", err) } - envMap = map[string]string{} + containerEnv = []corev1.EnvVar{{Name: "HOME", Value: "/root"}} } - - containerEnv := openClawEnvVars(envMap) script = openClawStartupScript(jsonBytes, gw.Port) return script, containerEnv, nil } @@ -68,20 +65,6 @@ func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" } -func openClawEnvVars(envMap map[string]string) []corev1.EnvVar { - keys := make([]string, 0, len(envMap)) - for k := range envMap { - keys = append(keys, k) - } - sort.Strings(keys) - out := make([]corev1.EnvVar, 0, len(keys)+1) - for _, k := range keys { - out = append(out, corev1.EnvVar{Name: k, Value: envMap[k]}) - } - out = append(out, corev1.EnvVar{Name: "HOME", Value: "/root"}) - return out -} - func openClawStartupScript(jsonBytes []byte, gwPort int) string { b64 := base64.StdEncoding.EncodeToString(jsonBytes) return strings.Join([]string{ diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index 1de21a3685..bd24ca7b1e 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -63,11 +63,17 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { var foundKey bool for _, e := range env { - if e.Name == "OPENAI_API_KEY" && e.Value == "sk-test" { - foundKey = true + if e.Name != "OPENAI_API_KEY" { + continue } + require.NotNil(t, e.ValueFrom) + require.NotNil(t, e.ValueFrom.SecretKeyRef) + require.Equal(t, "openai-key", e.ValueFrom.SecretKeyRef.Name) + require.Equal(t, "OPENAI_API_KEY", e.ValueFrom.SecretKeyRef.Key) + require.Empty(t, e.Value, "API key must not be inlined in ActorTemplate env") + foundKey = true } - require.True(t, foundKey, "expected OPENAI_API_KEY in container env") + require.True(t, foundKey, "expected OPENAI_API_KEY secretKeyRef in container env") // Decode embedded JSON from the base64 line in the startup script. var payload string diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/provision_shared.go new file mode 100644 index 0000000000..c0d4b842e3 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_shared.go @@ -0,0 +1,124 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" + AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" + + defaultWorkerPoolReplicas = int32(1) + defaultSnapshotsBucket = "ate-snapshots" + defaultOpenClawContainer = "openclaw" +) + +// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. +type ProvisionDefaults struct { + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string + DefaultAteomImage string + DefaultWorkloadImage string +} + +// ateActorDeleter removes actors from ate-api during harness teardown. +type ateActorDeleter interface { + deleteActorSequenced(ctx context.Context, actorID string) error +} + +// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. +type Provisioner struct { + Client client.Client + Defaults ProvisionDefaults + // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. + Ate ateActorDeleter +} + +// EnsureResult describes provisioned Substrate resources. +type EnsureResult struct { + WorkerPoolRef types.NamespacedName + ActorTemplateRef types.NamespacedName + ActorTemplateReady bool + ManagedWorkerPool bool + ManagedActorTemplate bool +} + +func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { + sub := ah.Spec.Substrate + if err := ValidateGatewayTokenSpec(sub); err != nil { + return err + } + if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { + return nil + } + loc := substrateSnapshotsLocation(ah) + if !strings.HasPrefix(loc, "gs://") { + return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") + } + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { + return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") + } + return nil +} + +func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { + return atev1alpha1.RunscConfig{ + AMD64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscAMD64URL, + SHA256Hash: d.RunscAMD64SHA256, + }, + ARM64: &atev1alpha1.RunscPlatformConfig{ + URL: d.RunscARM64URL, + SHA256Hash: d.RunscARM64SHA256, + }, + } +} + +func substrateSnapshotsLocation(ah *v1alpha2.AgentHarness) string { + if ah == nil { + return defaultSubstrateSnapshotsLocation("", "") + } + if sub := ah.Spec.Substrate; sub != nil && sub.SnapshotsConfig != nil { + if loc := strings.TrimSpace(sub.SnapshotsConfig.Location); loc != "" { + return loc + } + } + return defaultSubstrateSnapshotsLocation(ah.Namespace, ah.Name) +} + +func defaultSubstrateSnapshotsLocation(namespace, name string) string { + return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) +} + +func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { + return map[string]string{ + "app.kubernetes.io/managed-by": "kagent", + "kagent.dev/agent-harness": ah.Name, + } +} + +func workerPoolName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name + "-wp") +} + +func actorTemplateName(ah *v1alpha2.AgentHarness) string { + return truncateDNS1123(ah.Name) +} + +func truncateDNS1123(s string) string { + s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) + if len(s) > 63 { + s = strings.TrimRight(s[:63], "-") + } + return s +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go new file mode 100644 index 0000000000..3504c7f651 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go @@ -0,0 +1,77 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" +) + +func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { + sub := ah.Spec.Substrate + if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { + ns := sub.WorkerPoolRef.Namespace + if ns == "" { + ns = ah.Namespace + } + key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) + } + return key, false, nil + } + + key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + replicas := defaultWorkerPoolReplicas + ateomImage := "" + if sub.WorkerPool != nil { + if sub.WorkerPool.Replicas > 0 { + replicas = sub.WorkerPool.Replicas + } + ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) + } + if ateomImage == "" { + ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) + } + if ateomImage == "" { + return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") + } + + desired := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: provisionLabels(ah), + }, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: replicas, + AteomImage: ateomImage, + }, + } + if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) + } + + var existing atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { + if err := p.Client.Create(ctx, desired); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) + } + return key, true, nil + } else if err != nil { + return types.NamespacedName{}, false, err + } + existing.Spec.Replicas = desired.Spec.Replicas + existing.Spec.AteomImage = desired.Spec.AteomImage + if err := p.Client.Update(ctx, &existing); err != nil { + return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) + } + return key, true, nil +} From 959972a1a05e2ed34c4cc53681fcdb35b8b0eae0 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Wed, 27 May 2026 10:27:34 -0700 Subject: [PATCH 17/32] move startup script to a template Signed-off-by: Peter Jausovec --- .../substrate/provision_openclaw.go | 40 ++++++++++++------- .../templates/openclaw_startup.sh.tmpl | 9 +++++ 2 files changed, 35 insertions(+), 14 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go index 6892389127..96927611b7 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go @@ -1,10 +1,13 @@ package substrate import ( + "bytes" "context" + _ "embed" "encoding/base64" "fmt" "strings" + "text/template" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/internal/utils" @@ -14,6 +17,16 @@ import ( const defaultSubstrateOpenClawGatewayPort = 80 +//go:embed templates/openclaw_startup.sh.tmpl +var openClawStartupScriptTmplContent string + +var openClawStartupScriptTmpl = template.Must(template.New("openclaw_startup").Parse(openClawStartupScriptTmplContent)) + +type openClawStartupScriptData struct { + OpenClawJSONBase64 string + GatewayPort int +} + // buildOpenClawActorStartup returns the ateom workload startup script and container env for OpenClaw on Substrate. // When spec.modelConfigRef is set, openclaw.json includes models/agents/channels like the OpenShell bootstrap path. func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha2.AgentHarness) (script string, env []corev1.EnvVar, err error) { @@ -54,7 +67,10 @@ func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha } containerEnv = []corev1.EnvVar{{Name: "HOME", Value: "/root"}} } - script = openClawStartupScript(jsonBytes, gw.Port) + script, err = openClawStartupScript(jsonBytes, gw.Port) + if err != nil { + return "", nil, err + } return script, containerEnv, nil } @@ -65,17 +81,13 @@ func openClawControlUIBasePath(ah *v1alpha2.AgentHarness) string { return "/api/agentharnesses/" + ah.Namespace + "/" + ah.Name + "/gateway" } -func openClawStartupScript(jsonBytes []byte, gwPort int) string { - b64 := base64.StdEncoding.EncodeToString(jsonBytes) - return strings.Join([]string{ - "set -e", - `mkdir -p "${HOME}/.openclaw"`, - fmt.Sprintf(`echo '%s' | base64 -d > "${HOME}/.openclaw/openclaw.json"`, b64), - fmt.Sprintf("openclaw gateway run --port %d --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 &", gwPort), - `for i in $(seq 1 60); do`, - ` curl -sf http://127.0.0.1:80/ >/dev/null 2>&1 && echo "gateway up" && break`, - " sleep 1", - "done", - "tail -f /tmp/openclaw-gateway.log /dev/null", - }, "\n") +func openClawStartupScript(jsonBytes []byte, gwPort int) (string, error) { + var buf bytes.Buffer + if err := openClawStartupScriptTmpl.Execute(&buf, openClawStartupScriptData{ + OpenClawJSONBase64: base64.StdEncoding.EncodeToString(jsonBytes), + GatewayPort: gwPort, + }); err != nil { + return "", fmt.Errorf("render openclaw startup script: %w", err) + } + return strings.TrimRight(buf.String(), "\n"), nil } diff --git a/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl b/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl new file mode 100644 index 0000000000..184ad91c74 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl @@ -0,0 +1,9 @@ +set -e +mkdir -p "${HOME}/.openclaw" +echo '{{.OpenClawJSONBase64}}' | base64 -d > "${HOME}/.openclaw/openclaw.json" +openclaw gateway run --port {{.GatewayPort}} --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 & +for i in $(seq 1 60); do + curl -sf http://127.0.0.1:{{.GatewayPort}}/ >/dev/null 2>&1 && echo "gateway up" && break + sleep 1 +done +tail -f /tmp/openclaw-gateway.log /dev/null From 80ddbaf2fa24fbab78d318bd4b428759a978670e Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Wed, 27 May 2026 14:32:51 -0700 Subject: [PATCH 18/32] pr feedback Signed-off-by: Peter Jausovec --- .../crd/bases/kagent.dev_agentharnesses.yaml | 111 ++++++++----- go/api/v1alpha2/agentharness_types.go | 33 ++-- go/api/v1alpha2/zz_generated.deepcopy.go | 17 +- .../controller/agentharness_controller.go | 157 +++++++++++++----- .../agentharness_substrate_watches.go | 97 +++++++++++ .../sandboxbackend/substrate/delete_actor.go | 116 +++---------- .../substrate/delete_actor_test.go | 15 +- .../substrate/delete_provision.go | 157 +++++++++++------- .../substrate/delete_provision_test.go | 33 +++- .../sandboxbackend/substrate/gateway_token.go | 33 +--- .../pkg/sandboxbackend/substrate/openclaw.go | 35 +--- .../pkg/sandboxbackend/substrate/provision.go | 11 +- .../substrate/provision_openclaw_test.go | 2 +- .../substrate/provision_shared.go | 20 +-- .../substrate/provision_test.go | 31 +--- .../substrate/provision_workerpool.go | 6 +- .../templates/kagent.dev_agentharnesses.yaml | 111 ++++++++----- 17 files changed, 554 insertions(+), 431 deletions(-) create mode 100644 go/core/internal/controller/agentharness_substrate_watches.go diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index 2c2f18ff71..52f814c1aa 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -536,8 +536,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -564,8 +562,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -578,6 +574,7 @@ spec: description: |- Location is the GCS URI prefix for golden and incremental snapshots. Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + pattern: ^gs:// type: string required: - location @@ -609,8 +606,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -624,6 +619,8 @@ spec: be specified rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) + - message: workerPoolRef and workerPool are mutually exclusive + rule: '!(has(self.workerPoolRef) && has(self.workerPool))' required: - backend type: object @@ -633,6 +630,10 @@ spec: || (has(c.slack) && ((self.backend == ''hermes'' && has(c.slack.hermes) && !has(c.slack.openclaw)) || ((self.backend == ''openclaw'' || self.backend == ''nemoclaw'') && has(c.slack.openclaw) && !has(c.slack.hermes)))))' + - message: spec.substrate may only be set when runtime is substrate + rule: '!has(self.substrate) || self.runtime == ''substrate''' + - message: spec.substrate is required when runtime is substrate + rule: self.runtime != 'substrate' || has(self.substrate) status: description: AgentHarnessStatus is the observed state of an AgentHarness. properties: @@ -726,42 +727,70 @@ spec: format: int64 type: integer substrate: - description: Substrate records auto-provisioned Substrate CR references. + description: Substrate records observed Substrate provisioning state. properties: - actorTemplateReady: - description: ActorTemplateReady is true when the template phase - is Ready (golden snapshot taken). - type: boolean - actorTemplateRef: - description: ActorTemplateRef is the ActorTemplate used when creating - the actor. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object - workerPoolRef: - description: WorkerPoolRef is the WorkerPool used by the harness - ActorTemplate. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object + conditions: + description: Conditions describe substrate provisioning progress + (e.g. ActorTemplate golden snapshot). + items: + description: Condition contains details for one aspect of the + current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object type: object diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index 44902e1ffb..f0181a569b 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -52,6 +52,7 @@ type AgentHarnessSubstrateSnapshotsConfig struct { // Location is the GCS URI prefix for golden and incremental snapshots. // Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ // +required + // +kubebuilder:validation:Pattern=`^gs://` Location string `json:"location"` } @@ -74,11 +75,12 @@ type AgentHarnessSubstrateWorkerPoolSpec struct { // By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). // Set actorTemplateRef only to adopt an existing template (advanced / legacy). // +kubebuilder:validation:XValidation:rule="(has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef))",message="Exactly one of gatewayToken or gatewayTokenSecretRef must be specified" +// +kubebuilder:validation:XValidation:rule="!(has(self.workerPoolRef) && has(self.workerPool))",message="workerPoolRef and workerPool are mutually exclusive" type AgentHarnessSubstrateSpec struct { // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). // Mutually exclusive with workerPool. // +optional - WorkerPoolRef *TypedReference `json:"workerPoolRef,omitempty"` + WorkerPoolRef *TypedLocalReference `json:"workerPoolRef,omitempty"` // WorkerPool creates a dedicated WorkerPool in the harness namespace when workerPoolRef is unset. // +optional @@ -96,7 +98,7 @@ type AgentHarnessSubstrateSpec struct { // ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. // When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. // +optional - ActorTemplateRef *TypedReference `json:"actorTemplateRef,omitempty"` + ActorTemplateRef *TypedLocalReference `json:"actorTemplateRef,omitempty"` // GatewayPort is the port OpenClaw listens on inside the actor (Substrate routes to :80 today). // +optional @@ -112,7 +114,7 @@ type AgentHarnessSubstrateSpec struct { // GatewayTokenSecretRef references a Secret key holding the OpenClaw gateway Bearer token. // The Secret must contain a "token" key. // +optional - GatewayTokenSecretRef *TypedReference `json:"gatewayTokenSecretRef,omitempty"` + GatewayTokenSecretRef *TypedLocalReference `json:"gatewayTokenSecretRef,omitempty"` } // AgentHarnessChannelType selects a messenger integration for OpenClaw harness VMs. @@ -231,6 +233,8 @@ type AgentHarnessChannel struct { // in. The backend is responsible for provisioning an environment that stays // ready to accept incoming commands. // +kubebuilder:validation:XValidation:rule="!has(self.channels) || self.channels.all(c, c.type != 'slack' || (has(c.slack) && ((self.backend == 'hermes' && has(c.slack.hermes) && !has(c.slack.openclaw)) || ((self.backend == 'openclaw' || self.backend == 'nemoclaw') && has(c.slack.openclaw) && !has(c.slack.hermes)))))",message="slack backend-specific settings must match spec.backend" +// +kubebuilder:validation:XValidation:rule="!has(self.substrate) || self.runtime == 'substrate'",message="spec.substrate may only be set when runtime is substrate" +// +kubebuilder:validation:XValidation:rule="self.runtime != 'substrate' || has(self.substrate)",message="spec.substrate is required when runtime is substrate" type AgentHarnessSpec struct { // Backend selects the control plane to use. Required. // +required @@ -318,26 +322,27 @@ type AgentHarnessStatus struct { // +optional Connection *AgentHarnessConnection `json:"connection,omitempty"` - // Substrate records auto-provisioned Substrate CR references. + // Substrate records observed Substrate provisioning state. // +optional Substrate *AgentHarnessSubstrateStatus `json:"substrate,omitempty"` } // AgentHarnessSubstrateStatus is observed Substrate control-plane state for this harness. type AgentHarnessSubstrateStatus struct { - // WorkerPoolRef is the WorkerPool used by the harness ActorTemplate. + // Conditions describe substrate provisioning progress (e.g. ActorTemplate golden snapshot). // +optional - WorkerPoolRef TypedReference `json:"workerPoolRef,omitempty"` - - // ActorTemplateRef is the ActorTemplate used when creating the actor. - // +optional - ActorTemplateRef TypedReference `json:"actorTemplateRef,omitempty"` - - // ActorTemplateReady is true when the template phase is Ready (golden snapshot taken). - // +optional - ActorTemplateReady bool `json:"actorTemplateReady,omitempty"` + // +listType=map + // +listMapKey=type + Conditions []metav1.Condition `json:"conditions,omitempty"` } +// AgentHarnessSubstrateConditionType enumerates substrate-specific condition types. +const ( + AgentHarnessSubstrateConditionTypeActorTemplateReady = "ActorTemplateReady" + // AgentHarnessSubstrateConditionTypeResourcesCleaned is True when managed Substrate CRs are gone during delete. + AgentHarnessSubstrateConditionTypeResourcesCleaned = "ResourcesCleaned" +) + // AgentHarnessConditionType enumerates the condition types an AgentHarness may report. const ( AgentHarnessConditionTypeReady = "Ready" diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 6acf8938f6..9694e72608 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -354,7 +354,7 @@ func (in *AgentHarnessStatus) DeepCopyInto(out *AgentHarnessStatus) { if in.Substrate != nil { in, out := &in.Substrate, &out.Substrate *out = new(AgentHarnessSubstrateStatus) - **out = **in + (*in).DeepCopyInto(*out) } } @@ -403,7 +403,7 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec *out = *in if in.WorkerPoolRef != nil { in, out := &in.WorkerPoolRef, &out.WorkerPoolRef - *out = new(TypedReference) + *out = new(TypedLocalReference) **out = **in } if in.WorkerPool != nil { @@ -418,12 +418,12 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec } if in.ActorTemplateRef != nil { in, out := &in.ActorTemplateRef, &out.ActorTemplateRef - *out = new(TypedReference) + *out = new(TypedLocalReference) **out = **in } if in.GatewayTokenSecretRef != nil { in, out := &in.GatewayTokenSecretRef, &out.GatewayTokenSecretRef - *out = new(TypedReference) + *out = new(TypedLocalReference) **out = **in } } @@ -441,8 +441,13 @@ func (in *AgentHarnessSubstrateSpec) DeepCopy() *AgentHarnessSubstrateSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessSubstrateStatus) DeepCopyInto(out *AgentHarnessSubstrateStatus) { *out = *in - out.WorkerPoolRef = in.WorkerPoolRef - out.ActorTemplateRef = in.ActorTemplateRef + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateStatus. diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index 94b857d9ef..e371dbb0b3 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -40,6 +40,9 @@ const ( // status while the sandbox is still provisioning. agentHarnessNotReadyRequeue = 10 * time.Second + // substrateDeleteTimeout is the maximum time to wait for substrate cleanup during delete. + substrateDeleteTimeout = 5 * time.Minute + // annotationAgentHarnessBootstrapGeneration records the AgentHarness metadata.generation for which // post-ready bootstrap (backend OnAgentHarnessReady, e.g. exec hooks) already completed. annotationAgentHarnessBootstrapGeneration = "kagent.dev/agent-harness-bootstrap-generation" @@ -82,6 +85,7 @@ func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxba // +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) @@ -122,11 +126,20 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, nil } - runtime := ah.Spec.Runtime - if runtime == "" { - runtime = v1alpha2.AgentHarnessRuntimeOpenshell - } - if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { + runtime := effectiveAgentHarnessRuntime(&ah) + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { + if r.SubstrateProvisioner == nil { + log.Error(nil, "substrate provisioner not configured") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "SubstrateProvisionerUnavailable", + "substrate runtime requires a configured substrate provisioner (set --substrate-ate-api-endpoint)") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "SubstrateProvisionerUnavailable", "") + if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } provRes, err := r.SubstrateProvisioner.Ensure(ctx, &ah) if err != nil { log.Error(err, "substrate provision failed") @@ -139,20 +152,13 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request } return ctrl.Result{}, err } - if ah.Status.Substrate == nil { - ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} + if provRes.ActorTemplateReady { + setSubstrateCondition(&ah, v1alpha2.AgentHarnessSubstrateConditionTypeActorTemplateReady, + metav1.ConditionTrue, "Ready", "ActorTemplate golden snapshot is ready") + } else { + setSubstrateCondition(&ah, v1alpha2.AgentHarnessSubstrateConditionTypeActorTemplateReady, + metav1.ConditionFalse, "NotReady", "waiting for ActorTemplate golden snapshot") } - if provRes.WorkerPoolRef.Name != "" { - ah.Status.Substrate.WorkerPoolRef = v1alpha2.TypedReference{ - Name: provRes.WorkerPoolRef.Name, - Namespace: provRes.WorkerPoolRef.Namespace, - } - } - ah.Status.Substrate.ActorTemplateRef = v1alpha2.TypedReference{ - Name: provRes.ActorTemplateRef.Name, - Namespace: provRes.ActorTemplateRef.Namespace, - } - ah.Status.Substrate.ActorTemplateReady = provRes.ActorTemplateReady // Persist status before metadata annotation patch (client Patch can refresh ah and drop in-memory status). if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { return ctrl.Result{}, err @@ -273,22 +279,74 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph return ctrl.Result{}, nil } - if ah.Status.BackendRef != nil && ah.Status.BackendRef.ID != "" { - del := r.backendFor(ah) - if del != nil { - if err := del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: ah.Status.BackendRef.ID}); err != nil { - if r.Recorder != nil { - r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) - } - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + if substrateDeleteTimedOut(ah) { + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionFalse, "DeleteTimeout", "substrate cleanup exceeded timeout") + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, fmt.Errorf("substrate cleanup timed out for AgentHarness %s", ah.Name) + } + + runtime := effectiveAgentHarnessRuntime(ah) + actorID := "" + if ah.Status.BackendRef != nil { + actorID = ah.Status.BackendRef.ID + } + + if actorID != "" { + var actorDone bool + var err error + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { + actorDone, err = r.SubstrateProvisioner.AdvanceActorDelete(ctx, actorID) + } else if del := r.backendFor(ah); del != nil { + err = del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) + actorDone = err == nil + } else { + actorDone = true + } + if err != nil { + if r.Recorder != nil { + r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + } + if !actorDone { + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionFalse, "ActorDeleting", fmt.Sprintf("waiting for substrate actor %q deletion", actorID)) + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + ah.Status.BackendRef = nil + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err } } - if r.SubstrateProvisioner != nil { - if err := r.SubstrateProvisioner.Delete(ctx, ah); err != nil { + if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { + if r.SubstrateProvisioner == nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, + fmt.Errorf("substrate provisioner is not configured") + } + complete, err := r.SubstrateProvisioner.AdvanceDelete(ctx, ah) + if err != nil { return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("delete substrate resources: %w", err) } + if !complete { + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionFalse, "CleanupInProgress", "waiting for managed Substrate resources to be removed") + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + metav1.ConditionTrue, "Cleaned", "managed Substrate resources removed") + if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { + return ctrl.Result{}, err + } } controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) @@ -298,6 +356,13 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph return ctrl.Result{}, nil } +func substrateDeleteTimedOut(ah *v1alpha2.AgentHarness) bool { + if ah == nil || ah.DeletionTimestamp.IsZero() { + return false + } + return time.Since(ah.DeletionTimestamp.Time) > substrateDeleteTimeout +} + func (r *AgentHarnessController) patchAgentHarnessStatus(ctx context.Context, ah *v1alpha2.AgentHarness) error { if err := r.Client.Status().Update(ctx, ah); err != nil { return fmt.Errorf("update AgentHarness status: %w", err) @@ -322,10 +387,28 @@ func (r *AgentHarnessController) patchAgentHarnessProvisionAnnotations(ctx conte return nil } +func effectiveAgentHarnessRuntime(ah *v1alpha2.AgentHarness) v1alpha2.AgentHarnessRuntime { + if ah.Spec.Runtime == "" { + return v1alpha2.AgentHarnessRuntimeOpenshell + } + return ah.Spec.Runtime +} + func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { + setConditions(&ah.Status.Conditions, ah.Generation, t, s, reason, msg) +} + +func setSubstrateCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { + if ah.Status.Substrate == nil { + ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} + } + setConditions(&ah.Status.Substrate.Conditions, ah.Generation, t, s, reason, msg) +} + +func setConditions(conditions *[]metav1.Condition, generation int64, t string, s metav1.ConditionStatus, reason, msg string) { now := metav1.Now() - for i := range ah.Status.Conditions { - c := &ah.Status.Conditions[i] + for i := range *conditions { + c := &(*conditions)[i] if c.Type != t { continue } @@ -335,27 +418,27 @@ func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.Cond c.Status = s c.Reason = reason c.Message = msg - c.ObservedGeneration = ah.Generation + c.ObservedGeneration = generation return } - ah.Status.Conditions = append(ah.Status.Conditions, metav1.Condition{ + *conditions = append(*conditions, metav1.Condition{ Type: t, Status: s, Reason: reason, Message: msg, LastTransitionTime: now, - ObservedGeneration: ah.Generation, + ObservedGeneration: generation, }) } // SetupWithManager registers the controller with the manager. func (r *AgentHarnessController) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). + b := ctrl.NewControllerManagedBy(mgr). WithOptions(controller.Options{NeedLeaderElection: new(true)}). For(&v1alpha2.AgentHarness{}, builder.WithPredicates(predicate.Or( predicate.GenerationChangedPredicate{}, predicate.LabelChangedPredicate{}, - ))). - Named("agentharness"). - Complete(r) + ))) + b = r.substrateWatches(b) + return b.Named("agentharness").Complete(r) } diff --git a/go/core/internal/controller/agentharness_substrate_watches.go b/go/core/internal/controller/agentharness_substrate_watches.go new file mode 100644 index 0000000000..14dbf9b59e --- /dev/null +++ b/go/core/internal/controller/agentharness_substrate_watches.go @@ -0,0 +1,97 @@ +package controller + +import ( + "context" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" +) + +func (r *AgentHarnessController) enqueueAgentHarnessForSubstrateResource(ctx context.Context, obj client.Object) []reconcile.Request { + harnessName := substrate.HarnessNameFromLabels(obj.GetLabels()) + if harnessName == "" { + return nil + } + return []reconcile.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: obj.GetNamespace(), + Name: harnessName, + }, + }} +} + +func (r *AgentHarnessController) enqueueAgentHarnessForWorkerPoolDeployment(ctx context.Context, obj client.Object) []reconcile.Request { + deploy, ok := obj.(*appsv1.Deployment) + if !ok { + return nil + } + harnessName := substrate.HarnessNameFromLabels(deploy.GetLabels()) + if harnessName == "" { + harnessName = r.harnessNameFromWorkerPoolDeployment(ctx, deploy) + } + if harnessName == "" { + return nil + } + return []reconcile.Request{{ + NamespacedName: types.NamespacedName{ + Namespace: deploy.Namespace, + Name: harnessName, + }, + }} +} + +// harnessNameFromWorkerPoolDeployment resolves the harness via the owning WorkerPool's labels. +// Substrate names deployments "{workerPool}-deployment" and does not copy harness labels onto them. +func (r *AgentHarnessController) harnessNameFromWorkerPoolDeployment(ctx context.Context, deploy *appsv1.Deployment) string { + if r == nil || r.Client == nil || deploy == nil { + return "" + } + for _, ref := range deploy.GetOwnerReferences() { + if ref.Kind != "WorkerPool" || ref.Controller == nil || !*ref.Controller { + continue + } + if !strings.Contains(ref.APIVersion, "ate.dev") { + continue + } + var wp atev1alpha1.WorkerPool + key := types.NamespacedName{Namespace: deploy.Namespace, Name: ref.Name} + if err := r.Client.Get(ctx, key, &wp); err != nil { + if apierrors.IsNotFound(err) { + continue + } + return "" + } + if name := substrate.HarnessNameFromLabels(wp.GetLabels()); name != "" { + return name + } + } + return "" +} + +func (r *AgentHarnessController) substrateWatches(b *builder.Builder) *builder.Builder { + if r == nil || r.SubstrateProvisioner == nil { + return b + } + return b. + Watches( + &atev1alpha1.WorkerPool{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForSubstrateResource), + ). + Watches( + &atev1alpha1.ActorTemplate{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForSubstrateResource), + ). + Watches( + &appsv1.Deployment{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForWorkerPoolDeployment), + ) +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor.go b/go/core/pkg/sandboxbackend/substrate/delete_actor.go index c7a36e8409..462a57de26 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_actor.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor.go @@ -3,125 +3,49 @@ package substrate import ( "context" "fmt" - "time" "github.com/agent-substrate/substrate/proto/ateapipb" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) -const ( - actorDeletePollInterval = 2 * time.Second - actorDeleteTimeout = 5 * time.Minute -) - -// deleteActorSequenced suspends the actor, waits until suspended, deletes it, and waits until gone. -func (c *Client) deleteActorSequenced(ctx context.Context, actorID string) error { +// AdvanceActorDelete performs at most one mutating ate-api step per call. +// Returns true when the actor no longer exists. Callers should requeue until true. +func (c *Client) AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) { if actorID == "" { - return nil + return true, nil } - deadline := time.Now().Add(actorDeleteTimeout) actor, err := c.GetActor(ctx, actorID) if err != nil { if status.Code(err) == codes.NotFound { - return nil + return true, nil } - return fmt.Errorf("get actor %q: %w", actorID, err) + return false, fmt.Errorf("get actor %q: %w", actorID, err) } - if err := c.ensureActorSuspended(ctx, actorID, actor.GetStatus(), deadline); err != nil { - return err - } - - if err := c.DeleteActor(ctx, actorID); err != nil { - if status.Code(err) == codes.NotFound { - return nil - } - if status.Code(err) == codes.FailedPrecondition { - // ate-api requires STATUS_SUSPENDED; re-check and surface current status. - actor, getErr := c.GetActor(ctx, actorID) - if getErr == nil { - return fmt.Errorf("delete actor %q: not suspended (status %s)", actorID, actor.GetStatus()) + switch actor.GetStatus() { + case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: + if err := c.DeleteActor(ctx, actorID); err != nil { + if status.Code(err) == codes.NotFound { + return true, nil + } + if status.Code(err) == codes.FailedPrecondition { + return false, fmt.Errorf("delete actor %q: not suspended (status %s)", actorID, actor.GetStatus()) } + return false, fmt.Errorf("delete actor %q: %w", actorID, err) } - return fmt.Errorf("delete actor %q: %w", actorID, err) - } - - return c.waitForActorDeleted(ctx, actorID, deadline) -} - -func (c *Client) ensureActorSuspended(ctx context.Context, actorID string, st ateapipb.Actor_Status, deadline time.Time) error { - switch st { - case ateapipb.Actor_STATUS_SUSPENDED, ateapipb.Actor_STATUS_UNSPECIFIED: - return nil + return false, nil case ateapipb.Actor_STATUS_SUSPENDING: - // Retry suspend periodically; stuck checkpoint may need manual worker pod deletion. _ = c.SuspendActor(ctx, actorID) - return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + return false, nil case ateapipb.Actor_STATUS_RUNNING, ateapipb.Actor_STATUS_RESUMING: if err := c.SuspendActor(ctx, actorID); err != nil && status.Code(err) != codes.NotFound { - return fmt.Errorf("suspend actor %q: %w", actorID, err) + return false, fmt.Errorf("suspend actor %q: %w", actorID, err) } - return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) + return false, nil default: - // Best-effort suspend for unknown/intermediate states before delete. _ = c.SuspendActor(ctx, actorID) - return c.waitForActorStatus(ctx, actorID, ateapipb.Actor_STATUS_SUSPENDED, deadline) - } -} - -func (c *Client) waitForActorStatus(ctx context.Context, actorID string, want ateapipb.Actor_Status, deadline time.Time) error { - for time.Now().Before(deadline) { - actor, err := c.GetActor(ctx, actorID) - if err != nil { - if status.Code(err) == codes.NotFound { - if want == ateapipb.Actor_STATUS_UNSPECIFIED { - return nil - } - return fmt.Errorf("actor %q not found while waiting for %s", actorID, want) - } - return fmt.Errorf("get actor %q: %w", actorID, err) - } - if actor.GetStatus() == want { - return nil - } - if want == ateapipb.Actor_STATUS_SUSPENDED && actor.GetStatus() == ateapipb.Actor_STATUS_SUSPENDING { - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - continue - } - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - } - return fmt.Errorf("timeout waiting for actor %q status %s", actorID, want) -} - -func (c *Client) waitForActorDeleted(ctx context.Context, actorID string, deadline time.Time) error { - for time.Now().Before(deadline) { - _, err := c.GetActor(ctx, actorID) - if err != nil { - if status.Code(err) == codes.NotFound { - return nil - } - return fmt.Errorf("get actor %q: %w", actorID, err) - } - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - } - return fmt.Errorf("timeout waiting for actor %q deletion", actorID) -} - -func sleepOrDone(ctx context.Context, d time.Duration) error { - t := time.NewTimer(d) - defer t.Stop() - select { - case <-ctx.Done(): - return ctx.Err() - case <-t.C: - return nil + return false, nil } } diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go index 38b9bdae39..9453005fc9 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go @@ -1,18 +1,15 @@ package substrate -import ( - "testing" - "time" +import "testing" - "github.com/agent-substrate/substrate/proto/ateapipb" -) - -func TestEnsureActorSuspendedAlreadySuspended(t *testing.T) { +func TestAdvanceActorDeleteEmptyID(t *testing.T) { t.Parallel() c := &Client{} - deadline := time.Now().Add(time.Minute) - err := c.ensureActorSuspended(t.Context(), "ahr-test", ateapipb.Actor_STATUS_SUSPENDED, deadline) + done, err := c.AdvanceActorDelete(t.Context(), "") if err != nil { t.Fatalf("unexpected error: %v", err) } + if !done { + t.Fatal("expected done for empty actor id") + } } diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go index 47d641a2cf..780fcd4b54 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -4,106 +4,135 @@ import ( "context" "fmt" "strings" - "time" atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" "github.com/kagent-dev/kagent/go/api/v1alpha2" appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" ) -const workerPoolDrainTimeout = 3 * time.Minute +// AdvanceActorDelete deletes a harness actor via ate-api (one RPC step per call). +func (p *Provisioner) AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) { + if p == nil || p.Ate == nil || strings.TrimSpace(actorID) == "" { + return true, nil + } + return p.Ate.AdvanceActorDelete(ctx, actorID) +} -// Delete removes kagent-managed Substrate CRs after the harness actor has been removed. -// Order: golden snapshot actor (from ActorTemplate status), ActorTemplate, WorkerPool. -func (p *Provisioner) Delete(ctx context.Context, ah *v1alpha2.AgentHarness) error { +// AdvanceDelete issues delete requests and observes substrate cleanup progress without blocking. +// Returns true when all kagent-managed Substrate resources for this harness are gone. +func (p *Provisioner) AdvanceDelete(ctx context.Context, ah *v1alpha2.AgentHarness) (bool, error) { if ah == nil || ah.Annotations == nil { - return nil + return true, nil } + if p.Client == nil { + return true, nil + } + if ah.Annotations[AnnotationManagedActorTemplate] == "true" { - key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} - if err := p.deleteGoldenActor(ctx, key); err != nil { - return err + tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + goldenID, err := p.goldenActorID(ctx, tmplKey) + if err != nil { + return false, err } - var tmpl atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &tmpl); err == nil { - if err := p.Client.Delete(ctx, &tmpl); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("delete ActorTemplate %s: %w", key, err) + if goldenID != "" { + if p.Ate == nil { + return false, fmt.Errorf("substrate ate-api client is required to delete golden actor %q", goldenID) + } + done, err := p.Ate.AdvanceActorDelete(ctx, goldenID) + if err != nil { + return false, fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) } - } else if !apierrors.IsNotFound(err) { - return err + if !done { + return false, nil + } + } + var tmpl atev1alpha1.ActorTemplate + if done, err := p.advanceDeleteCR(ctx, tmplKey, &tmpl); err != nil || !done { + return false, err } } + if ah.Annotations[AnnotationManagedWorkerPool] == "true" { - key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} + wpKey := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} var wp atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &wp); err == nil { - if err := p.Client.Delete(ctx, &wp); err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("delete WorkerPool %s: %w", key, err) - } - } else if !apierrors.IsNotFound(err) { - return err + if done, err := p.advanceDeleteCR(ctx, wpKey, &wp); err != nil || !done { + return false, err + } + gone, err := p.workerPoolDeploymentGone(ctx, wpKey) + if err != nil { + return false, err } - if err := p.waitForWorkerPoolDeploymentGone(ctx, key); err != nil { - return err + if !gone { + return false, nil } } - return nil + + return true, nil } -func (p *Provisioner) deleteGoldenActor(ctx context.Context, tmplKey types.NamespacedName) error { - if p.Ate == nil || p.Client == nil { - return nil - } +func (p *Provisioner) goldenActorID(ctx context.Context, tmplKey types.NamespacedName) (string, error) { var tmpl atev1alpha1.ActorTemplate if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { if apierrors.IsNotFound(err) { - return nil + return "", nil } - return fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) + return "", fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) } - goldenID := strings.TrimSpace(tmpl.Status.GoldenActorID) - if goldenID == "" { - return nil + return strings.TrimSpace(tmpl.Status.GoldenActorID), nil +} + +// advanceDeleteCR deletes obj when present; returns true when the object is gone. +func (p *Provisioner) advanceDeleteCR(ctx context.Context, key types.NamespacedName, obj client.Object) (bool, error) { + if err := p.Client.Get(ctx, key, obj); err != nil { + if apierrors.IsNotFound(err) { + return true, nil + } + return false, err } - if err := p.Ate.deleteActorSequenced(ctx, goldenID); err != nil { - return fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) + if obj.GetDeletionTimestamp().IsZero() { + if err := p.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete %s: %w", key, err) + } + return false, nil } - return nil + return false, nil } func workerPoolDeploymentName(wpName string) string { return wpName + "-deployment" } -func (p *Provisioner) waitForWorkerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) error { - if p.Client == nil { - return nil - } +// workerPoolDeploymentGone reports whether the substrate WorkerPool deployment is absent or fully drained. +func (p *Provisioner) workerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) (bool, error) { deployKey := types.NamespacedName{Namespace: wpKey.Namespace, Name: workerPoolDeploymentName(wpKey.Name)} - deadline := time.Now().Add(workerPoolDrainTimeout) - for time.Now().Before(deadline) { - var deploy appsv1.Deployment - err := p.Client.Get(ctx, deployKey, &deploy) - if apierrors.IsNotFound(err) { - return nil - } - if err != nil { - return fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) - } - if deploy.DeletionTimestamp != nil { - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } - continue - } - if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { - return nil - } - if err := sleepOrDone(ctx, actorDeletePollInterval); err != nil { - return err - } + var deploy appsv1.Deployment + err := p.Client.Get(ctx, deployKey, &deploy) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) + } + if !deploy.DeletionTimestamp.IsZero() { + return false, nil + } + if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { + return true, nil } - return fmt.Errorf("timeout waiting for WorkerPool deployment %s to drain", deployKey) + return false, nil } + +// HarnessLabelKey labels substrate resources managed for an AgentHarness. +const HarnessLabelKey = "kagent.dev/agent-harness" + +// HarnessNameFromLabels returns the AgentHarness name from provision labels. +func HarnessNameFromLabels(labels map[string]string) string { + if labels == nil { + return "" + } + return strings.TrimSpace(labels[HarnessLabelKey]) +} + diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go index cfa632157e..ae316c43be 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" @@ -19,12 +20,12 @@ type recordingActorDeleter struct { deleted []string } -func (r *recordingActorDeleter) deleteActorSequenced(_ context.Context, actorID string) error { +func (r *recordingActorDeleter) AdvanceActorDelete(_ context.Context, actorID string) (bool, error) { r.deleted = append(r.deleted, actorID) - return nil + return true, nil } -func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { +func TestProvisionerAdvanceDelete_DeletesGoldenActor(t *testing.T) { t.Parallel() scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -33,7 +34,9 @@ func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { ns := "kagent" tmpl := &atev1alpha1.ActorTemplate{ - ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns}, + ObjectMeta: metav1.ObjectMeta{Name: "peterj-claw", Namespace: ns, Labels: map[string]string{ + HarnessLabelKey: "peterj-claw", + }}, Status: atev1alpha1.ActorTemplateStatus{ GoldenActorID: "golden-actor-uuid", Phase: atev1alpha1.PhaseReady, @@ -53,9 +56,29 @@ func TestProvisionerDelete_DeletesGoldenActor(t *testing.T) { rec := &recordingActorDeleter{} p := &Provisioner{Client: kube, Ate: rec} - require.NoError(t, p.Delete(context.Background(), ah)) + var complete bool + var err error + for range 5 { + complete, err = p.AdvanceDelete(context.Background(), ah) + require.NoError(t, err) + if complete { + break + } + } + require.True(t, complete, "AdvanceDelete should finish within a few reconcile passes") require.Equal(t, []string{"golden-actor-uuid"}, rec.deleted) var got atev1alpha1.ActorTemplate require.Error(t, kube.Get(context.Background(), client.ObjectKeyFromObject(tmpl), &got)) } + +func TestWorkerPoolDeploymentGoneNotFound(t *testing.T) { + t.Parallel() + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + kube := fake.NewClientBuilder().WithScheme(scheme).Build() + p := &Provisioner{Client: kube} + gone, err := p.workerPoolDeploymentGone(context.Background(), types.NamespacedName{Namespace: "kagent", Name: "claw-wp"}) + require.NoError(t, err) + require.True(t, gone) +} diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_token.go b/go/core/pkg/sandboxbackend/substrate/gateway_token.go index abe4b0ba53..fbb5c51634 100644 --- a/go/core/pkg/sandboxbackend/substrate/gateway_token.go +++ b/go/core/pkg/sandboxbackend/substrate/gateway_token.go @@ -14,52 +14,33 @@ import ( // GatewayTokenSecretKey is the Secret data key used for per-harness OpenClaw gateway tokens. const GatewayTokenSecretKey = "token" -// ValidateGatewayTokenSpec requires exactly one per-harness OpenClaw gateway token source. -func ValidateGatewayTokenSpec(sub *v1alpha2.AgentHarnessSubstrateSpec) error { - if sub == nil { - return fmt.Errorf("spec.substrate is required") - } - hasToken := strings.TrimSpace(sub.GatewayToken) != "" - hasSecretRef := sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" - if hasToken == hasSecretRef { - return fmt.Errorf("exactly one of spec.substrate.gatewayToken or gatewayTokenSecretRef must be specified") - } - return nil -} - // ResolveGatewayToken returns the per-harness gateway token. +// Token source is validated at admission via AgentHarnessSubstrateSpec CEL rules. func ResolveGatewayToken(ctx context.Context, kube client.Client, ah *v1alpha2.AgentHarness) (string, error) { if ah == nil || ah.Spec.Substrate == nil { return "", fmt.Errorf("spec.substrate is required") } - if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { - return "", err - } sub := ah.Spec.Substrate - if sub.GatewayTokenSecretRef != nil { + if sub.GatewayTokenSecretRef != nil && strings.TrimSpace(sub.GatewayTokenSecretRef.Name) != "" { return resolveGatewayTokenSecret(ctx, kube, ah.Namespace, sub.GatewayTokenSecretRef) } return strings.TrimSpace(sub.GatewayToken), nil } -func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, defaultNamespace string, ref *v1alpha2.TypedReference) (string, error) { +func resolveGatewayTokenSecret(ctx context.Context, kube client.Client, namespace string, ref *v1alpha2.TypedLocalReference) (string, error) { if kube == nil { return "", fmt.Errorf("kubernetes client is required to resolve gateway token secret") } - ns := ref.Namespace - if ns == "" { - ns = defaultNamespace - } var secret corev1.Secret - if err := kube.Get(ctx, types.NamespacedName{Namespace: ns, Name: ref.Name}, &secret); err != nil { - return "", fmt.Errorf("get gateway token secret %s/%s: %w", ns, ref.Name, err) + if err := kube.Get(ctx, types.NamespacedName{Namespace: namespace, Name: ref.Name}, &secret); err != nil { + return "", fmt.Errorf("get gateway token secret %s/%s: %w", namespace, ref.Name, err) } if secret.Data == nil { - return "", fmt.Errorf("gateway token secret %s/%s is empty", ns, ref.Name) + return "", fmt.Errorf("gateway token secret %s/%s is empty", namespace, ref.Name) } val, ok := secret.Data[GatewayTokenSecretKey] if !ok { - return "", fmt.Errorf("gateway token secret %s/%s missing key %q", ns, ref.Name, GatewayTokenSecretKey) + return "", fmt.Errorf("gateway token secret %s/%s missing key %q", namespace, ref.Name, GatewayTokenSecretKey) } return strings.TrimSpace(string(val)), nil } diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 7022085dea..1909a1183b 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -107,9 +107,13 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H if h.ID == "" { return nil } - if err := b.client.deleteActorSequenced(ctx, h.ID); err != nil { + done, err := b.client.AdvanceActorDelete(ctx, h.ID) + if err != nil { return fmt.Errorf("substrate delete actor %q: %w", h.ID, err) } + if !done { + return fmt.Errorf("substrate delete actor %q in progress", h.ID) + } return nil } @@ -150,22 +154,9 @@ func ActorHost(actorID string, suffix string) string { } func actorTemplateRef(ah *v1alpha2.AgentHarness, cfg Config) (string, string) { - if ah.Status.Substrate != nil && ah.Status.Substrate.ActorTemplateRef.Name != "" { - ref := ah.Status.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - return ns, ref.Name - } if ah.Spec.Substrate != nil && ah.Spec.Substrate.ActorTemplateRef != nil { - ref := ah.Spec.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - if ref.Name != "" { - return ns, ref.Name + if ref := ah.Spec.Substrate.ActorTemplateRef; ref.Name != "" { + return ah.Namespace, ref.Name } } // Auto-provisioned template in the harness namespace (also when status was not persisted yet). @@ -197,18 +188,6 @@ func validateSubstrateSpec(ah *v1alpha2.AgentHarness) error { if runtime != v1alpha2.AgentHarnessRuntimeSubstrate { return fmt.Errorf("substrate backend called for runtime %q", runtime) } - if ah.Spec.Substrate == nil { - return fmt.Errorf("spec.substrate is required when runtime is substrate") - } - if err := ValidateGatewayTokenSpec(ah.Spec.Substrate); err != nil { - return err - } - if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { - return nil - } - if loc := substrateSnapshotsLocation(ah); !strings.HasPrefix(loc, "gs://") { - return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") - } return nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go index 836e48e065..156f10aa9a 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ b/go/core/pkg/sandboxbackend/substrate/provision.go @@ -9,14 +9,11 @@ import ( "k8s.io/apimachinery/pkg/types" ) -// Ensure creates or updates Substrate CRs and waits for ActorTemplate Ready. +// Ensure creates or updates Substrate CRs and reports whether ActorTemplate is Ready (controller requeues until true). func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { if ah == nil || ah.Spec.Substrate == nil { return EnsureResult{}, fmt.Errorf("spec.substrate is required") } - if err := validateSubstrateProvisionSpec(ah); err != nil { - return EnsureResult{}, err - } if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { return p.ensureAdoptedActorTemplate(ctx, ah) @@ -48,11 +45,7 @@ func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (En func (p *Provisioner) ensureAdoptedActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { ref := ah.Spec.Substrate.ActorTemplateRef - ns := ref.Namespace - if ns == "" { - ns = ah.Namespace - } - tmplKey := types.NamespacedName{Namespace: ns, Name: ref.Name} + tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: ref.Name} ready, err := p.actorTemplateReady(ctx, tmplKey) if err != nil { return EnsureResult{}, err diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index bd24ca7b1e..cc61eb826a 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -131,7 +131,7 @@ func TestBuildOpenClawActorStartup_WithHarnessGatewayToken(t *testing.T) { { name: "secret token", substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - GatewayTokenSecretRef: &v1alpha2.TypedReference{Name: "openclaw-token"}, + GatewayTokenSecretRef: &v1alpha2.TypedLocalReference{Name: "openclaw-token"}, }, wantToken: "secret-token", }, diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/provision_shared.go index c0d4b842e3..87a526b8bd 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_shared.go @@ -33,7 +33,7 @@ type ProvisionDefaults struct { // ateActorDeleter removes actors from ate-api during harness teardown. type ateActorDeleter interface { - deleteActorSequenced(ctx context.Context, actorID string) error + AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) } // Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. @@ -53,24 +53,6 @@ type EnsureResult struct { ManagedActorTemplate bool } -func validateSubstrateProvisionSpec(ah *v1alpha2.AgentHarness) error { - sub := ah.Spec.Substrate - if err := ValidateGatewayTokenSpec(sub); err != nil { - return err - } - if sub.ActorTemplateRef != nil && strings.TrimSpace(sub.ActorTemplateRef.Name) != "" { - return nil - } - loc := substrateSnapshotsLocation(ah) - if !strings.HasPrefix(loc, "gs://") { - return fmt.Errorf("spec.substrate.snapshotsConfig.location must be a gs:// URI (Substrate snapshots are GCS-only today)") - } - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" && sub.WorkerPool != nil { - return fmt.Errorf("spec.substrate.workerPoolRef and workerPool are mutually exclusive") - } - return nil -} - func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { return atev1alpha1.RunscConfig{ AMD64: &atev1alpha1.RunscPlatformConfig{ diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go index e0e767e458..c08e87f8e0 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_test.go @@ -14,7 +14,7 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" ) -func TestValidateSubstrateProvisionSpec(t *testing.T) { +func TestSubstrateSnapshotsLocationDefault(t *testing.T) { t.Parallel() ah := &v1alpha2.AgentHarness{ ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, @@ -22,41 +22,12 @@ func TestValidateSubstrateProvisionSpec(t *testing.T) { Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ GatewayToken: "test-token", - SnapshotsConfig: &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{ - Location: "gs://bucket/prefix/", - }, }, }, } - if err := validateSubstrateProvisionSpec(ah); err != nil { - t.Fatalf("expected valid: %v", err) - } - - ah.Spec.Substrate.SnapshotsConfig = nil - if err := validateSubstrateProvisionSpec(ah); err != nil { - t.Fatalf("expected default snapshots config to be valid: %v", err) - } if got := substrateSnapshotsLocation(ah); got != "gs://ate-snapshots/kagent/claw" { t.Fatalf("got default snapshots location %q", got) } - - ah.Spec.Substrate.GatewayToken = "" - if err := validateSubstrateProvisionSpec(ah); err == nil { - t.Fatal("expected error when gateway token is not configured") - } - - ah.Spec.Substrate.GatewayToken = "test-token" - ah.Spec.Substrate.SnapshotsConfig = &v1alpha2.AgentHarnessSubstrateSnapshotsConfig{Location: "s3://nope"} - if err := validateSubstrateProvisionSpec(ah); err == nil { - t.Fatal("expected error for non-gs location") - } - - ah.Spec.Substrate.SnapshotsConfig.Location = "gs://ok" - ah.Spec.Substrate.WorkerPoolRef = &v1alpha2.TypedReference{Name: "pool"} - ah.Spec.Substrate.WorkerPool = &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 2} - if err := validateSubstrateProvisionSpec(ah); err == nil { - t.Fatal("expected error for workerPoolRef and workerPool together") - } } func TestEnsureWorkerPoolUsesDefaultAteomImage(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go index 3504c7f651..f715aa1651 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go @@ -16,11 +16,7 @@ import ( func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { sub := ah.Spec.Substrate if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { - ns := sub.WorkerPoolRef.Namespace - if ns == "" { - ns = ah.Namespace - } - key := types.NamespacedName{Namespace: ns, Name: sub.WorkerPoolRef.Name} + key := types.NamespacedName{Namespace: ah.Namespace, Name: sub.WorkerPoolRef.Name} var wp atev1alpha1.WorkerPool if err := p.Client.Get(ctx, key, &wp); err != nil { return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index 2c2f18ff71..52f814c1aa 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -536,8 +536,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -564,8 +562,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -578,6 +574,7 @@ spec: description: |- Location is the GCS URI prefix for golden and incremental snapshots. Example: gs://ate-snapshots/kagent/my-namespace/my-harness/ + pattern: ^gs:// type: string required: - location @@ -609,8 +606,6 @@ spec: type: string name: type: string - namespace: - type: string required: - name type: object @@ -624,6 +619,8 @@ spec: be specified rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) + - message: workerPoolRef and workerPool are mutually exclusive + rule: '!(has(self.workerPoolRef) && has(self.workerPool))' required: - backend type: object @@ -633,6 +630,10 @@ spec: || (has(c.slack) && ((self.backend == ''hermes'' && has(c.slack.hermes) && !has(c.slack.openclaw)) || ((self.backend == ''openclaw'' || self.backend == ''nemoclaw'') && has(c.slack.openclaw) && !has(c.slack.hermes)))))' + - message: spec.substrate may only be set when runtime is substrate + rule: '!has(self.substrate) || self.runtime == ''substrate''' + - message: spec.substrate is required when runtime is substrate + rule: self.runtime != 'substrate' || has(self.substrate) status: description: AgentHarnessStatus is the observed state of an AgentHarness. properties: @@ -726,42 +727,70 @@ spec: format: int64 type: integer substrate: - description: Substrate records auto-provisioned Substrate CR references. + description: Substrate records observed Substrate provisioning state. properties: - actorTemplateReady: - description: ActorTemplateReady is true when the template phase - is Ready (golden snapshot taken). - type: boolean - actorTemplateRef: - description: ActorTemplateRef is the ActorTemplate used when creating - the actor. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object - workerPoolRef: - description: WorkerPoolRef is the WorkerPool used by the harness - ActorTemplate. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - namespace: - type: string - required: - - name - type: object + conditions: + description: Conditions describe substrate provisioning progress + (e.g. ActorTemplate golden snapshot). + items: + description: Condition contains details for one aspect of the + current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object type: object From 198505b29ecd21db2510d6cacb6c09176891a77d Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Fri, 29 May 2026 08:34:54 -0700 Subject: [PATCH 19/32] make linter happy Signed-off-by: Peter Jausovec --- go/api/httpapi/types.go | 14 +++++++------- .../internal/controller/agentharness_controller.go | 8 ++++---- go/core/internal/httpserver/handlers/agents.go | 4 ++-- go/core/pkg/sandboxbackend/openclaw/credentials.go | 10 ---------- go/core/pkg/sandboxbackend/openclaw/types.go | 8 ++++---- .../pkg/sandboxbackend/openshell/ssh_terminal.go | 2 +- .../sandboxbackend/openshell/ssh_terminal_test.go | 2 +- .../sandboxbackend/substrate/delete_provision.go | 1 - go/core/pkg/sandboxbackend/substrate/openclaw.go | 5 +---- .../substrate/provision_openclaw_test.go | 6 +++--- 10 files changed, 23 insertions(+), 37 deletions(-) diff --git a/go/api/httpapi/types.go b/go/api/httpapi/types.go index 0107e5ffe0..d704eb549a 100644 --- a/go/api/httpapi/types.go +++ b/go/api/httpapi/types.go @@ -146,13 +146,13 @@ type OpenshellAgentHarnessListEntry struct { // SubstrateAgentHarnessListEntry is set when runtime is substrate. type SubstrateAgentHarnessListEntry struct { - Backend v1alpha2.AgentHarnessBackendType `json:"backend"` - Runtime v1alpha2.AgentHarnessRuntime `json:"runtime"` - ActorID string `json:"actorId,omitempty"` + Backend v1alpha2.AgentHarnessBackendType `json:"backend"` + Runtime v1alpha2.AgentHarnessRuntime `json:"runtime"` + ActorID string `json:"actorId,omitempty"` GatewayUIPath string `json:"gatewayUIPath,omitempty"` ModelConfigRef string `json:"modelConfigRef,omitempty"` - BackendRefID string `json:"backendRefId,omitempty"` - Endpoint string `json:"endpoint,omitempty"` + BackendRefID string `json:"backendRefId,omitempty"` + Endpoint string `json:"endpoint,omitempty"` } type AgentResponse struct { @@ -167,8 +167,8 @@ type AgentResponse struct { DeploymentReady bool `json:"deploymentReady"` Accepted bool `json:"accepted"` WorkloadMode v1alpha2.WorkloadMode `json:"workloadMode,omitempty"` - OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` - SubstrateAgentHarness *SubstrateAgentHarnessListEntry `json:"substrateAgentHarness,omitempty"` + OpenshellAgentHarness *OpenshellAgentHarnessListEntry `json:"openshellAgentHarness,omitempty"` + SubstrateAgentHarness *SubstrateAgentHarnessListEntry `json:"substrateAgentHarness,omitempty"` } // Session types diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index e371dbb0b3..7e771b0ffb 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -53,10 +53,10 @@ const ( // harness VMs are a generic exec/SSH-able environment with no in-cluster // workload owned by kagent. type AgentHarnessController struct { - Client client.Client - Recorder events.EventRecorder - OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + Client client.Client + Recorder events.EventRecorder + OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend SubstrateProvisioner *substrate.Provisioner } diff --git a/go/core/internal/httpserver/handlers/agents.go b/go/core/internal/httpserver/handlers/agents.go index 76056660c8..96249cf9d3 100644 --- a/go/core/internal/httpserver/handlers/agents.go +++ b/go/core/internal/httpserver/handlers/agents.go @@ -179,7 +179,7 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, }, }, DeploymentReady: ready, - Accepted: accepted, + Accepted: accepted, } switch runtime { @@ -188,7 +188,7 @@ func (h *AgentsHandler) openshellAgentHarnessAgentResponse(ctx context.Context, Backend: sb.Spec.Backend, Runtime: runtime, ModelConfigRef: sb.Spec.ModelConfigRef, - GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), + GatewayUIPath: fmt.Sprintf("/api/agentharnesses/%s/%s/gateway/", sb.Namespace, sb.Name), } if sb.Status.BackendRef != nil { subEntry.BackendRefID = sb.Status.BackendRef.ID diff --git a/go/core/pkg/sandboxbackend/openclaw/credentials.go b/go/core/pkg/sandboxbackend/openclaw/credentials.go index b167802c96..f0d56c4bc9 100644 --- a/go/core/pkg/sandboxbackend/openclaw/credentials.go +++ b/go/core/pkg/sandboxbackend/openclaw/credentials.go @@ -81,13 +81,3 @@ func channelCredentialContainerEnv(cred v1alpha2.AgentHarnessChannelCredential, return corev1.EnvVar{}, fmt.Errorf("unknown value source type %q", cred.ValueFrom.Type) } } - -// resolvedChannelSecret returns the plaintext value putChannelCredential stored in env. -// OpenShell bootstrap still inlines channel tokens in openclaw.json; Substrate uses OpenClaw env SecretRefs instead. -func resolvedChannelSecret(env map[string]string, envKey string) (string, error) { - v := strings.TrimSpace(env[envKey]) - if v == "" { - return "", fmt.Errorf("credential %s is missing or empty after resolve", envKey) - } - return v, nil -} diff --git a/go/core/pkg/sandboxbackend/openclaw/types.go b/go/core/pkg/sandboxbackend/openclaw/types.go index 2fac8ba330..5d993dd824 100644 --- a/go/core/pkg/sandboxbackend/openclaw/types.go +++ b/go/core/pkg/sandboxbackend/openclaw/types.go @@ -95,10 +95,10 @@ type slackAccount struct { Mode string `json:"mode"` BotToken credentialValue `json:"botToken"` AppToken credentialValue `json:"appToken"` - UserTokenReadOnly bool `json:"userTokenReadOnly"` - GroupPolicy string `json:"groupPolicy"` - Capabilities slackCaps `json:"capabilities"` - DM *groupDM `json:"dm,omitempty"` + UserTokenReadOnly bool `json:"userTokenReadOnly"` + GroupPolicy string `json:"groupPolicy"` + Capabilities slackCaps `json:"capabilities"` + DM *groupDM `json:"dm,omitempty"` } type slackCaps struct { diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go index c7e6c2033c..4437ba35b4 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal.go @@ -4,8 +4,8 @@ import ( "strings" "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" ) // ResolveSSHRemoteCommand decides whether to run an interactive shell or a harness CLI. diff --git a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go index a1f4849711..252b111a7b 100644 --- a/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go +++ b/go/core/pkg/sandboxbackend/openshell/ssh_terminal_test.go @@ -3,9 +3,9 @@ package openshell_test import ( "testing" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openshell/hermes" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" ) func TestResolveSSHRemoteCommand(t *testing.T) { diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go index 780fcd4b54..a662454793 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_provision.go @@ -135,4 +135,3 @@ func HarnessNameFromLabels(labels map[string]string) string { } return strings.TrimSpace(labels[HarnessLabelKey]) } - diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 1909a1183b..a3374d01fc 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -117,12 +117,9 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H return nil } -func (b *ClawBackend) OnAgentHarnessReady(ctx context.Context, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle) error { +func (b *ClawBackend) OnAgentHarnessReady(_ context.Context, _ *v1alpha2.AgentHarness, _ sandboxbackend.Handle) error { // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time // (see substrate/provision_openclaw.go — openclaw.BuildSubstrateBootstrapJSON with secretKeyRef env). - _ = ctx - _ = ah - _ = h return nil } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go index cc61eb826a..95e58211e7 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go @@ -77,7 +77,7 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { // Decode embedded JSON from the base64 line in the startup script. var payload string - for _, line := range strings.Split(script, "\n") { + for line := range strings.SplitSeq(script, "\n") { if !strings.Contains(line, "base64 -d") { continue } @@ -197,7 +197,7 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { require.NoError(t, err) var payload string - for _, line := range strings.Split(script, "\n") { + for line := range strings.SplitSeq(script, "\n") { if strings.Contains(line, "base64 -d") { start := strings.Index(line, `'`) + 1 end := strings.LastIndex(line, `'`) @@ -217,7 +217,7 @@ func gatewayTokenFromStartup(t *testing.T, script string) string { t.Helper() var payload string - for _, line := range strings.Split(script, "\n") { + for line := range strings.SplitSeq(script, "\n") { if strings.Contains(line, "base64 -d") { start := strings.Index(line, `'`) + 1 end := strings.LastIndex(line, `'`) From a23e899a7a66c429758783c4ae32c4449c91575f Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Fri, 29 May 2026 09:06:26 -0700 Subject: [PATCH 20/32] move write ops to writer-role Signed-off-by: Peter Jausovec --- helm/kagent/templates/rbac/getter-role.yaml | 4 ---- helm/kagent/templates/rbac/writer-role.yaml | 10 ++++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/helm/kagent/templates/rbac/getter-role.yaml b/helm/kagent/templates/rbac/getter-role.yaml index cafe9d0f5c..ceab5ec9aa 100644 --- a/helm/kagent/templates/rbac/getter-role.yaml +++ b/helm/kagent/templates/rbac/getter-role.yaml @@ -62,10 +62,6 @@ - get - list - watch - - create - - update - - patch - - delete - apiGroups: - ate.dev resources: diff --git a/helm/kagent/templates/rbac/writer-role.yaml b/helm/kagent/templates/rbac/writer-role.yaml index b735e159bd..b9516cae03 100644 --- a/helm/kagent/templates/rbac/writer-role.yaml +++ b/helm/kagent/templates/rbac/writer-role.yaml @@ -75,6 +75,16 @@ - update - patch - delete +- apiGroups: + - ate.dev + resources: + - workerpools + - actortemplates + verbs: + - create + - update + - patch + - delete {{- end -}} {{- include "kagent.rbac.validate" . -}} From 459a04d67d36429d9abf574b8568c92d6fa01de0 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Fri, 29 May 2026 09:30:23 -0700 Subject: [PATCH 21/32] commenting out the substrate section in values Signed-off-by: Peter Jausovec --- helm/kagent/values.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index a9a335ef85..0305601676 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -234,16 +234,16 @@ controller: # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. - substrate: - enabled: true - ateApiEndpoint: "dns:///api.ate-system.svc:443" - ateApiInsecure: false - pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" - runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" - runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" - runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" - runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" - ateomImage: "localhost:5001/ateom-gvisor:latest" + # substrate: + # enabled: true + # ateApiEndpoint: "dns:///api.ate-system.svc:443" + # ateApiInsecure: false + # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" + # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" + # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + # ateomImage: "localhost:5001/ateom-gvisor:latest" envFrom: [] From c5ccb682d6f6b04003fabe1c5559e7309ae58620 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 1 Jun 2026 08:19:38 -0700 Subject: [PATCH 22/32] fix failing helm unit tests Signed-off-by: Peter Jausovec --- .../templates/controller-deployment.yaml | 2 +- helm/kagent/values.yaml | 25 +++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index d63ff6bd92..2727ace3a6 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -87,7 +87,7 @@ spec: {{- with .Values.controller.env }} {{- toYaml . | nindent 12 }} {{- end }} - {{- if .Values.controller.substrate.enabled }} + {{- if and .Values.controller.substrate .Values.controller.substrate.enabled }} - name: SUBSTRATE_ATE_API_ENDPOINT value: {{ .Values.controller.substrate.ateApiEndpoint | quote }} {{- if .Values.controller.substrate.ateApiInsecure }} diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index 0305601676..39e97ba23d 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -234,16 +234,21 @@ controller: # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. - # substrate: - # enabled: true - # ateApiEndpoint: "dns:///api.ate-system.svc:443" - # ateApiInsecure: false - # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" - # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" - # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" - # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" - # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" - # ateomImage: "localhost:5001/ateom-gvisor:latest" + substrate: + enabled: false + ateApiEndpoint: "" + ateApiInsecure: false + defaultActorTemplateNamespace: "" + defaultActorTemplateName: "" + # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" + # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" + # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" + # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + # ateomImage: "localhost:5001/ateom-gvisor:latest" + # Example when enabled: + # enabled: true + # ateApiEndpoint: "dns:///api.ate-system.svc:443" envFrom: [] From 6a82057fcf7b8eec17f8bdb99f74b181caf39981 Mon Sep 17 00:00:00 2001 From: Eitan Yarmush Date: Mon, 1 Jun 2026 19:02:49 +0000 Subject: [PATCH 23/32] Refine substrate AgentHarness lifecycle Signed-off-by: Eitan Yarmush --- docs/substrate-agentharness-lifecycle.md | 60 ++++ examples/substrate-openclaw/README.md | 34 +- .../crd/bases/kagent.dev_agentharnesses.yaml | 108 +------ go/api/v1alpha2/agentharness_types.go | 64 +--- go/api/v1alpha2/zz_generated.deepcopy.go | 52 --- .../controller/agentharness_controller.go | 166 +++++----- .../agentharness_controller_test.go | 296 ++++++++++++++++++ .../agentharness_substrate_watches.go | 61 +--- go/core/pkg/app/app.go | 96 +++--- go/core/pkg/sandboxbackend/async.go | 8 +- .../agentharness_openshell_client.go | 12 +- .../pkg/sandboxbackend/openshell/openshell.go | 2 +- .../openshell/openshell_test.go | 12 +- .../pkg/sandboxbackend/substrate/config.go | 7 - .../substrate/delete_provision.go | 137 -------- .../pkg/sandboxbackend/substrate/lifecycle.go | 61 ++++ ...template.go => lifecycle_actortemplate.go} | 59 ++-- .../substrate/lifecycle_delete.go | 67 ++++ ...ision_test.go => lifecycle_delete_test.go} | 31 +- ...sion_openclaw.go => lifecycle_openclaw.go} | 4 +- ...law_test.go => lifecycle_openclaw_test.go} | 6 +- ...rovision_shared.go => lifecycle_shared.go} | 66 ++-- .../substrate/lifecycle_test.go | 146 +++++++++ .../pkg/sandboxbackend/substrate/openclaw.go | 35 +-- .../sandboxbackend/substrate/openclaw_test.go | 10 +- .../pkg/sandboxbackend/substrate/provision.go | 58 ---- .../substrate/provision_test.go | 98 ------ .../substrate/provision_workerpool.go | 73 ----- .../templates/kagent.dev_agentharnesses.yaml | 108 +------ .../templates/controller-deployment.yaml | 12 +- helm/kagent/templates/rbac/writer-role.yaml | 3 +- .../templates/substrate-workerpool.yaml | 15 + helm/kagent/values.yaml | 19 +- .../agent-form/OpenClawSandboxFields.tsx | 74 ++--- .../lib/__tests__/openClawSandboxForm.test.ts | 54 +++- ui/src/lib/openClawSandboxForm.ts | 32 +- 36 files changed, 1006 insertions(+), 1140 deletions(-) create mode 100644 docs/substrate-agentharness-lifecycle.md create mode 100644 go/core/internal/controller/agentharness_controller_test.go delete mode 100644 go/core/pkg/sandboxbackend/substrate/delete_provision.go create mode 100644 go/core/pkg/sandboxbackend/substrate/lifecycle.go rename go/core/pkg/sandboxbackend/substrate/{provision_actortemplate.go => lifecycle_actortemplate.go} (56%) create mode 100644 go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go rename go/core/pkg/sandboxbackend/substrate/{delete_provision_test.go => lifecycle_delete_test.go} (58%) rename go/core/pkg/sandboxbackend/substrate/{provision_openclaw.go => lifecycle_openclaw.go} (92%) rename go/core/pkg/sandboxbackend/substrate/{provision_openclaw_test.go => lifecycle_openclaw_test.go} (98%) rename go/core/pkg/sandboxbackend/substrate/{provision_shared.go => lifecycle_shared.go} (51%) create mode 100644 go/core/pkg/sandboxbackend/substrate/lifecycle_test.go delete mode 100644 go/core/pkg/sandboxbackend/substrate/provision.go delete mode 100644 go/core/pkg/sandboxbackend/substrate/provision_test.go delete mode 100644 go/core/pkg/sandboxbackend/substrate/provision_workerpool.go create mode 100644 helm/kagent/templates/substrate-workerpool.yaml diff --git a/docs/substrate-agentharness-lifecycle.md b/docs/substrate-agentharness-lifecycle.md new file mode 100644 index 0000000000..f01a18fc31 --- /dev/null +++ b/docs/substrate-agentharness-lifecycle.md @@ -0,0 +1,60 @@ +# Substrate AgentHarness Lifecycle + +This branch should use a single ownership model for `runtime: substrate` harnesses. + +## Ownership + +- Platform/Helm owns `WorkerPool` capacity. +- kagent owns the generated per-harness `ActorTemplate`. +- kagent owns the per-harness actor lifecycle through `ate-api`. +- Substrate owns the `WorkerPool` deployment and the `ActorTemplate` golden snapshot process. + +kagent should not create or delete `WorkerPool` resources from the `AgentHarness` reconciler. A chart may optionally install a default `WorkerPool`, and the controller may use that default when `spec.substrate.workerPoolRef` is unset. + +## Spec Shape + +`AgentHarness.spec.substrate` should contain only harness-level inputs: + +- `workerPoolRef`, optional; falls back to the configured controller default. +- `snapshotsConfig`, optional; defaults to `gs://ate-snapshots//`. +- `workloadImage`, optional. +- exactly one of `gatewayToken` or `gatewayTokenSecretRef`. + +There is no `actorTemplateRef`. kagent always generates the `ActorTemplate`, so adopting an external template is not part of the workflow. + +## Status + +Use top-level Kubernetes conditions for progress: + +- `Accepted` +- `ActorTemplateReady` +- `ActorReady` +- `Ready` + +`Ready` is the aggregate condition. Specific blockers should be reflected in `reason` and `message`. + +Do not store ownership booleans or cleanup markers in annotations or status. Ownership is deterministic: + +- `WorkerPool` is external. +- generated `ActorTemplate` is owned by the `AgentHarness` through an owner reference. + +## Reconcile + +The substrate reconcile path should: + +1. Resolve `workerPoolRef` from spec or controller default. +2. Verify the `WorkerPool` exists. +3. Create or update the generated `ActorTemplate` with an owner reference to the `AgentHarness`. +4. Wait for `ActorTemplate.status.phase == Ready`. +5. Create or resume the actor through `ate-api`. +6. Mark `ActorReady` and aggregate `Ready`. + +## Delete + +The finalizer should: + +1. Delete the harness actor recorded in `status.backendRef.id`. +2. Read the generated `ActorTemplate` and delete `status.goldenActorID`, if present. +3. Remove the finalizer. + +Kubernetes garbage collection deletes the generated `ActorTemplate` through the owner reference. kagent does not delete `WorkerPool`. diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index bb5964f663..6b234a39db 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -13,7 +13,7 @@ cd substrate `--deploy-ate-system` installs the **control plane only** (ate-api, ate-controller, atelet, atenet, …). Your registry catalog will show `ateapi-*`, `atelet-*`, etc., but **not** ateom until you build it. -Build and push **ateom-gvisor** (required for kagent `workerPool.ateomImage`): +Build and push **ateom-gvisor** (required for the WorkerPool `ateomImage`): ```bash # build the ateom-gvisor image from the substrate folder @@ -36,21 +36,20 @@ On amd64 hosts, use `--platform linux/amd64` in the pull step. ## kagent AgentHarness with substrate runtime -kagent **auto-provisions** a per-harness `ActorTemplate` (and optionally a `WorkerPool`). +kagent generates a per-harness `ActorTemplate` and uses an existing `WorkerPool`. Install kagent (Substrate must already be running in the cluster): ```bash export KIND_CLUSTER_NAME=kind -make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" +make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set substrateWorkerPool.create=true --set substrateWorkerPool.ateomImage=localhost:5001/ateom-gvisor:latest" ``` The generated `ActorTemplate` uses `controller.substrate.pauseImage`, `controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, `controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` from the Helm values Override them with `--set` or a values file when you need to pin a different gVisor build. -Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to `gs://ate-snapshots//`. If Helm sets `controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can be omitted unless you want to override it. +Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to `gs://ate-snapshots//`. -- **Worker pool** — reference an existing pool (`workerPoolRef`) **or** let kagent create one (`workerPool`) -- **`workerPool.ateomImage`** — optional override for the Helm/controller default (`localhost:5001/ateom-gvisor:latest`) +- **Worker pool** — reference an existing pool (`workerPoolRef`) or configure a controller default WorkerPool - **Gateway token** — required per harness with either `gatewayToken` or `gatewayTokenSecretRef` ```yaml @@ -69,33 +68,22 @@ spec: # snapshotsConfig: # location: gs://ate-snapshots/kagent/peterj-claw - # Optional: kagent auto-creates a WorkerPool when workerPoolRef is unset. - # Replicas default to 1 and ateomImage defaults to controller.substrate.ateomImage. - # NOTE: use single worker for now due to https://github.com/agent-substrate/substrate/issues/50 - gatewayToken: test-token - workerPool: - replicas: 1 - # ateomImage: localhost:5001/ateom-gvisor:latest + # Required unless the controller has a default WorkerPool configured. + workerPoolRef: + name: kagent-default # Required: configure the OpenClaw gateway token for this harness. # Use either gatewayToken or gatewayTokenSecretRef. The Secret must contain key "token". + gatewayToken: test-token + # gatewayTokenSecretRef: # name: openclaw-gateway-token - # namespace: kagent # Optional: override the sandbox image used in the ActorTemplate. # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 - - # Optional: adopt existing resources instead of auto-create - # workerPoolRef: - # name: my-pool - # namespace: ate-system - # actorTemplateRef: - # name: my-template - # namespace: ate-system ``` -When `actorTemplateRef` is not set, kagent creates an `ActorTemplate` that looks roughly like this: +kagent creates an `ActorTemplate` that looks roughly like this: ```yaml apiVersion: ate.dev/v1alpha1 diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index 52f814c1aa..9c4c1ee2a6 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -525,26 +525,6 @@ spec: substrate: description: Substrate is required when runtime is substrate. properties: - actorTemplateRef: - description: |- - ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. - When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - required: - - name - type: object - gatewayPort: - default: 80 - description: GatewayPort is the port OpenClaw listens on inside - the actor (Substrate routes to :80 today). - format: int32 - type: integer gatewayToken: description: |- GatewayToken is the OpenClaw gateway Bearer token for this harness. @@ -579,26 +559,10 @@ spec: required: - location type: object - workerPool: - description: WorkerPool creates a dedicated WorkerPool in the - harness namespace when workerPoolRef is unset. - properties: - ateomImage: - description: |- - AteomImage is the ateom herder image (pullable registry ref, not ko://). - Overrides the controller-wide substrate ateom image default for this WorkerPool. - type: string - replicas: - default: 1 - description: Replicas is the number of ateom worker pods. - Defaults to 1 when unset or zero. - format: int32 - type: integer - type: object workerPoolRef: description: |- - WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). - Mutually exclusive with workerPool. + WorkerPoolRef references an existing ate.dev WorkerPool in the harness namespace. + When unset, the controller uses its configured default WorkerPool. properties: apiGroup: type: string @@ -619,8 +583,6 @@ spec: be specified rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) - - message: workerPoolRef and workerPool are mutually exclusive - rule: '!(has(self.workerPoolRef) && has(self.workerPool))' required: - backend type: object @@ -726,72 +688,6 @@ spec: observedGeneration: format: int64 type: integer - substrate: - description: Substrate records observed Substrate provisioning state. - properties: - conditions: - description: Conditions describe substrate provisioning progress - (e.g. ActorTemplate golden snapshot). - items: - description: Condition contains details for one aspect of the - current state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, - Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object type: object type: object served: true diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index f0181a569b..e44c3d6924 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -56,36 +56,18 @@ type AgentHarnessSubstrateSnapshotsConfig struct { Location string `json:"location"` } -// AgentHarnessSubstrateWorkerPoolSpec creates a dedicated WorkerPool for this harness. -// Mutually exclusive with workerPoolRef. -type AgentHarnessSubstrateWorkerPoolSpec struct { - // Replicas is the number of ateom worker pods. Defaults to 1 when unset or zero. - // +optional - // +kubebuilder:default=1 - Replicas int32 `json:"replicas,omitempty"` - - // AteomImage is the ateom herder image (pullable registry ref, not ko://). - // Overrides the controller-wide substrate ateom image default for this WorkerPool. - // +optional - AteomImage string `json:"ateomImage,omitempty"` -} - // AgentHarnessSubstrateSpec configures Agent Substrate (WorkerPool + ActorTemplate + Actor). // -// By default kagent provisions a per-harness ActorTemplate (and optionally a WorkerPool). -// Set actorTemplateRef only to adopt an existing template (advanced / legacy). +// kagent generates a per-harness ActorTemplate and creates an Actor from it. WorkerPool +// capacity is referenced from workerPoolRef or the controller default; it is not +// created or deleted by the AgentHarness controller. // +kubebuilder:validation:XValidation:rule="(has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef))",message="Exactly one of gatewayToken or gatewayTokenSecretRef must be specified" -// +kubebuilder:validation:XValidation:rule="!(has(self.workerPoolRef) && has(self.workerPool))",message="workerPoolRef and workerPool are mutually exclusive" type AgentHarnessSubstrateSpec struct { - // WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). - // Mutually exclusive with workerPool. + // WorkerPoolRef references an existing ate.dev WorkerPool in the harness namespace. + // When unset, the controller uses its configured default WorkerPool. // +optional WorkerPoolRef *TypedLocalReference `json:"workerPoolRef,omitempty"` - // WorkerPool creates a dedicated WorkerPool in the harness namespace when workerPoolRef is unset. - // +optional - WorkerPool *AgentHarnessSubstrateWorkerPoolSpec `json:"workerPool,omitempty"` - // SnapshotsConfig configures actor memory snapshots. Defaults to // gs://ate-snapshots// when unset. // +optional @@ -95,16 +77,6 @@ type AgentHarnessSubstrateSpec struct { // +optional WorkloadImage string `json:"workloadImage,omitempty"` - // ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. - // When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. - // +optional - ActorTemplateRef *TypedLocalReference `json:"actorTemplateRef,omitempty"` - - // GatewayPort is the port OpenClaw listens on inside the actor (Substrate routes to :80 today). - // +optional - // +kubebuilder:default=80 - GatewayPort int32 `json:"gatewayPort,omitempty"` - // GatewayToken is the OpenClaw gateway Bearer token for this harness. // Prefer gatewayTokenSecretRef for production secrets. // +optional @@ -321,32 +293,14 @@ type AgentHarnessStatus struct { // Connection is populated by the controller when the harness is ready. // +optional Connection *AgentHarnessConnection `json:"connection,omitempty"` - - // Substrate records observed Substrate provisioning state. - // +optional - Substrate *AgentHarnessSubstrateStatus `json:"substrate,omitempty"` -} - -// AgentHarnessSubstrateStatus is observed Substrate control-plane state for this harness. -type AgentHarnessSubstrateStatus struct { - // Conditions describe substrate provisioning progress (e.g. ActorTemplate golden snapshot). - // +optional - // +listType=map - // +listMapKey=type - Conditions []metav1.Condition `json:"conditions,omitempty"` } -// AgentHarnessSubstrateConditionType enumerates substrate-specific condition types. -const ( - AgentHarnessSubstrateConditionTypeActorTemplateReady = "ActorTemplateReady" - // AgentHarnessSubstrateConditionTypeResourcesCleaned is True when managed Substrate CRs are gone during delete. - AgentHarnessSubstrateConditionTypeResourcesCleaned = "ResourcesCleaned" -) - // AgentHarnessConditionType enumerates the condition types an AgentHarness may report. const ( - AgentHarnessConditionTypeReady = "Ready" - AgentHarnessConditionTypeAccepted = "Accepted" + AgentHarnessConditionTypeReady = "Ready" + AgentHarnessConditionTypeAccepted = "Accepted" + AgentHarnessConditionTypeActorTemplateReady = "ActorTemplateReady" + AgentHarnessConditionTypeActorReady = "ActorReady" ) // +kubebuilder:object:root=true diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 9694e72608..1a7dfd4e90 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -351,11 +351,6 @@ func (in *AgentHarnessStatus) DeepCopyInto(out *AgentHarnessStatus) { *out = new(AgentHarnessConnection) **out = **in } - if in.Substrate != nil { - in, out := &in.Substrate, &out.Substrate - *out = new(AgentHarnessSubstrateStatus) - (*in).DeepCopyInto(*out) - } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessStatus. @@ -406,21 +401,11 @@ func (in *AgentHarnessSubstrateSpec) DeepCopyInto(out *AgentHarnessSubstrateSpec *out = new(TypedLocalReference) **out = **in } - if in.WorkerPool != nil { - in, out := &in.WorkerPool, &out.WorkerPool - *out = new(AgentHarnessSubstrateWorkerPoolSpec) - **out = **in - } if in.SnapshotsConfig != nil { in, out := &in.SnapshotsConfig, &out.SnapshotsConfig *out = new(AgentHarnessSubstrateSnapshotsConfig) **out = **in } - if in.ActorTemplateRef != nil { - in, out := &in.ActorTemplateRef, &out.ActorTemplateRef - *out = new(TypedLocalReference) - **out = **in - } if in.GatewayTokenSecretRef != nil { in, out := &in.GatewayTokenSecretRef, &out.GatewayTokenSecretRef *out = new(TypedLocalReference) @@ -438,43 +423,6 @@ func (in *AgentHarnessSubstrateSpec) DeepCopy() *AgentHarnessSubstrateSpec { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AgentHarnessSubstrateStatus) DeepCopyInto(out *AgentHarnessSubstrateStatus) { - *out = *in - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]metav1.Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateStatus. -func (in *AgentHarnessSubstrateStatus) DeepCopy() *AgentHarnessSubstrateStatus { - if in == nil { - return nil - } - out := new(AgentHarnessSubstrateStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *AgentHarnessSubstrateWorkerPoolSpec) DeepCopyInto(out *AgentHarnessSubstrateWorkerPoolSpec) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentHarnessSubstrateWorkerPoolSpec. -func (in *AgentHarnessSubstrateWorkerPoolSpec) DeepCopy() *AgentHarnessSubstrateWorkerPoolSpec { - if in == nil { - return nil - } - out := new(AgentHarnessSubstrateWorkerPoolSpec) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentHarnessTelegramChannelSpec) DeepCopyInto(out *AgentHarnessTelegramChannelSpec) { *out = *in diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go index 7e771b0ffb..96478d5a91 100644 --- a/go/core/internal/controller/agentharness_controller.go +++ b/go/core/internal/controller/agentharness_controller.go @@ -13,10 +13,12 @@ package controller import ( "context" "fmt" + "reflect" "strconv" "time" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/events" ctrl "sigs.k8s.io/controller-runtime" @@ -24,6 +26,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/predicate" "github.com/kagent-dev/kagent/go/api/v1alpha2" @@ -53,11 +56,11 @@ const ( // harness VMs are a generic exec/SSH-able environment with no in-cluster // workload owned by kagent. type AgentHarnessController struct { - Client client.Client - Recorder events.EventRecorder - OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - SubstrateProvisioner *substrate.Provisioner + Client client.Client + Recorder events.EventRecorder + OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + SubstrateLifecycle substrate.AgentHarnessLifecycle } func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxbackend.AsyncBackend { @@ -82,7 +85,7 @@ func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxba // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/status,verbs=get;update;patch // +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/finalizers,verbs=update -// +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch // +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch @@ -128,47 +131,45 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request runtime := effectiveAgentHarnessRuntime(&ah) if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { - if r.SubstrateProvisioner == nil { - log.Error(nil, "substrate provisioner not configured") + if r.SubstrateLifecycle == nil { + log.Error(nil, "substrate lifecycle not configured") setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, - "SubstrateProvisionerUnavailable", - "substrate runtime requires a configured substrate provisioner (set --substrate-ate-api-endpoint)") + "SubstrateLifecycleUnavailable", + "substrate runtime requires configured substrate lifecycle (set --substrate-ate-api-endpoint)") setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "SubstrateProvisionerUnavailable", "") + "SubstrateLifecycleUnavailable", "") if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { return ctrl.Result{}, err } return ctrl.Result{}, nil } - provRes, err := r.SubstrateProvisioner.Ensure(ctx, &ah) + lifecycleState, err := r.SubstrateLifecycle.EnsureGeneratedTemplate(ctx, &ah) if err != nil { - log.Error(err, "substrate provision failed") + log.Error(err, "substrate lifecycle reconciliation failed") setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, - "SubstrateProvisionFailed", err.Error()) + "SubstrateLifecycleFailed", err.Error()) setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "SubstrateProvisionFailed", "") + "SubstrateLifecycleFailed", "") if perr := r.patchAgentHarnessStatus(ctx, &ah); perr != nil { return ctrl.Result{}, perr } return ctrl.Result{}, err } - if provRes.ActorTemplateReady { - setSubstrateCondition(&ah, v1alpha2.AgentHarnessSubstrateConditionTypeActorTemplateReady, + if lifecycleState.ActorTemplateReady { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, metav1.ConditionTrue, "Ready", "ActorTemplate golden snapshot is ready") } else { - setSubstrateCondition(&ah, v1alpha2.AgentHarnessSubstrateConditionTypeActorTemplateReady, + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, metav1.ConditionFalse, "NotReady", "waiting for ActorTemplate golden snapshot") } - // Persist status before metadata annotation patch (client Patch can refresh ah and drop in-memory status). if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { return ctrl.Result{}, err } - if err := r.patchAgentHarnessProvisionAnnotations(ctx, &ah, provRes); err != nil { - return ctrl.Result{}, err - } - if !provRes.ActorTemplateReady { + if !lifecycleState.ActorTemplateReady { setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, - "SubstrateProvisioning", "waiting for ActorTemplate golden snapshot") + "SubstrateLifecyclePending", "waiting for ActorTemplate golden snapshot") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionFalse, + "ActorNotCreated", "waiting for ActorTemplate before creating actor") setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "ActorTemplateNotReady", "ActorTemplate is not Ready yet") if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { @@ -177,7 +178,7 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil } if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { - return ctrl.Result{}, fmt.Errorf("reload AgentHarness after substrate provision: %w", err) + return ctrl.Result{}, fmt.Errorf("reload AgentHarness after substrate lifecycle reconciliation: %w", err) } } @@ -207,10 +208,12 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request st, reason, msg := backend.GetStatus(ctx, res.Handle) pending := r.postReadyBootstrapPending(&ah) if st == metav1.ConditionTrue && pending { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "BootstrapPending", "gateway sandbox is ready; waiting for post-ready bootstrap (OnAgentHarnessReady) to finish") } else { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, st, reason, msg) } ah.Status.ObservedGeneration = ah.Generation @@ -232,6 +235,7 @@ func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, fmt.Errorf("get AgentHarness after bootstrap: %w", err) } st2, reason2, msg2 := backend.GetStatus(ctx, res.Handle) + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeActorReady, st2, reason2, msg2) setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeReady, st2, reason2, msg2) latest.Status.ObservedGeneration = latest.Generation if err := r.Client.Status().Update(ctx, &latest); err != nil { @@ -280,7 +284,7 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } if substrateDeleteTimedOut(ah) { - setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "DeleteTimeout", "substrate cleanup exceeded timeout") if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { return ctrl.Result{}, err @@ -295,13 +299,11 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } if actorID != "" { - var actorDone bool + backend := r.backendFor(ah) + actorDone := true var err error - if runtime == v1alpha2.AgentHarnessRuntimeSubstrate && r.SubstrateProvisioner != nil { - actorDone, err = r.SubstrateProvisioner.AdvanceActorDelete(ctx, actorID) - } else if del := r.backendFor(ah); del != nil { - err = del.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) - actorDone = err == nil + if backend != nil { + actorDone, err = backend.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) } else { actorDone = true } @@ -312,7 +314,7 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err } if !actorDone { - setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionFalse, "ActorDeleting", fmt.Sprintf("waiting for substrate actor %q deletion", actorID)) if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { return ctrl.Result{}, err @@ -326,24 +328,24 @@ func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alph } if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { - if r.SubstrateProvisioner == nil { + if r.SubstrateLifecycle == nil { return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, - fmt.Errorf("substrate provisioner is not configured") + fmt.Errorf("substrate lifecycle is not configured") } - complete, err := r.SubstrateProvisioner.AdvanceDelete(ctx, ah) + complete, err := r.SubstrateLifecycle.CleanupGeneratedTemplate(ctx, ah) if err != nil { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("delete substrate resources: %w", err) + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("cleanup substrate lifecycle: %w", err) } if !complete { - setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, - metav1.ConditionFalse, "CleanupInProgress", "waiting for managed Substrate resources to be removed") + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, + metav1.ConditionFalse, "GoldenActorDeleting", "waiting for generated ActorTemplate golden actor deletion") if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { return ctrl.Result{}, err } return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil } - setSubstrateCondition(ah, v1alpha2.AgentHarnessSubstrateConditionTypeResourcesCleaned, - metav1.ConditionTrue, "Cleaned", "managed Substrate resources removed") + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, + metav1.ConditionFalse, "Deleting", "generated ActorTemplate will be garbage collected") if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { return ctrl.Result{}, err } @@ -364,26 +366,19 @@ func substrateDeleteTimedOut(ah *v1alpha2.AgentHarness) bool { } func (r *AgentHarnessController) patchAgentHarnessStatus(ctx context.Context, ah *v1alpha2.AgentHarness) error { - if err := r.Client.Status().Update(ctx, ah); err != nil { - return fmt.Errorf("update AgentHarness status: %w", err) - } - return nil -} - -func (r *AgentHarnessController) patchAgentHarnessProvisionAnnotations(ctx context.Context, ah *v1alpha2.AgentHarness, prov substrate.EnsureResult) error { - base := ah.DeepCopy() - if ah.Annotations == nil { - ah.Annotations = map[string]string{} - } - if prov.ManagedWorkerPool { - ah.Annotations[substrate.AnnotationManagedWorkerPool] = "true" + var current v1alpha2.AgentHarness + if err := r.Client.Get(ctx, client.ObjectKeyFromObject(ah), ¤t); err != nil { + return fmt.Errorf("get AgentHarness before status update: %w", err) } - if prov.ManagedActorTemplate { - ah.Annotations[substrate.AnnotationManagedActorTemplate] = "true" + if reflect.DeepEqual(current.Status, ah.Status) { + *ah = current + return nil } - if err := r.Client.Patch(ctx, ah, client.MergeFrom(base)); err != nil { - return fmt.Errorf("patch AgentHarness substrate annotations: %w", err) + current.Status = ah.Status + if err := r.Client.Status().Update(ctx, ¤t); err != nil { + return fmt.Errorf("update AgentHarness status: %w", err) } + *ah = current return nil } @@ -395,39 +390,12 @@ func effectiveAgentHarnessRuntime(ah *v1alpha2.AgentHarness) v1alpha2.AgentHarne } func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { - setConditions(&ah.Status.Conditions, ah.Generation, t, s, reason, msg) -} - -func setSubstrateCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { - if ah.Status.Substrate == nil { - ah.Status.Substrate = &v1alpha2.AgentHarnessSubstrateStatus{} - } - setConditions(&ah.Status.Substrate.Conditions, ah.Generation, t, s, reason, msg) -} - -func setConditions(conditions *[]metav1.Condition, generation int64, t string, s metav1.ConditionStatus, reason, msg string) { - now := metav1.Now() - for i := range *conditions { - c := &(*conditions)[i] - if c.Type != t { - continue - } - if c.Status != s { - c.LastTransitionTime = now - } - c.Status = s - c.Reason = reason - c.Message = msg - c.ObservedGeneration = generation - return - } - *conditions = append(*conditions, metav1.Condition{ + meta.SetStatusCondition(&ah.Status.Conditions, metav1.Condition{ Type: t, Status: s, Reason: reason, Message: msg, - LastTransitionTime: now, - ObservedGeneration: generation, + ObservedGeneration: ah.Generation, }) } @@ -435,10 +403,26 @@ func setConditions(conditions *[]metav1.Condition, generation int64, t string, s func (r *AgentHarnessController) SetupWithManager(mgr ctrl.Manager) error { b := ctrl.NewControllerManagedBy(mgr). WithOptions(controller.Options{NeedLeaderElection: new(true)}). - For(&v1alpha2.AgentHarness{}, builder.WithPredicates(predicate.Or( - predicate.GenerationChangedPredicate{}, - predicate.LabelChangedPredicate{}, - ))) + For(&v1alpha2.AgentHarness{}, builder.WithPredicates(agentHarnessPrimaryPredicate())) b = r.substrateWatches(b) return b.Named("agentharness").Complete(r) } + +func agentHarnessPrimaryPredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(event.CreateEvent) bool { return true }, + DeleteFunc: func(event.DeleteEvent) bool { return true }, + UpdateFunc: func(e event.UpdateEvent) bool { + if e.ObjectOld == nil || e.ObjectNew == nil { + return true + } + if e.ObjectNew.GetGeneration() != e.ObjectOld.GetGeneration() { + return true + } + if !reflect.DeepEqual(e.ObjectNew.GetLabels(), e.ObjectOld.GetLabels()) { + return true + } + return e.ObjectOld.GetDeletionTimestamp().IsZero() && !e.ObjectNew.GetDeletionTimestamp().IsZero() + }, + } +} diff --git a/go/core/internal/controller/agentharness_controller_test.go b/go/core/internal/controller/agentharness_controller_test.go new file mode 100644 index 0000000000..5f82f1b010 --- /dev/null +++ b/go/core/internal/controller/agentharness_controller_test.go @@ -0,0 +1,296 @@ +package controller + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" +) + +type fakeSubstrateLifecycle struct { + state substrate.LifecycleState + ensureErr error + cleanupDone bool + cleanupErr error + ensureCalls int + cleanupCalls int +} + +func (f *fakeSubstrateLifecycle) EnsureGeneratedTemplate(_ context.Context, _ *v1alpha2.AgentHarness) (substrate.LifecycleState, error) { + f.ensureCalls++ + return f.state, f.ensureErr +} + +func (f *fakeSubstrateLifecycle) CleanupGeneratedTemplate(_ context.Context, _ *v1alpha2.AgentHarness) (bool, error) { + f.cleanupCalls++ + return f.cleanupDone, f.cleanupErr +} + +type fakeAgentHarnessBackend struct { + ensureCalls int + deleteCalls int + readyCalls int + + ensureHandle string + endpoint string + status metav1.ConditionStatus + reason string + message string + + deleteDone bool + deleteErr error + readyErr error +} + +func (f *fakeAgentHarnessBackend) Name() v1alpha2.AgentHarnessBackendType { + return v1alpha2.AgentHarnessBackendOpenClaw +} + +func (f *fakeAgentHarnessBackend) EnsureAgentHarness(context.Context, *v1alpha2.AgentHarness) (sandboxbackend.EnsureResult, error) { + f.ensureCalls++ + id := f.ensureHandle + if id == "" { + id = "actor-1" + } + return sandboxbackend.EnsureResult{ + Handle: sandboxbackend.Handle{ID: id}, + Endpoint: f.endpoint, + }, nil +} + +func (f *fakeAgentHarnessBackend) GetStatus(context.Context, sandboxbackend.Handle) (metav1.ConditionStatus, string, string) { + st := f.status + if st == "" { + st = metav1.ConditionTrue + } + reason := f.reason + if reason == "" { + reason = "Running" + } + return st, reason, f.message +} + +func (f *fakeAgentHarnessBackend) DeleteAgentHarness(context.Context, sandboxbackend.Handle) (bool, error) { + f.deleteCalls++ + return f.deleteDone, f.deleteErr +} + +func (f *fakeAgentHarnessBackend) OnAgentHarnessReady(context.Context, *v1alpha2.AgentHarness, sandboxbackend.Handle) error { + f.readyCalls++ + return f.readyErr +} + +func TestAgentHarnessController_SubstrateWaitsForGeneratedTemplate(t *testing.T) { + ctx := context.Background() + ah := newSubstrateHarness("kagent", "claw") + controller := newAgentHarnessTestController(t, ah) + lifecycle := &fakeSubstrateLifecycle{state: substrate.LifecycleState{ActorTemplateReady: false}} + backend := &fakeAgentHarnessBackend{} + controller.SubstrateLifecycle = lifecycle + controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: backend, + } + + result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.NoError(t, err) + require.Equal(t, agentHarnessNotReadyRequeue, result.RequeueAfter) + require.Equal(t, 1, lifecycle.ensureCalls) + require.Zero(t, backend.ensureCalls, "actor backend must not run before ActorTemplate is ready") + + latest := getAgentHarness(t, controller.Client, ah) + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, "SubstrateLifecyclePending") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, metav1.ConditionFalse, "NotReady") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionFalse, "ActorNotCreated") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "ActorTemplateNotReady") +} + +func TestAgentHarnessController_SubstrateLifecycleErrorSetsStatus(t *testing.T) { + ctx := context.Background() + ah := newSubstrateHarness("kagent", "claw") + controller := newAgentHarnessTestController(t, ah) + lifecycle := &fakeSubstrateLifecycle{ensureErr: errors.New("workerpool missing")} + backend := &fakeAgentHarnessBackend{} + controller.SubstrateLifecycle = lifecycle + controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: backend, + } + + _, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.ErrorContains(t, err, "workerpool missing") + require.Equal(t, 1, lifecycle.ensureCalls) + require.Zero(t, backend.ensureCalls) + + latest := getAgentHarness(t, controller.Client, ah) + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, "SubstrateLifecycleFailed") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, "SubstrateLifecycleFailed") +} + +func TestAgentHarnessController_SubstrateReadyCreatesActorAndRunsBootstrap(t *testing.T) { + ctx := context.Background() + ah := newSubstrateHarness("kagent", "claw") + controller := newAgentHarnessTestController(t, ah) + lifecycle := &fakeSubstrateLifecycle{state: substrate.LifecycleState{ActorTemplateReady: true}} + backend := &fakeAgentHarnessBackend{ensureHandle: "actor-1", endpoint: "kagent gateway: /api/agentharnesses/kagent/claw/gateway/"} + controller.SubstrateLifecycle = lifecycle + controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: backend, + } + + result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, result) + require.Equal(t, 1, lifecycle.ensureCalls) + require.Equal(t, 1, backend.ensureCalls) + require.Equal(t, 1, backend.readyCalls) + + latest := getAgentHarness(t, controller.Client, ah) + require.NotNil(t, latest.Status.BackendRef) + require.Equal(t, "actor-1", latest.Status.BackendRef.ID) + require.NotNil(t, latest.Status.Connection) + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, "AgentHarnessAccepted") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, metav1.ConditionTrue, "Ready") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionTrue, "Running") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionTrue, "Running") + require.Equal(t, "1", latest.Annotations[annotationAgentHarnessBootstrapGeneration]) +} + +func TestAgentHarnessController_SubstrateDeleteWaitsForActorBeforeTemplateCleanup(t *testing.T) { + ctx := context.Background() + ah := newDeletingSubstrateHarness("kagent", "claw") + ah.Status.BackendRef = &v1alpha2.AgentHarnessStatusRef{Backend: v1alpha2.AgentHarnessBackendOpenClaw, ID: "actor-1"} + controller := newAgentHarnessTestController(t, ah) + lifecycle := &fakeSubstrateLifecycle{cleanupDone: true} + backend := &fakeAgentHarnessBackend{deleteDone: false} + controller.SubstrateLifecycle = lifecycle + controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: backend, + } + + result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.NoError(t, err) + require.Equal(t, agentHarnessNotReadyRequeue, result.RequeueAfter) + require.Equal(t, 1, backend.deleteCalls) + require.Zero(t, lifecycle.cleanupCalls, "template cleanup must wait for harness actor deletion") + + latest := getAgentHarness(t, controller.Client, ah) + require.NotNil(t, latest.Status.BackendRef) + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionFalse, "ActorDeleting") + require.Contains(t, latest.Finalizers, agentHarnessFinalizer) +} + +func TestAgentHarnessController_SubstrateDeleteWaitsForGeneratedTemplateCleanup(t *testing.T) { + ctx := context.Background() + ah := newDeletingSubstrateHarness("kagent", "claw") + ah.Status.BackendRef = &v1alpha2.AgentHarnessStatusRef{Backend: v1alpha2.AgentHarnessBackendOpenClaw, ID: "actor-1"} + controller := newAgentHarnessTestController(t, ah) + lifecycle := &fakeSubstrateLifecycle{cleanupDone: false} + backend := &fakeAgentHarnessBackend{deleteDone: true} + controller.SubstrateLifecycle = lifecycle + controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: backend, + } + + result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.NoError(t, err) + require.Equal(t, agentHarnessNotReadyRequeue, result.RequeueAfter) + require.Equal(t, 1, backend.deleteCalls) + require.Equal(t, 1, lifecycle.cleanupCalls) + + latest := getAgentHarness(t, controller.Client, ah) + require.Nil(t, latest.Status.BackendRef) + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, metav1.ConditionFalse, "GoldenActorDeleting") + require.Contains(t, latest.Finalizers, agentHarnessFinalizer) +} + +func TestAgentHarnessController_SubstrateDeleteRemovesFinalizerAfterCleanup(t *testing.T) { + ctx := context.Background() + ah := newDeletingSubstrateHarness("kagent", "claw") + ah.Status.BackendRef = &v1alpha2.AgentHarnessStatusRef{Backend: v1alpha2.AgentHarnessBackendOpenClaw, ID: "actor-1"} + controller := newAgentHarnessTestController(t, ah) + lifecycle := &fakeSubstrateLifecycle{cleanupDone: true} + backend := &fakeAgentHarnessBackend{deleteDone: true} + controller.SubstrateLifecycle = lifecycle + controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ + v1alpha2.AgentHarnessBackendOpenClaw: backend, + } + + result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, result) + require.Equal(t, 1, backend.deleteCalls) + require.Equal(t, 1, lifecycle.cleanupCalls) + + var latest v1alpha2.AgentHarness + err = controller.Client.Get(ctx, client.ObjectKeyFromObject(ah), &latest) + require.True(t, apierrors.IsNotFound(err), "fake client should complete deletion after finalizer removal") +} + +func newAgentHarnessTestController(t *testing.T, objects ...client.Object) *AgentHarnessController { + t.Helper() + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + kube := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource(&v1alpha2.AgentHarness{}). + Build() + return &AgentHarnessController{Client: kube} +} + +func newSubstrateHarness(namespace, name string) *v1alpha2.AgentHarness { + return &v1alpha2.AgentHarness{ + TypeMeta: metav1.TypeMeta{APIVersion: v1alpha2.GroupVersion.String(), Kind: "AgentHarness"}, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Generation: 1, + Finalizers: []string{agentHarnessFinalizer}, + }, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Backend: v1alpha2.AgentHarnessBackendOpenClaw, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayToken: "token", + }, + }, + } +} + +func newDeletingSubstrateHarness(namespace, name string) *v1alpha2.AgentHarness { + ah := newSubstrateHarness(namespace, name) + now := metav1.Now() + ah.DeletionTimestamp = &now + return ah +} + +func getAgentHarness(t *testing.T, kube client.Client, ah *v1alpha2.AgentHarness) *v1alpha2.AgentHarness { + t.Helper() + var latest v1alpha2.AgentHarness + err := kube.Get(context.Background(), client.ObjectKeyFromObject(ah), &latest) + if apierrors.IsNotFound(err) { + t.Fatalf("AgentHarness %s unexpectedly not found", client.ObjectKeyFromObject(ah)) + } + require.NoError(t, err) + return &latest +} + +func requireCondition(t *testing.T, ah *v1alpha2.AgentHarness, conditionType string, status metav1.ConditionStatus, reason string) { + t.Helper() + condition := meta.FindStatusCondition(ah.Status.Conditions, conditionType) + require.NotNil(t, condition, "missing condition %s", conditionType) + require.Equal(t, status, condition.Status, "condition %s status", conditionType) + require.Equal(t, reason, condition.Reason, "condition %s reason", conditionType) +} diff --git a/go/core/internal/controller/agentharness_substrate_watches.go b/go/core/internal/controller/agentharness_substrate_watches.go index 14dbf9b59e..948778a673 100644 --- a/go/core/internal/controller/agentharness_substrate_watches.go +++ b/go/core/internal/controller/agentharness_substrate_watches.go @@ -2,11 +2,8 @@ package controller import ( "context" - "strings" atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" - appsv1 "k8s.io/api/apps/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -29,69 +26,13 @@ func (r *AgentHarnessController) enqueueAgentHarnessForSubstrateResource(ctx con }} } -func (r *AgentHarnessController) enqueueAgentHarnessForWorkerPoolDeployment(ctx context.Context, obj client.Object) []reconcile.Request { - deploy, ok := obj.(*appsv1.Deployment) - if !ok { - return nil - } - harnessName := substrate.HarnessNameFromLabels(deploy.GetLabels()) - if harnessName == "" { - harnessName = r.harnessNameFromWorkerPoolDeployment(ctx, deploy) - } - if harnessName == "" { - return nil - } - return []reconcile.Request{{ - NamespacedName: types.NamespacedName{ - Namespace: deploy.Namespace, - Name: harnessName, - }, - }} -} - -// harnessNameFromWorkerPoolDeployment resolves the harness via the owning WorkerPool's labels. -// Substrate names deployments "{workerPool}-deployment" and does not copy harness labels onto them. -func (r *AgentHarnessController) harnessNameFromWorkerPoolDeployment(ctx context.Context, deploy *appsv1.Deployment) string { - if r == nil || r.Client == nil || deploy == nil { - return "" - } - for _, ref := range deploy.GetOwnerReferences() { - if ref.Kind != "WorkerPool" || ref.Controller == nil || !*ref.Controller { - continue - } - if !strings.Contains(ref.APIVersion, "ate.dev") { - continue - } - var wp atev1alpha1.WorkerPool - key := types.NamespacedName{Namespace: deploy.Namespace, Name: ref.Name} - if err := r.Client.Get(ctx, key, &wp); err != nil { - if apierrors.IsNotFound(err) { - continue - } - return "" - } - if name := substrate.HarnessNameFromLabels(wp.GetLabels()); name != "" { - return name - } - } - return "" -} - func (r *AgentHarnessController) substrateWatches(b *builder.Builder) *builder.Builder { - if r == nil || r.SubstrateProvisioner == nil { + if r == nil || r.SubstrateLifecycle == nil { return b } return b. - Watches( - &atev1alpha1.WorkerPool{}, - handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForSubstrateResource), - ). Watches( &atev1alpha1.ActorTemplate{}, handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForSubstrateResource), - ). - Watches( - &appsv1.Deployment{}, - handler.EnqueueRequestsFromMapFunc(r.enqueueAgentHarnessForWorkerPoolDeployment), ) } diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 017c1ce7ee..6a17436ccc 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -154,18 +154,17 @@ type Config struct { CallTimeout time.Duration } Substrate struct { - AteAPIEndpoint string - Insecure bool - DialTimeout time.Duration - CallTimeout time.Duration - DefaultActorTemplateNamespace string - DefaultActorTemplateName string - PauseImage string - RunscAMD64URL string - RunscAMD64SHA256 string - RunscARM64URL string - RunscARM64SHA256 string - AteomImage string + AteAPIEndpoint string + Insecure bool + DialTimeout time.Duration + CallTimeout time.Duration + DefaultWorkerPoolNamespace string + DefaultWorkerPoolName string + PauseImage string + RunscAMD64URL string + RunscAMD64SHA256 string + RunscARM64URL string + RunscARM64SHA256 string } } @@ -230,14 +229,13 @@ func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { commandLine.BoolVar(&cfg.Substrate.Insecure, "substrate-ate-api-insecure", false, "Dial ate-api without TLS (local dev only).") commandLine.DurationVar(&cfg.Substrate.DialTimeout, "substrate-dial-timeout", 10*time.Second, "Timeout for the initial dial to ate-api.") commandLine.DurationVar(&cfg.Substrate.CallTimeout, "substrate-call-timeout", 30*time.Second, "Per-RPC timeout for ate-api calls.") - commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateNamespace, "substrate-default-actor-template-namespace", "", "Legacy fallback ActorTemplate namespace when adopting an external template (set spec.substrate.actorTemplateRef instead).") - commandLine.StringVar(&cfg.Substrate.DefaultActorTemplateName, "substrate-default-actor-template-name", "", "Legacy fallback ActorTemplate name when adopting an external template (set spec.substrate.actorTemplateRef instead).") - commandLine.StringVar(&cfg.Substrate.PauseImage, "substrate-pause-image", "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", "Pause image for auto-provisioned ActorTemplates.") + commandLine.StringVar(&cfg.Substrate.DefaultWorkerPoolNamespace, "substrate-default-workerpool-namespace", kagentNamespace, "Default Agent Substrate WorkerPool namespace when spec.substrate.workerPoolRef is unset.") + commandLine.StringVar(&cfg.Substrate.DefaultWorkerPoolName, "substrate-default-workerpool-name", "", "Default Agent Substrate WorkerPool name when spec.substrate.workerPoolRef is unset.") + commandLine.StringVar(&cfg.Substrate.PauseImage, "substrate-pause-image", "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", "Pause image for generated ActorTemplates.") commandLine.StringVar(&cfg.Substrate.RunscAMD64URL, "substrate-runsc-amd64-url", "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc", "gVisor runsc URL for amd64.") commandLine.StringVar(&cfg.Substrate.RunscAMD64SHA256, "substrate-runsc-amd64-sha256", "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63", "gVisor runsc sha256 for amd64.") commandLine.StringVar(&cfg.Substrate.RunscARM64URL, "substrate-runsc-arm64-url", "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc", "gVisor runsc URL for arm64.") commandLine.StringVar(&cfg.Substrate.RunscARM64SHA256, "substrate-runsc-arm64-sha256", "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9", "gVisor runsc sha256 for arm64.") - commandLine.StringVar(&cfg.Substrate.AteomImage, "substrate-ateom-image", "", "Default ateom herder image for auto-provisioned Substrate WorkerPools. Per-harness spec.substrate.workerPool.ateomImage overrides this.") commandLine.StringVar(&agent_translator.DefaultServiceAccountName, "default-service-account-name", "", "Global default ServiceAccount name for agent pods. When set, agents without an explicit serviceAccountName will use this instead of creating a per-agent ServiceAccount.") @@ -462,7 +460,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne clientOpts := client.Options{} if len(watchNamespacesList) > 0 { // In namespaced RBAC mode a Role cannot grant access to cluster-scoped - // resources, so prevent the cached client from starting a cluster-scoped + // lifecycle, so prevent the cached client from starting a cluster-scoped // Namespace informer whose list/watch would keep crashing. clientOpts.Cache = &client.CacheOptions{ DisableFor: []client.Object{&corev1.Namespace{}}, @@ -616,16 +614,16 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne } } if len(openshellBackends) > 0 || len(substrateBackends) > 0 { - var substrateProvisioner *substrate.Provisioner + var substrateLifecycle *substrate.Lifecycle if len(substrateBackends) > 0 { - substrateProvisioner = substrateProvisionerFromConfig(kubeClient, &cfg, substrateAteClient) + substrateLifecycle = substrateLifecycleFromConfig(kubeClient, &cfg, substrateAteClient) } if err := (&controller.AgentHarnessController{ - Client: kubeClient, - Recorder: mgr.GetEventRecorder("agentharness-controller"), - OpenshellBackends: openshellBackends, - SubstrateBackends: substrateBackends, - SubstrateProvisioner: substrateProvisioner, + Client: kubeClient, + Recorder: mgr.GetEventRecorder("agentharness-controller"), + OpenshellBackends: openshellBackends, + SubstrateBackends: substrateBackends, + SubstrateLifecycle: substrateLifecycle, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "AgentHarness") os.Exit(1) @@ -659,7 +657,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne } if err := reconcilerutils.SetupOwnerIndexes(mgr, rcnclr.GetOwnedResourceTypes()); err != nil { - setupLog.Error(err, "failed to setup indexes for owned resources") + setupLog.Error(err, "failed to setup indexes for owned lifecycle") os.Exit(1) } @@ -823,8 +821,8 @@ func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alph return nil, nil, err } - ocl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendOpenClaw, nil) - ncl := substrate.NewOpenClawBackend(client, sc, v1alpha2.AgentHarnessBackendNemoClaw, nil) + ocl := substrate.NewOpenClawBackend(client, v1alpha2.AgentHarnessBackendOpenClaw, nil) + ncl := substrate.NewOpenClawBackend(client, v1alpha2.AgentHarnessBackendNemoClaw, nil) return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ v1alpha2.AgentHarnessBackendOpenClaw: ocl, v1alpha2.AgentHarnessBackendNemoClaw: ncl, @@ -833,39 +831,27 @@ func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alph func substrateAppConfig(cfg *Config) substrate.Config { sc := substrate.Config{ - AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, - Insecure: cfg.Substrate.Insecure, - DialTimeout: cfg.Substrate.DialTimeout, - CallTimeout: cfg.Substrate.CallTimeout, - DefaultActorTemplateNamespace: cfg.Substrate.DefaultActorTemplateNamespace, - DefaultActorTemplateName: cfg.Substrate.DefaultActorTemplateName, - ProvisionDefaults: substrate.ProvisionDefaults{ - PauseImage: cfg.Substrate.PauseImage, - RunscAMD64URL: cfg.Substrate.RunscAMD64URL, - RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, - RunscARM64URL: cfg.Substrate.RunscARM64URL, - RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, - DefaultAteomImage: cfg.Substrate.AteomImage, - DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, - }, + AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, + Insecure: cfg.Substrate.Insecure, + DialTimeout: cfg.Substrate.DialTimeout, + CallTimeout: cfg.Substrate.CallTimeout, } return sc } -func substrateProvisionerFromConfig(kubeClient client.Client, cfg *Config, ate *substrate.Client) *substrate.Provisioner { - return &substrate.Provisioner{ - Client: kubeClient, - Ate: ate, - Defaults: substrate.ProvisionDefaults{ - PauseImage: cfg.Substrate.PauseImage, - RunscAMD64URL: cfg.Substrate.RunscAMD64URL, - RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, - RunscARM64URL: cfg.Substrate.RunscARM64URL, - RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, - DefaultAteomImage: cfg.Substrate.AteomImage, - DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, +func substrateLifecycleFromConfig(kubeClient client.Client, cfg *Config, ate *substrate.Client) *substrate.Lifecycle { + return substrate.NewLifecycle(kubeClient, substrate.LifecycleDefaults{ + PauseImage: cfg.Substrate.PauseImage, + RunscAMD64URL: cfg.Substrate.RunscAMD64URL, + RunscAMD64SHA256: cfg.Substrate.RunscAMD64SHA256, + RunscARM64URL: cfg.Substrate.RunscARM64URL, + RunscARM64SHA256: cfg.Substrate.RunscARM64SHA256, + DefaultWorkloadImage: openclaw.NemoclawSandboxBaseImage, + DefaultWorkerPool: types.NamespacedName{ + Namespace: cfg.Substrate.DefaultWorkerPoolNamespace, + Name: cfg.Substrate.DefaultWorkerPoolName, }, - } + }, ate) } // configureNamespaceWatching sets up the controller manager to watch specific namespaces diff --git a/go/core/pkg/sandboxbackend/async.go b/go/core/pkg/sandboxbackend/async.go index e680259949..29112ba541 100644 --- a/go/core/pkg/sandboxbackend/async.go +++ b/go/core/pkg/sandboxbackend/async.go @@ -39,9 +39,11 @@ type AsyncBackend interface { // each reconcile. GetStatus(ctx context.Context, h Handle) (metav1.ConditionStatus, string, string) - // DeleteAgentHarness releases the sandbox. NotFound must be treated as - // success so the finalizer can be removed idempotently. - DeleteAgentHarness(ctx context.Context, h Handle) error + // DeleteAgentHarness releases the sandbox. It performs at most one + // reconcile-safe delete step and returns done=true once the sandbox is gone. + // NotFound must be treated as success so the finalizer can be removed + // idempotently. + DeleteAgentHarness(ctx context.Context, h Handle) (done bool, err error) // OnAgentHarnessReady runs one-time work after the AgentHarness reports // Ready (for example ExecSandbox bootstrap inside the VM). Backends that diff --git a/go/core/pkg/sandboxbackend/openshell/agentharness_openshell_client.go b/go/core/pkg/sandboxbackend/openshell/agentharness_openshell_client.go index 7394beae62..f35fec7acf 100644 --- a/go/core/pkg/sandboxbackend/openshell/agentharness_openshell_client.go +++ b/go/core/pkg/sandboxbackend/openshell/agentharness_openshell_client.go @@ -129,9 +129,9 @@ func (c *AgentHarnessOpenShellClient) GetSandboxStatus(ctx context.Context, h sa } // DeleteAgentHarnessSandbox deletes the OpenShell sandbox; NotFound is success. -func (c *AgentHarnessOpenShellClient) DeleteAgentHarnessSandbox(ctx context.Context, h sandboxbackend.Handle) error { +func (c *AgentHarnessOpenShellClient) DeleteAgentHarnessSandbox(ctx context.Context, h sandboxbackend.Handle) (bool, error) { if h.ID == "" { - return nil + return true, nil } ctx, cancel := c.CallCtx(ctx) defer cancel() @@ -139,17 +139,17 @@ func (c *AgentHarnessOpenShellClient) DeleteAgentHarnessSandbox(ctx context.Cont osCli := c.openShell() if osCli == nil { - return fmt.Errorf("openshell: OpenShell client is required") + return false, fmt.Errorf("openshell: OpenShell client is required") } _, err := osCli.DeleteSandbox(ctx, &openshellv1.DeleteSandboxRequest{Name: h.ID}) if err == nil { - return nil + return true, nil } if status.Code(err) == codes.NotFound { - return nil + return true, nil } - return fmt.Errorf("openshell DeleteSandbox %s: %w", h.ID, err) + return false, fmt.Errorf("openshell DeleteSandbox %s: %w", h.ID, err) } // ExecSandboxID resolves metadata.id for ExecSandbox RPCs. diff --git a/go/core/pkg/sandboxbackend/openshell/openshell.go b/go/core/pkg/sandboxbackend/openshell/openshell.go index 9802b1b9e7..299617245f 100644 --- a/go/core/pkg/sandboxbackend/openshell/openshell.go +++ b/go/core/pkg/sandboxbackend/openshell/openshell.go @@ -82,6 +82,6 @@ func (b *agentHarnessOpenShellBackend) GetStatus(ctx context.Context, h sandboxb } // DeleteAgentHarness implements AsyncBackend. -func (b *agentHarnessOpenShellBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.Handle) error { +func (b *agentHarnessOpenShellBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.Handle) (bool, error) { return b.DeleteAgentHarnessSandbox(ctx, h) } diff --git a/go/core/pkg/sandboxbackend/openshell/openshell_test.go b/go/core/pkg/sandboxbackend/openshell/openshell_test.go index 7c55ea45ab..bf154f7bcd 100644 --- a/go/core/pkg/sandboxbackend/openshell/openshell_test.go +++ b/go/core/pkg/sandboxbackend/openshell/openshell_test.go @@ -304,14 +304,20 @@ func TestDeleteSandbox(t *testing.T) { r, err := b.EnsureAgentHarness(context.Background(), sampleClawSandbox()) require.NoError(t, err) - require.NoError(t, b.DeleteAgentHarness(context.Background(), r.Handle)) + done, err := b.DeleteAgentHarness(context.Background(), r.Handle) + require.NoError(t, err) + require.True(t, done) require.Equal(t, 1, fg.deleteCalls) - require.NoError(t, b.DeleteAgentHarness(context.Background(), r.Handle)) + done, err = b.DeleteAgentHarness(context.Background(), r.Handle) + require.NoError(t, err) + require.True(t, done) require.Equal(t, 2, fg.deleteCalls) before := fg.deleteCalls - require.NoError(t, b.DeleteAgentHarness(context.Background(), sandboxbackend.Handle{})) + done, err = b.DeleteAgentHarness(context.Background(), sandboxbackend.Handle{}) + require.NoError(t, err) + require.True(t, done) require.Equal(t, before, fg.deleteCalls) } diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go index 092e68ef92..071478d0c1 100644 --- a/go/core/pkg/sandboxbackend/substrate/config.go +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -9,11 +9,4 @@ type Config struct { Insecure bool DialTimeout time.Duration CallTimeout time.Duration - - // DefaultActorTemplateNamespace/name is a legacy fallback when status/spec refs are unset. - DefaultActorTemplateNamespace string - DefaultActorTemplateName string - - // ProvisionDefaults configures auto-created WorkerPool/ActorTemplate resources. - ProvisionDefaults ProvisionDefaults } diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision.go b/go/core/pkg/sandboxbackend/substrate/delete_provision.go deleted file mode 100644 index a662454793..0000000000 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision.go +++ /dev/null @@ -1,137 +0,0 @@ -package substrate - -import ( - "context" - "fmt" - "strings" - - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" - "github.com/kagent-dev/kagent/go/api/v1alpha2" - appsv1 "k8s.io/api/apps/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// AdvanceActorDelete deletes a harness actor via ate-api (one RPC step per call). -func (p *Provisioner) AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) { - if p == nil || p.Ate == nil || strings.TrimSpace(actorID) == "" { - return true, nil - } - return p.Ate.AdvanceActorDelete(ctx, actorID) -} - -// AdvanceDelete issues delete requests and observes substrate cleanup progress without blocking. -// Returns true when all kagent-managed Substrate resources for this harness are gone. -func (p *Provisioner) AdvanceDelete(ctx context.Context, ah *v1alpha2.AgentHarness) (bool, error) { - if ah == nil || ah.Annotations == nil { - return true, nil - } - if p.Client == nil { - return true, nil - } - - if ah.Annotations[AnnotationManagedActorTemplate] == "true" { - tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} - goldenID, err := p.goldenActorID(ctx, tmplKey) - if err != nil { - return false, err - } - if goldenID != "" { - if p.Ate == nil { - return false, fmt.Errorf("substrate ate-api client is required to delete golden actor %q", goldenID) - } - done, err := p.Ate.AdvanceActorDelete(ctx, goldenID) - if err != nil { - return false, fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) - } - if !done { - return false, nil - } - } - var tmpl atev1alpha1.ActorTemplate - if done, err := p.advanceDeleteCR(ctx, tmplKey, &tmpl); err != nil || !done { - return false, err - } - } - - if ah.Annotations[AnnotationManagedWorkerPool] == "true" { - wpKey := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} - var wp atev1alpha1.WorkerPool - if done, err := p.advanceDeleteCR(ctx, wpKey, &wp); err != nil || !done { - return false, err - } - gone, err := p.workerPoolDeploymentGone(ctx, wpKey) - if err != nil { - return false, err - } - if !gone { - return false, nil - } - } - - return true, nil -} - -func (p *Provisioner) goldenActorID(ctx context.Context, tmplKey types.NamespacedName) (string, error) { - var tmpl atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { - if apierrors.IsNotFound(err) { - return "", nil - } - return "", fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) - } - return strings.TrimSpace(tmpl.Status.GoldenActorID), nil -} - -// advanceDeleteCR deletes obj when present; returns true when the object is gone. -func (p *Provisioner) advanceDeleteCR(ctx context.Context, key types.NamespacedName, obj client.Object) (bool, error) { - if err := p.Client.Get(ctx, key, obj); err != nil { - if apierrors.IsNotFound(err) { - return true, nil - } - return false, err - } - if obj.GetDeletionTimestamp().IsZero() { - if err := p.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { - return false, fmt.Errorf("delete %s: %w", key, err) - } - return false, nil - } - return false, nil -} - -func workerPoolDeploymentName(wpName string) string { - return wpName + "-deployment" -} - -// workerPoolDeploymentGone reports whether the substrate WorkerPool deployment is absent or fully drained. -func (p *Provisioner) workerPoolDeploymentGone(ctx context.Context, wpKey types.NamespacedName) (bool, error) { - deployKey := types.NamespacedName{Namespace: wpKey.Namespace, Name: workerPoolDeploymentName(wpKey.Name)} - var deploy appsv1.Deployment - err := p.Client.Get(ctx, deployKey, &deploy) - if apierrors.IsNotFound(err) { - return true, nil - } - if err != nil { - return false, fmt.Errorf("get WorkerPool deployment %s: %w", deployKey, err) - } - if !deploy.DeletionTimestamp.IsZero() { - return false, nil - } - if deploy.Status.Replicas == 0 && deploy.Status.ReadyReplicas == 0 { - return true, nil - } - return false, nil -} - -// HarnessLabelKey labels substrate resources managed for an AgentHarness. -const HarnessLabelKey = "kagent.dev/agent-harness" - -// HarnessNameFromLabels returns the AgentHarness name from provision labels. -func HarnessNameFromLabels(labels map[string]string) string { - if labels == nil { - return "" - } - return strings.TrimSpace(labels[HarnessLabelKey]) -} diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle.go b/go/core/pkg/sandboxbackend/substrate/lifecycle.go new file mode 100644 index 0000000000..51649f1118 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle.go @@ -0,0 +1,61 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "k8s.io/apimachinery/pkg/types" +) + +// EnsureGeneratedTemplate creates or updates the generated ActorTemplate and reports whether it is Ready. +func (p *Lifecycle) EnsureGeneratedTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (LifecycleState, error) { + if ah == nil || ah.Spec.Substrate == nil { + return LifecycleState{}, fmt.Errorf("spec.substrate is required") + } + + wpKey, err := p.resolveWorkerPoolRef(ctx, ah) + if err != nil { + return LifecycleState{}, err + } + + tmplKey, err := p.ensureActorTemplate(ctx, ah, wpKey) + if err != nil { + return LifecycleState{}, err + } + + ready, err := p.actorTemplateReady(ctx, tmplKey) + if err != nil { + return LifecycleState{}, err + } + + return LifecycleState{ + ActorTemplateReady: ready, + }, nil +} + +func (p *Lifecycle) resolveWorkerPoolRef(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, error) { + if p == nil || p.Client == nil { + return types.NamespacedName{}, fmt.Errorf("substrate lifecycle kubernetes client is required") + } + key := p.Defaults.DefaultWorkerPool + if sub := ah.Spec.Substrate; sub != nil && sub.WorkerPoolRef != nil { + if name := strings.TrimSpace(sub.WorkerPoolRef.Name); name != "" { + key = types.NamespacedName{Namespace: ah.Namespace, Name: name} + } + } + if key.Name == "" { + return types.NamespacedName{}, fmt.Errorf("spec.substrate.workerPoolRef is required when no default substrate WorkerPool is configured") + } + if key.Namespace == "" { + key.Namespace = ah.Namespace + } + + var wp atev1alpha1.WorkerPool + if err := p.Client.Get(ctx, key, &wp); err != nil { + return types.NamespacedName{}, fmt.Errorf("get WorkerPool %s: %w", key, err) + } + return key, nil +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go similarity index 56% rename from go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go rename to go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go index c1ae943125..c585663f73 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go @@ -9,13 +9,36 @@ import ( "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/openclaw" corev1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { +func (p *Lifecycle) ensureActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (types.NamespacedName, error) { + key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + desired, err := p.buildActorTemplate(ctx, ah, wpKey) + if err != nil { + return types.NamespacedName{}, err + } + + existing := &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + }, + } + if _, err := controllerutil.CreateOrUpdate(ctx, p.Client, existing, func() error { + existing.Labels = mergeLabels(existing.Labels, desired.Labels) + existing.OwnerReferences = desired.OwnerReferences + existing.Spec = desired.Spec + return nil + }); err != nil { + return types.NamespacedName{}, fmt.Errorf("reconcile ActorTemplate %s: %w", key, err) + } + return key, nil +} + +func (p *Lifecycle) buildActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness, wpKey types.NamespacedName) (*atev1alpha1.ActorTemplate, error) { key := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} workloadImage := strings.TrimSpace(ah.Spec.Substrate.WorkloadImage) if workloadImage == "" { @@ -26,14 +49,14 @@ func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.Agen } startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) if err != nil { - return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) + return nil, fmt.Errorf("build openclaw actor startup: %w", err) } desired := &atev1alpha1.ActorTemplate{ ObjectMeta: metav1.ObjectMeta{ Name: key.Name, Namespace: key.Namespace, - Labels: provisionLabels(ah), + Labels: lifecycleLabels(ah), }, Spec: atev1alpha1.ActorTemplateSpec{ PauseImage: p.Defaults.PauseImage, @@ -61,26 +84,26 @@ func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.Agen }, } if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { - return types.NamespacedName{}, fmt.Errorf("set ActorTemplate owner ref: %w", err) + return nil, fmt.Errorf("set ActorTemplate owner ref: %w", err) } + return desired, nil +} - var existing atev1alpha1.ActorTemplate - if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { - if err := p.Client.Create(ctx, desired); err != nil { - return types.NamespacedName{}, fmt.Errorf("create ActorTemplate %s: %w", key, err) - } - return key, nil - } else if err != nil { - return types.NamespacedName{}, err +func mergeLabels(existing, desired map[string]string) map[string]string { + if len(existing) == 0 && len(desired) == 0 { + return nil } - existing.Spec = desired.Spec - if err := p.Client.Update(ctx, &existing); err != nil { - return types.NamespacedName{}, fmt.Errorf("update ActorTemplate %s: %w", key, err) + merged := make(map[string]string, len(existing)+len(desired)) + for k, v := range existing { + merged[k] = v } - return key, nil + for k, v := range desired { + merged[k] = v + } + return merged } -func (p *Provisioner) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { +func (p *Lifecycle) actorTemplateReady(ctx context.Context, key types.NamespacedName) (bool, error) { var tmpl atev1alpha1.ActorTemplate if err := p.Client.Get(ctx, key, &tmpl); err != nil { return false, fmt.Errorf("get ActorTemplate %s: %w", key, err) diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go new file mode 100644 index 0000000000..4063884849 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go @@ -0,0 +1,67 @@ +package substrate + +import ( + "context" + "fmt" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" +) + +// CleanupGeneratedTemplate removes external Substrate actors that Kubernetes garbage collection cannot see. +// The generated ActorTemplate CR is deleted by owner-reference garbage collection after the +// AgentHarness finalizer is removed. WorkerPools are externally owned and are never deleted here. +func (p *Lifecycle) CleanupGeneratedTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (bool, error) { + if ah == nil { + return true, nil + } + if p.Client == nil { + return true, nil + } + + tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: actorTemplateName(ah)} + goldenID, err := p.goldenActorID(ctx, tmplKey) + if err != nil { + return false, err + } + if goldenID == "" { + return true, nil + } + if p.deleteActor == nil { + return false, fmt.Errorf("substrate ate-api client is required to delete golden actor %q", goldenID) + } + done, err := p.deleteActor(ctx, goldenID) + if err != nil { + return false, fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) + } + if !done { + return false, nil + } + + return true, nil +} + +func (p *Lifecycle) goldenActorID(ctx context.Context, tmplKey types.NamespacedName) (string, error) { + var tmpl atev1alpha1.ActorTemplate + if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { + if apierrors.IsNotFound(err) { + return "", nil + } + return "", fmt.Errorf("get ActorTemplate %s for golden actor cleanup: %w", tmplKey, err) + } + return strings.TrimSpace(tmpl.Status.GoldenActorID), nil +} + +// HarnessLabelKey labels substrate lifecycle managed for an AgentHarness. +const HarnessLabelKey = "kagent.dev/agent-harness" + +// HarnessNameFromLabels returns the AgentHarness name from generated lifecycle labels. +func HarnessNameFromLabels(labels map[string]string) string { + if labels == nil { + return "" + } + return strings.TrimSpace(labels[HarnessLabelKey]) +} diff --git a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go similarity index 58% rename from go/core/pkg/sandboxbackend/substrate/delete_provision_test.go rename to go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go index ae316c43be..2651770244 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_provision_test.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go @@ -9,23 +9,22 @@ import ( "github.com/stretchr/testify/require" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" ) -type recordingActorDeleter struct { +type recordingActorClient struct { deleted []string } -func (r *recordingActorDeleter) AdvanceActorDelete(_ context.Context, actorID string) (bool, error) { +func (r *recordingActorClient) deleteActor(_ context.Context, actorID string) (bool, error) { r.deleted = append(r.deleted, actorID) return true, nil } -func TestProvisionerAdvanceDelete_DeletesGoldenActor(t *testing.T) { +func TestLifecycleCleanupGeneratedTemplate_DeletesGoldenActor(t *testing.T) { t.Parallel() scheme := runtime.NewScheme() utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -46,39 +45,25 @@ func TestProvisionerAdvanceDelete_DeletesGoldenActor(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "peterj-claw", Namespace: ns, - Annotations: map[string]string{ - AnnotationManagedActorTemplate: "true", - }, }, } kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tmpl).Build() - rec := &recordingActorDeleter{} - p := &Provisioner{Client: kube, Ate: rec} + rec := &recordingActorClient{} + p := &Lifecycle{Client: kube, deleteActor: rec.deleteActor} var complete bool var err error for range 5 { - complete, err = p.AdvanceDelete(context.Background(), ah) + complete, err = p.CleanupGeneratedTemplate(context.Background(), ah) require.NoError(t, err) if complete { break } } - require.True(t, complete, "AdvanceDelete should finish within a few reconcile passes") + require.True(t, complete, "CleanupGeneratedTemplate should finish within a few reconcile passes") require.Equal(t, []string{"golden-actor-uuid"}, rec.deleted) var got atev1alpha1.ActorTemplate - require.Error(t, kube.Get(context.Background(), client.ObjectKeyFromObject(tmpl), &got)) -} - -func TestWorkerPoolDeploymentGoneNotFound(t *testing.T) { - t.Parallel() - scheme := runtime.NewScheme() - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - kube := fake.NewClientBuilder().WithScheme(scheme).Build() - p := &Provisioner{Client: kube} - gone, err := p.workerPoolDeploymentGone(context.Background(), types.NamespacedName{Namespace: "kagent", Name: "claw-wp"}) - require.NoError(t, err) - require.True(t, gone) + require.NoError(t, kube.Get(context.Background(), client.ObjectKeyFromObject(tmpl), &got)) } diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_openclaw.go similarity index 92% rename from go/core/pkg/sandboxbackend/substrate/provision_openclaw.go rename to go/core/pkg/sandboxbackend/substrate/lifecycle_openclaw.go index 96927611b7..4bc5608a5b 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_openclaw.go @@ -29,12 +29,12 @@ type openClawStartupScriptData struct { // buildOpenClawActorStartup returns the ateom workload startup script and container env for OpenClaw on Substrate. // When spec.modelConfigRef is set, openclaw.json includes models/agents/channels like the OpenShell bootstrap path. -func (p *Provisioner) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha2.AgentHarness) (script string, env []corev1.EnvVar, err error) { +func (p *Lifecycle) buildOpenClawActorStartup(ctx context.Context, ah *v1alpha2.AgentHarness) (script string, env []corev1.EnvVar, err error) { if ah == nil { return "", nil, fmt.Errorf("AgentHarness is required") } if p.Client == nil { - return "", nil, fmt.Errorf("substrate provisioner kubernetes client is required") + return "", nil, fmt.Errorf("substrate lifecycle kubernetes client is required") } token, err := ResolveGatewayToken(ctx, p.Client, ah) diff --git a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_openclaw_test.go similarity index 98% rename from go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go rename to go/core/pkg/sandboxbackend/substrate/lifecycle_openclaw_test.go index 95e58211e7..6d8578f3c3 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_openclaw_test.go @@ -52,7 +52,7 @@ func TestBuildOpenClawActorStartup_WithModelConfig(t *testing.T) { } kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - p := &Provisioner{ + p := &Lifecycle{ Client: kube, } @@ -140,7 +140,7 @@ func TestBuildOpenClawActorStartup_WithHarnessGatewayToken(t *testing.T) { t.Parallel() kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret.DeepCopy()).Build() - p := &Provisioner{ + p := &Lifecycle{ Client: kube, } ah := &v1alpha2.AgentHarness{ @@ -192,7 +192,7 @@ func TestBuildOpenClawActorStartup_WithExplicitBaseURL(t *testing.T) { } kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(secret, mc).Build() - p := &Provisioner{Client: kube, Defaults: ProvisionDefaults{}} + p := &Lifecycle{Client: kube, Defaults: LifecycleDefaults{}} script, _, err := p.buildOpenClawActorStartup(context.Background(), ah) require.NoError(t, err) diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go similarity index 51% rename from go/core/pkg/sandboxbackend/substrate/provision_shared.go rename to go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go index 87a526b8bd..2ec6c1fb65 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go @@ -12,48 +12,56 @@ import ( ) const ( - AnnotationManagedWorkerPool = "kagent.dev/substrate-managed-workerpool" - AnnotationManagedActorTemplate = "kagent.dev/substrate-managed-actortemplate" - - defaultWorkerPoolReplicas = int32(1) - defaultSnapshotsBucket = "ate-snapshots" - defaultOpenClawContainer = "openclaw" + defaultSnapshotsBucket = "ate-snapshots" + defaultOpenClawContainer = "openclaw" ) -// ProvisionDefaults are cluster-wide defaults for auto-provisioned Substrate CRs. -type ProvisionDefaults struct { +// LifecycleDefaults are cluster-wide defaults for generated ActorTemplate lifecycle. +type LifecycleDefaults struct { PauseImage string RunscAMD64URL string RunscAMD64SHA256 string RunscARM64URL string RunscARM64SHA256 string - DefaultAteomImage string DefaultWorkloadImage string + DefaultWorkerPool types.NamespacedName +} + +// Lifecycle reconciles the Kubernetes lifecycle that kagent owns for a substrate AgentHarness. +// WorkerPools are externally owned; this helper only resolves the selected WorkerPool. +type Lifecycle struct { + Client client.Client + Defaults LifecycleDefaults + deleteActor func(context.Context, string) (bool, error) } -// ateActorDeleter removes actors from ate-api during harness teardown. -type ateActorDeleter interface { - AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) +// AgentHarnessLifecycle is the substrate lifecycle surface used by the +// AgentHarness controller. +type AgentHarnessLifecycle interface { + EnsureGeneratedTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (LifecycleState, error) + CleanupGeneratedTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (bool, error) } -// Provisioner ensures WorkerPool and ActorTemplate exist for a substrate AgentHarness. -type Provisioner struct { - Client client.Client - Defaults ProvisionDefaults - // Ate deletes harness and golden snapshot actors before Substrate CRs are removed. - Ate ateActorDeleter +var _ AgentHarnessLifecycle = (*Lifecycle)(nil) + +func NewLifecycle(kube client.Client, defaults LifecycleDefaults, actors *Client) *Lifecycle { + var deleteActor func(context.Context, string) (bool, error) + if actors != nil { + deleteActor = actors.AdvanceActorDelete + } + return &Lifecycle{ + Client: kube, + Defaults: defaults, + deleteActor: deleteActor, + } } -// EnsureResult describes provisioned Substrate resources. -type EnsureResult struct { - WorkerPoolRef types.NamespacedName - ActorTemplateRef types.NamespacedName - ActorTemplateReady bool - ManagedWorkerPool bool - ManagedActorTemplate bool +// LifecycleState describes the generated Substrate lifecycle for an AgentHarness. +type LifecycleState struct { + ActorTemplateReady bool } -func defaultRunscConfig(d ProvisionDefaults) atev1alpha1.RunscConfig { +func defaultRunscConfig(d LifecycleDefaults) atev1alpha1.RunscConfig { return atev1alpha1.RunscConfig{ AMD64: &atev1alpha1.RunscPlatformConfig{ URL: d.RunscAMD64URL, @@ -82,17 +90,13 @@ func defaultSubstrateSnapshotsLocation(namespace, name string) string { return fmt.Sprintf("gs://%s/%s/%s", defaultSnapshotsBucket, namespace, name) } -func provisionLabels(ah *v1alpha2.AgentHarness) map[string]string { +func lifecycleLabels(ah *v1alpha2.AgentHarness) map[string]string { return map[string]string{ "app.kubernetes.io/managed-by": "kagent", "kagent.dev/agent-harness": ah.Name, } } -func workerPoolName(ah *v1alpha2.AgentHarness) string { - return truncateDNS1123(ah.Name + "-wp") -} - func actorTemplateName(ah *v1alpha2.AgentHarness) string { return truncateDNS1123(ah.Name) } diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_test.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_test.go new file mode 100644 index 0000000000..f2c3c0ac9e --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_test.go @@ -0,0 +1,146 @@ +package substrate + +import ( + "context" + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestSubstrateSnapshotsLocationDefault(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayToken: "test-token", + }, + }, + } + if got := substrateSnapshotsLocation(ah); got != "gs://ate-snapshots/kagent/claw" { + t.Fatalf("got default snapshots location %q", got) + } +} + +func TestResolveWorkerPoolRef(t *testing.T) { + t.Parallel() + + for _, tt := range []struct { + name string + refName string + defaultRef types.NamespacedName + wantRef types.NamespacedName + }{ + { + name: "uses default workerpool", + defaultRef: types.NamespacedName{Namespace: "kagent", Name: "default-wp"}, + wantRef: types.NamespacedName{Namespace: "kagent", Name: "default-wp"}, + }, + { + name: "spec workerpool overrides default", + refName: "custom-wp", + defaultRef: types.NamespacedName{Namespace: "kagent", Name: "default-wp"}, + wantRef: types.NamespacedName{Namespace: "kagent", Name: "custom-wp"}, + }, + } { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + ah := &v1alpha2.AgentHarness{ + TypeMeta: metav1.TypeMeta{APIVersion: v1alpha2.GroupVersion.String(), Kind: "AgentHarness"}, + ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{}, + }, + } + if tt.refName != "" { + ah.Spec.Substrate.WorkerPoolRef = &v1alpha2.TypedLocalReference{Name: tt.refName} + } + wp := &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{Name: tt.wantRef.Name, Namespace: tt.wantRef.Namespace}, + Spec: atev1alpha1.WorkerPoolSpec{ + Replicas: 1, + AteomImage: "registry.example/ateom:default", + }, + } + p := &Lifecycle{ + Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(wp).Build(), + Defaults: LifecycleDefaults{ + DefaultWorkerPool: tt.defaultRef, + }, + } + + key, err := p.resolveWorkerPoolRef(context.Background(), ah) + require.NoError(t, err) + require.Equal(t, tt.wantRef, key) + }) + } +} + +func TestActorTemplateName(t *testing.T) { + t.Parallel() + ah := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "my-claw"}} + if got := actorTemplateName(ah); got != "my-claw" { + t.Fatalf("got %q", got) + } +} + +func TestEnsureActorTemplateDoesNotUpdateWhenDesiredStateMatches(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + var updateCalls int + kube := fake.NewClientBuilder(). + WithScheme(scheme). + WithInterceptorFuncs(interceptor.Funcs{ + Update: func(ctx context.Context, c ctrlclient.WithWatch, obj ctrlclient.Object, opts ...ctrlclient.UpdateOption) error { + if _, ok := obj.(*atev1alpha1.ActorTemplate); ok { + updateCalls++ + } + return c.Update(ctx, obj, opts...) + }, + }). + Build() + + ah := &v1alpha2.AgentHarness{ + TypeMeta: metav1.TypeMeta{APIVersion: v1alpha2.GroupVersion.String(), Kind: "AgentHarness"}, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "kagent", + Name: "claw", + UID: "00000000-0000-0000-0000-000000000001", + }, + Spec: v1alpha2.AgentHarnessSpec{ + Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, + Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ + GatewayToken: "test-token", + }, + }, + } + lifecycle := &Lifecycle{Client: kube} + wpKey := types.NamespacedName{Namespace: "kagent", Name: "default-wp"} + + _, err := lifecycle.ensureActorTemplate(context.Background(), ah, wpKey) + require.NoError(t, err) + _, err = lifecycle.ensureActorTemplate(context.Background(), ah, wpKey) + require.NoError(t, err) + require.Zero(t, updateCalls, "matching desired ActorTemplate should not be updated") +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index a3374d01fc..a66a9e45f5 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -26,7 +26,6 @@ var dns1123Label = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`) // ClawBackend implements AsyncBackend for OpenClaw/NemoClaw on Agent Substrate. type ClawBackend struct { client *Client - cfg Config backend v1alpha2.AgentHarnessBackendType recorder record.EventRecorder } @@ -34,10 +33,9 @@ type ClawBackend struct { var _ sandboxbackend.AsyncBackend = (*ClawBackend)(nil) // NewOpenClawBackend returns a substrate backend for openclaw/nemoclaw harness types. -func NewOpenClawBackend(client *Client, cfg Config, backend v1alpha2.AgentHarnessBackendType, recorder record.EventRecorder) *ClawBackend { +func NewOpenClawBackend(client *Client, backend v1alpha2.AgentHarnessBackendType, recorder record.EventRecorder) *ClawBackend { return &ClawBackend{ client: client, - cfg: cfg, backend: backend, recorder: recorder, } @@ -56,7 +54,7 @@ func (b *ClawBackend) EnsureAgentHarness(ctx context.Context, ah *v1alpha2.Agent } actorID := ActorID(ah) - tmplNS, tmplName := actorTemplateRef(ah, b.cfg) + tmplNS, tmplName := generatedActorTemplateKey(ah) actor, err := b.client.GetActor(ctx, actorID) if err != nil { @@ -103,23 +101,20 @@ func (b *ClawBackend) GetStatus(ctx context.Context, h sandboxbackend.Handle) (m return actorStatusToCondition(actor) } -func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.Handle) error { +func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.Handle) (bool, error) { if h.ID == "" { - return nil + return true, nil } done, err := b.client.AdvanceActorDelete(ctx, h.ID) if err != nil { - return fmt.Errorf("substrate delete actor %q: %w", h.ID, err) + return false, fmt.Errorf("substrate delete actor %q: %w", h.ID, err) } - if !done { - return fmt.Errorf("substrate delete actor %q in progress", h.ID) - } - return nil + return done, nil } func (b *ClawBackend) OnAgentHarnessReady(_ context.Context, _ *v1alpha2.AgentHarness, _ sandboxbackend.Handle) error { - // OpenClaw config is baked into the ActorTemplate golden snapshot at provision time - // (see substrate/provision_openclaw.go — openclaw.BuildSubstrateBootstrapJSON with secretKeyRef env). + // OpenClaw config is baked into the ActorTemplate golden snapshot when the + // generated ActorTemplate is reconciled. return nil } @@ -150,19 +145,7 @@ func ActorHost(actorID string, suffix string) string { return actorID + "." + suffix } -func actorTemplateRef(ah *v1alpha2.AgentHarness, cfg Config) (string, string) { - if ah.Spec.Substrate != nil && ah.Spec.Substrate.ActorTemplateRef != nil { - if ref := ah.Spec.Substrate.ActorTemplateRef; ref.Name != "" { - return ah.Namespace, ref.Name - } - } - // Auto-provisioned template in the harness namespace (also when status was not persisted yet). - if ah.Annotations != nil && ah.Annotations[AnnotationManagedActorTemplate] == "true" { - return ah.Namespace, actorTemplateName(ah) - } - if cfg.DefaultActorTemplateNamespace != "" && cfg.DefaultActorTemplateName != "" { - return cfg.DefaultActorTemplateNamespace, cfg.DefaultActorTemplateName - } +func generatedActorTemplateKey(ah *v1alpha2.AgentHarness) (string, string) { return ah.Namespace, actorTemplateName(ah) } diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw_test.go b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go index fa7c6c8d75..5e5b752f5f 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw_test.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw_test.go @@ -32,21 +32,15 @@ func TestActorHost(t *testing.T) { } } -func TestActorTemplateRefManagedProvisioner(t *testing.T) { +func TestGeneratedActorTemplateKey(t *testing.T) { t.Parallel() ah := &v1alpha2.AgentHarness{ ObjectMeta: metav1.ObjectMeta{ Namespace: "kagent", Name: "peterj-claw", - Annotations: map[string]string{ - AnnotationManagedActorTemplate: "true", - }, }, } - ns, name := actorTemplateRef(ah, Config{ - DefaultActorTemplateNamespace: "ate-demo-openclaw", - DefaultActorTemplateName: "openclaw", - }) + ns, name := generatedActorTemplateKey(ah) if ns != "kagent" || name != "peterj-claw" { t.Fatalf("got %s/%s, want kagent/peterj-claw", ns, name) } diff --git a/go/core/pkg/sandboxbackend/substrate/provision.go b/go/core/pkg/sandboxbackend/substrate/provision.go deleted file mode 100644 index 156f10aa9a..0000000000 --- a/go/core/pkg/sandboxbackend/substrate/provision.go +++ /dev/null @@ -1,58 +0,0 @@ -package substrate - -import ( - "context" - "fmt" - "strings" - - "github.com/kagent-dev/kagent/go/api/v1alpha2" - "k8s.io/apimachinery/pkg/types" -) - -// Ensure creates or updates Substrate CRs and reports whether ActorTemplate is Ready (controller requeues until true). -func (p *Provisioner) Ensure(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { - if ah == nil || ah.Spec.Substrate == nil { - return EnsureResult{}, fmt.Errorf("spec.substrate is required") - } - - if ah.Spec.Substrate.ActorTemplateRef != nil && strings.TrimSpace(ah.Spec.Substrate.ActorTemplateRef.Name) != "" { - return p.ensureAdoptedActorTemplate(ctx, ah) - } - - wpKey, managedWP, err := p.ensureWorkerPool(ctx, ah) - if err != nil { - return EnsureResult{}, err - } - - tmplKey, err := p.ensureActorTemplate(ctx, ah, wpKey) - if err != nil { - return EnsureResult{}, err - } - - ready, err := p.actorTemplateReady(ctx, tmplKey) - if err != nil { - return EnsureResult{}, err - } - - return EnsureResult{ - WorkerPoolRef: wpKey, - ActorTemplateRef: tmplKey, - ActorTemplateReady: ready, - ManagedWorkerPool: managedWP, - ManagedActorTemplate: true, - }, nil -} - -func (p *Provisioner) ensureAdoptedActorTemplate(ctx context.Context, ah *v1alpha2.AgentHarness) (EnsureResult, error) { - ref := ah.Spec.Substrate.ActorTemplateRef - tmplKey := types.NamespacedName{Namespace: ah.Namespace, Name: ref.Name} - ready, err := p.actorTemplateReady(ctx, tmplKey) - if err != nil { - return EnsureResult{}, err - } - return EnsureResult{ - ActorTemplateRef: tmplKey, - ActorTemplateReady: ready, - ManagedActorTemplate: false, - }, nil -} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_test.go b/go/core/pkg/sandboxbackend/substrate/provision_test.go deleted file mode 100644 index c08e87f8e0..0000000000 --- a/go/core/pkg/sandboxbackend/substrate/provision_test.go +++ /dev/null @@ -1,98 +0,0 @@ -package substrate - -import ( - "context" - "testing" - - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" - "github.com/stretchr/testify/require" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "sigs.k8s.io/controller-runtime/pkg/client/fake" - - "github.com/kagent-dev/kagent/go/api/v1alpha2" -) - -func TestSubstrateSnapshotsLocationDefault(t *testing.T) { - t.Parallel() - ah := &v1alpha2.AgentHarness{ - ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, - Spec: v1alpha2.AgentHarnessSpec{ - Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, - Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - GatewayToken: "test-token", - }, - }, - } - if got := substrateSnapshotsLocation(ah); got != "gs://ate-snapshots/kagent/claw" { - t.Fatalf("got default snapshots location %q", got) - } -} - -func TestEnsureWorkerPoolUsesDefaultAteomImage(t *testing.T) { - t.Parallel() - - for _, tt := range []struct { - name string - defaultImg string - workerPool *v1alpha2.AgentHarnessSubstrateWorkerPoolSpec - wantImage string - wantReplica int32 - }{ - { - name: "defaults omitted replicas", - defaultImg: "registry.example/ateom:default", - workerPool: &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{}, - wantImage: "registry.example/ateom:default", - wantReplica: 1, - }, - { - name: "workerpool override", - defaultImg: "registry.example/ateom:default", - workerPool: &v1alpha2.AgentHarnessSubstrateWorkerPoolSpec{Replicas: 3, AteomImage: "registry.example/ateom:override"}, - wantImage: "registry.example/ateom:override", - wantReplica: 3, - }, - } { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - - scheme := runtime.NewScheme() - utilruntime.Must(v1alpha2.AddToScheme(scheme)) - utilruntime.Must(atev1alpha1.AddToScheme(scheme)) - - ah := &v1alpha2.AgentHarness{ - TypeMeta: metav1.TypeMeta{APIVersion: v1alpha2.GroupVersion.String(), Kind: "AgentHarness"}, - ObjectMeta: metav1.ObjectMeta{Namespace: "kagent", Name: "claw"}, - Spec: v1alpha2.AgentHarnessSpec{ - Runtime: v1alpha2.AgentHarnessRuntimeSubstrate, - Substrate: &v1alpha2.AgentHarnessSubstrateSpec{ - WorkerPool: tt.workerPool, - }, - }, - } - p := &Provisioner{ - Client: fake.NewClientBuilder().WithScheme(scheme).Build(), - Defaults: ProvisionDefaults{DefaultAteomImage: tt.defaultImg}, - } - - key, managed, err := p.ensureWorkerPool(context.Background(), ah) - require.NoError(t, err) - require.True(t, managed) - - var wp atev1alpha1.WorkerPool - require.NoError(t, p.Client.Get(context.Background(), key, &wp)) - require.Equal(t, tt.wantImage, wp.Spec.AteomImage) - require.Equal(t, tt.wantReplica, wp.Spec.Replicas) - }) - } -} - -func TestActorTemplateName(t *testing.T) { - t.Parallel() - ah := &v1alpha2.AgentHarness{ObjectMeta: metav1.ObjectMeta{Name: "my-claw"}} - if got := actorTemplateName(ah); got != "my-claw" { - t.Fatalf("got %q", got) - } -} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go b/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go deleted file mode 100644 index f715aa1651..0000000000 --- a/go/core/pkg/sandboxbackend/substrate/provision_workerpool.go +++ /dev/null @@ -1,73 +0,0 @@ -package substrate - -import ( - "context" - "fmt" - "strings" - - atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" - "github.com/kagent-dev/kagent/go/api/v1alpha2" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" -) - -func (p *Provisioner) ensureWorkerPool(ctx context.Context, ah *v1alpha2.AgentHarness) (types.NamespacedName, bool, error) { - sub := ah.Spec.Substrate - if sub.WorkerPoolRef != nil && strings.TrimSpace(sub.WorkerPoolRef.Name) != "" { - key := types.NamespacedName{Namespace: ah.Namespace, Name: sub.WorkerPoolRef.Name} - var wp atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &wp); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("get WorkerPool %s: %w", key, err) - } - return key, false, nil - } - - key := types.NamespacedName{Namespace: ah.Namespace, Name: workerPoolName(ah)} - replicas := defaultWorkerPoolReplicas - ateomImage := "" - if sub.WorkerPool != nil { - if sub.WorkerPool.Replicas > 0 { - replicas = sub.WorkerPool.Replicas - } - ateomImage = strings.TrimSpace(sub.WorkerPool.AteomImage) - } - if ateomImage == "" { - ateomImage = strings.TrimSpace(p.Defaults.DefaultAteomImage) - } - if ateomImage == "" { - return types.NamespacedName{}, false, fmt.Errorf("ateom image is not configured (set controller substrate ateomImage or spec.substrate.workerPool.ateomImage)") - } - - desired := &atev1alpha1.WorkerPool{ - ObjectMeta: metav1.ObjectMeta{ - Name: key.Name, - Namespace: key.Namespace, - Labels: provisionLabels(ah), - }, - Spec: atev1alpha1.WorkerPoolSpec{ - Replicas: replicas, - AteomImage: ateomImage, - }, - } - if err := controllerutil.SetControllerReference(ah, desired, p.Client.Scheme()); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("set WorkerPool owner ref: %w", err) - } - - var existing atev1alpha1.WorkerPool - if err := p.Client.Get(ctx, key, &existing); apierrors.IsNotFound(err) { - if err := p.Client.Create(ctx, desired); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("create WorkerPool %s: %w", key, err) - } - return key, true, nil - } else if err != nil { - return types.NamespacedName{}, false, err - } - existing.Spec.Replicas = desired.Spec.Replicas - existing.Spec.AteomImage = desired.Spec.AteomImage - if err := p.Client.Update(ctx, &existing); err != nil { - return types.NamespacedName{}, false, fmt.Errorf("update WorkerPool %s: %w", key, err) - } - return key, true, nil -} diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index 52f814c1aa..9c4c1ee2a6 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -525,26 +525,6 @@ spec: substrate: description: Substrate is required when runtime is substrate. properties: - actorTemplateRef: - description: |- - ActorTemplateRef adopts an existing ate.dev ActorTemplate instead of auto-provisioning. - When set, workerPoolRef/workerPool/snapshotsConfig are ignored for template creation. - properties: - apiGroup: - type: string - kind: - type: string - name: - type: string - required: - - name - type: object - gatewayPort: - default: 80 - description: GatewayPort is the port OpenClaw listens on inside - the actor (Substrate routes to :80 today). - format: int32 - type: integer gatewayToken: description: |- GatewayToken is the OpenClaw gateway Bearer token for this harness. @@ -579,26 +559,10 @@ spec: required: - location type: object - workerPool: - description: WorkerPool creates a dedicated WorkerPool in the - harness namespace when workerPoolRef is unset. - properties: - ateomImage: - description: |- - AteomImage is the ateom herder image (pullable registry ref, not ko://). - Overrides the controller-wide substrate ateom image default for this WorkerPool. - type: string - replicas: - default: 1 - description: Replicas is the number of ateom worker pods. - Defaults to 1 when unset or zero. - format: int32 - type: integer - type: object workerPoolRef: description: |- - WorkerPoolRef references an existing ate.dev WorkerPool (namespace/name). - Mutually exclusive with workerPool. + WorkerPoolRef references an existing ate.dev WorkerPool in the harness namespace. + When unset, the controller uses its configured default WorkerPool. properties: apiGroup: type: string @@ -619,8 +583,6 @@ spec: be specified rule: (has(self.gatewayToken) && !has(self.gatewayTokenSecretRef)) || (!has(self.gatewayToken) && has(self.gatewayTokenSecretRef)) - - message: workerPoolRef and workerPool are mutually exclusive - rule: '!(has(self.workerPoolRef) && has(self.workerPool))' required: - backend type: object @@ -726,72 +688,6 @@ spec: observedGeneration: format: int64 type: integer - substrate: - description: Substrate records observed Substrate provisioning state. - properties: - conditions: - description: Conditions describe substrate provisioning progress - (e.g. ActorTemplate golden snapshot). - items: - description: Condition contains details for one aspect of the - current state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, - Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object type: object type: object served: true diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index 2727ace3a6..21d5d7b845 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -94,10 +94,10 @@ spec: - name: SUBSTRATE_ATE_API_INSECURE value: "true" {{- end }} - - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAMESPACE - value: {{ .Values.controller.substrate.defaultActorTemplateNamespace | quote }} - - name: SUBSTRATE_DEFAULT_ACTOR_TEMPLATE_NAME - value: {{ .Values.controller.substrate.defaultActorTemplateName | quote }} + - name: SUBSTRATE_DEFAULT_WORKERPOOL_NAMESPACE + value: {{ .Values.controller.substrate.defaultWorkerPool.namespace | default (include "kagent.namespace" .) | quote }} + - name: SUBSTRATE_DEFAULT_WORKERPOOL_NAME + value: {{ .Values.controller.substrate.defaultWorkerPool.name | default (ternary .Values.substrateWorkerPool.name "" .Values.substrateWorkerPool.create) | quote }} {{- with .Values.controller.substrate.pauseImage }} - name: SUBSTRATE_PAUSE_IMAGE value: {{ . | quote }} @@ -118,10 +118,6 @@ spec: - name: SUBSTRATE_RUNSC_ARM64_SHA256 value: {{ . | quote }} {{- end }} - {{- with .Values.controller.substrate.ateomImage }} - - name: SUBSTRATE_ATEOM_IMAGE - value: {{ . | quote }} - {{- end }} {{- end }} envFrom: - configMapRef: diff --git a/helm/kagent/templates/rbac/writer-role.yaml b/helm/kagent/templates/rbac/writer-role.yaml index b9516cae03..551f52d5fc 100644 --- a/helm/kagent/templates/rbac/writer-role.yaml +++ b/helm/kagent/templates/rbac/writer-role.yaml @@ -78,7 +78,6 @@ - apiGroups: - ate.dev resources: - - workerpools - actortemplates verbs: - create @@ -111,4 +110,4 @@ metadata: {{- include "kagent.labels" . | nindent 4 }} rules: {{- include "kagent.writer.rules" . | nindent 2 }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/helm/kagent/templates/substrate-workerpool.yaml b/helm/kagent/templates/substrate-workerpool.yaml new file mode 100644 index 0000000000..4cc12119c7 --- /dev/null +++ b/helm/kagent/templates/substrate-workerpool.yaml @@ -0,0 +1,15 @@ +{{- if and .Values.controller.substrate.enabled .Values.substrateWorkerPool.create }} +{{- if not .Values.substrateWorkerPool.ateomImage }} +{{- fail "substrateWorkerPool.ateomImage is required when substrateWorkerPool.create=true" }} +{{- end }} +apiVersion: ate.dev/v1alpha1 +kind: WorkerPool +metadata: + name: {{ .Values.substrateWorkerPool.name | quote }} + namespace: {{ include "kagent.namespace" . }} + labels: + {{- include "kagent.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.substrateWorkerPool.replicas }} + ateomImage: {{ .Values.substrateWorkerPool.ateomImage | quote }} +{{- end }} diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index 39e97ba23d..fab50dcf55 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -232,23 +232,24 @@ controller: # value: "true" # Agent Substrate (OpenClaw harness runtime=substrate). Requires ate-system installed. - # kagent auto-provisions per-harness ActorTemplate (+ optional WorkerPool). Per-harness - # spec.substrate.workerPool.ateomImage overrides the controller-wide ateomImage below. + # kagent generates per-harness ActorTemplates and references an existing WorkerPool. substrate: enabled: false ateApiEndpoint: "" ateApiInsecure: false - defaultActorTemplateNamespace: "" - defaultActorTemplateName: "" + defaultWorkerPool: + namespace: "" + name: "" # pauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da" # runscAMD64URL: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" # runscAMD64SHA256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" # runscARM64URL: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" # runscARM64SHA256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" - # ateomImage: "localhost:5001/ateom-gvisor:latest" # Example when enabled: # enabled: true # ateApiEndpoint: "dns:///api.ate-system.svc:443" + # defaultWorkerPool: + # name: "kagent-default" envFrom: [] @@ -277,6 +278,14 @@ controller: # @default -- httpGet /health on port http, periodSeconds=30 readinessProbe: {} +# -- Optional Agent Substrate WorkerPool installed by this chart. This is platform +# capacity and is not owned by individual AgentHarness resources. +substrateWorkerPool: + create: false + name: kagent-default + replicas: 1 + ateomImage: "" + # ============================================================================== # UI CONFIGURATION # ============================================================================== diff --git a/ui/src/components/agent-form/OpenClawSandboxFields.tsx b/ui/src/components/agent-form/OpenClawSandboxFields.tsx index 54e7cc6cf4..ee830023ad 100644 --- a/ui/src/components/agent-form/OpenClawSandboxFields.tsx +++ b/ui/src/components/agent-form/OpenClawSandboxFields.tsx @@ -165,7 +165,7 @@ export function OpenClawSandboxFields({ Control plane @@ -184,6 +184,19 @@ export function OpenClawSandboxFields({ {value.runtime === "substrate" ? (
+ + Gateway token + set({ substrateGatewayToken: e.target.value })} + /> +

+ Bearer token used by kagent when proxying the generated OpenClaw gateway. +

+
Snapshot location (GCS) - Worker pool - - set({ - substrateWorkerPoolMode: e.target.value === "existing" ? "existing" : "create", - }) - } - > - - - + placeholder="controller default" + value={value.substrateWorkerPoolRefName} + onChange={(e) => set({ substrateWorkerPoolRefName: e.target.value })} + /> +

+ Leave empty to use the controller default WorkerPool. +

- {value.substrateWorkerPoolMode === "existing" ? ( -
- - WorkerPool namespace - set({ substrateWorkerPoolRefNamespace: e.target.value })} - /> - - - WorkerPool name - set({ substrateWorkerPoolRefName: e.target.value })} - /> - -
- ) : ( - - Worker replicas - set({ substrateWorkerPoolReplicas: e.target.value })} - /> - - )}
) : null}
diff --git a/ui/src/lib/__tests__/openClawSandboxForm.test.ts b/ui/src/lib/__tests__/openClawSandboxForm.test.ts index 7402d618c4..512f4c610c 100644 --- a/ui/src/lib/__tests__/openClawSandboxForm.test.ts +++ b/ui/src/lib/__tests__/openClawSandboxForm.test.ts @@ -33,18 +33,27 @@ describe("validateOpenClawSandboxForm sections", () => { expect(r?.message).toContain("not a valid hostname"); }); - it("tags channel credential failures as channels", () => { - const row = newOpenClawChannelRow(); - row.name = "slack1"; - row.channelType = "slack"; - row.botToken = ""; - const r = validateOpenClawSandboxForm({ - openClaw: { ...defaultOpenClawSandboxFormSlice(), channels: [row] }, - modelRef: "ns/m1", - }); - expect(r?.section).toBe("channels"); - expect(r?.message).toContain("slack1"); + it("tags missing substrate gateway token as general", () => { + const r = validateOpenClawSandboxForm({ + openClaw: { ...defaultOpenClawSandboxFormSlice(), runtime: "substrate" }, + modelRef: "ns/m1", }); + expect(r?.section).toBe("general"); + expect(r?.message).toContain("gateway token"); + }); + + it("tags channel credential failures as channels", () => { + const row = newOpenClawChannelRow(); + row.name = "slack1"; + row.channelType = "slack"; + row.botToken = ""; + const r = validateOpenClawSandboxForm({ + openClaw: { ...defaultOpenClawSandboxFormSlice(), channels: [row] }, + modelRef: "ns/m1", + }); + expect(r?.section).toBe("channels"); + expect(r?.message).toContain("slack1"); + }); it("rejects duplicate channel binding names", () => { const row = newOpenClawChannelRow(); @@ -183,6 +192,29 @@ describe("openClawSandboxForm allowedDomains", () => { expect(draft.spec.backend).toBe("openclaw"); }); + it("writes substrate config without creating a WorkerPool", () => { + const draft = buildSandboxCRDraft({ + name: "h1", + namespace: "ns", + description: "", + modelRef: "m1", + openClaw: { + ...defaultOpenClawSandboxFormSlice(), + runtime: "substrate", + substrateGatewayToken: "tok", + substrateWorkerPoolRefName: "default-wp", + }, + }); + expect("error" in draft).toBe(false); + if ("error" in draft) return; + expect(draft.spec.substrate).toEqual({ + gatewayToken: "tok", + snapshotsConfig: { location: "gs://ate-snapshots/kagent/" }, + workerPoolRef: { name: "default-wp" }, + }); + expect(draft.spec.substrate).not.toHaveProperty("workerPool"); + }); + it("writes Hermes slack allowedUserIDs and home channel fields", () => { const row = newOpenClawChannelRow(); row.name = "slack-main"; diff --git a/ui/src/lib/openClawSandboxForm.ts b/ui/src/lib/openClawSandboxForm.ts index 46608384f1..33cf3ce49b 100644 --- a/ui/src/lib/openClawSandboxForm.ts +++ b/ui/src/lib/openClawSandboxForm.ts @@ -70,12 +70,9 @@ export type HarnessRuntimeForm = "openshell" | "substrate"; export interface OpenClawSandboxFormSlice { /** Harness control plane: OpenShell (default) or Agent Substrate. */ runtime: HarnessRuntimeForm; - /** Use an existing Substrate WorkerPool or let kagent create one per harness. */ - substrateWorkerPoolMode: "create" | "existing"; - substrateWorkerPoolRefNamespace: string; substrateWorkerPoolRefName: string; - substrateWorkerPoolReplicas: string; - /** GCS snapshot prefix (gs://bucket/path/) — required for auto-provisioned templates. */ + substrateGatewayToken: string; + /** GCS snapshot prefix (gs://bucket/path/) — required for generated templates. */ substrateSnapshotsLocation: string; /** Optional override for Sandbox.spec.image (OpenShell VM template image). Empty → controller default. */ image: string; @@ -92,10 +89,8 @@ export interface OpenClawSandboxFormSlice { export function defaultOpenClawSandboxFormSlice(): OpenClawSandboxFormSlice { return { runtime: "openshell", - substrateWorkerPoolMode: "create", - substrateWorkerPoolRefNamespace: "", substrateWorkerPoolRefName: "", - substrateWorkerPoolReplicas: "2", + substrateGatewayToken: "", substrateSnapshotsLocation: "gs://ate-snapshots/kagent/", image: "", channels: [], @@ -198,6 +193,9 @@ export function validateOpenClawSandboxForm(args: { if (!mr) { return openClawValidationFail("general", "Please select a model config for this sandbox."); } + if (args.openClaw.runtime === "substrate" && !args.openClaw.substrateGatewayToken.trim()) { + return openClawValidationFail("general", "Substrate gateway token is required."); + } for (const entry of trimSplitList(args.openClaw.allowedDomains)) { if (!isPlausibleAllowedDomainHost(entry)) { @@ -391,22 +389,18 @@ export function buildSandboxCRDraft(args: { if (!snapshots) { return { error: "Substrate snapshots location (gs://…) is required." }; } + const gatewayToken = args.openClaw.substrateGatewayToken?.trim(); + if (!gatewayToken) { + return { error: "Substrate gateway token is required." }; + } const substrate: Record = { + gatewayToken, snapshotsConfig: { location: snapshots }, }; - if (args.openClaw.substrateWorkerPoolMode === "existing") { - const wpName = args.openClaw.substrateWorkerPoolRefName?.trim(); - if (!wpName) { - return { error: "WorkerPool name is required when using an existing pool." }; - } + const wpName = args.openClaw.substrateWorkerPoolRefName?.trim(); + if (wpName) { substrate.workerPoolRef = { name: wpName, - namespace: args.openClaw.substrateWorkerPoolRefNamespace?.trim() || args.namespace.trim(), - }; - } else { - const replicas = Number.parseInt(args.openClaw.substrateWorkerPoolReplicas?.trim() || "2", 10); - substrate.workerPool = { - replicas: Number.isFinite(replicas) && replicas > 0 ? replicas : 2, }; } spec.substrate = substrate; From efc122d7f209383da4854615dc7f9aaf1624719d Mon Sep 17 00:00:00 2001 From: Eitan Yarmush Date: Mon, 1 Jun 2026 19:11:19 +0000 Subject: [PATCH 24/32] Simplify substrate actor cleanup wiring Signed-off-by: Eitan Yarmush --- .../substrate/lifecycle_delete.go | 12 +++-- .../substrate/lifecycle_delete_test.go | 50 +++++++++++++++++-- .../substrate/lifecycle_shared.go | 18 +++---- go/go.mod | 7 +-- go/go.sum | 20 +++----- 5 files changed, 71 insertions(+), 36 deletions(-) diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go index 4063884849..07534b4f4f 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go @@ -30,10 +30,7 @@ func (p *Lifecycle) CleanupGeneratedTemplate(ctx context.Context, ah *v1alpha2.A if goldenID == "" { return true, nil } - if p.deleteActor == nil { - return false, fmt.Errorf("substrate ate-api client is required to delete golden actor %q", goldenID) - } - done, err := p.deleteActor(ctx, goldenID) + done, err := deleteGoldenActor(ctx, p.AteClient, goldenID) if err != nil { return false, fmt.Errorf("delete golden actor %q for ActorTemplate %s: %w", goldenID, tmplKey, err) } @@ -44,6 +41,13 @@ func (p *Lifecycle) CleanupGeneratedTemplate(ctx context.Context, ah *v1alpha2.A return true, nil } +func deleteGoldenActor(ctx context.Context, ateClient *Client, actorID string) (bool, error) { + if ateClient == nil { + return false, fmt.Errorf("substrate ate-api client is required") + } + return ateClient.AdvanceActorDelete(ctx, actorID) +} + func (p *Lifecycle) goldenActorID(ctx context.Context, tmplKey types.NamespacedName) (string, error) { var tmpl atev1alpha1.ActorTemplate if err := p.Client.Get(ctx, tmplKey, &tmpl); err != nil { diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go index 2651770244..e772664133 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go @@ -5,8 +5,12 @@ import ( "testing" atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/agent-substrate/substrate/proto/ateapipb" "github.com/kagent-dev/kagent/go/api/v1alpha2" "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -19,9 +23,47 @@ type recordingActorClient struct { deleted []string } -func (r *recordingActorClient) deleteActor(_ context.Context, actorID string) (bool, error) { - r.deleted = append(r.deleted, actorID) - return true, nil +func (r *recordingActorClient) GetActor(_ context.Context, in *ateapipb.GetActorRequest, _ ...grpc.CallOption) (*ateapipb.GetActorResponse, error) { + for _, deleted := range r.deleted { + if deleted == in.GetActorId() { + return nil, status.Error(codes.NotFound, "actor deleted") + } + } + return &ateapipb.GetActorResponse{ + Actor: &ateapipb.Actor{ + ActorId: in.GetActorId(), + Status: ateapipb.Actor_STATUS_SUSPENDED, + }, + }, nil +} + +func (r *recordingActorClient) DeleteActor(_ context.Context, in *ateapipb.DeleteActorRequest, _ ...grpc.CallOption) (*ateapipb.DeleteActorResponse, error) { + r.deleted = append(r.deleted, in.GetActorId()) + return &ateapipb.DeleteActorResponse{}, nil +} + +func (r *recordingActorClient) CreateActor(context.Context, *ateapipb.CreateActorRequest, ...grpc.CallOption) (*ateapipb.CreateActorResponse, error) { + panic("not used") +} + +func (r *recordingActorClient) SuspendActor(context.Context, *ateapipb.SuspendActorRequest, ...grpc.CallOption) (*ateapipb.SuspendActorResponse, error) { + panic("not used") +} + +func (r *recordingActorClient) ResumeActor(context.Context, *ateapipb.ResumeActorRequest, ...grpc.CallOption) (*ateapipb.ResumeActorResponse, error) { + panic("not used") +} + +func (r *recordingActorClient) ListWorkers(context.Context, *ateapipb.ListWorkersRequest, ...grpc.CallOption) (*ateapipb.ListWorkersResponse, error) { + panic("not used") +} + +func (r *recordingActorClient) ListActors(context.Context, *ateapipb.ListActorsRequest, ...grpc.CallOption) (*ateapipb.ListActorsResponse, error) { + panic("not used") +} + +func (r *recordingActorClient) DebugClear(context.Context, *ateapipb.DebugClearRequest, ...grpc.CallOption) (*ateapipb.DebugClearResponse, error) { + panic("not used") } func TestLifecycleCleanupGeneratedTemplate_DeletesGoldenActor(t *testing.T) { @@ -50,7 +92,7 @@ func TestLifecycleCleanupGeneratedTemplate_DeletesGoldenActor(t *testing.T) { kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects(tmpl).Build() rec := &recordingActorClient{} - p := &Lifecycle{Client: kube, deleteActor: rec.deleteActor} + p := &Lifecycle{Client: kube, AteClient: &Client{ControlClient: rec}} var complete bool var err error diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go index 2ec6c1fb65..3c590af7fb 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_shared.go @@ -30,9 +30,9 @@ type LifecycleDefaults struct { // Lifecycle reconciles the Kubernetes lifecycle that kagent owns for a substrate AgentHarness. // WorkerPools are externally owned; this helper only resolves the selected WorkerPool. type Lifecycle struct { - Client client.Client - Defaults LifecycleDefaults - deleteActor func(context.Context, string) (bool, error) + Client client.Client + Defaults LifecycleDefaults + AteClient *Client } // AgentHarnessLifecycle is the substrate lifecycle surface used by the @@ -44,15 +44,11 @@ type AgentHarnessLifecycle interface { var _ AgentHarnessLifecycle = (*Lifecycle)(nil) -func NewLifecycle(kube client.Client, defaults LifecycleDefaults, actors *Client) *Lifecycle { - var deleteActor func(context.Context, string) (bool, error) - if actors != nil { - deleteActor = actors.AdvanceActorDelete - } +func NewLifecycle(kube client.Client, defaults LifecycleDefaults, ateClient *Client) *Lifecycle { return &Lifecycle{ - Client: kube, - Defaults: defaults, - deleteActor: deleteActor, + Client: kube, + Defaults: defaults, + AteClient: ateClient, } } diff --git a/go/go.mod b/go/go.mod index 5ff82a4418..1bd839cccf 100644 --- a/go/go.mod +++ b/go/go.mod @@ -64,8 +64,7 @@ require ( github.com/agent-substrate/substrate v0.0.0 github.com/aws/aws-sdk-go-v2 v1.41.7 github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.50.6 - github.com/golang/protobuf v1.5.4 - github.com/google/go-containerregistry v0.21.2 + github.com/google/go-containerregistry v0.21.5 github.com/google/jsonschema-go v0.4.3 github.com/jackc/pgx/v5 v5.9.2 github.com/ollama/ollama v0.24.0 @@ -179,8 +178,7 @@ require ( github.com/denis-tingaikin/go-header v0.5.0 // indirect github.com/distribution/reference v0.6.0 // indirect github.com/dlclark/regexp2 v1.12.0 // indirect - github.com/docker/cli v29.2.1+incompatible // indirect - github.com/docker/distribution v2.8.3+incompatible // indirect + github.com/docker/cli v29.4.0+incompatible // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect github.com/docker/go-connections v0.6.0 // indirect github.com/docker/go-units v0.5.0 // indirect @@ -309,7 +307,6 @@ require ( github.com/moby/moby/api v1.54.1 // indirect github.com/moby/moby/client v0.4.0 // indirect github.com/moby/patternmatcher v0.6.1 // indirect - github.com/moby/spdystream v0.5.1 // indirect github.com/moby/sys/sequential v0.6.0 // indirect github.com/moby/sys/user v0.4.0 // indirect github.com/moby/sys/userns v0.1.0 // indirect diff --git a/go/go.sum b/go/go.sum index 63b6d6f3e2..753a6fe935 100644 --- a/go/go.sum +++ b/go/go.sum @@ -94,8 +94,6 @@ github.com/anthropics/anthropic-sdk-go v1.43.0 h1:ShY3C7lafzHP0ze1dCxL3ZFZzvkGfX github.com/anthropics/anthropic-sdk-go v1.43.0/go.mod h1:5cEaslQ6A9ajdL5YUvhNW57LKxEz0OAZ7WEzgZWLD7k= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/ashanbrown/forbidigo/v2 v2.3.1 h1:KAZijvQ7zeIBKbhikT4jCm0TLYXC4u78bTiLh/8JROI= github.com/ashanbrown/forbidigo/v2 v2.3.1/go.mod h1:2QDkLTzU6TV937eFROamXrW92M3paehdae4HCDCOZCM= github.com/ashanbrown/makezero/v2 v2.2.1 h1:A7uU8dgB1PA9aelTxHMfHIQ8Qev8AB3JLxJUBUsejqM= @@ -249,12 +247,10 @@ github.com/dlclark/regexp2 v1.12.0 h1:0j4c5qQmnC6XOWNjP3PIXURXN2gWx76rd3KvgdPkCz github.com/dlclark/regexp2 v1.12.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI= github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= -github.com/docker/cli v29.2.1+incompatible h1:n3Jt0QVCN65eiVBoUTZQM9mcQICCJt3akW4pKAbKdJg= -github.com/docker/cli v29.2.1+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= -github.com/docker/distribution v2.8.3+incompatible h1:AtKxIZ36LoNK51+Z6RpzLpddBirtxJnzDrHLEKxTAYk= -github.com/docker/distribution v2.8.3+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= -github.com/docker/docker v28.5.2+incompatible h1:DBX0Y0zAjZbSrm1uzOkdr1onVghKaftjlSWt4AFexzM= -github.com/docker/docker v28.5.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/cli v29.4.0+incompatible h1:+IjXULMetlvWJiuSI0Nbor36lcJ5BTcVpUmB21KBoVM= +github.com/docker/cli v29.4.0+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= +github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= +github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= github.com/docker/docker-credential-helpers v0.9.3 h1:gAm/VtF9wgqJMoxzT3Gj5p4AqIjCBS4wrsOh9yRqcz8= github.com/docker/docker-credential-helpers v0.9.3/go.mod h1:x+4Gbw9aGmChi3qTLZj8Dfn0TD20M/fuWy0E5+WDeCo= github.com/docker/go-connections v0.6.0 h1:LlMG9azAe1TqfR7sO+NJttz1gy6KO7VJBh+pMmjSD94= @@ -385,6 +381,8 @@ github.com/godoc-lint/godoc-lint v0.11.2 h1:Bp0FkJWoSdNsBikdNgIcgtaoo+xz6I/Y9s5W github.com/godoc-lint/godoc-lint v0.11.2/go.mod h1:iVpGdL1JCikNH2gGeAn3Hh+AgN5Gx/I/cxV+91L41jo= github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw= github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA= @@ -425,8 +423,8 @@ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= -github.com/google/go-containerregistry v0.21.2 h1:vYaMU4nU55JJGFC9JR/s8NZcTjbE9DBBbvusTW9NeS0= -github.com/google/go-containerregistry v0.21.2/go.mod h1:ctO5aCaewH4AK1AumSF5DPW+0+R+d2FmylMJdp5G7p0= +github.com/google/go-containerregistry v0.21.5 h1:KTJG9Pn/jC0VdZR6ctV3/jcN+q6/Iqlx0sTVz3ywZlM= +github.com/google/go-containerregistry v0.21.5/go.mod h1:ySvMuiWg+dOsRW0Hw8GYwfMwBlNRTmpYBFJPlkco5zU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -611,8 +609,6 @@ github.com/moby/moby/client v0.4.0 h1:S+2XegzHQrrvTCvF6s5HFzcrywWQmuVnhOXe2kiWjI github.com/moby/moby/client v0.4.0/go.mod h1:QWPbvWchQbxBNdaLSpoKpCdf5E+WxFAgNHogCWDoa7g= github.com/moby/patternmatcher v0.6.1 h1:qlhtafmr6kgMIJjKJMDmMWq7WLkKIo23hsrpR3x084U= github.com/moby/patternmatcher v0.6.1/go.mod h1:hDPoyOpDY7OrrMDLaYoY3hf52gNCR/YOUYxkhApJIxc= -github.com/moby/spdystream v0.5.1 h1:9sNYeYZUcci9R6/w7KDaFWEWeV4LStVG78Mpyq/Zm/Y= -github.com/moby/spdystream v0.5.1/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/sys/sequential v0.6.0 h1:qrx7XFUd/5DxtqcoH1h438hF5TmOvzC/lspjy7zgvCU= github.com/moby/sys/sequential v0.6.0/go.mod h1:uyv8EUTrca5PnDsdMGXhZe6CCe8U/UiTWd+lL+7b/Ko= github.com/moby/sys/user v0.4.0 h1:jhcMKit7SA80hivmFJcbB1vqmw//wU61Zdui2eQXuMs= From 96649363e6b966dfa99c4743c2a483dbe4dc2623 Mon Sep 17 00:00:00 2001 From: Eitan Yarmush Date: Mon, 1 Jun 2026 19:57:04 +0000 Subject: [PATCH 25/32] Split AgentHarness controllers by runtime Signed-off-by: Eitan Yarmush --- go/api/v1alpha2/agentharness_types.go | 1 + .../controller/agentharness_controller.go | 428 ------------------ .../agentharness_controller_test.go | 35 +- .../agentharness_openshell_controller.go | 205 +++++++++ .../controller/agentharness_shared.go | 148 ++++++ .../agentharness_substrate_controller.go | 290 ++++++++++++ .../agentharness_substrate_watches.go | 6 +- go/core/pkg/app/app.go | 63 +-- .../sandboxbackend/substrate/delete_actor.go | 7 +- .../substrate/delete_actor_test.go | 5 +- .../substrate/lifecycle_delete.go | 5 +- .../pkg/sandboxbackend/substrate/openclaw.go | 2 +- 12 files changed, 703 insertions(+), 492 deletions(-) delete mode 100644 go/core/internal/controller/agentharness_controller.go create mode 100644 go/core/internal/controller/agentharness_openshell_controller.go create mode 100644 go/core/internal/controller/agentharness_shared.go create mode 100644 go/core/internal/controller/agentharness_substrate_controller.go diff --git a/go/api/v1alpha2/agentharness_types.go b/go/api/v1alpha2/agentharness_types.go index e44c3d6924..ee74118827 100644 --- a/go/api/v1alpha2/agentharness_types.go +++ b/go/api/v1alpha2/agentharness_types.go @@ -301,6 +301,7 @@ const ( AgentHarnessConditionTypeAccepted = "Accepted" AgentHarnessConditionTypeActorTemplateReady = "ActorTemplateReady" AgentHarnessConditionTypeActorReady = "ActorReady" + AgentHarnessConditionTypeBootstrapReady = "BootstrapReady" ) // +kubebuilder:object:root=true diff --git a/go/core/internal/controller/agentharness_controller.go b/go/core/internal/controller/agentharness_controller.go deleted file mode 100644 index 96478d5a91..0000000000 --- a/go/core/internal/controller/agentharness_controller.go +++ /dev/null @@ -1,428 +0,0 @@ -/* -Copyright 2026. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 -*/ - -package controller - -import ( - "context" - "fmt" - "reflect" - "strconv" - "time" - - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/events" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" - "sigs.k8s.io/controller-runtime/pkg/predicate" - - "github.com/kagent-dev/kagent/go/api/v1alpha2" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" - "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" -) - -const ( - // agentHarnessFinalizer guarantees the backend sandbox is deleted before the - // Kubernetes object is removed. - agentHarnessFinalizer = "kagent.dev/agent-harness-backend-cleanup" - - // agentHarnessNotReadyRequeue is how long we wait before re-polling backend - // status while the sandbox is still provisioning. - agentHarnessNotReadyRequeue = 10 * time.Second - - // substrateDeleteTimeout is the maximum time to wait for substrate cleanup during delete. - substrateDeleteTimeout = 5 * time.Minute - - // annotationAgentHarnessBootstrapGeneration records the AgentHarness metadata.generation for which - // post-ready bootstrap (backend OnAgentHarnessReady, e.g. exec hooks) already completed. - annotationAgentHarnessBootstrapGeneration = "kagent.dev/agent-harness-bootstrap-generation" -) - -// AgentHarnessController reconciles a kagent.dev/v1alpha2 AgentHarness against an -// AsyncBackend. It is intentionally independent of the SandboxAgent path — -// harness VMs are a generic exec/SSH-able environment with no in-cluster -// workload owned by kagent. -type AgentHarnessController struct { - Client client.Client - Recorder events.EventRecorder - OpenshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - SubstrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - SubstrateLifecycle substrate.AgentHarnessLifecycle -} - -func (r *AgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxbackend.AsyncBackend { - runtime := ah.Spec.Runtime - if runtime == "" { - runtime = v1alpha2.AgentHarnessRuntimeOpenshell - } - switch runtime { - case v1alpha2.AgentHarnessRuntimeSubstrate: - if r.SubstrateBackends == nil { - return nil - } - return r.SubstrateBackends[ah.Spec.Backend] - default: - if r.OpenshellBackends == nil { - return nil - } - return r.OpenshellBackends[ah.Spec.Backend] - } -} - -// +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/status,verbs=get;update;patch -// +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/finalizers,verbs=update -// +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch -// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete -// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get -// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch - -func (r *AgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) - - var ah v1alpha2.AgentHarness - if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { - if apierrors.IsNotFound(err) { - return ctrl.Result{}, nil - } - return ctrl.Result{}, fmt.Errorf("get AgentHarness: %w", err) - } - - if !ah.DeletionTimestamp.IsZero() { - return r.reconcileDelete(ctx, &ah) - } - - if controllerutil.AddFinalizer(&ah, agentHarnessFinalizer) { - if err := r.Client.Update(ctx, &ah); err != nil { - return ctrl.Result{}, fmt.Errorf("add finalizer: %w", err) - } - return ctrl.Result{Requeue: true}, nil - } - - backend := r.backendFor(&ah) - if backend == nil { - runtime := ah.Spec.Runtime - if runtime == "" { - runtime = v1alpha2.AgentHarnessRuntimeOpenshell - } - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, - "BackendUnavailable", - fmt.Sprintf("no %s backend configured for %q", runtime, ah.Spec.Backend)) - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "BackendUnavailable", "") - if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{}, nil - } - - runtime := effectiveAgentHarnessRuntime(&ah) - if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { - if r.SubstrateLifecycle == nil { - log.Error(nil, "substrate lifecycle not configured") - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, - "SubstrateLifecycleUnavailable", - "substrate runtime requires configured substrate lifecycle (set --substrate-ate-api-endpoint)") - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "SubstrateLifecycleUnavailable", "") - if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{}, nil - } - lifecycleState, err := r.SubstrateLifecycle.EnsureGeneratedTemplate(ctx, &ah) - if err != nil { - log.Error(err, "substrate lifecycle reconciliation failed") - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, - "SubstrateLifecycleFailed", err.Error()) - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "SubstrateLifecycleFailed", "") - if perr := r.patchAgentHarnessStatus(ctx, &ah); perr != nil { - return ctrl.Result{}, perr - } - return ctrl.Result{}, err - } - if lifecycleState.ActorTemplateReady { - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, - metav1.ConditionTrue, "Ready", "ActorTemplate golden snapshot is ready") - } else { - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, - metav1.ConditionFalse, "NotReady", "waiting for ActorTemplate golden snapshot") - } - if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { - return ctrl.Result{}, err - } - if !lifecycleState.ActorTemplateReady { - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, - "SubstrateLifecyclePending", "waiting for ActorTemplate golden snapshot") - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionFalse, - "ActorNotCreated", "waiting for ActorTemplate before creating actor") - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "ActorTemplateNotReady", "ActorTemplate is not Ready yet") - if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil - } - if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { - return ctrl.Result{}, fmt.Errorf("reload AgentHarness after substrate lifecycle reconciliation: %w", err) - } - } - - res, err := backend.EnsureAgentHarness(ctx, &ah) - if err != nil { - log.Error(err, "EnsureAgentHarness failed") - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, - "EnsureFailed", err.Error()) - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "EnsureFailed", err.Error()) - if perr := r.patchAgentHarnessStatus(ctx, &ah); perr != nil { - return ctrl.Result{}, perr - } - return ctrl.Result{}, err - } - - ah.Status.BackendRef = &v1alpha2.AgentHarnessStatusRef{ - Backend: ah.Spec.Backend, - ID: res.Handle.ID, - } - if res.Endpoint != "" { - ah.Status.Connection = &v1alpha2.AgentHarnessConnection{Endpoint: res.Endpoint} - } - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, - "AgentHarnessAccepted", "backend accepted sandbox request") - - st, reason, msg := backend.GetStatus(ctx, res.Handle) - pending := r.postReadyBootstrapPending(&ah) - if st == metav1.ConditionTrue && pending { - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, - "BootstrapPending", - "gateway sandbox is ready; waiting for post-ready bootstrap (OnAgentHarnessReady) to finish") - } else { - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) - setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, st, reason, msg) - } - ah.Status.ObservedGeneration = ah.Generation - - if err := r.patchAgentHarnessStatus(ctx, &ah); err != nil { - return ctrl.Result{}, err - } - - if st != metav1.ConditionTrue { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil - } - if pending { - if err := r.maybePostReadyBootstrap(ctx, client.ObjectKeyFromObject(&ah), &ah, res.Handle, backend); err != nil { - log.Error(err, "post-ready sandbox bootstrap failed") - return ctrl.Result{}, err - } - var latest v1alpha2.AgentHarness - if err := r.Client.Get(ctx, req.NamespacedName, &latest); err != nil { - return ctrl.Result{}, fmt.Errorf("get AgentHarness after bootstrap: %w", err) - } - st2, reason2, msg2 := backend.GetStatus(ctx, res.Handle) - setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeActorReady, st2, reason2, msg2) - setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeReady, st2, reason2, msg2) - latest.Status.ObservedGeneration = latest.Generation - if err := r.Client.Status().Update(ctx, &latest); err != nil { - return ctrl.Result{}, fmt.Errorf("update AgentHarness status after bootstrap: %w", err) - } - } - return ctrl.Result{}, nil -} - -func (r *AgentHarnessController) postReadyBootstrapPending(ah *v1alpha2.AgentHarness) bool { - wantGen := strconv.FormatInt(ah.Generation, 10) - if ah.Annotations != nil && ah.Annotations[annotationAgentHarnessBootstrapGeneration] == wantGen { - return false - } - return true -} - -func (r *AgentHarnessController) maybePostReadyBootstrap(ctx context.Context, key client.ObjectKey, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle, async sandboxbackend.AsyncBackend) error { - if !r.postReadyBootstrapPending(ah) { - return nil - } - wantGen := strconv.FormatInt(ah.Generation, 10) - if err := async.OnAgentHarnessReady(ctx, ah, h); err != nil { - return err - } - var fresh v1alpha2.AgentHarness - if err := r.Client.Get(ctx, key, &fresh); err != nil { - return fmt.Errorf("get AgentHarness after bootstrap: %w", err) - } - base := fresh.DeepCopy() - if fresh.Annotations == nil { - fresh.Annotations = map[string]string{} - } - fresh.Annotations[annotationAgentHarnessBootstrapGeneration] = wantGen - if err := r.Client.Patch(ctx, &fresh, client.MergeFrom(base)); err != nil { - return fmt.Errorf("patch AgentHarness bootstrap-generation annotation: %w", err) - } - ctrl.LoggerFrom(ctx).WithValues("agentHarness", key.String()).Info( - "recorded post-ready bootstrap for AgentHarness generation", "generation", ah.Generation) - return nil -} - -func (r *AgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alpha2.AgentHarness) (ctrl.Result, error) { - if !controllerutil.ContainsFinalizer(ah, agentHarnessFinalizer) { - return ctrl.Result{}, nil - } - - if substrateDeleteTimedOut(ah) { - setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, - metav1.ConditionFalse, "DeleteTimeout", "substrate cleanup exceeded timeout") - if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{}, fmt.Errorf("substrate cleanup timed out for AgentHarness %s", ah.Name) - } - - runtime := effectiveAgentHarnessRuntime(ah) - actorID := "" - if ah.Status.BackendRef != nil { - actorID = ah.Status.BackendRef.ID - } - - if actorID != "" { - backend := r.backendFor(ah) - actorDone := true - var err error - if backend != nil { - actorDone, err = backend.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) - } else { - actorDone = true - } - if err != nil { - if r.Recorder != nil { - r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) - } - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err - } - if !actorDone { - setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, - metav1.ConditionFalse, "ActorDeleting", fmt.Sprintf("waiting for substrate actor %q deletion", actorID)) - if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil - } - ah.Status.BackendRef = nil - if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { - return ctrl.Result{}, err - } - } - - if runtime == v1alpha2.AgentHarnessRuntimeSubstrate { - if r.SubstrateLifecycle == nil { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, - fmt.Errorf("substrate lifecycle is not configured") - } - complete, err := r.SubstrateLifecycle.CleanupGeneratedTemplate(ctx, ah) - if err != nil { - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("cleanup substrate lifecycle: %w", err) - } - if !complete { - setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, - metav1.ConditionFalse, "GoldenActorDeleting", "waiting for generated ActorTemplate golden actor deletion") - if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil - } - setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, - metav1.ConditionFalse, "Deleting", "generated ActorTemplate will be garbage collected") - if err := r.patchAgentHarnessStatus(ctx, ah); err != nil { - return ctrl.Result{}, err - } - } - - controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) - if err := r.Client.Update(ctx, ah); err != nil { - return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) - } - return ctrl.Result{}, nil -} - -func substrateDeleteTimedOut(ah *v1alpha2.AgentHarness) bool { - if ah == nil || ah.DeletionTimestamp.IsZero() { - return false - } - return time.Since(ah.DeletionTimestamp.Time) > substrateDeleteTimeout -} - -func (r *AgentHarnessController) patchAgentHarnessStatus(ctx context.Context, ah *v1alpha2.AgentHarness) error { - var current v1alpha2.AgentHarness - if err := r.Client.Get(ctx, client.ObjectKeyFromObject(ah), ¤t); err != nil { - return fmt.Errorf("get AgentHarness before status update: %w", err) - } - if reflect.DeepEqual(current.Status, ah.Status) { - *ah = current - return nil - } - current.Status = ah.Status - if err := r.Client.Status().Update(ctx, ¤t); err != nil { - return fmt.Errorf("update AgentHarness status: %w", err) - } - *ah = current - return nil -} - -func effectiveAgentHarnessRuntime(ah *v1alpha2.AgentHarness) v1alpha2.AgentHarnessRuntime { - if ah.Spec.Runtime == "" { - return v1alpha2.AgentHarnessRuntimeOpenshell - } - return ah.Spec.Runtime -} - -func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { - meta.SetStatusCondition(&ah.Status.Conditions, metav1.Condition{ - Type: t, - Status: s, - Reason: reason, - Message: msg, - ObservedGeneration: ah.Generation, - }) -} - -// SetupWithManager registers the controller with the manager. -func (r *AgentHarnessController) SetupWithManager(mgr ctrl.Manager) error { - b := ctrl.NewControllerManagedBy(mgr). - WithOptions(controller.Options{NeedLeaderElection: new(true)}). - For(&v1alpha2.AgentHarness{}, builder.WithPredicates(agentHarnessPrimaryPredicate())) - b = r.substrateWatches(b) - return b.Named("agentharness").Complete(r) -} - -func agentHarnessPrimaryPredicate() predicate.Predicate { - return predicate.Funcs{ - CreateFunc: func(event.CreateEvent) bool { return true }, - DeleteFunc: func(event.DeleteEvent) bool { return true }, - UpdateFunc: func(e event.UpdateEvent) bool { - if e.ObjectOld == nil || e.ObjectNew == nil { - return true - } - if e.ObjectNew.GetGeneration() != e.ObjectOld.GetGeneration() { - return true - } - if !reflect.DeepEqual(e.ObjectNew.GetLabels(), e.ObjectOld.GetLabels()) { - return true - } - return e.ObjectOld.GetDeletionTimestamp().IsZero() && !e.ObjectNew.GetDeletionTimestamp().IsZero() - }, - } -} diff --git a/go/core/internal/controller/agentharness_controller_test.go b/go/core/internal/controller/agentharness_controller_test.go index 5f82f1b010..7d09b37dee 100644 --- a/go/core/internal/controller/agentharness_controller_test.go +++ b/go/core/internal/controller/agentharness_controller_test.go @@ -100,9 +100,7 @@ func TestAgentHarnessController_SubstrateWaitsForGeneratedTemplate(t *testing.T) lifecycle := &fakeSubstrateLifecycle{state: substrate.LifecycleState{ActorTemplateReady: false}} backend := &fakeAgentHarnessBackend{} controller.SubstrateLifecycle = lifecycle - controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: backend, - } + controller.OpenClawBackend = backend result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) require.NoError(t, err) @@ -124,9 +122,7 @@ func TestAgentHarnessController_SubstrateLifecycleErrorSetsStatus(t *testing.T) lifecycle := &fakeSubstrateLifecycle{ensureErr: errors.New("workerpool missing")} backend := &fakeAgentHarnessBackend{} controller.SubstrateLifecycle = lifecycle - controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: backend, - } + controller.OpenClawBackend = backend _, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) require.ErrorContains(t, err, "workerpool missing") @@ -145,9 +141,7 @@ func TestAgentHarnessController_SubstrateReadyCreatesActorAndRunsBootstrap(t *te lifecycle := &fakeSubstrateLifecycle{state: substrate.LifecycleState{ActorTemplateReady: true}} backend := &fakeAgentHarnessBackend{ensureHandle: "actor-1", endpoint: "kagent gateway: /api/agentharnesses/kagent/claw/gateway/"} controller.SubstrateLifecycle = lifecycle - controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: backend, - } + controller.OpenClawBackend = backend result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) require.NoError(t, err) @@ -163,8 +157,13 @@ func TestAgentHarnessController_SubstrateReadyCreatesActorAndRunsBootstrap(t *te requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, "AgentHarnessAccepted") requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, metav1.ConditionTrue, "Ready") requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionTrue, "Running") + requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionTrue, "BootstrapComplete") requireCondition(t, latest, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionTrue, "Running") - require.Equal(t, "1", latest.Annotations[annotationAgentHarnessBootstrapGeneration]) + + result, err = controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) + require.NoError(t, err) + require.Equal(t, ctrl.Result{}, result) + require.Equal(t, 1, backend.readyCalls, "bootstrap should not rerun for an already bootstrapped generation") } func TestAgentHarnessController_SubstrateDeleteWaitsForActorBeforeTemplateCleanup(t *testing.T) { @@ -175,9 +174,7 @@ func TestAgentHarnessController_SubstrateDeleteWaitsForActorBeforeTemplateCleanu lifecycle := &fakeSubstrateLifecycle{cleanupDone: true} backend := &fakeAgentHarnessBackend{deleteDone: false} controller.SubstrateLifecycle = lifecycle - controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: backend, - } + controller.OpenClawBackend = backend result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) require.NoError(t, err) @@ -199,9 +196,7 @@ func TestAgentHarnessController_SubstrateDeleteWaitsForGeneratedTemplateCleanup( lifecycle := &fakeSubstrateLifecycle{cleanupDone: false} backend := &fakeAgentHarnessBackend{deleteDone: true} controller.SubstrateLifecycle = lifecycle - controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: backend, - } + controller.OpenClawBackend = backend result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) require.NoError(t, err) @@ -223,9 +218,7 @@ func TestAgentHarnessController_SubstrateDeleteRemovesFinalizerAfterCleanup(t *t lifecycle := &fakeSubstrateLifecycle{cleanupDone: true} backend := &fakeAgentHarnessBackend{deleteDone: true} controller.SubstrateLifecycle = lifecycle - controller.SubstrateBackends = map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: backend, - } + controller.OpenClawBackend = backend result, err := controller.Reconcile(ctx, ctrl.Request{NamespacedName: client.ObjectKeyFromObject(ah)}) require.NoError(t, err) @@ -238,7 +231,7 @@ func TestAgentHarnessController_SubstrateDeleteRemovesFinalizerAfterCleanup(t *t require.True(t, apierrors.IsNotFound(err), "fake client should complete deletion after finalizer removal") } -func newAgentHarnessTestController(t *testing.T, objects ...client.Object) *AgentHarnessController { +func newAgentHarnessTestController(t *testing.T, objects ...client.Object) *SubstrateAgentHarnessController { t.Helper() scheme := runtime.NewScheme() utilruntime.Must(v1alpha2.AddToScheme(scheme)) @@ -247,7 +240,7 @@ func newAgentHarnessTestController(t *testing.T, objects ...client.Object) *Agen WithObjects(objects...). WithStatusSubresource(&v1alpha2.AgentHarness{}). Build() - return &AgentHarnessController{Client: kube} + return &SubstrateAgentHarnessController{Client: kube} } func newSubstrateHarness(namespace, name string) *v1alpha2.AgentHarness { diff --git a/go/core/internal/controller/agentharness_openshell_controller.go b/go/core/internal/controller/agentharness_openshell_controller.go new file mode 100644 index 0000000000..be25f2f7d0 --- /dev/null +++ b/go/core/internal/controller/agentharness_openshell_controller.go @@ -0,0 +1,205 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package controller + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/events" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" +) + +// OpenShellAgentHarnessController reconciles AgentHarness resources that use the +// OpenShell runtime. +type OpenShellAgentHarnessController struct { + Client client.Client + Recorder events.EventRecorder + OpenClawBackend sandboxbackend.AsyncBackend + HermesBackend sandboxbackend.AsyncBackend +} + +func (r *OpenShellAgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxbackend.AsyncBackend { + switch ah.Spec.Backend { + case v1alpha2.AgentHarnessBackendOpenClaw, v1alpha2.AgentHarnessBackendNemoClaw: + return r.OpenClawBackend + case v1alpha2.AgentHarnessBackendHermes: + return r.HermesBackend + default: + return nil + } +} + +func (r *OpenShellAgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) + + var ah v1alpha2.AgentHarness + if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get AgentHarness: %w", err) + } + if effectiveAgentHarnessRuntime(&ah) != v1alpha2.AgentHarnessRuntimeOpenshell { + return ctrl.Result{}, nil + } + + if !ah.DeletionTimestamp.IsZero() { + return r.reconcileDelete(ctx, &ah) + } + + if controllerutil.AddFinalizer(&ah, agentHarnessFinalizer) { + if err := r.Client.Update(ctx, &ah); err != nil { + return ctrl.Result{}, fmt.Errorf("add finalizer: %w", err) + } + return ctrl.Result{Requeue: true}, nil + } + + backend := r.backendFor(&ah) + if backend == nil { + return reconcileBackendUnavailable(ctx, r.Client, &ah, v1alpha2.AgentHarnessRuntimeOpenshell) + } + + return r.reconcileBackend(ctx, req, &ah, backend, log) +} + +func (r *OpenShellAgentHarnessController) reconcileBackend(ctx context.Context, req ctrl.Request, ah *v1alpha2.AgentHarness, backend sandboxbackend.AsyncBackend, log logr.Logger) (ctrl.Result, error) { + res, err := backend.EnsureAgentHarness(ctx, ah) + if err != nil { + log.Error(err, "EnsureAgentHarness failed") + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "EnsureFailed", err.Error()) + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "EnsureFailed", err.Error()) + if perr := patchAgentHarnessStatus(ctx, r.Client, ah); perr != nil { + return ctrl.Result{}, perr + } + return ctrl.Result{}, err + } + + ah.Status.BackendRef = &v1alpha2.AgentHarnessStatusRef{ + Backend: ah.Spec.Backend, + ID: res.Handle.ID, + } + if res.Endpoint != "" { + ah.Status.Connection = &v1alpha2.AgentHarnessConnection{Endpoint: res.Endpoint} + } + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, + "AgentHarnessAccepted", "backend accepted sandbox request") + + st, reason, msg := backend.GetStatus(ctx, res.Handle) + pending := postReadyBootstrapPending(ah) + if st == metav1.ConditionTrue && pending { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionFalse, + "BootstrapPending", + "waiting for post-ready bootstrap (OnAgentHarnessReady) to finish") + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "BootstrapPending", + "gateway sandbox is ready; waiting for post-ready bootstrap (OnAgentHarnessReady) to finish") + } else { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) + if pending { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionFalse, + "ActorNotReady", "waiting for actor before post-ready bootstrap") + } + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, st, reason, msg) + } + ah.Status.ObservedGeneration = ah.Generation + + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + + if st != metav1.ConditionTrue { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + if pending { + if err := maybePostReadyBootstrap(ctx, client.ObjectKeyFromObject(ah), ah, res.Handle, backend); err != nil { + log.Error(err, "post-ready sandbox bootstrap failed") + return ctrl.Result{}, err + } + var latest v1alpha2.AgentHarness + if err := r.Client.Get(ctx, req.NamespacedName, &latest); err != nil { + return ctrl.Result{}, fmt.Errorf("get AgentHarness after bootstrap: %w", err) + } + st2, reason2, msg2 := backend.GetStatus(ctx, res.Handle) + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeActorReady, st2, reason2, msg2) + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionTrue, + "BootstrapComplete", "post-ready bootstrap completed") + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeReady, st2, reason2, msg2) + latest.Status.ObservedGeneration = latest.Generation + if err := r.Client.Status().Update(ctx, &latest); err != nil { + return ctrl.Result{}, fmt.Errorf("update AgentHarness status after bootstrap: %w", err) + } + } + return ctrl.Result{}, nil +} + +func (r *OpenShellAgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alpha2.AgentHarness) (ctrl.Result, error) { + if !controllerutil.ContainsFinalizer(ah, agentHarnessFinalizer) { + return ctrl.Result{}, nil + } + + if ah.Status.BackendRef != nil { + actorID := ah.Status.BackendRef.ID + if actorID != "" { + backend := r.backendFor(ah) + actorDone := true + var err error + if backend != nil { + actorDone, err = backend.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) + } + if err != nil { + if r.Recorder != nil { + r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + } + if !actorDone { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, + metav1.ConditionFalse, "ActorDeleting", fmt.Sprintf("waiting for backend actor %q deletion", actorID)) + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + } + ah.Status.BackendRef = nil + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + } + + controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) + if err := r.Client.Update(ctx, ah); err != nil { + return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) + } + return ctrl.Result{}, nil +} + +// SetupWithManager registers the OpenShell AgentHarness controller with the manager. +func (r *OpenShellAgentHarnessController) SetupWithManager(mgr ctrl.Manager) error { + b := ctrl.NewControllerManagedBy(mgr). + WithOptions(controller.Options{NeedLeaderElection: new(true)}). + For(&v1alpha2.AgentHarness{}, builder.WithPredicates(agentHarnessRuntimePredicate(v1alpha2.AgentHarnessRuntimeOpenshell))) + return b.Named("agentharness-openshell").Complete(r) +} diff --git a/go/core/internal/controller/agentharness_shared.go b/go/core/internal/controller/agentharness_shared.go new file mode 100644 index 0000000000..d0fd40cb22 --- /dev/null +++ b/go/core/internal/controller/agentharness_shared.go @@ -0,0 +1,148 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package controller + +import ( + "context" + "fmt" + "reflect" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" +) + +const ( + // agentHarnessFinalizer guarantees the backend sandbox is deleted before the + // Kubernetes object is removed. + agentHarnessFinalizer = "kagent.dev/agent-harness-backend-cleanup" + + // agentHarnessNotReadyRequeue is how long we wait before re-polling backend + // status while the sandbox is still provisioning. + agentHarnessNotReadyRequeue = 10 * time.Second +) + +// +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=kagent.dev,resources=agentharnesses/finalizers,verbs=update + +func reconcileBackendUnavailable(ctx context.Context, kube client.Client, ah *v1alpha2.AgentHarness, runtime v1alpha2.AgentHarnessRuntime) (ctrl.Result, error) { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "BackendUnavailable", + fmt.Sprintf("no %s backend configured for %q", runtime, ah.Spec.Backend)) + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "BackendUnavailable", "") + if err := patchAgentHarnessStatus(ctx, kube, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil +} + +func postReadyBootstrapPending(ah *v1alpha2.AgentHarness) bool { + cond := meta.FindStatusCondition(ah.Status.Conditions, v1alpha2.AgentHarnessConditionTypeBootstrapReady) + return cond == nil || cond.ObservedGeneration != ah.Generation || cond.Status != metav1.ConditionTrue +} + +func maybePostReadyBootstrap(ctx context.Context, key client.ObjectKey, ah *v1alpha2.AgentHarness, h sandboxbackend.Handle, async sandboxbackend.AsyncBackend) error { + if !postReadyBootstrapPending(ah) { + return nil + } + if err := async.OnAgentHarnessReady(ctx, ah, h); err != nil { + return err + } + ctrl.LoggerFrom(ctx).WithValues("agentHarness", key.String()).Info( + "recorded post-ready bootstrap for AgentHarness generation", "generation", ah.Generation) + return nil +} + +func patchAgentHarnessStatus(ctx context.Context, kube client.Client, ah *v1alpha2.AgentHarness) error { + var current v1alpha2.AgentHarness + if err := kube.Get(ctx, client.ObjectKeyFromObject(ah), ¤t); err != nil { + return fmt.Errorf("get AgentHarness before status update: %w", err) + } + if reflect.DeepEqual(current.Status, ah.Status) { + *ah = current + return nil + } + current.Status = ah.Status + if err := kube.Status().Update(ctx, ¤t); err != nil { + return fmt.Errorf("update AgentHarness status: %w", err) + } + *ah = current + return nil +} + +func effectiveAgentHarnessRuntime(ah *v1alpha2.AgentHarness) v1alpha2.AgentHarnessRuntime { + if ah.Spec.Runtime == "" { + return v1alpha2.AgentHarnessRuntimeOpenshell + } + return ah.Spec.Runtime +} + +func setAgentHarnessCondition(ah *v1alpha2.AgentHarness, t string, s metav1.ConditionStatus, reason, msg string) { + meta.SetStatusCondition(&ah.Status.Conditions, metav1.Condition{ + Type: t, + Status: s, + Reason: reason, + Message: msg, + ObservedGeneration: ah.Generation, + }) +} + +func agentHarnessPrimaryPredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(event.CreateEvent) bool { return true }, + DeleteFunc: func(event.DeleteEvent) bool { return true }, + UpdateFunc: func(e event.UpdateEvent) bool { + if e.ObjectOld == nil || e.ObjectNew == nil { + return true + } + if e.ObjectNew.GetGeneration() != e.ObjectOld.GetGeneration() { + return true + } + if !reflect.DeepEqual(e.ObjectNew.GetLabels(), e.ObjectOld.GetLabels()) { + return true + } + return e.ObjectOld.GetDeletionTimestamp().IsZero() && !e.ObjectNew.GetDeletionTimestamp().IsZero() + }, + } +} + +func agentHarnessRuntimePredicate(runtime v1alpha2.AgentHarnessRuntime) predicate.Predicate { + primary := agentHarnessPrimaryPredicate() + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return primary.Create(e) && agentHarnessObjectMatchesRuntime(e.Object, runtime) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return primary.Delete(e) && agentHarnessObjectMatchesRuntime(e.Object, runtime) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return primary.Update(e) && + (agentHarnessObjectMatchesRuntime(e.ObjectOld, runtime) || agentHarnessObjectMatchesRuntime(e.ObjectNew, runtime)) + }, + } +} + +func agentHarnessObjectMatchesRuntime(obj client.Object, runtime v1alpha2.AgentHarnessRuntime) bool { + ah, ok := obj.(*v1alpha2.AgentHarness) + if !ok || ah == nil { + return false + } + return effectiveAgentHarnessRuntime(ah) == runtime +} diff --git a/go/core/internal/controller/agentharness_substrate_controller.go b/go/core/internal/controller/agentharness_substrate_controller.go new file mode 100644 index 0000000000..3c56e2073c --- /dev/null +++ b/go/core/internal/controller/agentharness_substrate_controller.go @@ -0,0 +1,290 @@ +/* +Copyright 2026. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 +*/ + +package controller + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/events" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" +) + +const ( + // substrateDeleteTimeout is the maximum time to wait for substrate cleanup during delete. + substrateDeleteTimeout = 5 * time.Minute +) + +// +kubebuilder:rbac:groups=ate.dev,resources=workerpools,verbs=get;list;watch +// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=ate.dev,resources=actortemplates/status,verbs=get + +// SubstrateAgentHarnessController reconciles AgentHarness resources that use the +// Substrate runtime. +type SubstrateAgentHarnessController struct { + Client client.Client + Recorder events.EventRecorder + OpenClawBackend sandboxbackend.AsyncBackend + NemoClawBackend sandboxbackend.AsyncBackend + SubstrateLifecycle substrate.AgentHarnessLifecycle +} + +func (r *SubstrateAgentHarnessController) backendFor(ah *v1alpha2.AgentHarness) sandboxbackend.AsyncBackend { + switch ah.Spec.Backend { + case v1alpha2.AgentHarnessBackendOpenClaw: + return r.OpenClawBackend + case v1alpha2.AgentHarnessBackendNemoClaw: + return r.NemoClawBackend + default: + return nil + } +} + +func (r *SubstrateAgentHarnessController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx).WithValues("agentHarness", req.NamespacedName) + + var ah v1alpha2.AgentHarness + if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("get AgentHarness: %w", err) + } + if effectiveAgentHarnessRuntime(&ah) != v1alpha2.AgentHarnessRuntimeSubstrate { + return ctrl.Result{}, nil + } + + if !ah.DeletionTimestamp.IsZero() { + return r.reconcileDelete(ctx, &ah) + } + + if controllerutil.AddFinalizer(&ah, agentHarnessFinalizer) { + if err := r.Client.Update(ctx, &ah); err != nil { + return ctrl.Result{}, fmt.Errorf("add finalizer: %w", err) + } + return ctrl.Result{Requeue: true}, nil + } + + backend := r.backendFor(&ah) + if backend == nil { + return reconcileBackendUnavailable(ctx, r.Client, &ah, v1alpha2.AgentHarnessRuntimeSubstrate) + } + + lifecycleState, err := r.SubstrateLifecycle.EnsureGeneratedTemplate(ctx, &ah) + if err != nil { + log.Error(err, "substrate lifecycle reconciliation failed") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "SubstrateLifecycleFailed", err.Error()) + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "SubstrateLifecycleFailed", "") + if perr := patchAgentHarnessStatus(ctx, r.Client, &ah); perr != nil { + return ctrl.Result{}, perr + } + return ctrl.Result{}, err + } + if lifecycleState.ActorTemplateReady { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, + metav1.ConditionTrue, "Ready", "ActorTemplate golden snapshot is ready") + } else { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, + metav1.ConditionFalse, "NotReady", "waiting for ActorTemplate golden snapshot") + } + if err := patchAgentHarnessStatus(ctx, r.Client, &ah); err != nil { + return ctrl.Result{}, err + } + if !lifecycleState.ActorTemplateReady { + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, + "SubstrateLifecyclePending", "waiting for ActorTemplate golden snapshot") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeActorReady, metav1.ConditionFalse, + "ActorNotCreated", "waiting for ActorTemplate before creating actor") + setAgentHarnessCondition(&ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "ActorTemplateNotReady", "ActorTemplate is not Ready yet") + if err := patchAgentHarnessStatus(ctx, r.Client, &ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + if err := r.Client.Get(ctx, req.NamespacedName, &ah); err != nil { + return ctrl.Result{}, fmt.Errorf("reload AgentHarness after substrate lifecycle reconciliation: %w", err) + } + + return r.reconcileBackend(ctx, req, &ah, backend, log) +} + +func (r *SubstrateAgentHarnessController) reconcileBackend(ctx context.Context, req ctrl.Request, ah *v1alpha2.AgentHarness, backend sandboxbackend.AsyncBackend, log logr.Logger) (ctrl.Result, error) { + res, err := backend.EnsureAgentHarness(ctx, ah) + if err != nil { + log.Error(err, "EnsureAgentHarness failed") + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionFalse, + "EnsureFailed", err.Error()) + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "EnsureFailed", err.Error()) + if perr := patchAgentHarnessStatus(ctx, r.Client, ah); perr != nil { + return ctrl.Result{}, perr + } + return ctrl.Result{}, err + } + + ah.Status.BackendRef = &v1alpha2.AgentHarnessStatusRef{ + Backend: ah.Spec.Backend, + ID: res.Handle.ID, + } + if res.Endpoint != "" { + ah.Status.Connection = &v1alpha2.AgentHarnessConnection{Endpoint: res.Endpoint} + } + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeAccepted, metav1.ConditionTrue, + "AgentHarnessAccepted", "backend accepted sandbox request") + + st, reason, msg := backend.GetStatus(ctx, res.Handle) + pending := postReadyBootstrapPending(ah) + if st == metav1.ConditionTrue && pending { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionFalse, + "BootstrapPending", + "waiting for post-ready bootstrap (OnAgentHarnessReady) to finish") + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, metav1.ConditionFalse, + "BootstrapPending", + "gateway sandbox is ready; waiting for post-ready bootstrap (OnAgentHarnessReady) to finish") + } else { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, st, reason, msg) + if pending { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionFalse, + "ActorNotReady", "waiting for actor before post-ready bootstrap") + } + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, st, reason, msg) + } + ah.Status.ObservedGeneration = ah.Generation + + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + + if st != metav1.ConditionTrue { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + if pending { + if err := maybePostReadyBootstrap(ctx, client.ObjectKeyFromObject(ah), ah, res.Handle, backend); err != nil { + log.Error(err, "post-ready sandbox bootstrap failed") + return ctrl.Result{}, err + } + var latest v1alpha2.AgentHarness + if err := r.Client.Get(ctx, req.NamespacedName, &latest); err != nil { + return ctrl.Result{}, fmt.Errorf("get AgentHarness after bootstrap: %w", err) + } + st2, reason2, msg2 := backend.GetStatus(ctx, res.Handle) + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeActorReady, st2, reason2, msg2) + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeBootstrapReady, metav1.ConditionTrue, + "BootstrapComplete", "post-ready bootstrap completed") + setAgentHarnessCondition(&latest, v1alpha2.AgentHarnessConditionTypeReady, st2, reason2, msg2) + latest.Status.ObservedGeneration = latest.Generation + if err := r.Client.Status().Update(ctx, &latest); err != nil { + return ctrl.Result{}, fmt.Errorf("update AgentHarness status after bootstrap: %w", err) + } + } + return ctrl.Result{}, nil +} + +func (r *SubstrateAgentHarnessController) reconcileDelete(ctx context.Context, ah *v1alpha2.AgentHarness) (ctrl.Result, error) { + if !controllerutil.ContainsFinalizer(ah, agentHarnessFinalizer) { + return ctrl.Result{}, nil + } + + if substrateDeleteTimedOut(ah) { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeReady, + metav1.ConditionFalse, "DeleteTimeout", "substrate cleanup exceeded timeout") + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, fmt.Errorf("substrate cleanup timed out for AgentHarness %s", ah.Name) + } + + if ah.Status.BackendRef != nil { + actorID := ah.Status.BackendRef.ID + if actorID != "" { + backend := r.backendFor(ah) + actorDone := true + var err error + if backend != nil { + actorDone, err = backend.DeleteAgentHarness(ctx, sandboxbackend.Handle{ID: actorID}) + } + if err != nil { + if r.Recorder != nil { + r.Recorder.Eventf(ah, nil, "Warning", "AgentHarnessDeleteFailed", "DeleteAgentHarness", "%s", err.Error()) + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, err + } + if !actorDone { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorReady, + metav1.ConditionFalse, "ActorDeleting", fmt.Sprintf("waiting for substrate actor %q deletion", actorID)) + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + } + ah.Status.BackendRef = nil + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + } + + complete, err := r.SubstrateLifecycle.CleanupGeneratedTemplate(ctx, ah) + if err != nil { + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, fmt.Errorf("cleanup substrate lifecycle: %w", err) + } + if !complete { + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, + metav1.ConditionFalse, "GoldenActorDeleting", "waiting for generated ActorTemplate golden actor deletion") + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: agentHarnessNotReadyRequeue}, nil + } + setAgentHarnessCondition(ah, v1alpha2.AgentHarnessConditionTypeActorTemplateReady, + metav1.ConditionFalse, "Deleting", "generated ActorTemplate will be garbage collected") + if err := patchAgentHarnessStatus(ctx, r.Client, ah); err != nil { + return ctrl.Result{}, err + } + + controllerutil.RemoveFinalizer(ah, agentHarnessFinalizer) + if err := r.Client.Update(ctx, ah); err != nil { + return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err) + } + return ctrl.Result{}, nil +} + +func substrateDeleteTimedOut(ah *v1alpha2.AgentHarness) bool { + if ah == nil || ah.DeletionTimestamp.IsZero() { + return false + } + return time.Since(ah.DeletionTimestamp.Time) > substrateDeleteTimeout +} + +// SetupWithManager registers the Substrate AgentHarness controller with the manager. +func (r *SubstrateAgentHarnessController) SetupWithManager(mgr ctrl.Manager) error { + b := ctrl.NewControllerManagedBy(mgr). + WithOptions(controller.Options{NeedLeaderElection: new(true)}). + For(&v1alpha2.AgentHarness{}, builder.WithPredicates(agentHarnessRuntimePredicate(v1alpha2.AgentHarnessRuntimeSubstrate))) + b = r.substrateWatches(b) + return b.Named("agentharness-substrate").Complete(r) +} diff --git a/go/core/internal/controller/agentharness_substrate_watches.go b/go/core/internal/controller/agentharness_substrate_watches.go index 948778a673..9b3153f92b 100644 --- a/go/core/internal/controller/agentharness_substrate_watches.go +++ b/go/core/internal/controller/agentharness_substrate_watches.go @@ -13,7 +13,7 @@ import ( "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" ) -func (r *AgentHarnessController) enqueueAgentHarnessForSubstrateResource(ctx context.Context, obj client.Object) []reconcile.Request { +func (r *SubstrateAgentHarnessController) enqueueAgentHarnessForSubstrateResource(ctx context.Context, obj client.Object) []reconcile.Request { harnessName := substrate.HarnessNameFromLabels(obj.GetLabels()) if harnessName == "" { return nil @@ -26,8 +26,8 @@ func (r *AgentHarnessController) enqueueAgentHarnessForSubstrateResource(ctx con }} } -func (r *AgentHarnessController) substrateWatches(b *builder.Builder) *builder.Builder { - if r == nil || r.SubstrateLifecycle == nil { +func (r *SubstrateAgentHarnessController) substrateWatches(b *builder.Builder) *builder.Builder { + if r == nil { return b } return b. diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 6a17436ccc..5c683e0f42 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -594,41 +594,51 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne } kubeClient := mgr.GetClient() - var openshellBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend - var substrateBackends map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend + var openshellOpenClawBackend sandboxbackend.AsyncBackend + var openshellHermesBackend sandboxbackend.AsyncBackend if cfg.Openshell.GatewayURL != "" { var err error - openshellBackends, err = buildOpenshellSandboxBackends(ctx, &cfg, kubeClient) + openshellOpenClawBackend, openshellHermesBackend, err = buildOpenshellSandboxBackends(ctx, &cfg, kubeClient) if err != nil { setupLog.Error(err, "unable to build openshell sandbox backends") os.Exit(1) } } var substrateAteClient *substrate.Client + var substrateOpenClawBackend sandboxbackend.AsyncBackend + var substrateNemoClawBackend sandboxbackend.AsyncBackend if cfg.Substrate.AteAPIEndpoint != "" { var err error - substrateBackends, substrateAteClient, err = buildSubstrateSandboxBackends(ctx, &cfg) + substrateOpenClawBackend, substrateNemoClawBackend, substrateAteClient, err = buildSubstrateSandboxBackends(ctx, &cfg) if err != nil { setupLog.Error(err, "unable to build substrate sandbox backends") os.Exit(1) } } - if len(openshellBackends) > 0 || len(substrateBackends) > 0 { - var substrateLifecycle *substrate.Lifecycle - if len(substrateBackends) > 0 { - substrateLifecycle = substrateLifecycleFromConfig(kubeClient, &cfg, substrateAteClient) + if openshellOpenClawBackend != nil || openshellHermesBackend != nil { + if err := (&controller.OpenShellAgentHarnessController{ + Client: kubeClient, + Recorder: mgr.GetEventRecorder("agentharness-openshell-controller"), + OpenClawBackend: openshellOpenClawBackend, + HermesBackend: openshellHermesBackend, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "OpenShellAgentHarness") + os.Exit(1) } - if err := (&controller.AgentHarnessController{ + } + if substrateOpenClawBackend != nil || substrateNemoClawBackend != nil { + if err := (&controller.SubstrateAgentHarnessController{ Client: kubeClient, - Recorder: mgr.GetEventRecorder("agentharness-controller"), - OpenshellBackends: openshellBackends, - SubstrateBackends: substrateBackends, - SubstrateLifecycle: substrateLifecycle, + Recorder: mgr.GetEventRecorder("agentharness-substrate-controller"), + OpenClawBackend: substrateOpenClawBackend, + NemoClawBackend: substrateNemoClawBackend, + SubstrateLifecycle: substrateLifecycleFromConfig(kubeClient, &cfg, substrateAteClient), }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "AgentHarness") + setupLog.Error(err, "unable to create controller", "controller", "SubstrateAgentHarness") os.Exit(1) } - } else { + } + if openshellOpenClawBackend == nil && openshellHermesBackend == nil && substrateOpenClawBackend == nil && substrateNemoClawBackend == nil { setupLog.Info("AgentHarness controller disabled: set --openshell-gateway-url and/or --substrate-ate-api-endpoint") } @@ -778,7 +788,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne // nemoclaw from flag config. It dials the gateway once; OpenShell and Inference RPCs // share that connection (see openshell.OpenShellClients). The connection is not explicitly // closed today — same lifetime as the process. -func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient client.Client) (map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend, error) { +func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient client.Client) (sandboxbackend.AsyncBackend, sandboxbackend.AsyncBackend, error) { oc := openshell.Config{ GatewayURL: cfg.Openshell.GatewayURL, Token: cfg.Openshell.Token, @@ -789,44 +799,37 @@ func buildOpenshellSandboxBackends(ctx context.Context, cfg *Config, kubeClient if cfg.Openshell.TokenFile != "" { data, err := os.ReadFile(cfg.Openshell.TokenFile) if err != nil { - return nil, fmt.Errorf("read openshell token file: %w", err) + return nil, nil, fmt.Errorf("read openshell token file: %w", err) } oc.Token = strings.TrimSpace(string(data)) } if cfg.Openshell.CAFile != "" { data, err := os.ReadFile(cfg.Openshell.CAFile) if err != nil { - return nil, fmt.Errorf("read openshell CA file: %w", err) + return nil, nil, fmt.Errorf("read openshell CA file: %w", err) } oc.TLSCAPEM = data } clients, err := openshell.Dial(ctx, oc) if err != nil { - return nil, err + return nil, nil, err } ocl := openshell.NewOpenClawBackend(kubeClient, clients, oc, nil) hermesBackend := openshell.NewHermesBackend(kubeClient, clients, oc, nil) - return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, - v1alpha2.AgentHarnessBackendNemoClaw: ocl, - v1alpha2.AgentHarnessBackendHermes: hermesBackend, - }, nil + return ocl, hermesBackend, nil } -func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend, *substrate.Client, error) { +func buildSubstrateSandboxBackends(ctx context.Context, cfg *Config) (sandboxbackend.AsyncBackend, sandboxbackend.AsyncBackend, *substrate.Client, error) { sc := substrateAppConfig(cfg) client, err := substrate.Dial(ctx, sc) if err != nil { - return nil, nil, err + return nil, nil, nil, err } ocl := substrate.NewOpenClawBackend(client, v1alpha2.AgentHarnessBackendOpenClaw, nil) ncl := substrate.NewOpenClawBackend(client, v1alpha2.AgentHarnessBackendNemoClaw, nil) - return map[v1alpha2.AgentHarnessBackendType]sandboxbackend.AsyncBackend{ - v1alpha2.AgentHarnessBackendOpenClaw: ocl, - v1alpha2.AgentHarnessBackendNemoClaw: ncl, - }, client, nil + return ocl, ncl, client, nil } func substrateAppConfig(cfg *Config) substrate.Config { diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor.go b/go/core/pkg/sandboxbackend/substrate/delete_actor.go index 462a57de26..34c9e5cc45 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_actor.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor.go @@ -9,12 +9,15 @@ import ( "google.golang.org/grpc/status" ) -// AdvanceActorDelete performs at most one mutating ate-api step per call. +// deleteActor performs at most one mutating ate-api step per call. // Returns true when the actor no longer exists. Callers should requeue until true. -func (c *Client) AdvanceActorDelete(ctx context.Context, actorID string) (bool, error) { +func deleteActor(ctx context.Context, c *Client, actorID string) (bool, error) { if actorID == "" { return true, nil } + if c == nil { + return false, fmt.Errorf("substrate ate-api client is required") + } actor, err := c.GetActor(ctx, actorID) if err != nil { diff --git a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go index 9453005fc9..c61ab4edf4 100644 --- a/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go +++ b/go/core/pkg/sandboxbackend/substrate/delete_actor_test.go @@ -2,10 +2,9 @@ package substrate import "testing" -func TestAdvanceActorDeleteEmptyID(t *testing.T) { +func TestDeleteActorEmptyID(t *testing.T) { t.Parallel() - c := &Client{} - done, err := c.AdvanceActorDelete(t.Context(), "") + done, err := deleteActor(t.Context(), nil, "") if err != nil { t.Fatalf("unexpected error: %v", err) } diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go index 07534b4f4f..3b9a51b09f 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete.go @@ -42,10 +42,7 @@ func (p *Lifecycle) CleanupGeneratedTemplate(ctx context.Context, ah *v1alpha2.A } func deleteGoldenActor(ctx context.Context, ateClient *Client, actorID string) (bool, error) { - if ateClient == nil { - return false, fmt.Errorf("substrate ate-api client is required") - } - return ateClient.AdvanceActorDelete(ctx, actorID) + return deleteActor(ctx, ateClient, actorID) } func (p *Lifecycle) goldenActorID(ctx context.Context, tmplKey types.NamespacedName) (string, error) { diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index a66a9e45f5..02e059bfd6 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -105,7 +105,7 @@ func (b *ClawBackend) DeleteAgentHarness(ctx context.Context, h sandboxbackend.H if h.ID == "" { return true, nil } - done, err := b.client.AdvanceActorDelete(ctx, h.ID) + done, err := deleteActor(ctx, b.client, h.ID) if err != nil { return false, fmt.Errorf("substrate delete actor %q: %w", h.ID, err) } From 4d379f14e9dee093f1ded976b831a8adf1d82092 Mon Sep 17 00:00:00 2001 From: Eitan Yarmush Date: Mon, 1 Jun 2026 20:06:32 +0000 Subject: [PATCH 26/32] Fix AgentHarness CI failures Signed-off-by: Eitan Yarmush --- .github/workflows/ci.yaml | 2 +- .../sandboxbackend/substrate/lifecycle_actortemplate.go | 9 +++------ .../sandboxbackend/substrate/lifecycle_delete_test.go | 7 +++---- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 40b59b930a..b8e3768d9a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -81,7 +81,7 @@ jobs: - name: Install agent-sandbox run: | kubectl apply -f "https://github.com/kubernetes-sigs/agent-sandbox/releases/download/${AGENT_SANDBOX_VERSION}/manifest.yaml" - kubectl wait --for=condition=Established crd/sandboxes.agents.x-k8s.io --timeout=90s + timeout 90s bash -c 'until [ "$(kubectl get crd sandboxes.agents.x-k8s.io -o jsonpath="{.status.conditions[?(@.type==\"Established\")].status}" 2>/dev/null)" = "True" ]; do sleep 1; done' kubectl rollout status deployment/agent-sandbox-controller -n agent-sandbox-system --timeout=120s kubectl wait --for=condition=Ready pod -l app=agent-sandbox-controller -n agent-sandbox-system --timeout=120s diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go index c585663f73..88a0872dac 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_actortemplate.go @@ -3,6 +3,7 @@ package substrate import ( "context" "fmt" + "maps" "strings" atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" @@ -94,12 +95,8 @@ func mergeLabels(existing, desired map[string]string) map[string]string { return nil } merged := make(map[string]string, len(existing)+len(desired)) - for k, v := range existing { - merged[k] = v - } - for k, v := range desired { - merged[k] = v - } + maps.Copy(merged, existing) + maps.Copy(merged, desired) return merged } diff --git a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go index e772664133..4970d7ad40 100644 --- a/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go +++ b/go/core/pkg/sandboxbackend/substrate/lifecycle_delete_test.go @@ -2,6 +2,7 @@ package substrate import ( "context" + "slices" "testing" atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" @@ -24,10 +25,8 @@ type recordingActorClient struct { } func (r *recordingActorClient) GetActor(_ context.Context, in *ateapipb.GetActorRequest, _ ...grpc.CallOption) (*ateapipb.GetActorResponse, error) { - for _, deleted := range r.deleted { - if deleted == in.GetActorId() { - return nil, status.Error(codes.NotFound, "actor deleted") - } + if slices.Contains(r.deleted, in.GetActorId()) { + return nil, status.Error(codes.NotFound, "actor deleted") } return &ateapipb.GetActorResponse{ Actor: &ateapipb.Actor{ From fa30b7889e9945675676cb4054b41a678f76fcfd Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 1 Jun 2026 14:16:03 -0700 Subject: [PATCH 27/32] fix minor issues Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 27 ++++++----- .../crd/bases/kagent.dev_agentharnesses.yaml | 7 +-- .../pkg/sandboxbackend/openclaw/constants.go | 12 ++++- .../pkg/sandboxbackend/substrate/client.go | 17 ++++--- .../sandboxbackend/substrate/client_test.go | 45 ++++++++++++++++--- .../pkg/sandboxbackend/substrate/config.go | 3 +- .../substrate/pin_image_test.go | 27 +++++++++++ .../substrate/provision_actortemplate.go | 4 ++ .../substrate/provision_shared.go | 15 +++++++ .../templates/kagent.dev_agentharnesses.yaml | 7 +-- 10 files changed, 133 insertions(+), 31 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/pin_image_test.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index bb5964f663..d9176c1b82 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -4,24 +4,25 @@ Uses cluster `kind` (`KIND_CLUSTER_NAME=kind`; or set `KUBECONFIG` / context accordingly). -```bash -cd substrate +From the [Agent Substrate](https://github.com/agent-substrate/substrate) repository root: +```bash ./hack/create-kind-cluster.sh ./hack/install-ate-kind.sh --deploy-ate-system ``` -`--deploy-ate-system` installs the **control plane only** (ate-api, ate-controller, atelet, atenet, …). Your registry catalog will show `ateapi-*`, `atelet-*`, etc., but **not** ateom until you build it. +`hack/install-ate-kind.sh` sets `KO_DOCKER_REPO=localhost:5001` and `KO_DEFAULTPLATFORMS=linux/$(go env GOARCH)` for that shell. `--deploy-ate-system` installs the **control plane only** (ate-api, ate-controller, atelet, atenet, …). Your registry catalog will show `ateapi-*`, `atelet-*`, etc., but **not** ateom until you build it. -Build and push **ateom-gvisor** (required for kagent `workerPool.ateomImage`): +Build and push **ateom-gvisor** (required for kagent `workerPool.ateomImage`). Substrate pins `ko` via `hack/tools/ko` and invokes it with `hack/run-tool.sh` (the old `hack/ko.sh` wrapper was removed): ```bash -# build the ateom-gvisor image from the substrate folder export KO_DOCKER_REPO=localhost:5001 export KO_DEFAULTPLATFORMS=linux/$(go env GOARCH) -./hack/ko.sh build -B ./cmd/servers/ateom-gvisor +./hack/run-tool.sh ko build -B ./cmd/ateom-gvisor ``` +`-B` (`--base-import-paths`) publishes `localhost:5001/ateom-gvisor:latest`, matching the default `controller.substrate.ateomImage` in kagent Helm values. Do not use `--bare` here: it treats `KO_DOCKER_REPO` as the entire image name and fails on `localhost:5001`. + ## 2. Load nemoclaw image The image is a multi-arch manifest list. On Apple Silicon, `kind load docker-image` often fails with `content digest ... not found` because Docker only has the local arch locally while kind imports with `--all-platforms`. Use `docker save` + `ctr import` instead (match `--name` to your cluster, e.g. `agent` for context `kind-agent`): @@ -42,10 +43,14 @@ Install kagent (Substrate must already be running in the cluster): ```bash export KIND_CLUSTER_NAME=kind -make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" +make helm-install KAGENT_HELM_EXTRA_ARGS="\ + --set controller.substrate.enabled=true \ + --set controller.substrate.ateApiEndpoint=dns:///api.ate-system.svc:443 \ + --set controller.substrate.ateApiInsecure=true \ + --set controller.substrate.ateomImage=localhost:5001/ateom-gvisor:latest" ``` -The generated `ActorTemplate` uses `controller.substrate.pauseImage`, `controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, `controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` from the Helm values Override them with `--set` or a values file when you need to pin a different gVisor build. +The generated `ActorTemplate` uses `controller.substrate.pauseImage`, `controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, `controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` from the Helm values. Override them with `--set` or a values file when you need to pin a different gVisor build. Create a harness. If `snapshotsConfig` is omitted, kagent defaults it to `gs://ate-snapshots//`. If Helm sets `controller.substrate.ateomImage`, the per-harness `workerPool.ateomImage` can be omitted unless you want to override it. @@ -83,8 +88,8 @@ spec: # name: openclaw-gateway-token # namespace: kagent - # Optional: override the sandbox image used in the ActorTemplate. - # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + # Optional: override the sandbox image (must be digest-pinned for Substrate). + # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:d52bee415dc4c0dba7164f9eabe727574c056d4f211781f20af249707883a3b4 # Optional: adopt existing resources instead of auto-create # workerPoolRef: @@ -122,7 +127,7 @@ spec: location: gs://ate-snapshots/kagent/peterj-claw containers: - name: openclaw - image: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + image: ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:d52bee415dc4c0dba7164f9eabe727574c056d4f211781f20af249707883a3b4 ports: - containerPort: 80 command: diff --git a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml index 52f814c1aa..49d74db205 100644 --- a/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml +++ b/go/api/config/crd/bases/kagent.dev_agentharnesses.yaml @@ -542,7 +542,7 @@ spec: gatewayPort: default: 80 description: GatewayPort is the port OpenClaw listens on inside - the actor (Substrate routes to :80 today). + the actor. Defaults to 80. format: int32 type: integer gatewayToken: @@ -719,8 +719,9 @@ spec: properties: endpoint: description: |- - Endpoint is the backend-specific address (gRPC target, SSH host:port, - ...) clients should use to reach the harness. + Endpoint is the backend-specific address clients should use to reach the harness. + OpenShell: gRPC gateway URL with sandbox id (gateway#sandbox). Substrate: kagent + gateway proxy path (/api/agentharnesses///gateway/). type: string type: object observedGeneration: diff --git a/go/core/pkg/sandboxbackend/openclaw/constants.go b/go/core/pkg/sandboxbackend/openclaw/constants.go index bf696bd59d..9b43ce6f24 100644 --- a/go/core/pkg/sandboxbackend/openclaw/constants.go +++ b/go/core/pkg/sandboxbackend/openclaw/constants.go @@ -1,8 +1,16 @@ package openclaw const ( - // NemoclawSandboxBaseImage is the default OpenShell VM image for OpenClaw/NemoClaw harnesses. - NemoclawSandboxBaseImage = "ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4" + // NemoclawSandboxBaseImage is the default VM image for OpenClaw/NemoClaw harnesses (OpenShell and Substrate). + // Human tag: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + // + // Substrate ActorTemplates require a digest pin (image must contain "@"); OpenShell accepts tags or digests. + // To resolve a tag to a digest when bumping this constant: + // + // docker buildx imagetools inspect ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 --format '{{.Manifest.Digest}}' + // + // Then set NemoclawSandboxBaseImage to repo@, e.g. ghcr.io/.../sandbox-base@sha256:... + NemoclawSandboxBaseImage = "ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:d52bee415dc4c0dba7164f9eabe727574c056d4f211781f20af249707883a3b4" // openshellSecretProviderID is the secrets.providers key written into openclaw.json for OpenShell sandboxes. openshellSecretProviderID = "kagent" diff --git a/go/core/pkg/sandboxbackend/substrate/client.go b/go/core/pkg/sandboxbackend/substrate/client.go index b7987c668b..bb23ebbf32 100644 --- a/go/core/pkg/sandboxbackend/substrate/client.go +++ b/go/core/pkg/sandboxbackend/substrate/client.go @@ -10,7 +10,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials" - "google.golang.org/grpc/credentials/insecure" ) // Client wraps ate-api Control gRPC. @@ -32,11 +31,8 @@ func Dial(ctx context.Context, cfg Config) (*Client, error) { dialCtx, cancel := context.WithTimeout(ctx, dialTimeout) defer cancel() - var opts []grpc.DialOption - if cfg.Insecure { - opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials())) - } else { - opts = append(opts, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{MinVersion: tls.VersionTLS12}))) + opts := []grpc.DialOption{ + grpc.WithTransportCredentials(credentials.NewTLS(ateAPITLSConfig(cfg.Insecure))), } conn, err := grpc.NewClient(cfg.AteAPIEndpoint, opts...) @@ -57,6 +53,15 @@ func Dial(ctx context.Context, cfg Config) (*Client, error) { }, nil } +func ateAPITLSConfig(insecure bool) *tls.Config { + tlsCfg := &tls.Config{MinVersion: tls.VersionTLS12} + if insecure { + // Kind/local ate-api uses pod-issued certs; skip verification (same as grpcurl -insecure). + tlsCfg.InsecureSkipVerify = true + } + return tlsCfg +} + func waitConnReady(ctx context.Context, conn *grpc.ClientConn) error { for { switch s := conn.GetState(); s { diff --git a/go/core/pkg/sandboxbackend/substrate/client_test.go b/go/core/pkg/sandboxbackend/substrate/client_test.go index d1c1417db2..cc381cd350 100644 --- a/go/core/pkg/sandboxbackend/substrate/client_test.go +++ b/go/core/pkg/sandboxbackend/substrate/client_test.go @@ -2,30 +2,65 @@ package substrate import ( "context" + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "math/big" "net" "testing" "time" "github.com/stretchr/testify/require" "google.golang.org/grpc" + "google.golang.org/grpc/credentials" ) -func TestDial_tcpReachesReady(t *testing.T) { +func TestAteAPITLSConfig(t *testing.T) { + cfg := ateAPITLSConfig(false) + require.False(t, cfg.InsecureSkipVerify) + + cfg = ateAPITLSConfig(true) + require.True(t, cfg.InsecureSkipVerify) + require.Equal(t, uint16(tls.VersionTLS12), cfg.MinVersion) +} + +func TestDial_tlsSkipVerifyReachesReady(t *testing.T) { + cert := newTestTLSCert(t) + lis, err := net.Listen("tcp", "127.0.0.1:0") require.NoError(t, err) - srv := grpc.NewServer() + srv := grpc.NewServer(grpc.Creds(credentials.NewTLS(&tls.Config{ + Certificates: []tls.Certificate{cert}, + MinVersion: tls.VersionTLS12, + }))) go func() { _ = srv.Serve(lis) }() t.Cleanup(func() { srv.Stop() _ = lis.Close() }) - cfg := Config{ + c, err := Dial(context.Background(), Config{ AteAPIEndpoint: lis.Addr().String(), Insecure: true, DialTimeout: 2 * time.Second, - } - c, err := Dial(context.Background(), cfg) + }) require.NoError(t, err) require.NoError(t, c.Close()) } + +func newTestTLSCert(t *testing.T) tls.Certificate { + t.Helper() + key, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + template := x509.Certificate{ + SerialNumber: big.NewInt(1), + NotBefore: time.Now(), + NotAfter: time.Now().Add(time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + } + der, err := x509.CreateCertificate(rand.Reader, &template, &template, &key.PublicKey, key) + require.NoError(t, err) + return tls.Certificate{Certificate: [][]byte{der}, PrivateKey: key} +} diff --git a/go/core/pkg/sandboxbackend/substrate/config.go b/go/core/pkg/sandboxbackend/substrate/config.go index 793a4c98d5..2a83fb1064 100644 --- a/go/core/pkg/sandboxbackend/substrate/config.go +++ b/go/core/pkg/sandboxbackend/substrate/config.go @@ -6,7 +6,8 @@ import "time" type Config struct { // AteAPIEndpoint is a gRPC target (e.g. dns:///api.ate-system.svc:443). AteAPIEndpoint string - Insecure bool + // Insecure skips TLS certificate verification (local Kind / private CA). ate-api still uses TLS on :443. + Insecure bool // DialTimeout bounds the initial dial. Zero defaults to 10s in Dial. DialTimeout time.Duration CallTimeout time.Duration diff --git a/go/core/pkg/sandboxbackend/substrate/pin_image_test.go b/go/core/pkg/sandboxbackend/substrate/pin_image_test.go new file mode 100644 index 0000000000..2580dc97f5 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/pin_image_test.go @@ -0,0 +1,27 @@ +package substrate + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPinImageRef(t *testing.T) { + t.Run("accepts digest pin", func(t *testing.T) { + ref := "ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:abc" + got, err := pinImageRef(ref) + require.NoError(t, err) + require.Equal(t, ref, got) + }) + + t.Run("rejects tag", func(t *testing.T) { + _, err := pinImageRef("ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4") + require.Error(t, err) + require.Contains(t, err.Error(), "must be pinned with a digest") + }) + + t.Run("rejects empty", func(t *testing.T) { + _, err := pinImageRef(" ") + require.Error(t, err) + }) +} diff --git a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go index dcee1f5c14..233e91c0f4 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_actortemplate.go @@ -24,6 +24,10 @@ func (p *Provisioner) ensureActorTemplate(ctx context.Context, ah *v1alpha2.Agen if workloadImage == "" { workloadImage = openclaw.NemoclawSandboxBaseImage } + workloadImage, err := pinImageRef(workloadImage) + if err != nil { + return types.NamespacedName{}, err + } startupScript, containerEnv, err := p.buildOpenClawActorStartup(ctx, ah) if err != nil { return types.NamespacedName{}, fmt.Errorf("build openclaw actor startup: %w", err) diff --git a/go/core/pkg/sandboxbackend/substrate/provision_shared.go b/go/core/pkg/sandboxbackend/substrate/provision_shared.go index 2344b514d4..466f02b6d8 100644 --- a/go/core/pkg/sandboxbackend/substrate/provision_shared.go +++ b/go/core/pkg/sandboxbackend/substrate/provision_shared.go @@ -109,6 +109,21 @@ func actorTemplateName(ah *v1alpha2.AgentHarness) string { return truncateDNS1123(ah.Name) } +// pinImageRef ensures image refs satisfy Substrate ActorTemplate validation (must contain "@"). +func pinImageRef(image string) (string, error) { + image = strings.TrimSpace(image) + if image == "" { + return "", fmt.Errorf("image is required") + } + if strings.Contains(image, "@") { + return image, nil + } + return "", fmt.Errorf( + "image %q must be pinned with a digest (for example repo/name@sha256:...); tags are rejected because changing the image invalidates snapshots", + image, + ) +} + func truncateDNS1123(s string) string { s = strings.ToLower(strings.ReplaceAll(s, "_", "-")) if len(s) > 63 { diff --git a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml index 52f814c1aa..49d74db205 100644 --- a/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml +++ b/helm/kagent-crds/templates/kagent.dev_agentharnesses.yaml @@ -542,7 +542,7 @@ spec: gatewayPort: default: 80 description: GatewayPort is the port OpenClaw listens on inside - the actor (Substrate routes to :80 today). + the actor. Defaults to 80. format: int32 type: integer gatewayToken: @@ -719,8 +719,9 @@ spec: properties: endpoint: description: |- - Endpoint is the backend-specific address (gRPC target, SSH host:port, - ...) clients should use to reach the harness. + Endpoint is the backend-specific address clients should use to reach the harness. + OpenShell: gRPC gateway URL with sandbox id (gateway#sandbox). Substrate: kagent + gateway proxy path (/api/agentharnesses///gateway/). type: string type: object observedGeneration: From 600d717f3566a329dd23ab35b3a79a7205d2f23d Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Mon, 1 Jun 2026 15:32:20 -0700 Subject: [PATCH 28/32] update image and rbac for secrets Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 19 ++-- .../pkg/sandboxbackend/openclaw/constants.go | 4 +- helm/kagent/templates/_helpers.tpl | 11 +++ .../templates/substrate-ate-api-rbac.yaml | 39 ++++++++ .../tests/substrate-ate-api-rbac_test.yaml | 97 +++++++++++++++++++ helm/kagent/values.yaml | 4 + 6 files changed, 167 insertions(+), 7 deletions(-) create mode 100644 helm/kagent/templates/substrate-ate-api-rbac.yaml create mode 100644 helm/kagent/tests/substrate-ate-api-rbac_test.yaml diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index 6b234a39db..91c94b3dc8 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -16,10 +16,10 @@ cd substrate Build and push **ateom-gvisor** (required for the WorkerPool `ateomImage`): ```bash -# build the ateom-gvisor image from the substrate folder +# build the ateom-gvisor image from the substrate repo root export KO_DOCKER_REPO=localhost:5001 export KO_DEFAULTPLATFORMS=linux/$(go env GOARCH) -./hack/ko.sh build -B ./cmd/servers/ateom-gvisor +./hack/run-tool.sh ko build -B ./cmd/ateom-gvisor ``` ## 2. Load nemoclaw image @@ -42,7 +42,12 @@ Install kagent (Substrate must already be running in the cluster): ```bash export KIND_CLUSTER_NAME=kind -make helm-install KAGENT_HELM_EXTRA_ARGS="--set controller.substrate.enabled=true --set substrateWorkerPool.create=true --set substrateWorkerPool.ateomImage=localhost:5001/ateom-gvisor:latest" +make helm-install KAGENT_HELM_EXTRA_ARGS="\ + --set controller.substrate.enabled=true \ + --set controller.substrate.ateApiEndpoint=dns:///api.ate-system.svc:443 \ + --set controller.substrate.ateApiInsecure=true \ + --set substrateWorkerPool.create=true \ + --set substrateWorkerPool.ateomImage=localhost:5001/ateom-gvisor:latest" ``` The generated `ActorTemplate` uses `controller.substrate.pauseImage`, `controller.substrate.runscAMD64URL`, `controller.substrate.runscAMD64SHA256`, `controller.substrate.runscARM64URL`, and `controller.substrate.runscARM64SHA256` from the Helm values Override them with `--set` or a values file when you need to pin a different gVisor build. @@ -79,8 +84,8 @@ spec: # gatewayTokenSecretRef: # name: openclaw-gateway-token - # Optional: override the sandbox image used in the ActorTemplate. - # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + # Optional: override the sandbox image used in the ActorTemplate (must be digest-pinned). + # workloadImage: ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:d52bee415dc4c0dba7164f9eabe727574c056d4f211781f20af249707883a3b4 ``` kagent creates an `ActorTemplate` that looks roughly like this: @@ -110,7 +115,7 @@ spec: location: gs://ate-snapshots/kagent/peterj-claw containers: - name: openclaw - image: ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 + image: ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:d52bee415dc4c0dba7164f9eabe727574c056d4f211781f20af249707883a3b4 ports: - containerPort: 80 command: @@ -131,6 +136,8 @@ The generated `command` contains a base64-encoded `openclaw.json`, so the live o When `modelConfigRef` or `spec.channels` are set, credentials are **not** copied into the ActorTemplate or `openclaw.json` as plaintext. kagent writes `valueFrom.secretKeyRef` (or inline `value` for harness inline tokens) on the ActorTemplate container env; Substrate `ate-api` resolves those refs at actor resume. In `openclaw.json`, kagent uses OpenClaw [env SecretRefs](https://docs.openclaw.ai/gateway/secrets) (`{source:"env",provider:"default",id:""}`) for `models.providers.*.apiKey`, `channels.telegram.accounts.*.botToken`, and `channels.slack.accounts.*.botToken` / `appToken`. Rotate a Secret and recreate the ActorTemplate golden snapshot when keys change. +With `controller.substrate.enabled=true`, the kagent Helm chart installs a namespace-scoped Role and RoleBinding so `ate-api-server` (in `ate-system` by default) can `get` Secrets and ConfigMaps referenced by generated ActorTemplates. Harnesses in other namespaces need that namespace listed in `rbac.namespaces` (or a matching RoleBinding applied manually). + Port-forward the UI: ```bash diff --git a/go/core/pkg/sandboxbackend/openclaw/constants.go b/go/core/pkg/sandboxbackend/openclaw/constants.go index bf696bd59d..1f6f88b687 100644 --- a/go/core/pkg/sandboxbackend/openclaw/constants.go +++ b/go/core/pkg/sandboxbackend/openclaw/constants.go @@ -2,7 +2,9 @@ package openclaw const ( // NemoclawSandboxBaseImage is the default OpenShell VM image for OpenClaw/NemoClaw harnesses. - NemoclawSandboxBaseImage = "ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4" + // Substrate requires workload images to use @sha256:... refs (see pinImageRef). (OpenShell doesn't care) + // Tag: 2026.5.4 + NemoclawSandboxBaseImage = "ghcr.io/kagent-dev/nemoclaw/sandbox-base@sha256:d52bee415dc4c0dba7164f9eabe727574c056d4f211781f20af249707883a3b4" // openshellSecretProviderID is the secrets.providers key written into openclaw.json for OpenShell sandboxes. openshellSecretProviderID = "kagent" diff --git a/helm/kagent/templates/_helpers.tpl b/helm/kagent/templates/_helpers.tpl index 22c358bce3..761d225696 100644 --- a/helm/kagent/templates/_helpers.tpl +++ b/helm/kagent/templates/_helpers.tpl @@ -50,6 +50,17 @@ Allows overriding it for multi-namespace deployments in combined charts. {{- default .Release.Namespace .Values.namespaceOverride | trunc 63 | trimSuffix "-" -}} {{- end }} +{{/* +Namespaces where Substrate ate-api-server needs read access to Secrets and ConfigMaps +referenced by generated ActorTemplates (install namespace plus rbac.namespaces). +*/}} +{{- define "kagent.substrate.envSourceNamespaces" -}} +{{- $installNs := include "kagent.namespace" . -}} +{{- $extra := .Values.rbac.namespaces | default list -}} +{{- $all := append $extra $installNs | uniq | sortAlpha -}} +{{- join "," $all -}} +{{- end }} + {{/* Watch namespaces - transforms list of namespaces cached by the controller into comma-separated string. Precedence: controller.watchNamespaces (explicit override) > rbac.namespaces > empty (watch all). diff --git a/helm/kagent/templates/substrate-ate-api-rbac.yaml b/helm/kagent/templates/substrate-ate-api-rbac.yaml new file mode 100644 index 0000000000..ea7665c3b4 --- /dev/null +++ b/helm/kagent/templates/substrate-ate-api-rbac.yaml @@ -0,0 +1,39 @@ +{{- if .Values.controller.substrate.enabled }} +{{- $namespaces := splitList "," (include "kagent.substrate.envSourceNamespaces" .) }} +{{- $ateNs := .Values.controller.substrate.ateApiServer.namespace | default "ate-system" }} +{{- $ateSA := .Values.controller.substrate.ateApiServer.serviceAccount | default "ate-api-server" }} +{{- range $namespace := $namespaces }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "kagent.fullname" $ }}-ate-api-env-sources + namespace: {{ $namespace }} + labels: + {{- include "kagent.labels" $ | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - secrets + - configmaps + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "kagent.fullname" $ }}-ate-api-env-sources + namespace: {{ $namespace }} + labels: + {{- include "kagent.labels" $ | nindent 4 }} +subjects: +- kind: ServiceAccount + name: {{ $ateSA | quote }} + namespace: {{ $ateNs | quote }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "kagent.fullname" $ }}-ate-api-env-sources +{{- end }} +{{- end }} diff --git a/helm/kagent/tests/substrate-ate-api-rbac_test.yaml b/helm/kagent/tests/substrate-ate-api-rbac_test.yaml new file mode 100644 index 0000000000..2bcee7d040 --- /dev/null +++ b/helm/kagent/tests/substrate-ate-api-rbac_test.yaml @@ -0,0 +1,97 @@ +suite: test substrate ate-api rbac +templates: + - substrate-ate-api-rbac.yaml +tests: + - it: should not render when substrate is disabled + set: + controller.substrate.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: should render Role and RoleBinding in the release namespace when substrate is enabled + set: + controller.substrate.enabled: true + asserts: + - hasDocuments: + count: 2 + - isKind: + of: Role + documentIndex: 0 + - isKind: + of: RoleBinding + documentIndex: 1 + - equal: + path: metadata.namespace + value: NAMESPACE + documentIndex: 0 + - equal: + path: metadata.name + value: RELEASE-NAME-ate-api-env-sources + documentIndex: 0 + - contains: + path: rules + content: + apiGroups: [""] + resources: ["secrets", "configmaps"] + verbs: ["get"] + documentIndex: 0 + - equal: + path: subjects[0].name + value: ate-api-server + documentIndex: 1 + - equal: + path: subjects[0].namespace + value: ate-system + documentIndex: 1 + + - it: should render RBAC in each rbac.namespaces entry plus the release namespace + set: + controller.substrate.enabled: true + rbac: + namespaces: + - team-a + - team-b + asserts: + - hasDocuments: + count: 6 + - equal: + path: metadata.namespace + value: NAMESPACE + documentIndex: 0 + - equal: + path: metadata.namespace + value: NAMESPACE + documentIndex: 1 + - equal: + path: metadata.namespace + value: team-a + documentIndex: 2 + - equal: + path: metadata.namespace + value: team-a + documentIndex: 3 + - equal: + path: metadata.namespace + value: team-b + documentIndex: 4 + - equal: + path: metadata.namespace + value: team-b + documentIndex: 5 + + - it: should allow overriding ate-api-server service account identity + set: + controller.substrate.enabled: true + controller.substrate.ateApiServer: + namespace: custom-ate + serviceAccount: custom-api + asserts: + - equal: + path: subjects[0].name + value: custom-api + documentIndex: 1 + - equal: + path: subjects[0].namespace + value: custom-ate + documentIndex: 1 diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index fab50dcf55..42e06e5b3a 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -237,6 +237,10 @@ controller: enabled: false ateApiEndpoint: "" ateApiInsecure: false + # Substrate ate-api-server identity for env source resolution (secretKeyRef/configMapKeyRef on ActorTemplates). + ateApiServer: + namespace: ate-system + serviceAccount: ate-api-server defaultWorkerPool: namespace: "" name: "" From 8c90bd45aeb60c72a397b41231e86d4bc1b27ebf Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 2 Jun 2026 08:50:12 -0700 Subject: [PATCH 29/32] fix the readme Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index 91c94b3dc8..cc14385ce0 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -2,7 +2,9 @@ ## 1. Install Substrate on your Kind cluster -Uses cluster `kind` (`KIND_CLUSTER_NAME=kind`; or set `KUBECONFIG` / context accordingly). +You can clone the kagent fork of substrate [here](https://github.com/kagent-dev/substrate). + +These instructions use a Kind cluster called `kind` (`KIND_CLUSTER_NAME=kind`). ```bash cd substrate @@ -22,18 +24,6 @@ export KO_DEFAULTPLATFORMS=linux/$(go env GOARCH) ./hack/run-tool.sh ko build -B ./cmd/ateom-gvisor ``` -## 2. Load nemoclaw image - -The image is a multi-arch manifest list. On Apple Silicon, `kind load docker-image` often fails with `content digest ... not found` because Docker only has the local arch locally while kind imports with `--all-platforms`. Use `docker save` + `ctr import` instead (match `--name` to your cluster, e.g. `agent` for context `kind-agent`): - -```bash -docker pull --platform linux/arm64 ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 -docker save ghcr.io/kagent-dev/nemoclaw/sandbox-base:2026.5.4 | \ - docker exec -i kind-control-plane ctr --namespace=k8s.io images import - -``` - -On amd64 hosts, use `--platform linux/amd64` in the pull step. - ## kagent AgentHarness with substrate runtime kagent generates a per-harness `ActorTemplate` and uses an existing `WorkerPool`. From 9a9c281ff35b3bb0c3c5b078afe51dcf726ba7d7 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 2 Jun 2026 10:54:45 -0700 Subject: [PATCH 30/32] go through the atenet router for the UI Signed-off-by: Peter Jausovec --- examples/substrate-openclaw/README.md | 2 + .../handlers/agentharness_gateway.go | 55 +++---------------- .../handlers/agentharness_gateway_test.go | 50 +++++------------ go/core/internal/httpserver/server.go | 2 +- go/core/pkg/app/app.go | 7 +-- .../pkg/sandboxbackend/substrate/gateway.go | 31 +++++++++++ .../sandboxbackend/substrate/gateway_test.go | 41 ++++++++++++++ .../pkg/sandboxbackend/substrate/openclaw.go | 4 +- .../templates/controller-deployment.yaml | 4 ++ helm/kagent/values.yaml | 2 + 10 files changed, 110 insertions(+), 88 deletions(-) create mode 100644 go/core/pkg/sandboxbackend/substrate/gateway.go create mode 100644 go/core/pkg/sandboxbackend/substrate/gateway_test.go diff --git a/examples/substrate-openclaw/README.md b/examples/substrate-openclaw/README.md index cc14385ce0..3ddfc5cd99 100644 --- a/examples/substrate-openclaw/README.md +++ b/examples/substrate-openclaw/README.md @@ -140,3 +140,5 @@ Navigate to the deployed agent harness. If the OpenClaw Control UI asks for a ga - Gateway token: `test-token` The gateway URL must include the trailing slash. The token is the value configured in `spec.substrate.gatewayToken`, or the Secret value referenced by `spec.substrate.gatewayTokenSecretRef`; enter it in the token/credentials field rather than relying on a `token` query parameter. + +kagent proxies UI traffic to the actor OpenClaw gateway through Substrate's **atenet-router** (Envoy) using the actor `Host` header (`.actors.resources.substrate.ate.dev`). The default router URL is `http://atenet-router.ate-system.svc:80`; override with `controller.substrate.atenetRouterURL` when needed. diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway.go b/go/core/internal/httpserver/handlers/agentharness_gateway.go index 5215fe82cb..2762605c37 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway.go @@ -3,7 +3,6 @@ package handlers import ( "context" "fmt" - "net" "net/http" "net/http/httputil" "net/url" @@ -26,15 +25,12 @@ const ( ) // AgentHarnessGatewayConfig configures Substrate harness HTTP/WebSocket proxy. -// Traffic is proxied directly to the actor ateom pod IP on port 80 (no atenet-router fallback). +// Traffic is proxied through atenet-router (Envoy) using actor Host-based routing. type AgentHarnessGatewayConfig struct { - AteAPIEndpoint string - AteAPIInsecure bool - DialTimeout time.Duration - CallTimeout time.Duration + AtenetRouterURL string } -// HandleAgentHarnessGateway proxies browser traffic to the actor OpenClaw gateway (pod IP when available). +// HandleAgentHarnessGateway proxies browser traffic to the actor OpenClaw gateway via atenet-router. func (h *Handlers) HandleAgentHarnessGateway(w ErrorResponseWriter, r *http.Request) { log := ctrllog.FromContext(r.Context()).WithName("agentharness-gateway") if h.AgentHarnessGateway == nil { @@ -114,54 +110,21 @@ func (h *Handlers) resolveSubstrateGatewayTarget(ctx context.Context, ah *v1alph if cfg == nil { return nil, "", fmt.Errorf("substrate gateway is not configured") } - if cfg.AteAPIEndpoint == "" { - return nil, "", fmt.Errorf("substrate ate-api is not configured on the controller") - } - - ateClient, err := substrate.Dial(ctx, substrate.Config{ - AteAPIEndpoint: cfg.AteAPIEndpoint, - Insecure: cfg.AteAPIInsecure, - DialTimeout: cfg.DialTimeout, - CallTimeout: cfg.CallTimeout, - }) - if err != nil { - return nil, "", fmt.Errorf("dial ate-api: %w", err) - } - defer ateClient.Close() - actorID := ah.Status.BackendRef.ID - actor, err := ateClient.GetActor(ctx, actorID) - if err != nil { - return nil, "", fmt.Errorf("get substrate actor %q: %w", actorID, err) - } - podIP := strings.TrimSpace(actor.GetAteomPodIp()) - if podIP == "" { - return nil, "", fmt.Errorf("substrate actor %q has no pod IP (status %s; resume the actor and wait until running)", actorID, actor.GetStatus()) - } - target, host, err := substrateGatewayPodTarget(podIP) + actorID := strings.TrimSpace(ah.Status.BackendRef.ID) + target, host, err := substrate.GatewayRouterTarget(cfg.AtenetRouterURL, actorID) if err != nil { - return nil, "", fmt.Errorf("substrate actor %q pod IP %q: %w", actorID, podIP, err) + return nil, "", fmt.Errorf("substrate actor %q: %w", actorID, err) } ctrllog.FromContext(ctx).WithName("agentharness-gateway").Info( - "proxying via actor pod IP", + "proxying via atenet-router", "actor", actorID, - "podIP", host, + "router", target.String(), + "host", host, ) return target, host, nil } -func substrateGatewayPodTarget(podIP string) (*url.URL, string, error) { - ip := strings.TrimSpace(podIP) - if ip == "" || net.ParseIP(ip) == nil { - return nil, "", fmt.Errorf("invalid actor pod IP %q", podIP) - } - target, err := url.Parse("http://" + net.JoinHostPort(ip, "80")) - if err != nil { - return nil, "", fmt.Errorf("parse actor pod target: %w", err) - } - return target, ip, nil -} - func agentHarnessHarnessBase(namespace, name string) string { return "/api/agentharnesses/" + namespace + "/" + name } diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_test.go index 5e9775a4fa..724130bcb8 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_test.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_test.go @@ -8,33 +8,13 @@ import ( "net/url" "strings" "testing" -) -func TestSubstrateGatewayPodTarget(t *testing.T) { - t.Parallel() - target, host, err := substrateGatewayPodTarget("10.244.0.29") - if err != nil { - t.Fatal(err) - } - if host != "10.244.0.29" { - t.Fatalf("host = %q", host) - } - if target.Scheme != "http" || target.Host != "10.244.0.29:80" { - t.Fatalf("target = %s", target.String()) - } -} - -func TestSubstrateGatewayPodTargetRejectsInvalidIP(t *testing.T) { - t.Parallel() - _, _, err := substrateGatewayPodTarget("not-an-ip") - if err == nil { - t.Fatal("expected error for invalid pod IP") - } -} + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" +) -func TestGatewayProxyForwardsToPodIPWithAuthHeaders(t *testing.T) { +func TestGatewayProxyForwardsToAtenetRouterWithActorHost(t *testing.T) { t.Parallel() - const podIP = "10.244.0.29" + const actorHost = "ahr-kagent-my-claw.actors.resources.substrate.ate.dev" const token = "some-token" ns, name := "kagent", "my-claw" publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) @@ -55,7 +35,7 @@ func TestGatewayProxyForwardsToPodIPWithAuthHeaders(t *testing.T) { t.Fatal(err) } - proxy := newAgentHarnessGatewayProxy(target, podIP, token, publicPrefix, ns, name, testLog{t}) + proxy := newAgentHarnessGatewayProxy(target, actorHost, token, publicPrefix, ns, name, testLog{t}) req := httptest.NewRequest(http.MethodGet, publicPrefix, nil) rec := httptest.NewRecorder() proxy.ServeHTTP(rec, req) @@ -63,8 +43,8 @@ func TestGatewayProxyForwardsToPodIPWithAuthHeaders(t *testing.T) { if rec.Code != http.StatusOK { t.Fatalf("status = %d, body = %s", rec.Code, rec.Body.String()) } - if gotHost != podIP { - t.Fatalf("upstream Host = %q, want %q", gotHost, podIP) + if gotHost != actorHost { + t.Fatalf("upstream Host = %q, want %q", gotHost, actorHost) } if gotAuth != "Bearer "+token { t.Fatalf("Authorization = %q", gotAuth) @@ -81,17 +61,17 @@ func TestGatewayProxyForwardsToPodIPWithAuthHeaders(t *testing.T) { } } -func TestGatewayProxyRewriteTargetsPodIPOnWebSocketPath(t *testing.T) { +func TestGatewayProxyRewriteTargetsAtenetRouterHostOnWebSocketPath(t *testing.T) { t.Parallel() - const podIP = "10.244.0.29" + const actorHost = "ahr-kagent-my-claw.actors.resources.substrate.ate.dev" ns, name := "kagent", "my-claw" publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) - target, err := url.Parse("http://" + podIP + ":80") + target, err := substrate.GatewayRouterTarget(substrate.DefaultAtenetRouterURL, "ahr-kagent-my-claw") if err != nil { t.Fatal(err) } - proxy := newAgentHarnessGatewayProxy(target, podIP, "tok", publicPrefix, ns, name, testLog{t}) + proxy := newAgentHarnessGatewayProxy(target, actorHost, "tok", publicPrefix, ns, name, testLog{t}) req := httptest.NewRequest(http.MethodGet, strings.TrimSuffix(publicPrefix, "/"), nil) req.Header.Set("Connection", "Upgrade") req.Header.Set("Upgrade", "websocket") @@ -101,11 +81,11 @@ func TestGatewayProxyRewriteTargetsPodIPOnWebSocketPath(t *testing.T) { proxy.Rewrite(&httputil.ProxyRequest{In: req, Out: outReq}) - if outReq.Host != podIP { - t.Fatalf("Host = %q, want pod IP", outReq.Host) + if outReq.Host != actorHost { + t.Fatalf("Host = %q, want actor host", outReq.Host) } - if outReq.URL.Host != podIP+":80" { - t.Fatalf("URL.Host = %q", outReq.URL.Host) + if outReq.URL.Host != target.Host { + t.Fatalf("URL.Host = %q, want router %q", outReq.URL.Host, target.Host) } if outReq.URL.Path != publicPrefix { t.Fatalf("URL.Path = %q, want %q", outReq.URL.Path, publicPrefix) diff --git a/go/core/internal/httpserver/server.go b/go/core/internal/httpserver/server.go index 037393381f..c7da966cb9 100644 --- a/go/core/internal/httpserver/server.go +++ b/go/core/internal/httpserver/server.go @@ -315,7 +315,7 @@ func (s *HTTPServer) setupRoutes() { // OpenShell sandbox PTY (browser WebSocket → gateway CONNECT → SSH). Authenticated like other /api routes. s.router.HandleFunc(APIPathSandboxSSH, adaptHandler(s.handlers.HandleSandboxSSHWebSocket)).Methods(http.MethodGet) - // Substrate OpenClaw gateway proxy (HTTP + WebSocket) to the actor pod IP :80. + // Substrate OpenClaw gateway proxy (HTTP + WebSocket) via atenet-router. s.router.PathPrefix(APIPathAgentHarnessHarness).Handler( adaptHandler(s.handlers.HandleAgentHarnessGateway), ) diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index 5c683e0f42..a505e423de 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -155,6 +155,7 @@ type Config struct { } Substrate struct { AteAPIEndpoint string + AtenetRouterURL string Insecure bool DialTimeout time.Duration CallTimeout time.Duration @@ -226,6 +227,7 @@ func (cfg *Config) SetFlags(commandLine *flag.FlagSet) { commandLine.DurationVar(&cfg.Openshell.CallTimeout, "openshell-call-timeout", 30*time.Second, "Per-RPC timeout for OpenShell gateway calls.") commandLine.StringVar(&cfg.Substrate.AteAPIEndpoint, "substrate-ate-api-endpoint", "", "gRPC target for Agent Substrate ate-api (e.g. dns:///api.ate-system.svc:443). Enables substrate AgentHarness runtime when set.") + commandLine.StringVar(&cfg.Substrate.AtenetRouterURL, "substrate-atenet-router-url", "", "HTTP URL for Substrate atenet-router (Envoy). Defaults to http://atenet-router.ate-system.svc:80 when unset.") commandLine.BoolVar(&cfg.Substrate.Insecure, "substrate-ate-api-insecure", false, "Dial ate-api without TLS (local dev only).") commandLine.DurationVar(&cfg.Substrate.DialTimeout, "substrate-dial-timeout", 10*time.Second, "Timeout for the initial dial to ate-api.") commandLine.DurationVar(&cfg.Substrate.CallTimeout, "substrate-call-timeout", 30*time.Second, "Per-RPC timeout for ate-api calls.") @@ -740,10 +742,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne var agentHarnessGateway *handlers.AgentHarnessGatewayConfig if cfg.Substrate.AteAPIEndpoint != "" { agentHarnessGateway = &handlers.AgentHarnessGatewayConfig{ - AteAPIEndpoint: cfg.Substrate.AteAPIEndpoint, - AteAPIInsecure: cfg.Substrate.Insecure, - DialTimeout: cfg.Substrate.DialTimeout, - CallTimeout: cfg.Substrate.CallTimeout, + AtenetRouterURL: cfg.Substrate.AtenetRouterURL, } } diff --git a/go/core/pkg/sandboxbackend/substrate/gateway.go b/go/core/pkg/sandboxbackend/substrate/gateway.go new file mode 100644 index 0000000000..b1631d94e5 --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/gateway.go @@ -0,0 +1,31 @@ +package substrate + +import ( + "fmt" + "net/url" + "strings" +) + +// DefaultAtenetRouterURL is the in-cluster HTTP endpoint for Substrate's Envoy router. +const DefaultAtenetRouterURL = "http://atenet-router.ate-system.svc:80" + +// GatewayRouterTarget returns the atenet-router reverse-proxy URL and Host header for an actor. +func GatewayRouterTarget(routerURL, actorID string) (*url.URL, string, error) { + routerURL = strings.TrimSpace(routerURL) + if routerURL == "" { + routerURL = DefaultAtenetRouterURL + } + actorID = strings.TrimSpace(actorID) + if actorID == "" { + return nil, "", fmt.Errorf("actor id is required") + } + target, err := url.Parse(routerURL) + if err != nil { + return nil, "", fmt.Errorf("parse atenet-router URL %q: %w", routerURL, err) + } + if target.Scheme == "" { + return nil, "", fmt.Errorf("atenet-router URL %q must include a scheme (http or https)", routerURL) + } + host := ActorHost(actorID, "") + return target, host, nil +} diff --git a/go/core/pkg/sandboxbackend/substrate/gateway_test.go b/go/core/pkg/sandboxbackend/substrate/gateway_test.go new file mode 100644 index 0000000000..8d89cf7e5b --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/gateway_test.go @@ -0,0 +1,41 @@ +package substrate + +import ( + "testing" +) + +func TestGatewayRouterTarget(t *testing.T) { + t.Parallel() + target, host, err := GatewayRouterTarget("", "ahr-kagent-my-claw") + if err != nil { + t.Fatal(err) + } + if target.String() != DefaultAtenetRouterURL { + t.Fatalf("target = %s, want %s", target, DefaultAtenetRouterURL) + } + if host != "ahr-kagent-my-claw.actors.resources.substrate.ate.dev" { + t.Fatalf("host = %q", host) + } +} + +func TestGatewayRouterTargetCustomURL(t *testing.T) { + t.Parallel() + target, host, err := GatewayRouterTarget("http://atenet-router.custom.svc:8080", "actor-1") + if err != nil { + t.Fatal(err) + } + if target.Host != "atenet-router.custom.svc:8080" { + t.Fatalf("target host = %q", target.Host) + } + if host != "actor-1.actors.resources.substrate.ate.dev" { + t.Fatalf("host = %q", host) + } +} + +func TestGatewayRouterTargetRejectsEmptyActor(t *testing.T) { + t.Parallel() + _, _, err := GatewayRouterTarget("", "") + if err == nil { + t.Fatal("expected error for empty actor id") + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/openclaw.go b/go/core/pkg/sandboxbackend/substrate/openclaw.go index 02e059bfd6..04a559c6a1 100644 --- a/go/core/pkg/sandboxbackend/substrate/openclaw.go +++ b/go/core/pkg/sandboxbackend/substrate/openclaw.go @@ -154,8 +154,8 @@ func substrateConnectionEndpoint(namespace, name string, actor *ateapipb.Actor) if actor == nil { return "kagent gateway: " + gw } - if podIP := strings.TrimSpace(actor.GetAteomPodIp()); podIP != "" { - return fmt.Sprintf("http://%s:80 (pod IP; UI via kagent %s)", podIP, gw) + if actorID := strings.TrimSpace(actor.GetActorId()); actorID != "" { + return fmt.Sprintf("atenet-router Host %s (UI via kagent %s)", ActorHost(actorID, ""), gw) } return fmt.Sprintf("kagent gateway: %s (actor status %s)", gw, actor.GetStatus()) } diff --git a/helm/kagent/templates/controller-deployment.yaml b/helm/kagent/templates/controller-deployment.yaml index 21d5d7b845..6ef3af61e0 100644 --- a/helm/kagent/templates/controller-deployment.yaml +++ b/helm/kagent/templates/controller-deployment.yaml @@ -90,6 +90,10 @@ spec: {{- if and .Values.controller.substrate .Values.controller.substrate.enabled }} - name: SUBSTRATE_ATE_API_ENDPOINT value: {{ .Values.controller.substrate.ateApiEndpoint | quote }} + {{- with .Values.controller.substrate.atenetRouterURL }} + - name: SUBSTRATE_ATENET_ROUTER_URL + value: {{ . | quote }} + {{- end }} {{- if .Values.controller.substrate.ateApiInsecure }} - name: SUBSTRATE_ATE_API_INSECURE value: "true" diff --git a/helm/kagent/values.yaml b/helm/kagent/values.yaml index 42e06e5b3a..e354d25310 100644 --- a/helm/kagent/values.yaml +++ b/helm/kagent/values.yaml @@ -236,6 +236,7 @@ controller: substrate: enabled: false ateApiEndpoint: "" + atenetRouterURL: "" ateApiInsecure: false # Substrate ate-api-server identity for env source resolution (secretKeyRef/configMapKeyRef on ActorTemplates). ateApiServer: @@ -252,6 +253,7 @@ controller: # Example when enabled: # enabled: true # ateApiEndpoint: "dns:///api.ate-system.svc:443" + # atenetRouterURL: "http://atenet-router.ate-system.svc:80" # defaultWorkerPool: # name: "kagent-default" From 9c93632920acfefa1e9e086c32b725d2ef22927b Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 2 Jun 2026 14:20:00 -0700 Subject: [PATCH 31/32] add /substrate page to the ui Signed-off-by: Peter Jausovec --- go/api/httpapi/substrate.go | 60 +++ .../handlers/agentharness_gateway_test.go | 7 +- .../internal/httpserver/handlers/handlers.go | 4 + .../internal/httpserver/handlers/substrate.go | 249 +++++++++++++ .../httpserver/handlers/substrate_test.go | 104 ++++++ go/core/internal/httpserver/server.go | 7 + go/core/pkg/app/app.go | 1 + go/core/pkg/sandboxbackend/substrate/list.go | 53 +++ .../templates/openclaw_startup.sh.tmpl | 7 +- ui/src/app/actions/substrate.ts | 22 ++ ui/src/app/substrate/SubstrateStatusPage.tsx | 72 ++++ ui/src/app/substrate/page.tsx | 14 + ui/src/components/Header.tsx | 14 +- .../substrate/SubstrateStatusView.tsx | 352 ++++++++++++++++++ ui/src/types/index.ts | 52 +++ 15 files changed, 1009 insertions(+), 9 deletions(-) create mode 100644 go/api/httpapi/substrate.go create mode 100644 go/core/internal/httpserver/handlers/substrate.go create mode 100644 go/core/internal/httpserver/handlers/substrate_test.go create mode 100644 go/core/pkg/sandboxbackend/substrate/list.go create mode 100644 ui/src/app/actions/substrate.ts create mode 100644 ui/src/app/substrate/SubstrateStatusPage.tsx create mode 100644 ui/src/app/substrate/page.tsx create mode 100644 ui/src/components/substrate/SubstrateStatusView.tsx diff --git a/go/api/httpapi/substrate.go b/go/api/httpapi/substrate.go new file mode 100644 index 0000000000..bbaf83f63c --- /dev/null +++ b/go/api/httpapi/substrate.go @@ -0,0 +1,60 @@ +package httpapi + +// SubstrateStatusResponse aggregates Agent Substrate control-plane and Kubernetes state. +type SubstrateStatusResponse struct { + // Enabled is true when the controller is configured with an ate-api endpoint. + Enabled bool `json:"enabled"` + // AteAPIError is set when ate-api list calls fail (actors/workers may be partial or empty). + AteAPIError string `json:"ateApiError,omitempty"` + + WorkerPools []SubstrateWorkerPoolEntry `json:"workerPools"` + ActorTemplates []SubstrateActorTemplateEntry `json:"actorTemplates"` + Actors []SubstrateActorEntry `json:"actors"` + Workers []SubstrateWorkerEntry `json:"workers"` +} + +// SubstrateWorkerPoolEntry is a ate.dev WorkerPool CR. +type SubstrateWorkerPoolEntry struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + Replicas int32 `json:"replicas"` + AteomImage string `json:"ateomImage"` +} + +// SubstrateActorTemplateEntry is a ate.dev ActorTemplate CR. +type SubstrateActorTemplateEntry struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + Phase string `json:"phase,omitempty"` + GoldenActorID string `json:"goldenActorId,omitempty"` + GoldenSnapshot string `json:"goldenSnapshot,omitempty"` + WorkerPoolRef string `json:"workerPoolRef,omitempty"` + HarnessName string `json:"harnessName,omitempty"` + ManagedByKagent bool `json:"managedByKagent"` +} + +// SubstrateActorEntry is runtime state from ate-api (redis). +type SubstrateActorEntry struct { + ActorID string `json:"actorId"` + Status string `json:"status"` + ActorTemplateNamespace string `json:"actorTemplateNamespace,omitempty"` + ActorTemplateName string `json:"actorTemplateName,omitempty"` + AteomPodNamespace string `json:"ateomPodNamespace,omitempty"` + AteomPodName string `json:"ateomPodName,omitempty"` + AteomPodIP string `json:"ateomPodIp,omitempty"` + LastSnapshot string `json:"lastSnapshot,omitempty"` + InProgressSnapshot string `json:"inProgressSnapshot,omitempty"` + Version int64 `json:"version,omitempty"` +} + +// SubstrateWorkerEntry is a worker assignment from ate-api (redis). +type SubstrateWorkerEntry struct { + WorkerNamespace string `json:"workerNamespace"` + WorkerPool string `json:"workerPool"` + WorkerPod string `json:"workerPod"` + ActorNamespace string `json:"actorNamespace,omitempty"` + ActorTemplate string `json:"actorTemplate,omitempty"` + ActorID string `json:"actorId,omitempty"` + IP string `json:"ip,omitempty"` + Version int64 `json:"version,omitempty"` +} diff --git a/go/core/internal/httpserver/handlers/agentharness_gateway_test.go b/go/core/internal/httpserver/handlers/agentharness_gateway_test.go index 724130bcb8..943aff65ce 100644 --- a/go/core/internal/httpserver/handlers/agentharness_gateway_test.go +++ b/go/core/internal/httpserver/handlers/agentharness_gateway_test.go @@ -67,11 +67,14 @@ func TestGatewayProxyRewriteTargetsAtenetRouterHostOnWebSocketPath(t *testing.T) ns, name := "kagent", "my-claw" publicPrefix := agentHarnessGatewayPublicPrefix(ns, name) - target, err := substrate.GatewayRouterTarget(substrate.DefaultAtenetRouterURL, "ahr-kagent-my-claw") + target, host, err := substrate.GatewayRouterTarget(substrate.DefaultAtenetRouterURL, "ahr-kagent-my-claw") if err != nil { t.Fatal(err) } - proxy := newAgentHarnessGatewayProxy(target, actorHost, "tok", publicPrefix, ns, name, testLog{t}) + if host != actorHost { + t.Fatalf("host = %q, want %q", host, actorHost) + } + proxy := newAgentHarnessGatewayProxy(target, host, "tok", publicPrefix, ns, name, testLog{t}) req := httptest.NewRequest(http.MethodGet, strings.TrimSuffix(publicPrefix, "/"), nil) req.Header.Set("Connection", "Upgrade") req.Header.Set("Upgrade", "websocket") diff --git a/go/core/internal/httpserver/handlers/handlers.go b/go/core/internal/httpserver/handlers/handlers.go index 3d854bd134..a0fab3a3a7 100644 --- a/go/core/internal/httpserver/handlers/handlers.go +++ b/go/core/internal/httpserver/handlers/handlers.go @@ -8,6 +8,7 @@ import ( "github.com/kagent-dev/kagent/go/core/internal/controller/reconciler" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" ) // Handlers holds all the HTTP handler components @@ -32,6 +33,7 @@ type Handlers struct { Checkpoints *CheckpointsHandler CrewAI *CrewAIHandler CurrentUser *CurrentUserHandler + Substrate *SubstrateHandler } // Base holds common dependencies for all handlers @@ -56,6 +58,7 @@ func NewHandlers( rcnclr reconciler.KagentReconciler, sandboxBackend sandboxbackend.Backend, agentHarnessGateway *AgentHarnessGatewayConfig, + substrateAteClient *substrate.Client, ) *Handlers { base := &Base{ KubeClient: kubeClient, @@ -87,5 +90,6 @@ func NewHandlers( Checkpoints: NewCheckpointsHandler(base), CrewAI: NewCrewAIHandler(base), CurrentUser: NewCurrentUserHandler(), + Substrate: NewSubstrateHandler(base, substrateAteClient), } } diff --git a/go/core/internal/httpserver/handlers/substrate.go b/go/core/internal/httpserver/handlers/substrate.go new file mode 100644 index 0000000000..47c87d4351 --- /dev/null +++ b/go/core/internal/httpserver/handlers/substrate.go @@ -0,0 +1,249 @@ +package handlers + +import ( + "context" + "fmt" + "net/http" + "slices" + "strings" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/agent-substrate/substrate/proto/ateapipb" + api "github.com/kagent-dev/kagent/go/api/httpapi" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/errors" + "github.com/kagent-dev/kagent/go/core/pkg/auth" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" + utilvalidation "k8s.io/apimachinery/pkg/util/validation" + "sigs.k8s.io/controller-runtime/pkg/client" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" +) + +// SubstrateHandler exposes Agent Substrate inventory for the UI. +type SubstrateHandler struct { + *Base + AteClient *substrate.Client +} + +// NewSubstrateHandler creates a SubstrateHandler. +func NewSubstrateHandler(base *Base, ateClient *substrate.Client) *SubstrateHandler { + return &SubstrateHandler{Base: base, AteClient: ateClient} +} + +// HandleGetSubstrateStatus handles GET /api/substrate/status?namespace=… +func (h *SubstrateHandler) HandleGetSubstrateStatus(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("substrate-handler").WithValues("operation", "status") + if err := Check(h.Authorizer, r, auth.Resource{Type: "Agent"}); err != nil { + w.RespondWithError(err) + return + } + + namespace := strings.TrimSpace(r.URL.Query().Get("namespace")) + if namespace != "" { + if errs := utilvalidation.IsDNS1123Label(namespace); len(errs) > 0 { + w.RespondWithError(errors.NewBadRequestError( + fmt.Sprintf("invalid namespace %q: %s", namespace, strings.Join(errs, ", ")), + nil, + )) + return + } + } + + namespaces, err := h.substrateNamespaces(namespace) + if err != nil { + w.RespondWithError(err) + return + } + + resp := api.SubstrateStatusResponse{ + Enabled: h.AteClient != nil, + WorkerPools: []api.SubstrateWorkerPoolEntry{}, + ActorTemplates: []api.SubstrateActorTemplateEntry{}, + Actors: []api.SubstrateActorEntry{}, + Workers: []api.SubstrateWorkerEntry{}, + } + + for _, ns := range namespaces { + wpEntries, tmplEntries, err := h.listSubstrateCRs(r.Context(), ns) + if err != nil { + log.Error(err, "list substrate CRs", "namespace", ns) + w.RespondWithError(errors.NewInternalServerError("Failed to list substrate resources from Kubernetes", err)) + return + } + resp.WorkerPools = append(resp.WorkerPools, wpEntries...) + resp.ActorTemplates = append(resp.ActorTemplates, tmplEntries...) + } + + if h.AteClient != nil { + actors, workers, ateErr := h.listAteAPIState(r.Context(), namespaces) + resp.Actors = actors + resp.Workers = workers + if ateErr != nil { + resp.AteAPIError = ateErr.Error() + log.Error(ateErr, "list ate-api state") + } + } + + slices.SortStableFunc(resp.WorkerPools, compareWorkerPool) + slices.SortStableFunc(resp.ActorTemplates, compareActorTemplate) + slices.SortStableFunc(resp.Actors, compareActor) + slices.SortStableFunc(resp.Workers, compareWorker) + + data := api.NewResponse(resp, "Successfully listed substrate status", false) + RespondWithJSON(w, http.StatusOK, data) +} + +func (h *SubstrateHandler) substrateNamespaces(requested string) ([]string, error) { + if requested != "" { + return []string{requested}, nil + } + if len(h.WatchedNamespaces) > 0 { + return slices.Clone(h.WatchedNamespaces), nil + } + return []string{""}, nil +} + +func (h *SubstrateHandler) listSubstrateCRs(ctx context.Context, namespace string) ([]api.SubstrateWorkerPoolEntry, []api.SubstrateActorTemplateEntry, error) { + var listOpts []client.ListOption + if namespace != "" { + listOpts = append(listOpts, client.InNamespace(namespace)) + } + + wpList := &atev1alpha1.WorkerPoolList{} + if err := h.KubeClient.List(ctx, wpList, listOpts...); err != nil { + return nil, nil, err + } + tmplList := &atev1alpha1.ActorTemplateList{} + if err := h.KubeClient.List(ctx, tmplList, listOpts...); err != nil { + return nil, nil, err + } + + workerPools := make([]api.SubstrateWorkerPoolEntry, 0, len(wpList.Items)) + for i := range wpList.Items { + wp := &wpList.Items[i] + workerPools = append(workerPools, api.SubstrateWorkerPoolEntry{ + Namespace: wp.Namespace, + Name: wp.Name, + Replicas: wp.Spec.Replicas, + AteomImage: wp.Spec.AteomImage, + }) + } + + templates := make([]api.SubstrateActorTemplateEntry, 0, len(tmplList.Items)) + for i := range tmplList.Items { + tmpl := &tmplList.Items[i] + entry := api.SubstrateActorTemplateEntry{ + Namespace: tmpl.Namespace, + Name: tmpl.Name, + Phase: string(tmpl.Status.Phase), + GoldenActorID: tmpl.Status.GoldenActorID, + GoldenSnapshot: tmpl.Status.GoldenSnapshot, + ManagedByKagent: tmpl.Labels["app.kubernetes.io/managed-by"] == "kagent", + } + if harness := strings.TrimSpace(tmpl.Labels[substrate.HarnessLabelKey]); harness != "" { + entry.HarnessName = harness + } + if ref := tmpl.Spec.WorkerPoolRef; ref.Name != "" { + wpNS := ref.Namespace + if wpNS == "" { + wpNS = tmpl.Namespace + } + entry.WorkerPoolRef = wpNS + "/" + ref.Name + } + templates = append(templates, entry) + } + + return workerPools, templates, nil +} + +func (h *SubstrateHandler) listAteAPIState(ctx context.Context, namespaces []string) ([]api.SubstrateActorEntry, []api.SubstrateWorkerEntry, error) { + allowAll := len(namespaces) == 1 && namespaces[0] == "" + allowed := make(map[string]struct{}, len(namespaces)) + for _, ns := range namespaces { + if ns != "" { + allowed[ns] = struct{}{} + } + } + + actorPB, err := h.AteClient.ListActors(ctx) + if err != nil { + return nil, nil, err + } + workerPB, err := h.AteClient.ListWorkers(ctx) + if err != nil { + return nil, nil, err + } + + actors := make([]api.SubstrateActorEntry, 0, len(actorPB)) + for _, a := range actorPB { + if a == nil { + continue + } + ns := strings.TrimSpace(a.GetActorTemplateNamespace()) + if !allowAll && ns != "" { + if _, ok := allowed[ns]; !ok { + continue + } + } + actors = append(actors, actorEntryFromPB(a)) + } + + workers := make([]api.SubstrateWorkerEntry, 0, len(workerPB)) + for _, w := range workerPB { + if w == nil { + continue + } + ns := strings.TrimSpace(w.GetWorkerNamespace()) + if !allowAll && ns != "" { + if _, ok := allowed[ns]; !ok { + continue + } + } + workers = append(workers, workerEntryFromPB(w)) + } + + return actors, workers, nil +} + +func actorEntryFromPB(a *ateapipb.Actor) api.SubstrateActorEntry { + return api.SubstrateActorEntry{ + ActorID: a.GetActorId(), + Status: substrate.ActorStatusLabel(a.GetStatus()), + ActorTemplateNamespace: a.GetActorTemplateNamespace(), + ActorTemplateName: a.GetActorTemplateName(), + AteomPodNamespace: a.GetAteomPodNamespace(), + AteomPodName: a.GetAteomPodName(), + AteomPodIP: a.GetAteomPodIp(), + LastSnapshot: a.GetLastSnapshot(), + InProgressSnapshot: a.GetInProgressSnapshot(), + Version: a.GetVersion(), + } +} + +func workerEntryFromPB(w *ateapipb.Worker) api.SubstrateWorkerEntry { + return api.SubstrateWorkerEntry{ + WorkerNamespace: w.GetWorkerNamespace(), + WorkerPool: w.GetWorkerPool(), + WorkerPod: w.GetWorkerPod(), + ActorNamespace: w.GetActorNamespace(), + ActorTemplate: w.GetActorTemplate(), + ActorID: w.GetActorId(), + IP: w.GetIp(), + Version: w.GetVersion(), + } +} + +func compareWorkerPool(a, b api.SubstrateWorkerPoolEntry) int { + return strings.Compare(a.Namespace+"/"+a.Name, b.Namespace+"/"+b.Name) +} + +func compareActorTemplate(a, b api.SubstrateActorTemplateEntry) int { + return strings.Compare(a.Namespace+"/"+a.Name, b.Namespace+"/"+b.Name) +} + +func compareActor(a, b api.SubstrateActorEntry) int { + return strings.Compare(a.ActorID, b.ActorID) +} + +func compareWorker(a, b api.SubstrateWorkerEntry) int { + return strings.Compare(a.WorkerNamespace+"/"+a.WorkerPool+"/"+a.WorkerPod, b.WorkerNamespace+"/"+b.WorkerPool+"/"+b.WorkerPod) +} diff --git a/go/core/internal/httpserver/handlers/substrate_test.go b/go/core/internal/httpserver/handlers/substrate_test.go new file mode 100644 index 0000000000..6f34f3fe39 --- /dev/null +++ b/go/core/internal/httpserver/handlers/substrate_test.go @@ -0,0 +1,104 @@ +package handlers_test + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/api/v1alpha1" + "github.com/agent-substrate/substrate/proto/ateapipb" + api "github.com/kagent-dev/kagent/go/api/httpapi" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/auth" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +type stubAteControl struct { + ateapipb.ControlClient + actors []*ateapipb.Actor + workers []*ateapipb.Worker +} + +func (s *stubAteControl) ListActors(context.Context, *ateapipb.ListActorsRequest, ...grpc.CallOption) (*ateapipb.ListActorsResponse, error) { + return &ateapipb.ListActorsResponse{Actors: s.actors}, nil +} + +func (s *stubAteControl) ListWorkers(context.Context, *ateapipb.ListWorkersRequest, ...grpc.CallOption) (*ateapipb.ListWorkersResponse, error) { + return &ateapipb.ListWorkersResponse{Workers: s.workers}, nil +} + +func TestHandleGetSubstrateStatus(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(atev1alpha1.AddToScheme(scheme)) + + kube := fake.NewClientBuilder().WithScheme(scheme).WithObjects( + &atev1alpha1.WorkerPool{ + ObjectMeta: metav1.ObjectMeta{Name: "default-wp", Namespace: "kagent"}, + Spec: atev1alpha1.WorkerPoolSpec{Replicas: 2, AteomImage: "localhost:5001/ateom:latest"}, + }, + &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-claw", + Namespace: "kagent", + Labels: map[string]string{ + "app.kubernetes.io/managed-by": "kagent", + substrate.HarnessLabelKey: "my-claw", + }, + }, + Spec: atev1alpha1.ActorTemplateSpec{ + WorkerPoolRef: corev1.ObjectReference{Name: "default-wp", Namespace: "kagent"}, + }, + Status: atev1alpha1.ActorTemplateStatus{Phase: atev1alpha1.PhaseReady, GoldenActorID: "golden-1"}, + }, + ).Build() + + ate := &substrate.Client{ControlClient: &stubAteControl{ + actors: []*ateapipb.Actor{{ + ActorId: "ahr-kagent-my-claw", + Status: ateapipb.Actor_STATUS_RUNNING, + ActorTemplateNamespace: "kagent", + ActorTemplateName: "my-claw", + }}, + workers: []*ateapipb.Worker{{ + WorkerNamespace: "kagent", + WorkerPool: "default-wp", + WorkerPod: "ateom-0", + ActorId: "ahr-kagent-my-claw", + }}, + }} + + base := &handlers.Base{KubeClient: kube, Authorizer: &auth.NoopAuthorizer{}} + h := handlers.NewSubstrateHandler(base, ate) + + req := httptest.NewRequest(http.MethodGet, "/api/substrate/status?namespace=kagent", nil) + req = setUser(req, "test-user") + rec := httptest.NewRecorder() + h.HandleGetSubstrateStatus(&testErrorResponseWriter{ResponseWriter: rec}, req) + require.Equal(t, http.StatusOK, rec.Code) + + var wrapped api.StandardResponse[api.SubstrateStatusResponse] + require.NoError(t, json.Unmarshal(rec.Body.Bytes(), &wrapped)) + require.True(t, wrapped.Data.Enabled) + require.Len(t, wrapped.Data.WorkerPools, 1) + require.Equal(t, "default-wp", wrapped.Data.WorkerPools[0].Name) + require.Len(t, wrapped.Data.ActorTemplates, 1) + require.Equal(t, "Ready", wrapped.Data.ActorTemplates[0].Phase) + require.True(t, wrapped.Data.ActorTemplates[0].ManagedByKagent) + require.Equal(t, "my-claw", wrapped.Data.ActorTemplates[0].HarnessName) + require.Len(t, wrapped.Data.Actors, 1) + require.Equal(t, "Running", wrapped.Data.Actors[0].Status) + require.Len(t, wrapped.Data.Workers, 1) +} diff --git a/go/core/internal/httpserver/server.go b/go/core/internal/httpserver/server.go index c7da966cb9..eb88dfe0a5 100644 --- a/go/core/internal/httpserver/server.go +++ b/go/core/internal/httpserver/server.go @@ -17,6 +17,7 @@ import ( "github.com/kagent-dev/kagent/go/core/internal/version" "github.com/kagent-dev/kagent/go/core/pkg/auth" "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend" + "github.com/kagent-dev/kagent/go/core/pkg/sandboxbackend/substrate" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "k8s.io/apimachinery/pkg/types" ctrl_client "sigs.k8s.io/controller-runtime/pkg/client" @@ -51,6 +52,7 @@ const ( APIPathCrewAI = "/api/crewai" APIPathSandboxSSH = "/api/sandbox/ssh" APIPathAgentHarnessHarness = "/api/agentharnesses/{namespace}/{name}/" + APIPathSubstrateStatus = "/api/substrate/status" ) var defaultModelConfig = types.NamespacedName{ @@ -73,6 +75,7 @@ type ServerConfig struct { Reconciler reconciler.KagentReconciler SandboxBackend sandboxbackend.Backend AgentHarnessGateway *handlers.AgentHarnessGatewayConfig + SubstrateAteClient *substrate.Client } // HTTPServer is the structure that manages the HTTP server @@ -101,6 +104,7 @@ func NewHTTPServer(config ServerConfig) (*HTTPServer, error) { config.Reconciler, config.SandboxBackend, config.AgentHarnessGateway, + config.SubstrateAteClient, ), authenticator: config.Authenticator, }, nil @@ -288,6 +292,9 @@ func (s *HTTPServer) setupRoutes() { // Namespaces s.router.HandleFunc(APIPathNamespaces, adaptHandler(s.handlers.Namespaces.HandleListNamespaces)).Methods(http.MethodGet) + // Agent Substrate inventory (WorkerPools, ActorTemplates, ate-api actors/workers) + s.router.HandleFunc(APIPathSubstrateStatus, adaptHandler(s.handlers.Substrate.HandleGetSubstrateStatus)).Methods(http.MethodGet) + // Prompt template libraries (ConfigMaps) s.router.HandleFunc(APIPathPromptTemplates, adaptHandler(s.handlers.PromptTemplates.HandleListPromptTemplates)).Methods(http.MethodGet) s.router.HandleFunc(APIPathPromptTemplates, adaptHandler(s.handlers.PromptTemplates.HandleCreatePromptTemplate)).Methods(http.MethodPost) diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index a505e423de..d9adf663d1 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -760,6 +760,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne Reconciler: rcnclr, SandboxBackend: extensionCfg.SandboxBackend, AgentHarnessGateway: agentHarnessGateway, + SubstrateAteClient: substrateAteClient, }) if err != nil { setupLog.Error(err, "unable to create HTTP server") diff --git a/go/core/pkg/sandboxbackend/substrate/list.go b/go/core/pkg/sandboxbackend/substrate/list.go new file mode 100644 index 0000000000..976da7b27f --- /dev/null +++ b/go/core/pkg/sandboxbackend/substrate/list.go @@ -0,0 +1,53 @@ +package substrate + +import ( + "context" + + "github.com/agent-substrate/substrate/proto/ateapipb" +) + +// ListActors returns all actors reflected in ate-api. +func (c *Client) ListActors(ctx context.Context) ([]*ateapipb.Actor, error) { + if c == nil { + return nil, nil + } + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.ListActors(ctx, &ateapipb.ListActorsRequest{}) + if err != nil { + return nil, err + } + return resp.GetActors(), nil +} + +// ListWorkers returns all workers reflected in ate-api. +func (c *Client) ListWorkers(ctx context.Context) ([]*ateapipb.Worker, error) { + if c == nil { + return nil, nil + } + ctx, cancel := c.callCtx(ctx) + defer cancel() + resp, err := c.ControlClient.ListWorkers(ctx, &ateapipb.ListWorkersRequest{}) + if err != nil { + return nil, err + } + return resp.GetWorkers(), nil +} + +// ActorStatusLabel returns a stable human-readable actor status. +func ActorStatusLabel(status ateapipb.Actor_Status) string { + switch status { + case ateapipb.Actor_STATUS_RESUMING: + return "Resuming" + case ateapipb.Actor_STATUS_RUNNING: + return "Running" + case ateapipb.Actor_STATUS_SUSPENDING: + return "Suspending" + case ateapipb.Actor_STATUS_SUSPENDED: + return "Suspended" + case ateapipb.Actor_STATUS_UNSPECIFIED: + return "Unknown" + default: + return status.String() + } +} diff --git a/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl b/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl index 184ad91c74..a082584ddf 100644 --- a/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl +++ b/go/core/pkg/sandboxbackend/substrate/templates/openclaw_startup.sh.tmpl @@ -1,9 +1,4 @@ set -e mkdir -p "${HOME}/.openclaw" echo '{{.OpenClawJSONBase64}}' | base64 -d > "${HOME}/.openclaw/openclaw.json" -openclaw gateway run --port {{.GatewayPort}} --allow-unconfigured >>/tmp/openclaw-gateway.log 2>&1 & -for i in $(seq 1 60); do - curl -sf http://127.0.0.1:{{.GatewayPort}}/ >/dev/null 2>&1 && echo "gateway up" && break - sleep 1 -done -tail -f /tmp/openclaw-gateway.log /dev/null +openclaw gateway run --port {{.GatewayPort}} --allow-unconfigured diff --git a/ui/src/app/actions/substrate.ts b/ui/src/app/actions/substrate.ts new file mode 100644 index 0000000000..e638fdc961 --- /dev/null +++ b/ui/src/app/actions/substrate.ts @@ -0,0 +1,22 @@ +"use server"; + +import { fetchApi, createErrorResponse } from "./utils"; +import type { BaseResponse, SubstrateStatusResponse } from "@/types"; + +export async function getSubstrateStatus( + namespace?: string, +): Promise> { + try { + const qs = namespace?.trim() ? `?namespace=${encodeURIComponent(namespace.trim())}` : ""; + const response = await fetchApi>(`/substrate/status${qs}`); + if (!response?.data) { + throw new Error("Failed to load substrate status"); + } + return { + message: response.message ?? "Substrate status fetched", + data: response.data, + }; + } catch (error) { + return createErrorResponse(error, "Error loading substrate status"); + } +} diff --git a/ui/src/app/substrate/SubstrateStatusPage.tsx b/ui/src/app/substrate/SubstrateStatusPage.tsx new file mode 100644 index 0000000000..2bfd081dcf --- /dev/null +++ b/ui/src/app/substrate/SubstrateStatusPage.tsx @@ -0,0 +1,72 @@ +"use client"; + +import { useCallback, useEffect, useState } from "react"; +import { useSearchParams, useRouter } from "next/navigation"; +import { AppPageFrame } from "@/components/layout/AppPageFrame"; +import { PageHeader } from "@/components/layout/PageHeader"; +import { SubstrateStatusView } from "@/components/substrate/SubstrateStatusView"; +import { getSubstrateStatus } from "@/app/actions/substrate"; +import type { SubstrateStatusResponse } from "@/types"; + +export function SubstrateStatusPage() { + const router = useRouter(); + const searchParams = useSearchParams(); + const namespace = searchParams.get("namespace") ?? ""; + + const [status, setStatus] = useState(null); + const [loading, setLoading] = useState(true); + const [loadError, setLoadError] = useState(null); + + const load = useCallback(async () => { + setLoading(true); + setLoadError(null); + const result = await getSubstrateStatus(namespace || undefined); + if (result.error || !result.data) { + setLoadError(result.error || "Failed to load substrate status"); + setStatus(null); + } else { + setStatus(result.data); + } + setLoading(false); + }, [namespace]); + + useEffect(() => { + const raf = requestAnimationFrame(() => { + void load(); + }); + return () => cancelAnimationFrame(raf); + }, [load]); + + const handleNamespaceChange = useCallback( + (ns: string) => { + const params = new URLSearchParams(searchParams.toString()); + if (ns) { + params.set("namespace", ns); + } else { + params.delete("namespace"); + } + const q = params.toString(); + router.replace(q ? `/substrate?${q}` : "/substrate"); + }, + [router, searchParams], + ); + + return ( + + + + + ); +} diff --git a/ui/src/app/substrate/page.tsx b/ui/src/app/substrate/page.tsx new file mode 100644 index 0000000000..6c966f85af --- /dev/null +++ b/ui/src/app/substrate/page.tsx @@ -0,0 +1,14 @@ +import { Suspense } from "react"; +import { SubstrateStatusPage } from "./SubstrateStatusPage"; + +export default function SubstratePage() { + return ( + Loading substrate status…
+ } + > + + + ); +} diff --git a/ui/src/components/Header.tsx b/ui/src/components/Header.tsx index 4ced9791e7..358dd4ce26 100644 --- a/ui/src/components/Header.tsx +++ b/ui/src/components/Header.tsx @@ -4,7 +4,7 @@ import Link from "next/link"; import { Button } from "./ui/button"; import KAgentLogoWithText from "./kagent-logo-text"; import KagentLogo from "./kagent-logo"; -import { Plus, Menu, X, ChevronDown, Brain, Server, Eye, Hammer, HomeIcon, ScrollText, Cable } from "lucide-react"; +import { Plus, Menu, X, ChevronDown, Brain, Server, Eye, Hammer, HomeIcon, ScrollText, Cable, Layers } from "lucide-react"; import { ThemeToggle } from "./ThemeToggle"; import { UserMenu } from "./UserMenu"; import { @@ -131,6 +131,12 @@ export function Header() { Prompt Library + + + + Substrate + + @@ -194,6 +200,12 @@ export function Header() { Prompt Library + + + + Substrate + + diff --git a/ui/src/components/substrate/SubstrateStatusView.tsx b/ui/src/components/substrate/SubstrateStatusView.tsx new file mode 100644 index 0000000000..1720e80867 --- /dev/null +++ b/ui/src/components/substrate/SubstrateStatusView.tsx @@ -0,0 +1,352 @@ +"use client"; + +import { useCallback, useMemo, type ComponentType } from "react"; +import Link from "next/link"; +import { RefreshCw, AlertCircle, Cpu, FileStack, Users, Boxes } from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; +import { NamespaceCombobox } from "@/components/NamespaceCombobox"; +import type { + SubstrateActorEntry, + SubstrateActorTemplateEntry, + SubstrateStatusResponse, + SubstrateWorkerEntry, + SubstrateWorkerPoolEntry, +} from "@/types"; +import { cn } from "@/lib/utils"; + +type SubstrateStatusViewProps = { + status: SubstrateStatusResponse | null; + namespace: string; + onNamespaceChange: (ns: string) => void; + isLoading: boolean; + loadError: string | null; + onRefresh: () => Promise; +}; + +function statusTone(label: string): "ok" | "warn" | "idle" | "busy" | "neutral" { + const s = label.toLowerCase(); + if (s === "ready" || s === "running") return "ok"; + if (s === "failed" || s === "suspending") return "warn"; + if (s === "suspended" || s === "unknown" || s === "") return "idle"; + if (s.includes("resume") || s.includes("wait") || s.includes("golden")) return "busy"; + return "neutral"; +} + +function StatusChip({ label }: { label: string }) { + const tone = statusTone(label); + return ( + + {label || "—"} + + ); +} + +function SectionHeader({ + icon: Icon, + title, + count, + hint, +}: { + icon: ComponentType<{ className?: string }>; + title: string; + count: number; + hint?: string; +}) { + return ( +
+
+ +

{title}

+ {count} +
+ {hint ?

{hint}

: null} +
+ ); +} + +function EmptyRow({ message }: { message: string }) { + return ( +

{message}

+ ); +} + +function WorkerPoolsTable({ rows }: { rows: SubstrateWorkerPoolEntry[] }) { + if (rows.length === 0) { + return ; + } + return ( +
+ + + + + + + + + + {rows.map((wp) => ( + + + + + + ))} + +
PoolReplicasAteom image
+ {wp.namespace}/ + {wp.name} + {wp.replicas}{wp.ateomImage}
+
+ ); +} + +function ActorTemplatesTable({ rows }: { rows: SubstrateActorTemplateEntry[] }) { + if (rows.length === 0) { + return ; + } + return ( +
+ + + + + + + + + + + {rows.map((t) => ( + + + + + + + ))} + +
TemplatePhaseWorker poolHarness
+
+ {t.namespace}/ + {t.name} +
+ {t.goldenActorId ? ( +
golden: {t.goldenActorId}
+ ) : null} +
+ + {t.workerPoolRef ?? "—"} + {t.harnessName ? ( + + {t.harnessName} + + ) : ( + + )} +
+
+ ); +} + +function ActorsTable({ rows, enabled }: { rows: SubstrateActorEntry[]; enabled: boolean }) { + if (!enabled) { + return ( + + ); + } + if (rows.length === 0) { + return ; + } + return ( +
+ + + + + + + + + + + {rows.map((a) => ( + + + + + + + ))} + +
ActorStatusTemplateWorker pod
{a.actorId} + + + {a.actorTemplateNamespace && a.actorTemplateName + ? `${a.actorTemplateNamespace}/${a.actorTemplateName}` + : "—"} + + {a.ateomPodName ? `${a.ateomPodNamespace ?? ""}/${a.ateomPodName}` : "—"} + {a.ateomPodIp ? ` · ${a.ateomPodIp}` : ""} +
+
+ ); +} + +function WorkersTable({ rows, enabled }: { rows: SubstrateWorkerEntry[]; enabled: boolean }) { + if (!enabled) { + return ; + } + if (rows.length === 0) { + return ; + } + return ( +
+ + + + + + + + + + {rows.map((w) => ( + + + + + + ))} + +
PodPoolActor
+ {w.workerNamespace}/{w.workerPod} + {w.workerPool}{w.actorId || "idle"}
+
+ ); +} + +export function SubstrateStatusView({ + status, + namespace, + onNamespaceChange, + isLoading, + loadError, + onRefresh, +}: SubstrateStatusViewProps) { + const summary = useMemo(() => { + if (!status) return null; + const running = status.actors.filter((a) => a.status.toLowerCase() === "running").length; + const readyTemplates = status.actorTemplates.filter((t) => t.phase?.toLowerCase() === "ready").length; + return { + pools: status.workerPools.length, + templates: status.actorTemplates.length, + readyTemplates, + actors: status.actors.length, + running, + workers: status.workers.length, + busyWorkers: status.workers.filter((w) => w.actorId).length, + }; + }, [status]); + + const handleRefresh = useCallback(() => { + void onRefresh(); + }, [onRefresh]); + + return ( +
+
+
+ + +
+ +
+ + {loadError ? ( + + + Could not load substrate status + {loadError} + + ) : null} + + {status?.ateApiError ? ( + + + ate-api partial data + {status.ateApiError} + + ) : null} + + {summary ? ( +
+ {[ + { label: "Worker pools", value: summary.pools }, + { label: "Templates ready", value: `${summary.readyTemplates}/${summary.templates}` }, + { label: "Actors running", value: `${summary.running}/${summary.actors}` }, + { label: "Workers busy", value: `${summary.busyWorkers}/${summary.workers}` }, + { label: "ate-api", value: status?.enabled ? "connected" : "off" }, + { label: "Scope", value: namespace || "all" }, + ].map((item) => ( +
+
{item.label}
+
{item.value}
+
+ ))} +
+ ) : null} + +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+
+ ); +} diff --git a/ui/src/types/index.ts b/ui/src/types/index.ts index 7f50f04e5b..08de6dd75b 100644 --- a/ui/src/types/index.ts +++ b/ui/src/types/index.ts @@ -439,6 +439,58 @@ export interface SubstrateAgentHarnessListEntry { endpoint?: string; } +/** GET /api/substrate/status — WorkerPools, ActorTemplates, and ate-api actors/workers. */ +export interface SubstrateStatusResponse { + enabled: boolean; + ateApiError?: string; + workerPools: SubstrateWorkerPoolEntry[]; + actorTemplates: SubstrateActorTemplateEntry[]; + actors: SubstrateActorEntry[]; + workers: SubstrateWorkerEntry[]; +} + +export interface SubstrateWorkerPoolEntry { + namespace: string; + name: string; + replicas: number; + ateomImage: string; +} + +export interface SubstrateActorTemplateEntry { + namespace: string; + name: string; + phase?: string; + goldenActorId?: string; + goldenSnapshot?: string; + workerPoolRef?: string; + harnessName?: string; + managedByKagent: boolean; +} + +export interface SubstrateActorEntry { + actorId: string; + status: string; + actorTemplateNamespace?: string; + actorTemplateName?: string; + ateomPodNamespace?: string; + ateomPodName?: string; + ateomPodIp?: string; + lastSnapshot?: string; + inProgressSnapshot?: string; + version?: number; +} + +export interface SubstrateWorkerEntry { + workerNamespace: string; + workerPool: string; + workerPod: string; + actorNamespace?: string; + actorTemplate?: string; + actorId?: string; + ip?: string; + version?: number; +} + export interface AgentResponse { id: number | string; agent: Agent; From 002db3378b9285087465e76e19f5eb0a23d07141 Mon Sep 17 00:00:00 2001 From: Peter Jausovec Date: Tue, 2 Jun 2026 15:00:26 -0700 Subject: [PATCH 32/32] gate the substrate features in the UI Signed-off-by: Peter Jausovec --- ui/src/app/layout.tsx | 3 + ui/src/app/substrate/page.tsx | 19 ++- ui/src/components/Header.stories.tsx | 9 +- ui/src/components/Header.tsx | 29 ++-- ui/src/components/SubstrateFeatureGate.tsx | 32 ++++ .../agent-form/OpenClawSandboxFields.tsx | 138 ++++++++++-------- .../substrate/SubstratePageGuard.tsx | 35 +++++ ui/src/contexts/SubstrateFeaturesContext.tsx | 100 +++++++++++++ 8 files changed, 279 insertions(+), 86 deletions(-) create mode 100644 ui/src/components/SubstrateFeatureGate.tsx create mode 100644 ui/src/components/substrate/SubstratePageGuard.tsx create mode 100644 ui/src/contexts/SubstrateFeaturesContext.tsx diff --git a/ui/src/app/layout.tsx b/ui/src/app/layout.tsx index 175d0e4fba..4ce12532bc 100644 --- a/ui/src/app/layout.tsx +++ b/ui/src/app/layout.tsx @@ -4,6 +4,7 @@ import "./globals.css"; import { TooltipProvider } from "@/components/ui/tooltip"; import { AgentsProvider } from "@/components/AgentsProvider"; import { AuthProvider } from "@/contexts/AuthContext"; +import { SubstrateFeaturesProvider } from "@/contexts/SubstrateFeaturesContext"; import { Header } from "@/components/Header"; import { Footer } from "@/components/Footer"; import { ThemeProvider } from "@/components/ThemeProvider"; @@ -23,6 +24,7 @@ export default function RootLayout({ children }: { children: React.ReactNode }) return ( + @@ -37,6 +39,7 @@ export default function RootLayout({ children }: { children: React.ReactNode }) + ); diff --git a/ui/src/app/substrate/page.tsx b/ui/src/app/substrate/page.tsx index 6c966f85af..a51600ca0d 100644 --- a/ui/src/app/substrate/page.tsx +++ b/ui/src/app/substrate/page.tsx @@ -1,14 +1,19 @@ import { Suspense } from "react"; +import { SubstratePageGuard } from "@/components/substrate/SubstratePageGuard"; import { SubstrateStatusPage } from "./SubstrateStatusPage"; export default function SubstratePage() { return ( - Loading substrate status…
- } - > - - + + + Loading substrate status… +
+ } + > + + + ); } diff --git a/ui/src/components/Header.stories.tsx b/ui/src/components/Header.stories.tsx index 7813370811..76b89c92c5 100644 --- a/ui/src/components/Header.stories.tsx +++ b/ui/src/components/Header.stories.tsx @@ -1,6 +1,7 @@ import type { Meta, StoryObj } from "@storybook/nextjs-vite"; import { Header } from "./Header"; import { AuthProvider } from "@/contexts/AuthContext"; +import { SubstrateFeaturesTestProvider } from "@/contexts/SubstrateFeaturesContext"; const meta = { title: "Components/Header", @@ -10,9 +11,11 @@ const meta = { }, decorators: [ (Story) => ( - - - + + + + + ), ], } satisfies Meta; diff --git a/ui/src/components/Header.tsx b/ui/src/components/Header.tsx index 358dd4ce26..7f76bcfd3c 100644 --- a/ui/src/components/Header.tsx +++ b/ui/src/components/Header.tsx @@ -13,6 +13,7 @@ import { DropdownMenuItem, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; +import { SubstrateFeatureGate } from "@/components/SubstrateFeatureGate"; export function Header() { const [isMenuOpen, setIsMenuOpen] = useState(false); @@ -131,12 +132,14 @@ export function Header() { Prompt Library - - - - Substrate - - + + + + + Substrate + + + @@ -200,12 +203,14 @@ export function Header() { Prompt Library - - - - Substrate - - + + + + + Substrate + + + diff --git a/ui/src/components/SubstrateFeatureGate.tsx b/ui/src/components/SubstrateFeatureGate.tsx new file mode 100644 index 0000000000..08e3ae6325 --- /dev/null +++ b/ui/src/components/SubstrateFeatureGate.tsx @@ -0,0 +1,32 @@ +"use client"; + +import type { ReactNode } from "react"; +import { useSubstrateFeatures } from "@/contexts/SubstrateFeaturesContext"; + +type SubstrateFeatureGateProps = { + children: ReactNode; + /** Shown while capabilities are loading. Defaults to nothing. */ + loadingFallback?: ReactNode; + /** Shown when substrate is disabled. Defaults to nothing. */ + fallback?: ReactNode; +}; + +/** + * Renders children only when Agent Substrate is enabled on the controller. + * Use for nav items, form sections, or any UI gated on cluster substrate config. + */ +export function SubstrateFeatureGate({ + children, + loadingFallback = null, + fallback = null, +}: SubstrateFeatureGateProps) { + const { enabled, isLoading } = useSubstrateFeatures(); + + if (isLoading) { + return <>{loadingFallback}; + } + if (!enabled) { + return <>{fallback}; + } + return <>{children}; +} diff --git a/ui/src/components/agent-form/OpenClawSandboxFields.tsx b/ui/src/components/agent-form/OpenClawSandboxFields.tsx index ee830023ad..13e71d94e9 100644 --- a/ui/src/components/agent-form/OpenClawSandboxFields.tsx +++ b/ui/src/components/agent-form/OpenClawSandboxFields.tsx @@ -18,6 +18,7 @@ import type { OpenClawSandboxFormValidationError, } from "@/lib/openClawSandboxForm"; import { isClawHarnessBackend, newOpenClawChannelRow } from "@/lib/openClawSandboxForm"; +import { useSubstrateEnabled } from "@/contexts/SubstrateFeaturesContext"; const OPENCLAW_DOCS_ROOT = "https://docs.openclaw.ai"; @@ -151,81 +152,90 @@ export function OpenClawSandboxFields({ harnessBackend, validationError, }: OpenClawSandboxFieldsProps) { + const substrateEnabled = useSubstrateEnabled(); const clawBackend = isClawHarnessBackend(harnessBackend); const set = (patch: Partial) => onChange({ ...value, ...patch }); const [advancedOpen, setAdvancedOpen] = React.useState(false); const section = validationError?.section ?? null; + React.useEffect(() => { + if (!substrateEnabled && value.runtime === "substrate") { + set({ runtime: "openshell" }); + } + }, [substrateEnabled, value.runtime]); + return (
{section === "general" ? validationError?.message : null} - - - Control plane - - - {value.runtime === "substrate" ? ( -
- - Gateway token - set({ substrateGatewayToken: e.target.value })} - /> -

- Bearer token used by kagent when proxying the generated OpenClaw gateway. -

-
- - Snapshot location (GCS) - set({ substrateSnapshotsLocation: e.target.value })} - /> -

- Substrate stores golden and incremental snapshots at this gs:// prefix (GCS only today). -

-
- - WorkerPool name - set({ substrateWorkerPoolRefName: e.target.value })} - /> -

- Leave empty to use the controller default WorkerPool. -

-
-
- ) : null} -
+ {substrateEnabled ? ( + + + Control plane + + + {value.runtime === "substrate" ? ( +
+ + Gateway token + set({ substrateGatewayToken: e.target.value })} + /> +

+ Bearer token used by kagent when proxying the generated OpenClaw gateway. +

+
+ + Snapshot location (GCS) + set({ substrateSnapshotsLocation: e.target.value })} + /> +

+ Substrate stores golden and incremental snapshots at this gs:// prefix (GCS only today). +

+
+ + WorkerPool name + set({ substrateWorkerPoolRefName: e.target.value })} + /> +

+ Leave empty to use the controller default WorkerPool. +

+
+
+ ) : null} +
+ ) : null} { + if (!isLoading && !enabled) { + router.replace("/"); + } + }, [enabled, isLoading, router]); + + if (isLoading) { + return ( +
+ Loading… +
+ ); + } + + if (!enabled) { + return null; + } + + return <>{children}; +} diff --git a/ui/src/contexts/SubstrateFeaturesContext.tsx b/ui/src/contexts/SubstrateFeaturesContext.tsx new file mode 100644 index 0000000000..8b572d36e3 --- /dev/null +++ b/ui/src/contexts/SubstrateFeaturesContext.tsx @@ -0,0 +1,100 @@ +"use client"; + +import React, { + createContext, + useCallback, + useContext, + useEffect, + useMemo, + useState, + type ReactNode, +} from "react"; +import { getSubstrateStatus } from "@/app/actions/substrate"; + +export interface SubstrateFeaturesContextValue { + /** True when the controller has Agent Substrate configured (ate-api endpoint set). */ + enabled: boolean; + isLoading: boolean; + error: string | null; + refetch: () => Promise; +} + +const SubstrateFeaturesContext = createContext( + undefined, +); + +export function SubstrateFeaturesProvider({ children }: { children: ReactNode }) { + const [enabled, setEnabled] = useState(false); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + + const refetch = useCallback(async () => { + setIsLoading(true); + setError(null); + try { + const result = await getSubstrateStatus(); + if (result.error || !result.data) { + setEnabled(false); + setError(result.error ?? "Failed to load substrate features"); + return; + } + setEnabled(result.data.enabled); + } catch (e) { + setEnabled(false); + setError(e instanceof Error ? e.message : "Failed to load substrate features"); + } finally { + setIsLoading(false); + } + }, []); + + useEffect(() => { + void refetch(); + }, [refetch]); + + const value = useMemo( + () => ({ enabled, isLoading, error, refetch }), + [enabled, isLoading, error, refetch], + ); + + return ( + {children} + ); +} + +export function useSubstrateFeatures(): SubstrateFeaturesContextValue { + const context = useContext(SubstrateFeaturesContext); + if (context === undefined) { + throw new Error("useSubstrateFeatures must be used within a SubstrateFeaturesProvider"); + } + return context; +} + +/** True after the initial probe finishes and substrate is enabled on the cluster. */ +export function useSubstrateEnabled(): boolean { + const { enabled, isLoading } = useSubstrateFeatures(); + return !isLoading && enabled; +} + +/** For Storybook/tests: inject feature flags without calling the API. */ +export function SubstrateFeaturesTestProvider({ + children, + enabled, + isLoading = false, +}: { + children: ReactNode; + enabled: boolean; + isLoading?: boolean; +}) { + const value = useMemo( + () => ({ + enabled, + isLoading, + error: null, + refetch: async () => {}, + }), + [enabled, isLoading], + ); + return ( + {children} + ); +}