diff --git a/cmd/main.go b/cmd/main.go index a0b8f36029..b2f43ff2e9 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -4,11 +4,11 @@ import ( "context" "net/http" _ "net/http/pprof" - "net/url" "os" "os/signal" "strings" "syscall" + "time" "github.com/armosec/armoapi-go/armotypes" utilsmetadata "github.com/armosec/utils-k8s-go/armometadata" @@ -38,8 +38,7 @@ import ( "github.com/kubescape/node-agent/pkg/hostsensormanager" "github.com/kubescape/node-agent/pkg/malwaremanager" malwaremanagerv1 "github.com/kubescape/node-agent/pkg/malwaremanager/v1" - "github.com/kubescape/node-agent/pkg/metricsmanager" - metricprometheus "github.com/kubescape/node-agent/pkg/metricsmanager/prometheus" + otelmetrics "github.com/kubescape/node-agent/pkg/metricsmanager/otel" "github.com/kubescape/node-agent/pkg/networkstream" networkstreamv1 "github.com/kubescape/node-agent/pkg/networkstream/v1" "github.com/kubescape/node-agent/pkg/nodeprofilemanager" @@ -49,6 +48,7 @@ import ( "github.com/kubescape/node-agent/pkg/objectcache/dnscache" "github.com/kubescape/node-agent/pkg/objectcache/k8scache" objectcachev1 "github.com/kubescape/node-agent/pkg/objectcache/v1" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/kubescape/node-agent/pkg/processtree" containerprocesstree "github.com/kubescape/node-agent/pkg/processtree/container" processtreecreator "github.com/kubescape/node-agent/pkg/processtree/creator" @@ -70,6 +70,8 @@ import ( "github.com/kubescape/node-agent/pkg/validator" "github.com/kubescape/node-agent/pkg/watcher/dynamicwatcher" "github.com/kubescape/node-agent/pkg/watcher/seccompprofilewatcher" + goruntime "go.opentelemetry.io/contrib/instrumentation/runtime" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func main() { @@ -99,14 +101,35 @@ func main() { logger.L().Info("credentials loaded", helpers.Int("accountLength", len(credentials.Account))) } - // to enable otel, set OTEL_COLLECTOR_SVC=otel-collector:4317 - if otelHost, present := os.LookupEnv("OTEL_COLLECTOR_SVC"); present { - ctx = logger.InitOtel("node-agent", - os.Getenv("RELEASE"), - clusterData.AccountID, - clusterData.ClusterName, - url.URL{Host: otelHost}) - defer logger.ShutdownOtel(ctx) + otelShutdown, err := otelsetup.InitProviders(ctx, otelsetup.ProviderConfig{ + ServiceName: "node-agent", + ServiceVersion: os.Getenv("RELEASE"), + NodeName: cfg.NodeName, + PodName: cfg.PodName, + Namespace: cfg.NamespaceName, + ClusterName: clusterData.ClusterName, + AccountID: clusterData.AccountID, + AccessKey: accessKey, + }) + if err != nil { + logger.L().Warning("OTEL init failed, running without telemetry", helpers.Error(err)) + } + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + if otelShutdown != nil { + _ = otelShutdown(shutdownCtx) + } + }() + + // Emit Go runtime metrics only when metrics collection is configured; + // avoids ~2–3 KB/hr of metric volume for deployments without telemetry. + if os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") != "" || + os.Getenv("OTEL_METRICS_EXPORTER") != "" || + os.Getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") != "" { + if err := goruntime.Start(goruntime.WithMinimumReadMemStatsInterval(30 * time.Second)); err != nil { + logger.L().Warning("node-agent: Go runtime metrics unavailable", helpers.Error(err)) + } } // Check if we need to validate the kernel version. @@ -174,13 +197,11 @@ func main() { logger.L().Ctx(ctx).Fatal("error creating the storage client", helpers.Error(err)) } - // Create Prometheus metrics exporter - var prometheusExporter metricsmanager.MetricsManager - if cfg.EnablePrometheusExporter { - prometheusExporter = metricprometheus.NewPrometheusMetric() - } else { - prometheusExporter = metricsmanager.NewMetricsNoop() - } + // Create metrics provider (OTEL SDK; Prometheus scrape endpoint started by otelsetup + // when OTEL_METRICS_EXPORTER=prometheus; OTLP push when OTEL_EXPORTER_OTLP_ENDPOINT set). + // MUST be constructed after otelsetup.InitProviders() so the global MeterProvider is set. + // Always use the OTEL impl — the SDK's own no-op providers handle the "no endpoint" case. + metricsProvider := otelmetrics.NewOTELMetricsManager(resolveOwnContainerID(ctx, k8sClient)) // Create watchers dWatcher := dynamicwatcher.NewWatchHandler(k8sClient, storageClient.GetStorageClient(), cfg.SkipNamespace) @@ -290,13 +311,13 @@ func main() { if cfg.EnableRuntimeDetection { // create exporter - exporter := exporters.InitExporters(cfg.Exporters, clusterData.ClusterName, cfg.NodeName, cloudMetadata, clusterUID, armotypes.AlertSourcePlatformK8sAgent) + exporter := exporters.InitExporters(cfg.Exporters, clusterData.ClusterName, cfg.NodeName, cloudMetadata, clusterUID, armotypes.AlertSourcePlatformK8sAgent, metricsProvider) dWatcher.AddAdaptor(ruleBindingCache) ruleBindingNotify = make(chan rulebinding.RuleBindingNotify, 100) ruleBindingCache.AddNotifier(&ruleBindingNotify) - cpc := containerprofilecache.NewContainerProfileCache(cfg, storageClient, k8sObjectCache, prometheusExporter) + cpc := containerprofilecache.NewContainerProfileCache(cfg, storageClient, k8sObjectCache, metricsProvider) cpc.Start(ctx) if cpm, ok := containerProfileManager.(*containerprofilemanagerv1.ContainerProfileManager); ok { cpm.SetCompletionNotifier(cpc) @@ -312,7 +333,7 @@ func main() { adapterFactory := ruleadapters.NewEventRuleAdapterFactory() - celEvaluator, err := cel.NewCEL(objCache, cfg, prometheusExporter) + celEvaluator, err := cel.NewCEL(objCache, cfg, metricsProvider) if err != nil { logger.L().Ctx(ctx).Fatal("error creating CEL evaluator", helpers.Error(err)) } @@ -321,7 +342,7 @@ func main() { // create runtimeDetection managers agentVersion := os.Getenv("AGENT_VERSION") - ruleManager, err = rulemanager.CreateRuleManager(ctx, cfg, k8sClient, ruleBindingCache, objCache, exporter, prometheusExporter, processTreeManager, dnsResolver, nil, ruleCooldown, adapterFactory, celEvaluator, mntnsRegistry, agentVersion) + ruleManager, err = rulemanager.CreateRuleManager(ctx, cfg, k8sClient, ruleBindingCache, objCache, exporter, metricsProvider, processTreeManager, dnsResolver, nil, ruleCooldown, adapterFactory, celEvaluator, mntnsRegistry, agentVersion) if err != nil { logger.L().Ctx(ctx).Fatal("error creating RuleManager", helpers.Error(err)) } @@ -357,8 +378,8 @@ func main() { var malwareManager malwaremanager.MalwareManagerClient if cfg.EnableMalwareDetection { // create exporter - exporter := exporters.InitExporters(cfg.Exporters, clusterData.ClusterName, cfg.NodeName, cloudMetadata, clusterUID, armotypes.AlertSourcePlatformK8sAgent) - malwareManager, err = malwaremanagerv1.CreateMalwareManager(cfg, k8sClient, cfg.NodeName, clusterData.ClusterName, exporter, prometheusExporter, k8sObjectCache) + exporter := exporters.InitExporters(cfg.Exporters, clusterData.ClusterName, cfg.NodeName, cloudMetadata, clusterUID, armotypes.AlertSourcePlatformK8sAgent, metricsProvider) + malwareManager, err = malwaremanagerv1.CreateMalwareManager(cfg, k8sClient, cfg.NodeName, clusterData.ClusterName, exporter, metricsProvider, k8sObjectCache) if err != nil { logger.L().Ctx(ctx).Fatal("error creating MalwareManager", helpers.Error(err)) } @@ -406,7 +427,7 @@ func main() { // Create the SBOM manager var sbomManager sbommanager.SbomManagerClient if cfg.EnableSbomGeneration { - sbomManager, err = sbommanagerv1.CreateSbomManager(ctx, cfg, igK8sClient.RuntimeConfig.SocketPath, storageClient, k8sObjectCache, scannerClient, failureReporter) + sbomManager, err = sbommanagerv1.CreateSbomManager(ctx, cfg, igK8sClient.RuntimeConfig.SocketPath, storageClient, k8sObjectCache, scannerClient, failureReporter, metricsProvider) if err != nil { logger.L().Ctx(ctx).Fatal("error creating SbomManager", helpers.Error(err)) } @@ -419,7 +440,7 @@ func main() { if cfg.EnableFIM { // Initialize FIM-specific exporters fimExportersConfig := cfg.FIM.GetFIMExportersConfig() - fimExporter := exporters.InitExporters(fimExportersConfig, clusterData.ClusterName, cfg.NodeName, cloudMetadata, clusterUID, armotypes.AlertSourcePlatformK8sAgent) + fimExporter := exporters.InitExporters(fimExportersConfig, clusterData.ClusterName, cfg.NodeName, cloudMetadata, clusterUID, armotypes.AlertSourcePlatformK8sAgent, metricsProvider) fimManager, err = fimmanager.NewFIMManager(cfg, clusterData.ClusterName, fimExporter, cloudMetadata) if err != nil { @@ -434,7 +455,7 @@ func main() { // Create the container handler mainHandler, err := containerwatcherv2.CreateIGContainerWatcher(cfg, containerProfileManager, k8sClient, - igK8sClient, dnsManagerClient, prometheusExporter, ruleManager, + igK8sClient, dnsManagerClient, metricsProvider, ruleManager, malwareManager, sbomManager, &ruleBindingNotify, igK8sClient.RuntimeConfig, nil, processTreeManager, clusterData.ClusterName, objCache, networkStreamClient, containerProcessTree, thirdPartyTracers) if err != nil { @@ -448,8 +469,8 @@ func main() { // Start the networkStream networkStreamClient.Start() - // Start the prometheusExporter - prometheusExporter.Start() + // Start the metrics provider + metricsProvider.Start() // Start the host sensor manager if err = hostSensorManager.Start(ctx); err != nil { @@ -503,16 +524,42 @@ func main() { signal.Notify(shutdown, os.Interrupt, syscall.SIGTERM) sig := <-shutdown - // Exit with success switch sig { case os.Interrupt: logger.L().Info("Received interrupt signal") - os.Exit(utils.ExitCodeSuccess) case syscall.SIGTERM: logger.L().Info("Received SIGTERM signal") - os.Exit(utils.ExitCodeSuccess) default: logger.L().Info("Received unknown signal") - os.Exit(utils.ExitCodeError) } + // Return normally so deferred OTEL shutdown flushes traces/metrics/logs. +} + +// resolveOwnContainerID returns node-agent's own container ID via the k8s API +// (using the Downward-API POD_NAME / NAMESPACE_NAME env), or "" if it cannot be +// determined. The cgroup memory gauges use it to locate the correct container +// scope in the host-mounted cgroup tree; "" makes them fall back to /proc-based +// resolution. Best-effort: a failure here only degrades those gauges. +func resolveOwnContainerID(ctx context.Context, k8sClient *k8sinterface.KubernetesApi) string { + const containerName = "node-agent" + podName, namespace := os.Getenv("POD_NAME"), os.Getenv("NAMESPACE_NAME") + if podName == "" || namespace == "" { + return "" + } + pod, err := k8sClient.GetKubernetesClient().CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + logger.L().Warning("resolveOwnContainerID - failed to get own pod, cgroup memory gauges will fall back", + helpers.Error(err), helpers.String("pod", podName), helpers.String("namespace", namespace)) + return "" + } + for _, cs := range pod.Status.ContainerStatuses { + if cs.Name == containerName { + // ContainerID is "://", e.g. "containerd://9103ee5f..." + if i := strings.LastIndex(cs.ContainerID, "/"); i >= 0 { + return cs.ContainerID[i+1:] + } + return cs.ContainerID + } + } + return "" } diff --git a/cmd/sbom-scanner/main.go b/cmd/sbom-scanner/main.go index 06ff53f066..0dcdb6fba5 100644 --- a/cmd/sbom-scanner/main.go +++ b/cmd/sbom-scanner/main.go @@ -1,48 +1,30 @@ package main import ( - "net" + "context" "os" - "os/signal" - "syscall" - "github.com/kubescape/go-logger" - "github.com/kubescape/go-logger/helpers" + beUtils "github.com/kubescape/backend/pkg/utils" sbomscanner "github.com/kubescape/node-agent/pkg/sbomscanner/v1" - pb "github.com/kubescape/node-agent/pkg/sbomscanner/v1/proto" - "google.golang.org/grpc" _ "modernc.org/sqlite" ) func main() { - socketPath := os.Getenv("SOCKET_PATH") - if socketPath == "" { - socketPath = "/sbom-comm/scanner.sock" + ctx := context.Background() + + // Load ARMO credentials from /etc/credentials (same source as the main agent). + // Fall back to env vars so the binary stays functional in non-ARMO deployments. + accountID := os.Getenv("ACCOUNT_ID") + accessKey := os.Getenv("ACCESS_KEY") + if creds, err := beUtils.LoadCredentialsFromFile("/etc/credentials"); err == nil { + if creds.Account != "" { + accountID = creds.Account + } + if creds.AccessKey != "" { + accessKey = creds.AccessKey + } } - // Remove stale socket file from a previous run - os.Remove(socketPath) - - lis, err := net.Listen("unix", socketPath) - if err != nil { - logger.L().Fatal("failed to listen on socket", helpers.Error(err), helpers.String("path", socketPath)) - } - - srv := grpc.NewServer() - pb.RegisterSBOMScannerServer(srv, sbomscanner.NewScannerServer()) - - sigCh := make(chan os.Signal, 1) - signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT) - - go func() { - sig := <-sigCh - logger.L().Info("received signal, shutting down", helpers.String("signal", sig.String())) - srv.GracefulStop() - os.Remove(socketPath) - }() - - logger.L().Info("SBOM scanner sidecar started", helpers.String("socket", socketPath)) - if err := srv.Serve(lis); err != nil { - logger.L().Fatal("gRPC server failed", helpers.Error(err)) - } + // Run the reusable SBOM scanner server + sbomscanner.RunServer(ctx, accountID, accessKey) } diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index b4b1d26d31..1020fd61b3 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -86,15 +86,49 @@ These environment variables are read directly (not through config file): | `CONFIG_DIR` | Configuration directory path | No (default: `/etc/config`) | | `SKIP_KERNEL_VERSION_CHECK` | Skip kernel validation | No | | `ENABLE_PROFILER` | Enable pprof on port 6060 | No | -| `OTEL_COLLECTOR_SVC` | OpenTelemetry collector (e.g., `otel-collector:4317`) | No | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP collector endpoint (e.g., `otel.armosec.io:4317`). When unset, all telemetry is silently discarded (no-op). | No | +| `OTEL_METRICS_EXPORTER` | Metrics exporter: `otlp` (push to collector) or `prometheus` (expose `:8080/metrics`). Defaults to `otlp` when endpoint is set, `none` otherwise. | No | +| `OTEL_TRACES_EXPORTER` | Traces exporter: `otlp` or `none`. Defaults to `otlp` when endpoint is set, `none` otherwise. | No | +| `OTEL_SLOW_EVAL_THRESHOLD_MS` | Rule evaluations exceeding this threshold (ms) emit a trace span. Default: `5`. | No | +| `OTEL_DEBUG_PORT` | Port for the debug listener when `KS_LOGGER_LEVEL=debug`. Default: `6062`. | No | +| `OTEL_COLLECTOR_SVC` | **Deprecated** — alias for `OTEL_EXPORTER_OTLP_ENDPOINT`. Will be removed in a future release. | No | | `PYROSCOPE_SERVER_SVC` | Pyroscope server address | No | | `APPLICATION_NAME` | Application name for Pyroscope | No (default: `node-agent`) | | `RELEASE` | Release version for telemetry | No | +| `KS_LOGGER_LEVEL` | Log level: `debug`, `info`, `warning`, `error`. Default: `info`. When `debug`, enables the ring-buffer flush endpoint. | No | +| `KS_LOGGER_NAME` | Logger output format: `zap` (structured JSON) or `pretty` (human-readable). Default: `zap`. | No | | `MULTIPLY` | Enable pod multiplication (testing) | No | | `QUEUE_DIR` | Directory for persistent queue | No | | `MAX_QUEUE_SIZE` | Maximum queue size | No | | `TEST_NAMESPACE` | Override namespace in tests | No | +### OTEL Notes + +**Authentication headers:** When credentials are present in `/etc/credentials` (or via `ACCOUNT_ID` / +`ACCESS_KEY` env vars), `X-API-Key` and `X-Customer-GUID` gRPC metadata headers are injected for every +outbound OTLP RPC, regardless of endpoint hostname. This applies to any collector — ARMO back office, +self-hosted, or otherwise. Credentials are read once at startup; agent restart is required if credentials +are rotated at runtime (known v1 limitation). + +**Ring buffer (retroactive log export):** When `KS_LOGGER_LEVEL=debug`, the agent keeps the last +7,500 log records in memory and activates a flush endpoint at `localhost:6062/debug/flush-ring-buffer` +(port configurable via `OTEL_DEBUG_PORT`). A `POST` to that endpoint re-emits all buffered records +through the OTLP log pipeline — useful for recovering startup logs that were emitted before the OTLP +exporter finished connecting. The ring buffer is cleared after flushing; a second call emits nothing. + +**Kubernetes event correlation — cross-repo dependency (operator PR, not yet implemented)** + +K8s events (OOMKilled, Evicted, CrashLoopBackOff, NodeMemoryPressure, pod rescheduling) are collected +by the **operator**, which runs once per cluster and pushes OTLP logs to `otel.armosec.io:4317` using +the same API credentials. Correlation is automatic via shared `k8s.node.name`, `k8s.pod.name`, +`k8s.namespace.name` resource attributes — no node-agent configuration change required. + +Until the operator PR ships, node-agent ACs 1–9 are independent and all pass without K8s event +correlation. The ARMO back office dashboard renders correctly using node-agent signals only; K8s event +correlation is an upgrade, not a dependency. + +--- + ## Configuration File Options ### Core Settings diff --git a/docs/metrics-migration.md b/docs/metrics-migration.md new file mode 100644 index 0000000000..6de401e01a --- /dev/null +++ b/docs/metrics-migration.md @@ -0,0 +1,143 @@ +# Metrics Migration Guide: Prometheus → OTEL SDK + +This document maps every old Prometheus metric name to its new OTEL SDK name. +Required for Phase 2 merge per the instrumentation plan (AC7). + +## Background + +Phase 2 replaces `pkg/metricsmanager/prometheus/` with `pkg/metricsmanager/otel/`. +Metric names change from legacy Prometheus conventions (`_counter` suffix, flat names) +to OTEL semantic conventions (dot-separated namespaces). When +`OTEL_METRICS_EXPORTER=prometheus` the OTEL→Prometheus bridge converts `.` → `_`, +so `node_agent.ebpf.exec.total` is exposed as `node_agent_ebpf_exec_total`. + +**This is a breaking rename.** Existing Prometheus dashboards and alerting rules that +reference the old names must be updated. See the mapping table below. + +## Metric Name Mapping + +### eBPF Event Counters (17 → 1) + +The 17 individual per-event-type counters are collapsed into one counter with an +`event_type` label. This eliminates metric proliferation and adds coverage for +previously missing event types (`exit`, `fork`). + +| Old Prometheus name | New OTEL name (Prometheus: `_` replaces `.`) | Label change | +|---|---|---| +| `node_agent_exec_counter` | `node_agent_ebpf_events_total{event_type="execve"}` | new `event_type` label | +| `node_agent_open_counter` | `node_agent_ebpf_events_total{event_type="open"}` | new `event_type` label | +| `node_agent_network_counter` | `node_agent_ebpf_events_total{event_type="network"}` | new `event_type` label | +| `node_agent_dns_counter` | `node_agent_ebpf_events_total{event_type="dns"}` | new `event_type` label | +| `node_agent_syscall_counter` | `node_agent_ebpf_events_total{event_type="syscall"}` | new `event_type` label | +| `node_agent_capability_counter` | `node_agent_ebpf_events_total{event_type="capabilities"}` | new `event_type` label | +| `node_agent_randomx_counter` | `node_agent_ebpf_events_total{event_type="randomx"}` | new `event_type` label | +| `node_agent_symlink_counter` | `node_agent_ebpf_events_total{event_type="symlink"}` | new `event_type` label | +| `node_agent_hardlink_counter` | `node_agent_ebpf_events_total{event_type="hardlink"}` | new `event_type` label | +| `node_agent_ssh_counter` | `node_agent_ebpf_events_total{event_type="ssh"}` | new `event_type` label | +| `node_agent_http_counter` | `node_agent_ebpf_events_total{event_type="http"}` | new `event_type` label | +| `node_agent_ptrace_counter` | `node_agent_ebpf_events_total{event_type="ptrace"}` | new `event_type` label | +| `node_agent_iouring_counter` | `node_agent_ebpf_events_total{event_type="iouring"}` | new `event_type` label | +| `node_agent_kmod_counter` | `node_agent_ebpf_events_total{event_type="kmod"}` | new `event_type` label | +| `node_agent_unshare_counter` | `node_agent_ebpf_events_total{event_type="unshare"}` | new `event_type` label | +| `node_agent_bpf_counter` | `node_agent_ebpf_events_total{event_type="bpf"}` | new `event_type` label | +| `node_agent_ebpf_event_failure_counter` | `node_agent_ebpf_events_failed_total` | no label | + +### Rule Metrics + +| Old Prometheus name | New OTEL name | Label change | +|---|---|---| +| `node_agent_rule_counter{rule_id}` | `node_agent_rule_processed_total{rule_id}` | `rule_id` now uses `rule.ID` (stable ID) instead of `rule.Name` | +| `node_agent_rule_prefiltered_total{rule_id}` | `node_agent_rule_prefiltered_total{rule_id}` | `rule_id` now uses `rule.ID` | +| `node_agent_alert_counter{rule_id}` | `node_agent_alert_total{rule_id}` | `rule_id` now uses `rule.ID` | +| `node_agent_rule_evaluation_time_seconds{rule_id,event_type}` | `node_agent_rule_evaluation_duration{rule_id,event_type}` | `rule_id` now uses `rule.ID`; bucket boundaries updated | + +**Note:** `rule_id` label values change from rule display names to stable rule IDs +(e.g. `R1001` instead of `Unexpected process launched`). Update alert queries accordingly. + +### Container Metrics + +| Old Prometheus name | New OTEL name | +|---|---| +| `node_agent_container_start_counter` | `node_agent_container_start_total` | +| `node_agent_container_stop_counter` | `node_agent_container_stop_total` | +| `node_agent_dedup_events_total{event_type,result}` | `node_agent_ebpf_dedup_total{event_type,result}` | + +### ContainerProfile Cache Metrics + +| Old Prometheus name | New OTEL name | +|---|---| +| `node_agent_user_profile_legacy_loads_total{kind,completeness}` | `node_agent_profile_legacy_load_total{kind,completeness}` | +| `node_agent_containerprofile_cache_entries{kind}` | `node_agent_profile_cache_entries{kind}` | +| `node_agent_containerprofile_cache_hit_total{result}` | `node_agent_profile_cache_hit_total{result}` | +| `node_agent_containerprofile_reconciler_duration_seconds{phase}` | `node_agent_profile_reconciler_duration{phase}` | +| `node_agent_containerprofile_reconciler_evictions_total{reason}` | `node_agent_profile_reconciler_evictions_total{reason}` | + +### Rule Projection Metrics + +| Old Prometheus name | New OTEL name | +|---|---| +| `rule_load_rejected_missing_declaration_total{rule_id}` | `node_agent_rule_projection_missing_decl_total{rule_id}` | +| `rule_projection_undeclared_literal_total{helper}` | `node_agent_rule_projection_undeclared_literal_total{helper}` | +| `rule_projection_stale_entries` | `node_agent_rule_projection_stale_entries` | +| `rule_projection_undeclared_rules` | `node_agent_rule_projection_undeclared_rules` | +| `rule_projection_spec_compile_total` | `node_agent_rule_projection_spec_compile_total` | +| `rule_projection_spec_hash_changes_total` | `node_agent_rule_projection_spec_hash_change_total` | +| `rule_projection_spec_patterns{field,kind}` | `node_agent_rule_projection_spec_patterns{field,kind}` | +| `rule_projection_spec_all_fields{field}` | `node_agent_rule_projection_spec_all_field{field}` | +| `rule_projection_apply_duration_seconds` | `node_agent_rule_projection_apply_duration` | +| `rule_projection_reconcile_triggered_total{trigger}` | `node_agent_rule_projection_reconcile_triggered_total{trigger}` | +| `rule_helper_call_total{helper}` | `node_agent_rule_projection_helper_call_total{helper}` | +| `rule_projection_undeclared_rules_list{rule_id}` | `node_agent_rule_projection_undeclared_rules_detail{rule_id}` | + +### Memory-Savings Metrics (dev-only) + +| Old Prometheus name | New OTEL name | +|---|---| +| `profile_raw_size_bytes` | `node_agent_profile_raw_size` | +| `profile_projected_size_bytes` | `node_agent_profile_projected_size` | +| `profile_entries_raw_total{field}` | `node_agent_profile_entries_raw{field}` | +| `profile_entries_retained_total{field}` | `node_agent_profile_entries_retained{field}` | +| `profile_retention_ratio{field}` | `node_agent_profile_retention_ratio{field}` | + +### New Metrics (no old equivalent) + +| New OTEL name | Description | +|---|---| +| `node_agent_ebpf_events_dropped_total{reason}` | eBPF events dropped due to backpressure (`reason=worker_channel_full`) or profile drops | + +### Removed Metrics (not migrated) + +The following Prometheus metrics are not present in the OTEL implementation. +See Appendix A of the instrumentation plan for rationale. + +| Old Prometheus name | Reason removed | +|---|---| +| `node_agent_program_current_runtime` | Dead code — `ReportEbpfStats` commented out since initial implementation | +| `node_agent_program_current_run_count` | Dead code | +| `node_agent_program_total_runtime` | Dead code | +| `node_agent_program_total_run_count` | Dead code | +| `node_agent_program_map_memory` | Dead code | +| `node_agent_program_map_count` | Dead code | +| `node_agent_program_total_cpu_usage` | Dead code | +| `node_agent_program_per_cpu_usage` | Dead code | + +## Histogram Bucket Changes + +`node_agent_rule_evaluation_duration` uses new focused buckets covering P99 in the +1–10ms range: + +**Old:** `prometheus.ExponentialBuckets(0.001, 2, 10)` → 1ms … 1024s (upper buckets unrealistic) + +**New:** `0.5ms, 1ms, 2ms, 5ms, 10ms, 50ms, 500ms, 2s` (covers realistic rule eval latency) + +## Update Checklist + +When upgrading from Phase 1 to Phase 2: + +- [ ] Update Prometheus recording rules referencing old metric names +- [ ] Update Prometheus alerting rules (especially those querying `node_agent_exec_counter`, + `node_agent_alert_counter`, `node_agent_rule_evaluation_time_seconds`) +- [ ] Update Grafana dashboard panels: replace old metric names with new ones; + add `event_type` label selector to panels that previously used individual event counters +- [ ] Verify `curl -s :8080/metrics | grep node_agent` returns new names +- [ ] Note that `rule_id` label values now use stable rule IDs, not display names diff --git a/go.mod b/go.mod index d2bc031002..b50989530e 100644 --- a/go.mod +++ b/go.mod @@ -33,8 +33,8 @@ require ( github.com/inspektor-gadget/inspektor-gadget v0.45.1-0.20251020222545-c91c23581ebf github.com/joncrlsn/dque v0.0.0-20241024143830-7723fd131a64 github.com/kubescape/backend v0.0.39 - github.com/kubescape/go-logger v0.0.28 - github.com/kubescape/k8s-interface v0.0.210 + github.com/kubescape/go-logger v0.0.32 + github.com/kubescape/k8s-interface v0.0.213 github.com/kubescape/storage v0.0.258 github.com/kubescape/workerpool v0.0.0-20250526074519-0e4a4e7f44cf github.com/moby/sys/mountinfo v0.7.2 @@ -46,12 +46,21 @@ require ( github.com/picatz/xcel v0.0.0-20260226001349-6958ffac5706 github.com/prometheus/alertmanager v0.27.0 github.com/prometheus/client_golang v1.23.2 - github.com/prometheus/procfs v0.19.2 + github.com/prometheus/procfs v0.20.1 github.com/sirupsen/logrus v1.9.4-0.20230606125235-dd1b4c2e81af github.com/spf13/afero v1.15.0 github.com/spf13/viper v1.21.0 github.com/stretchr/testify v1.11.1 github.com/weaveworks/procspy v0.0.0-20150706124340-cb970aa190c3 + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 + go.opentelemetry.io/contrib/instrumentation/runtime v0.68.0 + go.opentelemetry.io/otel v1.43.0 + go.opentelemetry.io/otel/exporters/prometheus v0.65.0 + go.opentelemetry.io/otel/log v0.19.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 go.uber.org/multierr v1.11.0 golang.org/x/net v0.53.0 golang.org/x/sys v0.43.0 @@ -350,7 +359,8 @@ require ( github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.67.4 // indirect + github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/otlptranslator v1.0.0 // indirect github.com/puzpuzpuz/xsync/v2 v2.4.1 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rivo/uniseg v0.4.7 // indirect @@ -408,25 +418,22 @@ require ( go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/bridges/otelslog v0.18.0 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect - go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect - go.opentelemetry.io/contrib/instrumentation/runtime v0.68.0 // indirect go.opentelemetry.io/contrib/processors/minsev v0.16.0 // indirect - go.opentelemetry.io/otel v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.19.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 // indirect go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 // indirect - go.opentelemetry.io/otel/log v0.19.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect - go.opentelemetry.io/otel/sdk v1.43.0 // indirect + go.opentelemetry.io/otel/log/logtest v0.19.0 // indirect go.opentelemetry.io/otel/sdk/log v0.19.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect - go.opentelemetry.io/otel/trace v1.43.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/zap v1.27.1 // indirect - go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v2 v2.4.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect go4.org v0.0.0-20230225012048-214862532bf5 // indirect go4.org/netipx v0.0.0-20231129151722-fdeea329fbba // indirect diff --git a/go.sum b/go.sum index 3538ee00ab..502c3f92a6 100644 --- a/go.sum +++ b/go.sum @@ -885,10 +885,10 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kubescape/backend v0.0.39 h1:B1QRfKCSFlzuE+jWOnk/l7EpH71/Q3n14KKq0QSnZwg= github.com/kubescape/backend v0.0.39/go.mod h1:cMEGP8cXUZgY89YU4GRBGIla9HZW7grZsUtlCwvZgAE= -github.com/kubescape/go-logger v0.0.28 h1:xulKTp9kOg3rD98sopFELQ6yZCHQoQXMDzteoSHDFKI= -github.com/kubescape/go-logger v0.0.28/go.mod h1:YZHFjwGCDar1hP9OyBLE46oR7a0Y/Z/0FperDo8+9D0= -github.com/kubescape/k8s-interface v0.0.210 h1:3TiO3lYxdIHncoBRAMAMFdwanHmllUpYKFy5cG0h97o= -github.com/kubescape/k8s-interface v0.0.210/go.mod h1:WNYUG93aZ5kDmuaRKFLtVhp18Yc6EfaHdD1gLYtVTN4= +github.com/kubescape/go-logger v0.0.32 h1:4mI+XJOV8VFCMewrEE9VIFEIOhzXokYT3nFpNfXf4fM= +github.com/kubescape/go-logger v0.0.32/go.mod h1:Alj7JBQ8/WCxbXe8Ura6ZheSRK45E0p21M3xeqedX90= +github.com/kubescape/k8s-interface v0.0.213 h1:JaEVzgE5qwQ3rEjQ8tBMp48YX4yveitLfYNaCIk8j/A= +github.com/kubescape/k8s-interface v0.0.213/go.mod h1:WNYUG93aZ5kDmuaRKFLtVhp18Yc6EfaHdD1gLYtVTN4= github.com/kubescape/storage v0.0.258 h1:0mL0z3dAmtP1qup7VgoEgwLgbBSROu5oOusBAPeMmus= github.com/kubescape/storage v0.0.258/go.mod h1:VHs+xQzvZKE2lJDN8rR1sFmTa43N6XJAcatZ249gviU= github.com/kubescape/syft v1.32.0-ks.2 h1:xdUksUmKEyyVKsTfJDYW8Z5HawVJtelsUolPOsWtDx0= @@ -1149,14 +1149,16 @@ github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvM github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= -github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= -github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos= +github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM= github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= -github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= -github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= +github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/puzpuzpuz/xsync/v2 v2.4.1 h1:aGdE1C/HaR/QC6YAFdtZXi60Df8/qBIrs8PKrzkItcM= github.com/puzpuzpuz/xsync/v2 v2.4.1/go.mod h1:gD2H2krq/w52MfPLE+Uy64TzJDVY7lP2znR9qmR35kU= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= @@ -1400,20 +1402,30 @@ go.opentelemetry.io/contrib/processors/minsev v0.16.0 h1:bjTZkvAKnG1mqWgCjU7RkOk go.opentelemetry.io/contrib/processors/minsev v0.16.0/go.mod h1:R2mmaDsqsWb+Y0mQkPifiCwifdotrG4fFoD4z0tim+g= go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.19.0 h1:Dn8rkudDzY6KV9dr/D/bTUuWgqDf9xe0rr4G2elrn0Y= +go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.19.0/go.mod h1:gMk9F0xDgyN9M/3Ed5Y1wKcx/9mlU91NXY2SNq7RQuU= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0 h1:HIBTQ3VO5aupLKjC90JgMqpezVXwFuq6Ryjn0/izoag= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.19.0/go.mod h1:ji9vId85hMxqfvICA0Jt8JqEdrXaAkcpkI9HPXya0ro= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0 h1:RAE+JPfvEmvy+0LzyUA25/SGawPwIUbZ6u0Wug54sLc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.43.0/go.mod h1:AGmbycVGEsRx9mXMZ75CsOyhSP6MFIcj/6dnG+vhVjk= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak= -go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.40.0 h1:ZrPRak/kS4xI3AVXy8F7pipuDXmDsrO8Lg+yQjBLjw0= -go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.40.0/go.mod h1:3y6kQCWztq6hyW8Z9YxQDDm0Je9AJoFar2G0yDcmhRk= +go.opentelemetry.io/otel/exporters/prometheus v0.65.0 h1:jOveH/b4lU9HT7y+Gfamf18BqlOuz2PWEvs8yM7Q6XE= +go.opentelemetry.io/otel/exporters/prometheus v0.65.0/go.mod h1:i1P8pcumauPtUI4YNopea1dhzEMuEqWP1xoUZDylLHo= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0 h1:TC+BewnDpeiAmcscXbGMfxkO+mwYUwE/VySwvw88PfA= +go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.43.0/go.mod h1:J/ZyF4vfPwsSr9xJSPyQ4LqtcTPULFR64KwTikGLe+A= go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 h1:mS47AX77OtFfKG4vtp+84kuGSFZHTyxtXIN269vChY0= go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0/go.mod h1:PJnsC41lAGncJlPUniSwM81gc80GkgWJWr3cu2nKEtU= go.opentelemetry.io/otel/log v0.19.0 h1:KUZs/GOsw79TBBMfDWsXS+KZ4g2Ckzksd1ymzsIEbo4= go.opentelemetry.io/otel/log v0.19.0/go.mod h1:5DQYeGmxVIr4n0/BcJvF4upsraHjg6vudJJpnkL6Ipk= +go.opentelemetry.io/otel/log/logtest v0.19.0 h1:HdSsl4ndTK15LtJGLWBfMsSlLrCgSeE3VMzwOrLYiYs= +go.opentelemetry.io/otel/log/logtest v0.19.0/go.mod h1:c1sH1nOHTwfMCWhhQTdWGqxgDjZhtkbkzAqGGyj0Ijs= go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= @@ -1438,8 +1450,8 @@ go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN8 go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= -go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= +go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= go4.org v0.0.0-20180809161055-417644f6feb5/go.mod h1:MkTOUMDaeVYJUOUsaDXIhWPZYa1yOyC1qaOBpL57BhE= diff --git a/pkg/config/config.go b/pkg/config/config.go index 0291d79386..f203ad1100 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -81,7 +81,7 @@ type Config struct { EnableNetworkTracing bool `mapstructure:"networkServiceEnabled"` EnableNodeProfile bool `mapstructure:"nodeProfileServiceEnabled"` EnablePartialProfileGeneration bool `mapstructure:"partialProfileGenerationEnabled"` - EnablePrometheusExporter bool `mapstructure:"prometheusExporterEnabled"` + EnableMetricsExporter bool `mapstructure:"prometheusExporterEnabled"` EnableRuntimeDetection bool `mapstructure:"runtimeDetectionEnabled"` EnableSbomGeneration bool `mapstructure:"sbomGenerationEnabled"` EnableSeccomp bool `mapstructure:"seccompServiceEnabled"` diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index acb00bd7e2..74f322147e 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -54,7 +54,7 @@ func TestLoadConfig(t *testing.T) { MaxTsProfileSize: 2 * 1024 * 1024, ProfilesCacheRefreshRate: 1 * time.Minute, ProcfsPidScanInterval: 5 * time.Second, - EnablePrometheusExporter: false, + EnableMetricsExporter: false, EnableRuntimeDetection: false, EnableSeccomp: false, SeccompProfileBackend: "storage", diff --git a/pkg/containerprofilemanager/v1/containerprofile_manager.go b/pkg/containerprofilemanager/v1/containerprofile_manager.go index 7b20f44719..f4a5fafb3d 100644 --- a/pkg/containerprofilemanager/v1/containerprofile_manager.go +++ b/pkg/containerprofilemanager/v1/containerprofile_manager.go @@ -21,6 +21,7 @@ import ( "github.com/kubescape/node-agent/pkg/dnsmanager" "github.com/kubescape/node-agent/pkg/k8sclient" "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/kubescape/node-agent/pkg/rulebindingmanager" "github.com/kubescape/node-agent/pkg/seccompmanager" "github.com/kubescape/node-agent/pkg/storage" @@ -94,6 +95,8 @@ type ContainerProfileManager struct { hostID string completionNotifier objectcache.CompletionNotifier + + lifecycleTracker *otelsetup.ProfileLifecycleTracker } func (cpm *ContainerProfileManager) SetCompletionNotifier(n objectcache.CompletionNotifier) { @@ -132,6 +135,7 @@ func NewContainerProfileManager( containers: make(map[string]*ContainerEntry), maxSniffTimeNotificationChan: make([]chan *containercollection.Container, 0), cloudMetadata: cloudMetadata, + lifecycleTracker: otelsetup.NewProfileLifecycleTracker(), } // Initialize queue diff --git a/pkg/containerprofilemanager/v1/monitoring.go b/pkg/containerprofilemanager/v1/monitoring.go index c7e7f9f051..f579cbc894 100644 --- a/pkg/containerprofilemanager/v1/monitoring.go +++ b/pkg/containerprofilemanager/v1/monitoring.go @@ -18,6 +18,12 @@ import ( // monitorContainer monitors a container and saves its profile periodically func (cpm *ContainerProfileManager) monitorContainer(container *containercollection.Container, watchedContainer *objectcache.WatchedContainerData) error { + cpm.lifecycleTracker.OnLearningStarted( + watchedContainer.ContainerID, + container.K8s.Namespace, + container.K8s.PodName, + container.Runtime.ContainerImageName, + ) for { select { case <-watchedContainer.UpdateDataTicker.C: @@ -40,7 +46,7 @@ func (cpm *ContainerProfileManager) monitorContainer(container *containercollect switch { case errors.Is(err, ContainerHasTerminatedError): if err := cpm.saveProfile(watchedContainer, container, true); err != nil { - logger.L().Error("failed to save container profile on termination", helpers.Error(err), + logger.L().Ctx(cpm.lifecycleTracker.LearningCtx(watchedContainer.ContainerID)).Error("failed to save container profile on termination", helpers.Error(err), helpers.String("containerID", watchedContainer.ContainerID), helpers.String("containerName", container.Runtime.ContainerName), helpers.String("workloadID", watchedContainer.Wlid), @@ -50,6 +56,7 @@ func (cpm *ContainerProfileManager) monitorContainer(container *containercollect if watchedContainer.GetStatus() == objectcache.WatchedContainerStatusCompleted { cpm.notifyCompleted(watchedContainer.ContainerID) } + cpm.lifecycleTracker.OnLearningEnded(watchedContainer.ContainerID, "terminated") // Signal ack to lifecycle goroutine if watchedContainer.AckChan != nil { watchedContainer.AckChan <- struct{}{} @@ -59,7 +66,7 @@ func (cpm *ContainerProfileManager) monitorContainer(container *containercollect case errors.Is(err, ContainerReachedMaxTime): watchedContainer.SetStatus(objectcache.WatchedContainerStatusCompleted) if err := cpm.saveProfile(watchedContainer, container, true); err != nil { - logger.L().Error("failed to save container profile on max time", helpers.Error(err), + logger.L().Ctx(cpm.lifecycleTracker.LearningCtx(watchedContainer.ContainerID)).Error("failed to save container profile on max time", helpers.Error(err), helpers.String("containerID", watchedContainer.ContainerID), helpers.String("containerName", container.Runtime.ContainerName), helpers.String("workloadID", watchedContainer.Wlid), @@ -67,6 +74,7 @@ func (cpm *ContainerProfileManager) monitorContainer(container *containercollect helpers.String("completionStatus", string(watchedContainer.GetCompletionStatus()))) } cpm.notifyCompleted(watchedContainer.ContainerID) + cpm.lifecycleTracker.OnLearningEnded(watchedContainer.ContainerID, "completed") // Signal ack to lifecycle goroutine if watchedContainer.AckChan != nil { watchedContainer.AckChan <- struct{}{} @@ -97,15 +105,17 @@ func (cpm *ContainerProfileManager) handleSaveProfileError(err error, watchedCon cpm.deleteContainer(container) cpm.notifyContainerEndOfLife(container) cpm.notifyCompleted(watchedContainer.ContainerID) + cpm.lifecycleTracker.OnLearningEnded(watchedContainer.ContainerID, "too_large") return file.ObjectTooLargeError } else if err.Error() == file.ObjectCompletedError.Error() { watchedContainer.SetStatus(objectcache.WatchedContainerStatusCompleted) cpm.deleteContainer(container) cpm.notifyContainerEndOfLife(container) cpm.notifyCompleted(watchedContainer.ContainerID) + cpm.lifecycleTracker.OnLearningEnded(watchedContainer.ContainerID, "completed") return file.ObjectCompletedError } else { - logger.L().Error("failed to save container profile", helpers.Error(err), + logger.L().Ctx(cpm.lifecycleTracker.LearningCtx(watchedContainer.ContainerID)).Error("failed to save container profile", helpers.Error(err), helpers.String("containerID", watchedContainer.ContainerID), helpers.String("containerName", container.Runtime.ContainerName), helpers.String("workloadID", watchedContainer.Wlid), @@ -132,7 +142,7 @@ func (cpm *ContainerProfileManager) saveContainerProfile(watchedContainer *objec slug, err := watchedContainer.InstanceID.GetOneTimeSlug(false) if err != nil { - logger.L().Error("failed to get slug for container profile", helpers.Error(err)) + logger.L().Ctx(cpm.lifecycleTracker.LearningCtx(watchedContainer.ContainerID)).Error("failed to get slug for container profile", helpers.Error(err)) return err } @@ -173,6 +183,10 @@ func (cpm *ContainerProfileManager) saveContainerProfile(watchedContainer *objec helpersv1.ReportSeriesIdMetadataKey: watchedContainer.SeriesID, helpersv1.PreviousReportTimestampMetadataKey: watchedContainer.PreviousReportTimestamp.String(), helpersv1.ReportTimestampMetadataKey: watchedContainer.CurrentReportTimestamp.String(), + helpersv1.OtelSpanIDMetadataKey: cpm.lifecycleTracker.LearningSpanID(watchedContainer.ContainerID), + // Full W3C traceparent so kubescape/storage can create a properly + // parented child span for the aggregation step. + helpersv1.OtelTraceparentMetadataKey: cpm.lifecycleTracker.LearningTraceparent(watchedContainer.ContainerID), }, Labels: objectcache.GetLabels(cpm.cloudMetadata, watchedContainer, false), }, @@ -203,6 +217,8 @@ func (cpm *ContainerProfileManager) saveContainerProfile(watchedContainer *objec return err } + cpm.lifecycleTracker.OnEntrySaved(watchedContainer.ContainerID, containerData.droppedEvents) + logger.L().Debug("container profile saved successfully", helpers.String("containerID", watchedContainer.ContainerID), helpers.String("containerName", container.Runtime.ContainerName), diff --git a/pkg/containerwatcher/v2/container_watcher.go b/pkg/containerwatcher/v2/container_watcher.go index 2fa79594fa..d67e02c90c 100644 --- a/pkg/containerwatcher/v2/container_watcher.go +++ b/pkg/containerwatcher/v2/container_watcher.go @@ -8,6 +8,9 @@ import ( mapset "github.com/deckarep/golang-set/v2" containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "github.com/kubescape/node-agent/pkg/otelsetup" containerutilsTypes "github.com/inspektor-gadget/inspektor-gadget/pkg/container-utils/types" "github.com/inspektor-gadget/inspektor-gadget/pkg/operators/socketenricher" "github.com/inspektor-gadget/inspektor-gadget/pkg/runtime" @@ -73,8 +76,9 @@ type ContainerWatcher struct { tracerManagerV2 *TracerManager // Worker pool for processing events - workerPool *ants.PoolWithFunc - workerChan chan *events.EnrichedEvent // Channel for worker pool invocation + workerPool *ants.PoolWithFunc + workerChan chan *events.EnrichedEvent // Channel for worker pool invocation + ebpfDropCounter metric.Int64Counter // Third party components thirdPartyTracersInitializers mapset.Set[containerwatcher.CustomTracerInitializer] @@ -172,6 +176,11 @@ func CreateContainerWatcher( return nil, fmt.Errorf("creating worker pool: %w", err) } + ebpfDropCounter, _ := otelsetup.Meter().Int64Counter( + "node_agent.ebpf.events_dropped.total", + metric.WithDescription("Total eBPF events dropped due to backpressure or profile drops"), + ) + return &ContainerWatcher{ // Configuration cfg: cfg, @@ -201,6 +210,7 @@ func CreateContainerWatcher( eventEnricher: eventEnricher, workerPool: workerPool, workerChan: make(chan *events.EnrichedEvent, cfg.WorkerChannelSize), + ebpfDropCounter: ebpfDropCounter, // Third party components thirdPartyTracersInitializers: thirdPartyTracers.ThirdPartyTracersInitializers, @@ -473,14 +483,23 @@ func (cw *ContainerWatcher) enrichAndProcess(entry EventEntry) { case cw.workerChan <- enrichedEvent: default: if cw.cfg.BlockEvents { - logger.L().Warning("ContainerWatcher - Worker channel full, blocking until space available", + logger.L().Ctx(context.Background()).Warning("ContainerWatcher - Worker channel full, blocking until space available", helpers.String("eventType", string(entry.EventType)), helpers.String("containerID", entry.ContainerID)) cw.workerChan <- enrichedEvent } else { - logger.L().Warning("ContainerWatcher - Worker channel full, dropping event", + logger.L().Ctx(context.Background()).Warning("ContainerWatcher - Worker channel full, dropping event", helpers.String("eventType", string(entry.EventType)), helpers.String("containerID", entry.ContainerID)) + cw.ebpfDropCounter.Add(context.Background(), + 1, + metric.WithAttributes( + attribute.String("event_type", string(entry.EventType)), + attribute.String("reason", "worker_channel_full"), + ), + ) + cw.containerProfileManager.ReportDroppedEvent(entry.ContainerID) + enrichedEvent.Event.Release() } } } diff --git a/pkg/containerwatcher/v2/event_handler_factory.go b/pkg/containerwatcher/v2/event_handler_factory.go index 06a1878d4c..561532a8a0 100644 --- a/pkg/containerwatcher/v2/event_handler_factory.go +++ b/pkg/containerwatcher/v2/event_handler_factory.go @@ -5,6 +5,9 @@ import ( "runtime/pprof" mapset "github.com/deckarep/golang-set/v2" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/goradd/maps" containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" "github.com/kubescape/node-agent/pkg/config" @@ -70,6 +73,7 @@ type EventHandlerFactory struct { dedupCache *dedupcache.DedupCache metrics metricsmanager.MetricsManager dedupSkipSet map[Manager]struct{} // Managers to skip when event is duplicate + ebpfDropCounter metric.Int64Counter } // NewEventHandlerFactory creates a new event handler factory @@ -87,6 +91,11 @@ func NewEventHandlerFactory( rulePolicyReporter *rulepolicy.RulePolicyReporter, dedupCache *dedupcache.DedupCache, ) *EventHandlerFactory { + ebpfDropCounter, _ := otelsetup.Meter().Int64Counter( + "node_agent.ebpf.events_dropped.total", + metric.WithDescription("Total eBPF events dropped due to backpressure or profile drops"), + ) + factory := &EventHandlerFactory{ handlers: make(map[utils.EventType][]Manager), thirdPartyEventReceivers: thirdPartyEventReceivers, @@ -98,6 +107,7 @@ func NewEventHandlerFactory( dedupCache: dedupCache, metrics: metrics, dedupSkipSet: make(map[Manager]struct{}), + ebpfDropCounter: ebpfDropCounter, } // Create adapters for managers that don't implement the Manager interface directly @@ -322,6 +332,13 @@ func (ehf *EventHandlerFactory) ProcessEvent(enrichedEvent *events.EnrichedEvent // Always report dropped events regardless of dedup status if enrichedEvent.Event.HasDroppedEvents() { ehf.containerProfileManager.ReportDroppedEvent(enrichedEvent.Event.GetContainerID()) + ehf.ebpfDropCounter.Add(context.Background(), + 1, + metric.WithAttributes( + attribute.String("event_type", string(enrichedEvent.Event.GetEventType())), + attribute.String("reason", "profile_drop"), + ), + ) } // Get handlers for this event type diff --git a/pkg/containerwatcher/v2/tracers/top.go b/pkg/containerwatcher/v2/tracers/top.go index b72ac77d2d..d0cc2c8f67 100644 --- a/pkg/containerwatcher/v2/tracers/top.go +++ b/pkg/containerwatcher/v2/tracers/top.go @@ -86,7 +86,7 @@ func (tt *TopTracer) GetEventType() utils.EventType { // IsEnabled checks if this tracer should be enabled based on configuration func (tt *TopTracer) IsEnabled(cfg config.Config) bool { - return !cfg.DTop && cfg.EnablePrometheusExporter + return !cfg.DTop && cfg.EnableMetricsExporter } // topEventCallback handles top events from the tracer diff --git a/pkg/exporters/alert_bulk_manager.go b/pkg/exporters/alert_bulk_manager.go index 8510ad505c..2d2821e37f 100644 --- a/pkg/exporters/alert_bulk_manager.go +++ b/pkg/exporters/alert_bulk_manager.go @@ -424,7 +424,7 @@ func (abm *AlertBulkManager) processSendQueueItem(item *bulkQueueItem) { // Failed - check if should retry if item.retryCount >= abm.maxRetries { - logger.L().Error("Bulk send failed after max retries", + logger.L().Ctx(context.Background()).Error("Bulk send failed after max retries", helpers.String("containerID", item.containerID), helpers.Int("alertCount", len(item.alerts)), helpers.Int("retries", item.retryCount), @@ -493,7 +493,7 @@ func (abm *AlertBulkManager) drainSendQueue() { case abm.sendQueue <- item: // Enqueued successfully default: - logger.L().Warning("Queue full during drain, dropping bulk", + logger.L().Ctx(context.Background()).Warning("Queue full during drain, dropping bulk", helpers.String("containerID", containerID), helpers.Int("alertCount", len(alerts))) } @@ -516,7 +516,7 @@ func (abm *AlertBulkManager) drainSendQueue() { case <-timeout: remaining := len(abm.sendQueue) if remaining > 0 { - logger.L().Warning("Timeout draining send queue", + logger.L().Ctx(context.Background()).Warning("Timeout draining send queue", helpers.Int("remainingItems", remaining)) } return diff --git a/pkg/exporters/alert_manager.go b/pkg/exporters/alert_manager.go index d87c3be25b..e028ae6036 100644 --- a/pkg/exporters/alert_manager.go +++ b/pkg/exporters/alert_manager.go @@ -133,11 +133,11 @@ func (ame *AlertManagerExporter) SendRuleAlert(failedRule types.RuleFailure) { params := alert.NewPostAlertsParams().WithContext(context.Background()).WithAlerts(models.PostableAlerts{&myAlert}) isOK, err := ame.client.Alert.PostAlerts(params) if err != nil { - logger.L().Warning("AlertManagerExporter.SendRuleAlert - error sending alert", helpers.Error(err)) + logger.L().Ctx(context.Background()).Warning("AlertManagerExporter.SendRuleAlert - error sending alert", helpers.Error(err)) return } if isOK == nil { - logger.L().Warning("AlertManagerExporter.SendRuleAlert - alert was not sent successfully") + logger.L().Ctx(context.Background()).Warning("AlertManagerExporter.SendRuleAlert - alert was not sent successfully") return } } @@ -182,11 +182,11 @@ func (ame *AlertManagerExporter) SendMalwareAlert(malwareResult malwaremanager.M params := alert.NewPostAlertsParams().WithContext(context.Background()).WithAlerts(models.PostableAlerts{&myAlert}) isOK, err := ame.client.Alert.PostAlerts(params) if err != nil { - logger.L().Warning("AlertManagerExporter.SendMalwareAlert - error sending alert", helpers.Error(err)) + logger.L().Ctx(context.Background()).Warning("AlertManagerExporter.SendMalwareAlert - error sending alert", helpers.Error(err)) return } if isOK == nil { - logger.L().Warning("AlertManagerExporter.SendMalwareAlert - alert was not sent successfully") + logger.L().Ctx(context.Background()).Warning("AlertManagerExporter.SendMalwareAlert - alert was not sent successfully") return } } diff --git a/pkg/exporters/exporters_bus.go b/pkg/exporters/exporters_bus.go index f6f346bc5a..a73b0db2bc 100644 --- a/pkg/exporters/exporters_bus.go +++ b/pkg/exporters/exporters_bus.go @@ -6,6 +6,7 @@ import ( "github.com/armosec/armoapi-go/armotypes" "github.com/kubescape/node-agent/pkg/hostfimsensor" "github.com/kubescape/node-agent/pkg/malwaremanager" + "github.com/kubescape/node-agent/pkg/metricsmanager" "github.com/kubescape/node-agent/pkg/rulemanager/types" "github.com/kubescape/go-logger" @@ -29,7 +30,7 @@ type ExporterBus struct { } // InitExporters initializes all exporters. -func InitExporters(exportersConfig ExportersConfig, clusterName string, nodeName string, cloudMetadata *armotypes.CloudMetadata, clusterUID string, alertSourcePlatform armotypes.AlertSourcePlatform) *ExporterBus { +func InitExporters(exportersConfig ExportersConfig, clusterName string, nodeName string, cloudMetadata *armotypes.CloudMetadata, clusterUID string, alertSourcePlatform armotypes.AlertSourcePlatform, metrics metricsmanager.MetricsManager) *ExporterBus { var exporters []Exporter for _, url := range exportersConfig.AlertManagerExporterUrls { alertMan := InitAlertManagerExporter(url) @@ -56,7 +57,7 @@ func InitExporters(exportersConfig ExportersConfig, clusterName string, nodeName } } if exportersConfig.HTTPExporterConfig != nil { - httpExporter, err := NewHTTPExporter(*exportersConfig.HTTPExporterConfig, clusterName, nodeName, cloudMetadata, clusterUID, alertSourcePlatform) + httpExporter, err := NewHTTPExporter(*exportersConfig.HTTPExporterConfig, clusterName, nodeName, cloudMetadata, clusterUID, alertSourcePlatform, metrics) if err == nil { exporters = append(exporters, httpExporter) } else { diff --git a/pkg/exporters/http_exporter.go b/pkg/exporters/http_exporter.go index 416bfd912f..8ecc9c751f 100644 --- a/pkg/exporters/http_exporter.go +++ b/pkg/exporters/http_exporter.go @@ -14,6 +14,7 @@ import ( "github.com/kubescape/node-agent/pkg/hostfimsensor" "github.com/kubescape/node-agent/pkg/malwaremanager" + "github.com/kubescape/node-agent/pkg/metricsmanager" "github.com/kubescape/node-agent/pkg/rulemanager/types" "github.com/kubescape/go-logger" @@ -73,6 +74,7 @@ type HTTPExporter struct { cloudMetadata *armotypes.CloudMetadata bulkManager *AlertBulkManager alertSourcePlatform armotypes.AlertSourcePlatform + metrics metricsmanager.MetricsManager } type alertMetrics struct { @@ -96,11 +98,16 @@ type HTTPAlertsListSpec struct { } // NewHTTPExporter creates a new HTTPExporter instance -func NewHTTPExporter(config HTTPExporterConfig, clusterName, nodeName string, cloudMetadata *armotypes.CloudMetadata, clusterUID string, alertSourcePlatform armotypes.AlertSourcePlatform) (*HTTPExporter, error) { +func NewHTTPExporter(config HTTPExporterConfig, clusterName, nodeName string, cloudMetadata *armotypes.CloudMetadata, clusterUID string, alertSourcePlatform armotypes.AlertSourcePlatform, metricsOpt ...metricsmanager.MetricsManager) (*HTTPExporter, error) { if err := config.Validate(); err != nil { return nil, fmt.Errorf("invalid config: %w", err) } + var metrics metricsmanager.MetricsManager = metricsmanager.NewMetricsNoop() + if len(metricsOpt) > 0 && metricsOpt[0] != nil { + metrics = metricsOpt[0] + } + exporter := &HTTPExporter{ config: config, nodeName: nodeName, @@ -112,6 +119,7 @@ func NewHTTPExporter(config HTTPExporterConfig, clusterName, nodeName string, cl alertMetrics: &alertMetrics{}, cloudMetadata: cloudMetadata, alertSourcePlatform: alertSourcePlatform, + metrics: metrics, } // Initialize bulk manager if bulking is enabled @@ -171,6 +179,7 @@ func (config *HTTPExporterConfig) Validate() error { func (e *HTTPExporter) SendRuleAlert(failedRule types.RuleFailure) { // Check if alert limit is reached first if e.shouldSendLimitAlert() { + e.metrics.ReportAlertSuppressed(failedRule.GetRuleId(), "rate_limit") ctx, cancel := context.WithTimeout(context.Background(), time.Duration(e.config.TimeoutSeconds)*time.Second) defer cancel() if err := e.sendAlertLimitReached(ctx); err != nil { @@ -191,7 +200,7 @@ func (e *HTTPExporter) SendRuleAlert(failedRule types.RuleFailure) { defer cancel() if err := e.sendRuleAlertWithContext(ctx, failedRule); err != nil { - logger.L().Warning("HTTPExporter.SendRuleAlert - failed to send rule alert", helpers.Error(err)) + logger.L().Ctx(ctx).Warning("HTTPExporter.SendRuleAlert - failed to send rule alert", helpers.Error(err)) } } @@ -199,6 +208,7 @@ func (e *HTTPExporter) SendRuleAlert(failedRule types.RuleFailure) { func (e *HTTPExporter) SendMalwareAlert(malwareResult malwaremanager.MalwareResult) { // Check if alert limit is reached first if e.shouldSendLimitAlert() { + e.metrics.ReportAlertSuppressed(malwareRuleID, "rate_limit") ctx, cancel := context.WithTimeout(context.Background(), time.Duration(e.config.TimeoutSeconds)*time.Second) defer cancel() if err := e.sendAlertLimitReached(ctx); err != nil { @@ -229,7 +239,7 @@ func (e *HTTPExporter) SendFimAlerts(fimEvents []hostfimsensor.FimEvent) { defer cancel() if err := e.sendFimAlertsWithContext(ctx, fimEvents); err != nil { - logger.L().Warning("HTTPExporter.SendFimAlerts - failed to send FIM alerts", helpers.Error(err)) + logger.L().Ctx(ctx).Warning("HTTPExporter.SendFimAlerts - failed to send FIM alerts", helpers.Error(err)) } } @@ -506,7 +516,7 @@ func (e *HTTPExporter) sendAlertLimitReached(ctx context.Context) error { }, } - logger.L().Warning("Alert limit reached", + logger.L().Ctx(ctx).Warning("Alert limit reached", helpers.Int("alerts", e.alertMetrics.count), helpers.String("since", e.alertMetrics.startTime.Format(time.RFC3339))) diff --git a/pkg/malwaremanager/v1/clamav/clamav.go b/pkg/malwaremanager/v1/clamav/clamav.go index 70ef58f568..5d81394fa9 100644 --- a/pkg/malwaremanager/v1/clamav/clamav.go +++ b/pkg/malwaremanager/v1/clamav/clamav.go @@ -1,6 +1,8 @@ package malwaremanager import ( + "context" + "github.com/cenkalti/backoff/v4" "github.com/dutchcoders/go-clamd" "github.com/kubescape/go-logger" @@ -31,7 +33,7 @@ func CreateClamAVClient(clamavSocket string) (*ClamAVClient, error) { if err := backoff.Retry(func() error { return clamavClient.Ping() }, backoff.NewExponentialBackOff()); err != nil { - logger.L().Error("Error pinging ClamAV", helpers.Error(err)) + logger.L().Ctx(context.Background()).Error("Error pinging ClamAV", helpers.Error(err)) return nil, err } diff --git a/pkg/malwaremanager/v1/clamav/exec.go b/pkg/malwaremanager/v1/clamav/exec.go index f10726ba2e..f088bfbfbb 100644 --- a/pkg/malwaremanager/v1/clamav/exec.go +++ b/pkg/malwaremanager/v1/clamav/exec.go @@ -1,6 +1,7 @@ package malwaremanager import ( + "context" "fmt" "os" "strings" @@ -30,7 +31,7 @@ func (c *ClamAVClient) handleExecEvent(event utils.ExecEvent, containerPid uint3 response, err := c.clamd.ScanFile(hostFilePath) if err != nil { - logger.L().Warning("ClamAVClient.handleExecEvent - scanning file", helpers.Error(err)) + logger.L().Ctx(context.Background()).Warning("ClamAVClient.handleExecEvent - scanning file", helpers.Error(err)) return nil } diff --git a/pkg/malwaremanager/v1/clamav/open.go b/pkg/malwaremanager/v1/clamav/open.go index cda0d8e6bf..b46a450656 100644 --- a/pkg/malwaremanager/v1/clamav/open.go +++ b/pkg/malwaremanager/v1/clamav/open.go @@ -1,6 +1,7 @@ package malwaremanager import ( + "context" "os" "slices" "strings" @@ -35,7 +36,7 @@ func (c *ClamAVClient) handleOpenEvent(event utils.OpenEvent, containerPid uint3 response, err := c.clamd.ScanFile(hostFilePath) if err != nil { - logger.L().Warning("ClamAVClient.handleOpenEvent - scanning file", helpers.Error(err)) + logger.L().Ctx(context.Background()).Warning("ClamAVClient.handleOpenEvent - scanning file", helpers.Error(err)) return nil } diff --git a/pkg/malwaremanager/v1/malware_manager.go b/pkg/malwaremanager/v1/malware_manager.go index c5462d9358..7783dde715 100644 --- a/pkg/malwaremanager/v1/malware_manager.go +++ b/pkg/malwaremanager/v1/malware_manager.go @@ -21,7 +21,10 @@ import ( clamavv1 "github.com/kubescape/node-agent/pkg/malwaremanager/v1/clamav" "github.com/kubescape/node-agent/pkg/metricsmanager" "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/kubescape/node-agent/pkg/utils" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" ) const ( @@ -166,6 +169,26 @@ func (mm *MalwareManager) reportFileExec(event utils.ExecEvent) { result = mm.enrichMalwareResult(result) result.SetWorkloadDetails(mm.podToWlid.Get(utils.CreateK8sPodID(event.GetNamespace(), event.GetPod()))) mm.exporter.SendMalwareAlert(result) + mm.metrics.ReportRuleAlert("malware") // constant — AlertName is unbounded cardinality + alertCtx, alertSpan := otelsetup.Tracer().Start(context.Background(), "malware.alert", + trace.WithAttributes( + attribute.String("container.id", containerID), + attribute.String("k8s.namespace.name", result.GetRuntimeAlertK8sDetails().Namespace), + attribute.String("k8s.pod.name", result.GetRuntimeAlertK8sDetails().PodName), + attribute.String("malware.signature", result.GetBasicRuntimeAlert().AlertName), + )) + otelsetup.EmitAlertLogRecord(alertCtx, otelsetup.AlertLogAttrs{ + RuleID: "malware", + AlertType: result.GetBasicRuntimeAlert().AlertName, + ContainerID: containerID, + ContainerName: result.GetRuntimeAlertK8sDetails().ContainerName, + Namespace: result.GetRuntimeAlertK8sDetails().Namespace, + PodName: result.GetRuntimeAlertK8sDetails().PodName, + Image: result.GetRuntimeAlertK8sDetails().Image, + EventType: "malware", + MalwareSignature: result.GetBasicRuntimeAlert().AlertName, + }) + alertSpan.End() } } @@ -210,7 +233,26 @@ func (mm *MalwareManager) reportFileOpen(event utils.OpenEvent) { result = mm.enrichMalwareResult(result) result.SetWorkloadDetails(mm.podToWlid.Get(utils.CreateK8sPodID(event.GetNamespace(), event.GetPod()))) mm.exporter.SendMalwareAlert(result) - mm.metrics.ReportRuleAlert(result.GetBasicRuntimeAlert().AlertName) + mm.metrics.ReportRuleAlert("malware") // constant — AlertName is unbounded cardinality + alertCtx, alertSpan := otelsetup.Tracer().Start(context.Background(), "malware.alert", + trace.WithAttributes( + attribute.String("container.id", containerID), + attribute.String("k8s.namespace.name", result.GetRuntimeAlertK8sDetails().Namespace), + attribute.String("k8s.pod.name", result.GetRuntimeAlertK8sDetails().PodName), + attribute.String("malware.signature", result.GetBasicRuntimeAlert().AlertName), + )) + otelsetup.EmitAlertLogRecord(alertCtx, otelsetup.AlertLogAttrs{ + RuleID: "malware", + AlertType: result.GetBasicRuntimeAlert().AlertName, + ContainerID: containerID, + ContainerName: result.GetRuntimeAlertK8sDetails().ContainerName, + Namespace: result.GetRuntimeAlertK8sDetails().Namespace, + PodName: result.GetRuntimeAlertK8sDetails().PodName, + Image: result.GetRuntimeAlertK8sDetails().Image, + EventType: "malware", + MalwareSignature: result.GetBasicRuntimeAlert().AlertName, + }) + alertSpan.End() } } } diff --git a/pkg/metricsmanager/metrics_manager_interface.go b/pkg/metricsmanager/metrics_manager_interface.go index 8762d6ad58..fa090fc699 100644 --- a/pkg/metricsmanager/metrics_manager_interface.go +++ b/pkg/metricsmanager/metrics_manager_interface.go @@ -1,6 +1,7 @@ package metricsmanager import ( + "context" "time" "github.com/kubescape/node-agent/pkg/utils" @@ -15,7 +16,7 @@ type MetricsManager interface { ReportRuleProcessed(ruleID string) ReportRulePrefiltered(ruleName string) ReportRuleAlert(ruleID string) - ReportRuleEvaluationTime(ruleID string, eventType utils.EventType, duration time.Duration) + ReportRuleEvaluationTime(ctx context.Context, ruleID string, eventType utils.EventType, duration time.Duration) //ReportEbpfStats(stats *top.Event[toptypes.Stats]) ReportContainerStart() ReportContainerStop() @@ -48,4 +49,13 @@ type MetricsManager interface { ObserveProfileEntriesRaw(field string, count float64) ObserveProfileEntriesRetained(field string, count float64) ObserveProfileRetentionRatio(field string, ratio float64) + + // SBOM scan metrics. + ReportSBOMScan(status string) + ObserveSBOMScanDuration(status string, d time.Duration) + ReportSBOMScannerRestart() + SetSBOMScannerReady(ready bool) + + // Alert suppression funnel — counts how many alerts were dropped and why. + ReportAlertSuppressed(ruleID, reason string) } diff --git a/pkg/metricsmanager/metrics_manager_mock.go b/pkg/metricsmanager/metrics_manager_mock.go index 02e541aacb..68261f5b26 100644 --- a/pkg/metricsmanager/metrics_manager_mock.go +++ b/pkg/metricsmanager/metrics_manager_mock.go @@ -1,6 +1,7 @@ package metricsmanager import ( + "context" "sync/atomic" "time" @@ -52,7 +53,7 @@ func (m *MetricsMock) ReportRuleAlert(ruleID string) { m.RuleAlertCounter.Set(ruleID, m.RuleAlertCounter.Get(ruleID)+1) } -func (m *MetricsMock) ReportRuleEvaluationTime(ruleID string, eventType utils.EventType, duration time.Duration) { +func (m *MetricsMock) ReportRuleEvaluationTime(_ context.Context, ruleID string, eventType utils.EventType, duration time.Duration) { key := ruleID + ":" + string(eventType) m.RuleEvaluationTime.Set(key, duration) } @@ -89,3 +90,8 @@ func (m *MetricsMock) ObserveProfileProjectedSize(_ float64) {} func (m *MetricsMock) ObserveProfileEntriesRaw(_ string, _ float64) {} func (m *MetricsMock) ObserveProfileEntriesRetained(_ string, _ float64) {} func (m *MetricsMock) ObserveProfileRetentionRatio(_ string, _ float64) {} +func (m *MetricsMock) ReportSBOMScan(_ string) {} +func (m *MetricsMock) ObserveSBOMScanDuration(_ string, _ time.Duration) {} +func (m *MetricsMock) ReportSBOMScannerRestart() {} +func (m *MetricsMock) SetSBOMScannerReady(_ bool) {} +func (m *MetricsMock) ReportAlertSuppressed(_, _ string) {} diff --git a/pkg/metricsmanager/metrics_manager_noop.go b/pkg/metricsmanager/metrics_manager_noop.go index 1216c0fea6..ffa9bac6ae 100644 --- a/pkg/metricsmanager/metrics_manager_noop.go +++ b/pkg/metricsmanager/metrics_manager_noop.go @@ -1,6 +1,7 @@ package metricsmanager import ( + "context" "time" "github.com/kubescape/node-agent/pkg/utils" @@ -18,7 +19,7 @@ func (m *MetricsNoop) ReportFailedEvent() func (m *MetricsNoop) ReportRuleProcessed(_ string) {} func (m *MetricsNoop) ReportRulePrefiltered(_ string) {} func (m *MetricsNoop) ReportRuleAlert(_ string) {} -func (m *MetricsNoop) ReportRuleEvaluationTime(_ string, _ utils.EventType, _ time.Duration) {} +func (m *MetricsNoop) ReportRuleEvaluationTime(_ context.Context, _ string, _ utils.EventType, _ time.Duration) {} func (m *MetricsNoop) ReportContainerStart() {} func (m *MetricsNoop) ReportContainerStop() {} func (m *MetricsNoop) ReportDedupEvent(_ utils.EventType, _ bool) {} @@ -44,3 +45,8 @@ func (m *MetricsNoop) ObserveProfileProjectedSize(_ float64) {} func (m *MetricsNoop) ObserveProfileEntriesRaw(_ string, _ float64) {} func (m *MetricsNoop) ObserveProfileEntriesRetained(_ string, _ float64) {} func (m *MetricsNoop) ObserveProfileRetentionRatio(_ string, _ float64) {} +func (m *MetricsNoop) ReportSBOMScan(_ string) {} +func (m *MetricsNoop) ObserveSBOMScanDuration(_ string, _ time.Duration) {} +func (m *MetricsNoop) ReportSBOMScannerRestart() {} +func (m *MetricsNoop) SetSBOMScannerReady(_ bool) {} +func (m *MetricsNoop) ReportAlertSuppressed(_, _ string) {} diff --git a/pkg/metricsmanager/otel/bench_test.go b/pkg/metricsmanager/otel/bench_test.go new file mode 100644 index 0000000000..ac190f2329 --- /dev/null +++ b/pkg/metricsmanager/otel/bench_test.go @@ -0,0 +1,69 @@ +package otelmetrics + +import ( + "context" + "testing" + "time" + + "go.opentelemetry.io/otel" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + + "github.com/kubescape/node-agent/pkg/utils" +) + +// setupBenchmarkMeterProvider installs a real MeterProvider backed by a +// ManualReader (synchronous, no goroutines). This exercises the full SDK +// instrument → aggregation → reader pipeline without network or disk I/O, +// giving stable alloc numbers for comparison with the Prometheus impl. +func setupBenchmarkMeterProvider() { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + otel.SetMeterProvider(mp) +} + +// BenchmarkReportRuleEvaluationTime measures the hot-path cost of recording a +// rule evaluation histogram observation with pre-cached attribute sets. +// Pass criteria (per Phase 2 plan): allocs/op ≤ Prometheus impl AND +// ns/op ≤ 1.1× Prometheus impl (run with -benchmem to verify). +func BenchmarkReportRuleEvaluationTime(b *testing.B) { + setupBenchmarkMeterProvider() + m := NewOTELMetricsManager("") + + // Warm the cache for the key under test so the benchmark measures the + // cached fast path, not the first-call allocation. + m.ReportRuleEvaluationTime(context.Background(), "R1001", utils.ExecveEventType, 3*time.Millisecond) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + m.ReportRuleEvaluationTime(context.Background(), "R1001", utils.ExecveEventType, 3*time.Millisecond) + } +} + +// BenchmarkReportEvent measures the hot-path cost of incrementing the collapsed +// eBPF events counter with a cached event_type attribute set. +func BenchmarkReportEvent(b *testing.B) { + setupBenchmarkMeterProvider() + m := NewOTELMetricsManager("") + m.ReportEvent(utils.ExecveEventType) // warm cache + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + m.ReportEvent(utils.ExecveEventType) + } +} + +// BenchmarkReportRuleAlert measures the hot-path cost of incrementing the alert +// counter with a cached rule_id attribute set. +func BenchmarkReportRuleAlert(b *testing.B) { + setupBenchmarkMeterProvider() + m := NewOTELMetricsManager("") + m.ReportRuleAlert("R1001") // warm cache + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + m.ReportRuleAlert("R1001") + } +} diff --git a/pkg/metricsmanager/otel/otel_metrics_manager.go b/pkg/metricsmanager/otel/otel_metrics_manager.go new file mode 100644 index 0000000000..f0771546e5 --- /dev/null +++ b/pkg/metricsmanager/otel/otel_metrics_manager.go @@ -0,0 +1,511 @@ +package otelmetrics + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "github.com/kubescape/node-agent/pkg/metricsmanager" + "github.com/kubescape/node-agent/pkg/otelsetup" + "github.com/kubescape/node-agent/pkg/utils" +) + +var _ metricsmanager.MetricsManager = (*OTELMetricsManager)(nil) + +type OTELMetricsManager struct { + // eBPF events — collapsed from 17 individual Prometheus counters into one + // with an event_type attribute (OTEL convention, avoids metric explosion). + ebpfEventsTotal metric.Int64Counter + ebpfFailedTotal metric.Int64Counter + + // Rule metrics + ruleProcTotal metric.Int64Counter + rulePrefiltTotal metric.Int64Counter + alertTotal metric.Int64Counter + ruleEvalDuration metric.Float64Histogram + + // Container lifecycle + containerStartTotal metric.Int64Counter + containerStopTotal metric.Int64Counter + dedupEventsTotal metric.Int64Counter + + // ContainerProfile cache + profileLegacyLoadTotal metric.Int64Counter + profileCacheEntries metric.Float64Gauge + profileCacheHitTotal metric.Int64Counter + reconcilerDuration metric.Float64Histogram + reconcilerEvictionsTotal metric.Int64Counter + + // Rule projection — always-on + projMissingDeclTotal metric.Int64Counter + projUndeclaredLitTotal metric.Int64Counter + projStaleEntries metric.Float64Gauge + projUndeclaredRules metric.Float64Gauge + + // Rule projection — detailed (gated by caller) + projSpecCompileTotal metric.Int64Counter + projSpecHashChangeTotal metric.Int64Counter + projSpecPatterns metric.Float64Gauge + projSpecAllField metric.Float64Gauge + projApplyDuration metric.Float64Histogram + projReconcileTriggeredTotal metric.Int64Counter + projHelperCallTotal metric.Int64Counter + projUndeclaredRulesDetail metric.Float64Gauge + + // Memory-savings metrics (dev-only, kept for interface compat; candidates for removal) + profileRawSize metric.Float64Histogram + profileProjectedSize metric.Float64Histogram + profileEntriesRaw metric.Float64Histogram + profileEntriesRetained metric.Float64Histogram + profileRetentionRatio metric.Float64Histogram + + // SBOM scan metrics + sbomScanTotal metric.Int64Counter + sbomScanDuration metric.Float64Histogram + sbomRestarts metric.Int64Counter + sbomReady metric.Float64Gauge + + // Alert suppression funnel + alertSuppressedTotal metric.Int64Counter + + // Live container count — incremented on start, decremented on stop. + // Exposed as node_agent.container.count observable gauge. + containerCount atomic.Int64 + + // Attribute-set caches: mandatory on the hot path to avoid per-call allocations. + // Each cache maps a string key → metric.MeasurementOption (pre-built attribute set). + ruleIDCache sync.Map // ruleID → MeasurementOption (rule_id attribute) + ruleEvalCache sync.Map // ruleID+"\x00"+eventType → MeasurementOption (rule_id + event_type) + eventTypeCache sync.Map // eventType string → MeasurementOption (event_type attribute) + dedupCache sync.Map // eventType+"\x00"+result → MeasurementOption (event_type + result) + suppressedCache sync.Map // ruleID+"\x00"+reason → MeasurementOption (rule_id + reason) + + // SetProjectionUndeclaredRulesDetail tracks the current rule ID set so that + // removed rules can be zeroed out on the next call (no Reset() in OTEL gauges). + undeclaredRulesMu sync.Mutex + undeclaredRulesSet map[string]struct{} +} + +// NewOTELMetricsManager constructs a fully-initialised OTELMetricsManager. +// MUST be called after otelsetup.InitProviders() so that otelsetup.Meter() +// returns the real MeterProvider, not the SDK no-op. +// +// ownContainerID is this agent's own container ID (from the k8s API); it lets +// the cgroup memory gauges resolve the correct container scope under the +// host-mounted cgroup tree. Pass "" when unknown — the gauges then fall back to +// /proc-based resolution and report 0 if that fails. +func NewOTELMetricsManager(ownContainerID string) *OTELMetricsManager { + meter := otelsetup.Meter() + m := &OTELMetricsManager{ + undeclaredRulesSet: make(map[string]struct{}), + } + + mustCounter := func(name, desc string) metric.Int64Counter { + c, err := meter.Int64Counter(name, metric.WithDescription(desc)) + if err != nil { + panic(fmt.Sprintf("otelmetrics: counter %q: %v", name, err)) + } + return c + } + mustGauge := func(name, desc string) metric.Float64Gauge { + g, err := meter.Float64Gauge(name, metric.WithDescription(desc)) + if err != nil { + panic(fmt.Sprintf("otelmetrics: gauge %q: %v", name, err)) + } + return g + } + mustHistogram := func(name, desc, unit string, boundaries []float64) metric.Float64Histogram { + h, err := meter.Float64Histogram(name, + metric.WithDescription(desc), + metric.WithUnit(unit), + metric.WithExplicitBucketBoundaries(boundaries...), + ) + if err != nil { + panic(fmt.Sprintf("otelmetrics: histogram %q: %v", name, err)) + } + return h + } + + // Rule-evaluation buckets: covers P99 in 1–10ms range with a 2s tail bucket. + evalBuckets := []float64{0.0005, 0.001, 0.002, 0.005, 0.010, 0.050, 0.500, 2.0} + defBuckets := []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0} + sizeBuckets := []float64{0, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304} + entryBuckets := []float64{0, 1, 5, 10, 50, 100, 500, 1000, 5000} + ratioBuckets := []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0} + + m.ebpfEventsTotal = mustCounter("node_agent.ebpf.events.total", + "Total eBPF events received, labeled by event_type (collapses 17 Prometheus counters)") + m.ebpfFailedTotal = mustCounter("node_agent.ebpf.events.failed.total", + "Total failed eBPF event processing attempts") + + m.ruleProcTotal = mustCounter("node_agent.rule.processed.total", + "Total rule evaluations by rule_id") + m.rulePrefiltTotal = mustCounter("node_agent.rule.prefiltered.total", + "Total rule evaluations skipped by pre-filter") + m.alertTotal = mustCounter("node_agent.alert.total", + "Total security alerts fired, labeled by rule_id") + m.ruleEvalDuration = mustHistogram("node_agent.rule.evaluation.duration", + "Rule evaluation latency by rule_id and event_type", "s", evalBuckets) + + m.containerStartTotal = mustCounter("node_agent.container.start.total", + "Total container start events") + m.containerStopTotal = mustCounter("node_agent.container.stop.total", + "Total container stop events") + m.dedupEventsTotal = mustCounter("node_agent.ebpf.dedup.total", + "Total events processed by the dedup layer") + + m.profileLegacyLoadTotal = mustCounter("node_agent.profile.legacy_load.total", + "Legacy ApplicationProfile/NetworkNeighborhood loads (deprecated, will be removed)") + m.profileCacheEntries = mustGauge("node_agent.profile.cache.entries", + "Current ContainerProfile cache entries per kind") + m.profileCacheHitTotal = mustCounter("node_agent.profile.cache.hit.total", + "ContainerProfile cache lookups by result (hit/miss)") + m.reconcilerDuration = mustHistogram("node_agent.profile.reconciler.duration", + "ContainerProfile reconciler phase duration", "s", defBuckets) + m.reconcilerEvictionsTotal = mustCounter("node_agent.profile.reconciler.evictions.total", + "ContainerProfile cache evictions by reason") + + m.projMissingDeclTotal = mustCounter("node_agent.rule.projection.missing_decl.total", + "Rules with profileDependency>0 but no profileDataRequired declaration") + m.projUndeclaredLitTotal = mustCounter("node_agent.rule.projection.undeclared_literal.total", + "Literals evaluated against an undeclared projected field") + m.projStaleEntries = mustGauge("node_agent.rule.projection.stale_entries", + "Projected cache entries whose spec hash is stale") + m.projUndeclaredRules = mustGauge("node_agent.rule.projection.undeclared_rules", + "Rules currently loaded with no profileDataRequired field") + + m.projSpecCompileTotal = mustCounter("node_agent.rule.projection.spec_compile.total", + "Total projection spec compilations") + m.projSpecHashChangeTotal = mustCounter("node_agent.rule.projection.spec_hash_change.total", + "Total projection spec hash changes") + m.projSpecPatterns = mustGauge("node_agent.rule.projection.spec_patterns", + "Projection spec pattern counts per field and kind") + m.projSpecAllField = mustGauge("node_agent.rule.projection.spec_all_field", + "Whether a projection spec field has All=true (1) or not (0)") + m.projApplyDuration = mustHistogram("node_agent.rule.projection.apply.duration", + "Profile projection Apply call duration", "s", defBuckets) + m.projReconcileTriggeredTotal = mustCounter("node_agent.rule.projection.reconcile_triggered.total", + "Projection reconcile triggers by type") + m.projHelperCallTotal = mustCounter("node_agent.rule.projection.helper_call.total", + "Profile-helper CEL function calls by helper name") + // program runtime gauges intentionally omitted — dead code since initial implementation + m.projUndeclaredRulesDetail = mustGauge("node_agent.rule.projection.undeclared_rules_detail", + "Per-rule gauge for undeclared rules (high-cardinality; candidate for removal in Phase 3)") + + m.profileRawSize = mustHistogram("node_agent.profile.raw_size", + "Raw ContainerProfile data size before projection (dev-only)", "By", sizeBuckets) + m.profileProjectedSize = mustHistogram("node_agent.profile.projected_size", + "Projected ContainerProfile data size after projection (dev-only)", "By", sizeBuckets) + m.profileEntriesRaw = mustHistogram("node_agent.profile.entries_raw", + "Entries per field before projection (dev-only)", "{entry}", entryBuckets) + m.profileEntriesRetained = mustHistogram("node_agent.profile.entries_retained", + "Entries per field after projection (dev-only)", "{entry}", entryBuckets) + m.profileRetentionRatio = mustHistogram("node_agent.profile.retention_ratio", + "Entry retention ratio per field after projection (dev-only)", "1", ratioBuckets) + + // SBOM scan buckets: covers 1s–15min (scans can take several minutes for large images). + sbomBuckets := []float64{1, 2, 5, 10, 30, 60, 120, 300, 600, 900} + m.sbomScanTotal = mustCounter("node_agent.sbom.scan.total", + "Total SBOM scan attempts by status (success/error/oom_killed)") + m.sbomScanDuration = mustHistogram("node_agent.sbom.scan.duration", + "SBOM scan duration by status", "s", sbomBuckets) + m.sbomRestarts = mustCounter("node_agent.sbom.scanner.restarts.total", + "Total SBOM scanner sidecar restarts detected via connection loss") + m.sbomReady = mustGauge("node_agent.sbom.scanner.ready", + "Whether the SBOM scanner sidecar is ready (1=ready, 0=not ready)") + + m.alertSuppressedTotal = mustCounter("node_agent.alert.suppressed.total", + "Total alerts suppressed before delivery, labeled by rule_id and reason") + + registerResourceMetrics(meter, &m.containerCount, ownContainerID) + + return m +} + +// Start is a no-op: the Prometheus HTTP listener (if configured) is started +// inside otelsetup.InitProviders when OTEL_METRICS_EXPORTER=prometheus. +func (m *OTELMetricsManager) Start() {} + +// Destroy is a no-op: provider shutdown is handled by the otelsetup shutdown func. +func (m *OTELMetricsManager) Destroy() {} + +// ── Attribute-set cache helpers ───────────────────────────────────────────── + +func (m *OTELMetricsManager) ruleIDOption(ruleID string) metric.MeasurementOption { + if v, ok := m.ruleIDCache.Load(ruleID); ok { + return v.(metric.MeasurementOption) + } + opt := metric.WithAttributeSet(attribute.NewSet(attribute.String("rule_id", ruleID))) + m.ruleIDCache.Store(ruleID, opt) + return opt +} + +func (m *OTELMetricsManager) ruleEvalOption(ruleID string, et utils.EventType) metric.MeasurementOption { + key := ruleID + "\x00" + string(et) + if v, ok := m.ruleEvalCache.Load(key); ok { + return v.(metric.MeasurementOption) + } + opt := metric.WithAttributeSet(attribute.NewSet( + attribute.String("rule_id", ruleID), + attribute.String("event_type", string(et)), + )) + m.ruleEvalCache.Store(key, opt) + return opt +} + +func (m *OTELMetricsManager) eventTypeOption(et utils.EventType) metric.MeasurementOption { + key := string(et) + if v, ok := m.eventTypeCache.Load(key); ok { + return v.(metric.MeasurementOption) + } + opt := metric.WithAttributeSet(attribute.NewSet(attribute.String("event_type", key))) + m.eventTypeCache.Store(key, opt) + return opt +} + +func (m *OTELMetricsManager) dedupOption(et utils.EventType, duplicate bool) metric.MeasurementOption { + result := "passed" + if duplicate { + result = "deduplicated" + } + key := string(et) + "\x00" + result + if v, ok := m.dedupCache.Load(key); ok { + return v.(metric.MeasurementOption) + } + opt := metric.WithAttributeSet(attribute.NewSet( + attribute.String("event_type", string(et)), + attribute.String("result", result), + )) + m.dedupCache.Store(key, opt) + return opt +} + +// ── Interface implementation ───────────────────────────────────────────────── + +func (m *OTELMetricsManager) ReportEvent(eventType utils.EventType) { + m.ebpfEventsTotal.Add(context.Background(), 1, m.eventTypeOption(eventType)) +} + +func (m *OTELMetricsManager) ReportFailedEvent() { + m.ebpfFailedTotal.Add(context.Background(), 1) +} + +func (m *OTELMetricsManager) ReportRuleProcessed(ruleID string) { + m.ruleProcTotal.Add(context.Background(), 1, m.ruleIDOption(ruleID)) +} + +func (m *OTELMetricsManager) ReportRulePrefiltered(ruleID string) { + m.rulePrefiltTotal.Add(context.Background(), 1, m.ruleIDOption(ruleID)) +} + +func (m *OTELMetricsManager) ReportRuleAlert(ruleID string) { + m.alertTotal.Add(context.Background(), 1, m.ruleIDOption(ruleID)) +} + +func (m *OTELMetricsManager) ReportRuleEvaluationTime(ctx context.Context, ruleID string, eventType utils.EventType, duration time.Duration) { + m.ruleEvalDuration.Record(ctx, duration.Seconds(), m.ruleEvalOption(ruleID, eventType)) +} + +func (m *OTELMetricsManager) ReportContainerStart() { + m.containerStartTotal.Add(context.Background(), 1) + m.containerCount.Add(1) +} + +func (m *OTELMetricsManager) ReportContainerStop() { + m.containerStopTotal.Add(context.Background(), 1) + m.containerCount.Add(-1) +} + +func (m *OTELMetricsManager) ReportDedupEvent(eventType utils.EventType, duplicate bool) { + m.dedupEventsTotal.Add(context.Background(), 1, m.dedupOption(eventType, duplicate)) +} + +func (m *OTELMetricsManager) ReportContainerProfileLegacyLoad(kind, completeness string) { + m.profileLegacyLoadTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("kind", kind), + attribute.String("completeness", completeness), + )) +} + +func (m *OTELMetricsManager) SetContainerProfileCacheEntries(kind string, count float64) { + m.profileCacheEntries.Record(context.Background(), count, metric.WithAttributes( + attribute.String("kind", kind), + )) +} + +func (m *OTELMetricsManager) ReportContainerProfileCacheHit(hit bool) { + result := "hit" + if !hit { + result = "miss" + } + m.profileCacheHitTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("result", result), + )) +} + +func (m *OTELMetricsManager) ReportContainerProfileReconcilerDuration(phase string, duration time.Duration) { + m.reconcilerDuration.Record(context.Background(), duration.Seconds(), metric.WithAttributes( + attribute.String("phase", phase), + )) +} + +func (m *OTELMetricsManager) ReportContainerProfileReconcilerEviction(reason string) { + m.reconcilerEvictionsTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("reason", reason), + )) +} + +func (m *OTELMetricsManager) IncMissingProfileDataRequired(ruleID string) { + m.projMissingDeclTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("rule_id", ruleID), + )) +} + +func (m *OTELMetricsManager) IncProjectionUndeclaredLiteral(helper string) { + m.projUndeclaredLitTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("helper", helper), + )) +} + +func (m *OTELMetricsManager) SetProjectionStaleEntries(count float64) { + m.projStaleEntries.Record(context.Background(), count) +} + +func (m *OTELMetricsManager) SetProjectionUndeclaredRules(count float64) { + m.projUndeclaredRules.Record(context.Background(), count) +} + +func (m *OTELMetricsManager) IncProjectionSpecCompile() { + m.projSpecCompileTotal.Add(context.Background(), 1) +} + +func (m *OTELMetricsManager) IncProjectionSpecHashChange() { + m.projSpecHashChangeTotal.Add(context.Background(), 1) +} + +func (m *OTELMetricsManager) SetProjectionSpecPatterns(field, kind string, count float64) { + m.projSpecPatterns.Record(context.Background(), count, metric.WithAttributes( + attribute.String("field", field), + attribute.String("kind", kind), + )) +} + +func (m *OTELMetricsManager) SetProjectionSpecAllField(field string, isAll bool) { + v := float64(0) + if isAll { + v = 1 + } + m.projSpecAllField.Record(context.Background(), v, metric.WithAttributes( + attribute.String("field", field), + )) +} + +func (m *OTELMetricsManager) ObserveProjectionApplyDuration(d time.Duration) { + m.projApplyDuration.Record(context.Background(), d.Seconds()) +} + +func (m *OTELMetricsManager) IncProjectionReconcileTriggered(trigger string) { + m.projReconcileTriggeredTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("trigger", trigger), + )) +} + +func (m *OTELMetricsManager) IncHelperCall(helper string) { + m.projHelperCallTotal.Add(context.Background(), 1, metric.WithAttributes( + attribute.String("helper", helper), + )) +} + +// SetProjectionUndeclaredRulesDetail records 1 for each rule currently undeclared +// and 0 for rules that were in the previous call but are no longer undeclared. +// OTEL synchronous gauges have no Reset(); zeroing removed entries is the equivalent. +func (m *OTELMetricsManager) SetProjectionUndeclaredRulesDetail(ruleIDs []string) { + m.undeclaredRulesMu.Lock() + defer m.undeclaredRulesMu.Unlock() + + newSet := make(map[string]struct{}, len(ruleIDs)) + for _, id := range ruleIDs { + newSet[id] = struct{}{} + } + for id := range m.undeclaredRulesSet { + if _, still := newSet[id]; !still { + m.projUndeclaredRulesDetail.Record(context.Background(), 0, metric.WithAttributes( + attribute.String("rule_id", id), + )) + } + } + for _, id := range ruleIDs { + m.projUndeclaredRulesDetail.Record(context.Background(), 1, metric.WithAttributes( + attribute.String("rule_id", id), + )) + } + m.undeclaredRulesSet = newSet +} + +func (m *OTELMetricsManager) ObserveProfileRawSize(bytes float64) { + m.profileRawSize.Record(context.Background(), bytes) +} + +func (m *OTELMetricsManager) ObserveProfileProjectedSize(bytes float64) { + m.profileProjectedSize.Record(context.Background(), bytes) +} + +func (m *OTELMetricsManager) ObserveProfileEntriesRaw(field string, count float64) { + m.profileEntriesRaw.Record(context.Background(), count, metric.WithAttributes( + attribute.String("field", field), + )) +} + +func (m *OTELMetricsManager) ObserveProfileEntriesRetained(field string, count float64) { + m.profileEntriesRetained.Record(context.Background(), count, metric.WithAttributes( + attribute.String("field", field), + )) +} + +func (m *OTELMetricsManager) ObserveProfileRetentionRatio(field string, ratio float64) { + m.profileRetentionRatio.Record(context.Background(), ratio, metric.WithAttributes( + attribute.String("field", field), + )) +} + +func (m *OTELMetricsManager) ReportSBOMScan(status string) { + m.sbomScanTotal.Add(context.Background(), 1, metric.WithAttributes(attribute.String("status", status))) +} + +func (m *OTELMetricsManager) ObserveSBOMScanDuration(status string, d time.Duration) { + m.sbomScanDuration.Record(context.Background(), d.Seconds(), metric.WithAttributes(attribute.String("status", status))) +} + +func (m *OTELMetricsManager) ReportSBOMScannerRestart() { + m.sbomRestarts.Add(context.Background(), 1) +} + +func (m *OTELMetricsManager) SetSBOMScannerReady(ready bool) { + v := 0.0 + if ready { + v = 1.0 + } + m.sbomReady.Record(context.Background(), v) +} + +func (m *OTELMetricsManager) suppressedOption(ruleID, reason string) metric.MeasurementOption { + key := ruleID + "\x00" + reason + if v, ok := m.suppressedCache.Load(key); ok { + return v.(metric.MeasurementOption) + } + opt := metric.WithAttributeSet(attribute.NewSet( + attribute.String("rule_id", ruleID), + attribute.String("reason", reason), + )) + m.suppressedCache.Store(key, opt) + return opt +} + +func (m *OTELMetricsManager) ReportAlertSuppressed(ruleID, reason string) { + m.alertSuppressedTotal.Add(context.Background(), 1, m.suppressedOption(ruleID, reason)) +} diff --git a/pkg/metricsmanager/otel/resource_metrics.go b/pkg/metricsmanager/otel/resource_metrics.go new file mode 100644 index 0000000000..89437494a6 --- /dev/null +++ b/pkg/metricsmanager/otel/resource_metrics.go @@ -0,0 +1,262 @@ +package otelmetrics + +import ( + "bufio" + "context" + "io/fs" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync" + "sync/atomic" + + "go.opentelemetry.io/otel/metric" +) + +const cgroupRoot = "/sys/fs/cgroup" + +// registerResourceMetrics wires up observable gauges for process-level and +// host-level resource. Called once from NewOTELMetricsManager; panics on +// instrument creation failure (same policy as mustCounter/mustGauge). +func registerResourceMetrics(meter metric.Meter, containerCount *atomic.Int64, ownContainerID string) { + // Per-process memory gauges (rss + cgroup usage/limit) — shared with the + // sbom-scanner sidecar so both containers report the same memory signals. + RegisterProcessMemoryMetrics(meter, ownContainerID) + + hostMemTotal := readHostMemTotalBytes() + hostCPUCount := int64(runtime.NumCPU()) + + hostMemGauge, err := meter.Int64ObservableGauge("node_agent.host.memory.total_bytes", + metric.WithDescription("Host total physical memory from /proc/meminfo MemTotal"), + metric.WithUnit("By"), + ) + if err != nil { + panic("otelmetrics: gauge node_agent.host.memory.total_bytes: " + err.Error()) + } + hostCPUGauge, err := meter.Int64ObservableGauge("node_agent.host.cpu.count", + metric.WithDescription("Host logical CPU count"), + ) + if err != nil { + panic("otelmetrics: gauge node_agent.host.cpu.count: " + err.Error()) + } + containerCountGauge, err := meter.Int64ObservableGauge("node_agent.container.count", + metric.WithDescription("Currently observed container count (start − stop events)"), + ) + if err != nil { + panic("otelmetrics: gauge node_agent.container.count: " + err.Error()) + } + + _, _ = meter.RegisterCallback(func(_ context.Context, o metric.Observer) error { + o.ObserveInt64(hostMemGauge, hostMemTotal) + o.ObserveInt64(hostCPUGauge, hostCPUCount) + o.ObserveInt64(containerCountGauge, containerCount.Load()) + return nil + }, hostMemGauge, hostCPUGauge, containerCountGauge) +} + +// RegisterProcessMemoryMetrics registers the per-process/per-container memory +// gauges — rss_bytes, cgroup_bytes, cgroup_limit_bytes — on the given meter. +// Both the main agent and the sbom-scanner sidecar call this so each container +// reports its own memory usage and limit (distinguished downstream by +// service.name). ownContainerID, when non-empty, lets the cgroup resolver find +// the correct scope under a host-mounted cgroup tree (the main-agent topology); +// pass "" for containers that mount their own namespaced /sys/fs/cgroup (the +// sidecar), where a direct read of the namespace root works. +// +// MUST be called after otelsetup.InitProviders so the real MeterProvider is set. +func RegisterProcessMemoryMetrics(meter metric.Meter, ownContainerID string) { + rssGauge, err := meter.Int64ObservableGauge("node_agent.process.memory.rss_bytes", + metric.WithDescription("Process RSS (resident set size) from /proc/self/status"), + metric.WithUnit("By"), + ) + if err != nil { + panic("otelmetrics: gauge node_agent.process.memory.rss_bytes: " + err.Error()) + } + cgroupMemGauge, err := meter.Int64ObservableGauge("node_agent.process.memory.cgroup_bytes", + metric.WithDescription("Container memory usage from cgroupv2 memory.current or cgroupv1 memory.usage_in_bytes"), + metric.WithUnit("By"), + ) + if err != nil { + panic("otelmetrics: gauge node_agent.process.memory.cgroup_bytes: " + err.Error()) + } + cgroupLimitGauge, err := meter.Int64ObservableGauge("node_agent.process.memory.cgroup_limit_bytes", + metric.WithDescription("Container memory limit from cgroupv2 memory.max or cgroupv1 memory.limit_in_bytes (0 = unlimited). Pair with cgroup_bytes for OOM headroom."), + metric.WithUnit("By"), + ) + if err != nil { + panic("otelmetrics: gauge node_agent.process.memory.cgroup_limit_bytes: " + err.Error()) + } + + _, _ = meter.RegisterCallback(func(_ context.Context, o metric.Observer) error { + o.ObserveInt64(rssGauge, readProcessRSSBytes()) + cur, lim := readCgroupMem(ownContainerID) + o.ObserveInt64(cgroupMemGauge, cur) + o.ObserveInt64(cgroupLimitGauge, lim) + return nil + }, rssGauge, cgroupMemGauge, cgroupLimitGauge) +} + +func readProcessRSSBytes() int64 { + f, err := os.Open("/proc/self/status") + if err != nil { + return 0 + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + line := s.Text() + if strings.HasPrefix(line, "VmRSS:") { + fields := strings.Fields(line) + if len(fields) >= 2 { + kb, _ := strconv.ParseInt(fields[1], 10, 64) + return kb * 1024 + } + } + } + return 0 +} + +var ( + cgroupResolveOnce sync.Once + cgroupCurrentPath string // path to memory.current / memory.usage_in_bytes ("" if unresolved) + cgroupMaxPath string // path to memory.max / memory.limit_in_bytes ("" if unresolved) +) + +// readCgroupMem returns the process's cgroup memory usage and limit in bytes +// (limit 0 = unlimited / unresolved). Paths are resolved once and cached since +// a process never changes cgroup. +func readCgroupMem(ownContainerID string) (current, limit int64) { + cgroupResolveOnce.Do(func() { + cgroupCurrentPath, cgroupMaxPath = resolveCgroupMemoryPaths(ownContainerID) + }) + if cgroupCurrentPath != "" { + if data, err := os.ReadFile(cgroupCurrentPath); err == nil { + current = parseCgroupMemValue(string(data)) + } + } + if cgroupMaxPath != "" { + if data, err := os.ReadFile(cgroupMaxPath); err == nil { + limit = parseCgroupMemValue(string(data)) + } + } + return current, limit +} + +// resolveCgroupMemoryPaths locates this process's cgroup memory files. +// +// node-agent runs with a private cgroup namespace (so /proc/self/cgroup +// reports "0::/") while bind-mounting the host's /sys/fs/cgroup over its own, +// so the namespaced path cannot be joined with the host tree — reading the +// fixed root path yields 0. Strategies, in order: +// 1. If our own container ID is known (resolved from the k8s API at startup), +// find the matching *.scope directory in the host cgroup tree. This is the +// node-agent topology. Self-discovery from /proc is unreliable here: +// /proc/self/cgroup is "0::/", and /proc/self/mountinfo is polluted with +// every other container's ID via shared mount propagation of /host. +// 2. Join /proc/self/cgroup with the cgroup root; use it if memory.current +// exists there. This covers both the host cgroup namespace (rel is the full +// path) and a container's own namespaced /sys/fs/cgroup mount, where +// /proc/self/cgroup is "0::/" and the namespace root (cgroupRoot itself) is +// the container's own cgroup — the sbom-scanner sidecar topology. The main +// agent overrides /sys/fs/cgroup with the host tree, whose root has no +// memory.current, so this path no-ops there and strategy 1 wins. +// 3. cgroupv1 fixed mount layout. +func resolveCgroupMemoryPaths(ownContainerID string) (current, max string) { + if ownContainerID != "" { + if dir := findCgroupScopeDir(cgroupRoot, ownContainerID); dir != "" { + return filepath.Join(dir, "memory.current"), filepath.Join(dir, "memory.max") + } + } + // cgroupv2: join /proc/self/cgroup with the root. filepath.Join collapses + // the "0::/" namespace-root case to cgroupRoot itself. + if rel, ok := parseSelfCgroupV2(readFileString("/proc/self/cgroup")); ok { + dir := filepath.Join(cgroupRoot, rel) + if fileExists(filepath.Join(dir, "memory.current")) { + return filepath.Join(dir, "memory.current"), filepath.Join(dir, "memory.max") + } + } + // cgroupv1 fallback (fixed mount layout). + if fileExists("/sys/fs/cgroup/memory/memory.usage_in_bytes") { + return "/sys/fs/cgroup/memory/memory.usage_in_bytes", "/sys/fs/cgroup/memory/memory.limit_in_bytes" + } + return "", "" +} + +// parseSelfCgroupV2 returns the cgroup v2 path from the "0::" line of +// /proc/self/cgroup and ok=true when that line is present (path may be "/", the +// namespace root). ok=false means no cgroupv2 line (e.g. cgroupv1-only). +func parseSelfCgroupV2(content string) (string, bool) { + for _, line := range strings.Split(content, "\n") { + if strings.HasPrefix(line, "0::") { + return strings.TrimPrefix(line, "0::"), true + } + } + return "", false +} + +// findCgroupScopeDir walks the cgroup tree for a "**.scope" directory. +// Returns the first match, or "" if none. Bounded one-time cost (cached caller). +func findCgroupScopeDir(root, id string) string { + var found string + _ = filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { + if err != nil || !d.IsDir() { + return nil //nolint:nilerr // skip unreadable subtrees, keep walking + } + name := d.Name() + if strings.HasSuffix(name, ".scope") && strings.Contains(name, id) { + found = path + return filepath.SkipAll + } + return nil + }) + return found +} + +// parseCgroupMemValue parses a cgroup memory file value; the literal "max" +// (cgroupv2 unlimited) and unparseable input both yield 0. +func parseCgroupMemValue(s string) int64 { + s = strings.TrimSpace(s) + if s == "" || s == "max" { + return 0 + } + v, err := strconv.ParseInt(s, 10, 64) + if err != nil { + return 0 + } + return v +} + +func readFileString(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + return string(data) +} + +func fileExists(path string) bool { + _, err := os.Stat(path) + return err == nil +} + +func readHostMemTotalBytes() int64 { + f, err := os.Open("/proc/meminfo") + if err != nil { + return 0 + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + line := s.Text() + if strings.HasPrefix(line, "MemTotal:") { + fields := strings.Fields(line) + if len(fields) >= 2 { + kb, _ := strconv.ParseInt(fields[1], 10, 64) + return kb * 1024 + } + } + } + return 0 +} diff --git a/pkg/metricsmanager/otel/resource_metrics_test.go b/pkg/metricsmanager/otel/resource_metrics_test.go new file mode 100644 index 0000000000..9a0d857127 --- /dev/null +++ b/pkg/metricsmanager/otel/resource_metrics_test.go @@ -0,0 +1,87 @@ +package otelmetrics + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseSelfCgroupV2(t *testing.T) { + tests := []struct { + name string + content string + want string + wantOK bool + }{ + {"host ns full path", "0::/kubepods.slice/kubepods-burstable.slice/pod.slice/cri-containerd-abc.scope", "/kubepods.slice/kubepods-burstable.slice/pod.slice/cri-containerd-abc.scope", true}, + {"private ns root", "0::/", "/", true}, + {"v1 lines only", "12:memory:/kubepods/pod\n11:cpu:/kubepods/pod", "", false}, + {"empty", "", "", false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, ok := parseSelfCgroupV2(tt.content) + assert.Equal(t, tt.want, got) + assert.Equal(t, tt.wantOK, ok) + }) + } +} + +// TestNamespaceRootJoin documents that the "0::/" namespace-root case collapses +// to cgroupRoot itself — the sidecar's own-namespaced-mount read path. +func TestNamespaceRootJoin(t *testing.T) { + rel, ok := parseSelfCgroupV2("0::/") + require.True(t, ok) + assert.Equal(t, "/sys/fs/cgroup", filepath.Join(cgroupRoot, rel)) +} + +func TestParseCgroupMemValue(t *testing.T) { + assert.Equal(t, int64(295608320), parseCgroupMemValue("295608320\n")) + assert.Equal(t, int64(0), parseCgroupMemValue("max"), "cgroupv2 unlimited sentinel → 0") + assert.Equal(t, int64(0), parseCgroupMemValue("")) + assert.Equal(t, int64(0), parseCgroupMemValue("garbage")) + assert.Equal(t, int64(766509056), parseCgroupMemValue(" 766509056 ")) +} + +// TestFindCgroupScopeDir builds a fake cgroup tree mirroring the real EKS +// layout and asserts we locate the scope dir by container ID. +func TestFindCgroupScopeDir(t *testing.T) { + const id = "e75962bca00d51fae3534887fbbd77b012464637c93b3be3f397dfa30a2eb8be" + root := t.TempDir() + scope := filepath.Join(root, "kubepods.slice", "kubepods-besteffort.slice", + "kubepods-besteffort-poduid.slice", "cri-containerd-"+id+".scope") + require.NoError(t, os.MkdirAll(scope, 0o755)) + // A sibling scope for a different container must not match. + other := filepath.Join(root, "kubepods.slice", "cri-containerd-"+ + "1111111111111111111111111111111111111111111111111111111111111111.scope") + require.NoError(t, os.MkdirAll(other, 0o755)) + + got := findCgroupScopeDir(root, id) + assert.Equal(t, scope, got) + + assert.Empty(t, findCgroupScopeDir(root, "deadbeef"), "unknown id → no match") +} + +// TestResolveCgroupMemoryPaths_FastPath verifies the /proc/self/cgroup join +// path is preferred when memory.current exists there. We can't override +// /proc/self/cgroup, so this exercises the helper composition indirectly via +// the fallback resolution against a fake tree. +func TestReadCgroupMem_EndToEndFakeTree(t *testing.T) { + const id = "abc1230000000000000000000000000000000000000000000000000000000000" + root := t.TempDir() + scope := filepath.Join(root, "kubepods.slice", "cri-containerd-"+id+".scope") + require.NoError(t, os.MkdirAll(scope, 0o755)) + require.NoError(t, os.WriteFile(filepath.Join(scope, "memory.current"), []byte("123456\n"), 0o644)) + require.NoError(t, os.WriteFile(filepath.Join(scope, "memory.max"), []byte("999999\n"), 0o644)) + + dir := findCgroupScopeDir(root, id) + require.NotEmpty(t, dir) + + cur := parseCgroupMemValue(readFileString(filepath.Join(dir, "memory.current"))) + max := parseCgroupMemValue(readFileString(filepath.Join(dir, "memory.max"))) + assert.Equal(t, int64(123456), cur) + assert.Equal(t, int64(999999), max) +} diff --git a/pkg/metricsmanager/prometheus/bench_test.go b/pkg/metricsmanager/prometheus/bench_test.go new file mode 100644 index 0000000000..8e8e2d3768 --- /dev/null +++ b/pkg/metricsmanager/prometheus/bench_test.go @@ -0,0 +1,58 @@ +package metricsmanager + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/kubescape/node-agent/pkg/utils" +) + +// benchPM is a singleton for benchmarks: promauto registers with the global +// Prometheus DefaultRegisterer, which panics on duplicate registration. +var ( + benchPM *PrometheusMetric + benchOnce sync.Once +) + +func getBenchPM() *PrometheusMetric { + benchOnce.Do(func() { benchPM = NewPrometheusMetric() }) + return benchPM +} + +// BenchmarkReportRuleEvaluationTime is the Prometheus baseline for the Phase 2 +// A/B comparison. The OTEL implementation must not exceed this in allocs/op or +// ns/op × 1.1 (per Phase 2 plan hard gate). +func BenchmarkReportRuleEvaluationTime(b *testing.B) { + pm := getBenchPM() + pm.ReportRuleEvaluationTime(context.Background(), "R1001", utils.ExecveEventType, 3*time.Millisecond) // warm cache + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + pm.ReportRuleEvaluationTime(context.Background(), "R1001", utils.ExecveEventType, 3*time.Millisecond) + } +} + +func BenchmarkReportEvent(b *testing.B) { + pm := getBenchPM() + pm.ReportEvent(utils.ExecveEventType) + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + pm.ReportEvent(utils.ExecveEventType) + } +} + +func BenchmarkReportRuleAlert(b *testing.B) { + pm := getBenchPM() + pm.ReportRuleAlert("R1001") // warm cache + + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + pm.ReportRuleAlert("R1001") + } +} diff --git a/pkg/metricsmanager/prometheus/prometheus.go b/pkg/metricsmanager/prometheus/prometheus.go index d48a6ea270..ff0656414b 100644 --- a/pkg/metricsmanager/prometheus/prometheus.go +++ b/pkg/metricsmanager/prometheus/prometheus.go @@ -1,6 +1,7 @@ package metricsmanager import ( + "context" "net/http" "sync" "time" @@ -93,6 +94,15 @@ type PrometheusMetric struct { cpProfileEntriesRetainedHistogram *prometheus.HistogramVec cpProfileRetentionRatioHistogram *prometheus.HistogramVec + // SBOM scan metrics + sbomScanCounter *prometheus.CounterVec + sbomScanDuration *prometheus.HistogramVec + sbomRestarts prometheus.Counter + sbomReady prometheus.Gauge + + // Alert suppression funnel + alertSuppressedCounter *prometheus.CounterVec + // Cache to avoid allocating Labels maps on every call ruleCounterCache map[string]prometheus.Counter rulePrefilteredCounterCache map[string]prometheus.Counter @@ -348,6 +358,29 @@ func NewPrometheusMetric() *PrometheusMetric { Buckets: []float64{0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, }, []string{"field"}), + // SBOM scan metrics + sbomScanCounter: promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "sbom_scan_total", + Help: "Total SBOM scan attempts", + }, []string{"status"}), + alertSuppressedCounter: promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "node_agent_alert_suppressed_total", + Help: "Total alerts suppressed before delivery, labeled by rule_id and reason", + }, []string{prometheusRuleIdLabel, "reason"}), + sbomScanDuration: promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "sbom_scan_duration_seconds", + Help: "SBOM scan duration in seconds", + Buckets: prometheus.ExponentialBuckets(1, 2, 12), + }, []string{"status"}), + sbomRestarts: promauto.NewCounter(prometheus.CounterOpts{ + Name: "sbom_scanner_restarts_total", + Help: "Total number of SBOM scanner sidecar restarts detected via connection loss", + }), + sbomReady: promauto.NewGauge(prometheus.GaugeOpts{ + Name: "sbom_scanner_ready", + Help: "Whether the SBOM scanner sidecar is connected and healthy (1=ready, 0=not ready)", + }), + // Initialize counter caches ruleCounterCache: make(map[string]prometheus.Counter), rulePrefilteredCounterCache: make(map[string]prometheus.Counter), @@ -420,6 +453,11 @@ func (p *PrometheusMetric) Destroy() { prometheus.Unregister(p.programMapCountGauge) prometheus.Unregister(p.programCpuUsageGauge) prometheus.Unregister(p.programPerCpuUsageGauge) + prometheus.Unregister(p.sbomScanCounter) + prometheus.Unregister(p.alertSuppressedCounter) + prometheus.Unregister(p.sbomScanDuration) + prometheus.Unregister(p.sbomRestarts) + prometheus.Unregister(p.sbomReady) } func (p *PrometheusMetric) ReportEvent(eventType utils.EventType) { @@ -544,7 +582,7 @@ func (p *PrometheusMetric) ReportRuleAlert(ruleID string) { p.getCachedAlertCounter(ruleID).Inc() } -func (p *PrometheusMetric) ReportRuleEvaluationTime(ruleID string, eventType utils.EventType, duration time.Duration) { +func (p *PrometheusMetric) ReportRuleEvaluationTime(_ context.Context, ruleID string, eventType utils.EventType, duration time.Duration) { labels := prometheus.Labels{ prometheusRuleIdLabel: ruleID, eventTypeLabel: string(eventType), @@ -670,3 +708,27 @@ func (p *PrometheusMetric) ObserveProfileEntriesRetained(field string, count flo func (p *PrometheusMetric) ObserveProfileRetentionRatio(field string, ratio float64) { p.cpProfileRetentionRatioHistogram.WithLabelValues(field).Observe(ratio) } + +func (p *PrometheusMetric) ReportSBOMScan(status string) { + p.sbomScanCounter.WithLabelValues(status).Inc() +} + +func (p *PrometheusMetric) ObserveSBOMScanDuration(status string, d time.Duration) { + p.sbomScanDuration.WithLabelValues(status).Observe(d.Seconds()) +} + +func (p *PrometheusMetric) ReportSBOMScannerRestart() { + p.sbomRestarts.Inc() +} + +func (p *PrometheusMetric) SetSBOMScannerReady(ready bool) { + if ready { + p.sbomReady.Set(1) + } else { + p.sbomReady.Set(0) + } +} + +func (p *PrometheusMetric) ReportAlertSuppressed(ruleID, reason string) { + p.alertSuppressedCounter.WithLabelValues(ruleID, reason).Inc() +} diff --git a/pkg/nodeprofilemanager/v1/nodeprofile_manager.go b/pkg/nodeprofilemanager/v1/nodeprofile_manager.go index d7415a320e..7782c9945c 100644 --- a/pkg/nodeprofilemanager/v1/nodeprofile_manager.go +++ b/pkg/nodeprofilemanager/v1/nodeprofile_manager.go @@ -11,6 +11,7 @@ import ( "github.com/kubescape/node-agent/pkg/config" "github.com/kubescape/node-agent/pkg/nodeprofilemanager" "github.com/kubescape/node-agent/pkg/objectcache" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/kubescape/node-agent/pkg/rulemanager" "github.com/kubescape/node-agent/pkg/utils" @@ -18,6 +19,9 @@ import ( "github.com/armosec/utils-k8s-go/armometadata" "github.com/kubescape/go-logger" "github.com/kubescape/go-logger/helpers" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" "golang.org/x/net/context" v1 "k8s.io/api/core/v1" ) @@ -33,8 +37,11 @@ type NodeProfileManager struct { } func NewNodeProfileManager(config config.Config, clusterData armometadata.ClusterConfig, nodeName string, k8sObjectCache objectcache.K8sObjectCache, ruleManager rulemanager.RuleManagerClient, cloudMetadata *armotypes.CloudMetadata) *NodeProfileManager { + // Keep the 5s default when timeoutSeconds is unset (0): a zero http.Client + // Timeout means no timeout, which lets a hung POST block the sequential + // send loop for hours (observed via the nodeprofile.send span). timeoutSeconds := 5 - if config.Exporters.HTTPExporterConfig != nil { + if config.Exporters.HTTPExporterConfig != nil && config.Exporters.HTTPExporterConfig.TimeoutSeconds > 0 { timeoutSeconds = config.Exporters.HTTPExporterConfig.TimeoutSeconds } return &NodeProfileManager{ @@ -60,12 +67,21 @@ func (n *NodeProfileManager) Start(ctx context.Context) { profile, err := n.getProfile() if err != nil { logger.L().Ctx(ctx).Warning("NodeProfileManager - get profile", helpers.Error(err)) - } else { - err := n.sendProfile(profile) - if err != nil { - logger.L().Ctx(ctx).Warning("NodeProfileManager - send profile", helpers.Error(err)) - } + continue + } + // Wrap the send in a span so the failure warning below inherits + // trace_id/span_id (span↔log correlation). One span per + // NodeProfileInterval per node-agent pod. + sendCtx, span := otelsetup.Tracer().Start(ctx, "nodeprofile.send", + trace.WithAttributes( + attribute.String("http.url", n.config.Exporters.HTTPExporterConfig.URL+"/v1/nodeprofiles"), + attribute.Int("pod.count", len(profile.PodStatuses)), + )) + if err := n.sendProfile(profile); err != nil { + span.SetStatus(codes.Error, err.Error()) + logger.L().Ctx(sendCtx).Warning("NodeProfileManager - send profile", helpers.Error(err)) } + span.End() } }() } diff --git a/pkg/otelsetup/lifecycle.go b/pkg/otelsetup/lifecycle.go new file mode 100644 index 0000000000..a3d43ff851 --- /dev/null +++ b/pkg/otelsetup/lifecycle.go @@ -0,0 +1,171 @@ +package otelsetup + +import ( + "context" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" +) + +const maxTrackedProfiles = 10_000 + +// ProfileLifecycleTracker manages one long-running trace span per container +// learning period. State transitions are recorded as span events. +type ProfileLifecycleTracker struct { + spans map[string]trace.Span + ctxs map[string]context.Context // span contexts, used to parent child spans + counts map[string]int // checkpoint snapshot count per container (M2 throttle) + startTimes map[string]time.Time + mu sync.Mutex +} + +func NewProfileLifecycleTracker() *ProfileLifecycleTracker { + return &ProfileLifecycleTracker{ + spans: make(map[string]trace.Span), + ctxs: make(map[string]context.Context), + counts: make(map[string]int), + startTimes: make(map[string]time.Time), + } +} + +// OnLearningStarted begins a lifecycle span for the container. +func (t *ProfileLifecycleTracker) OnLearningStarted(containerID, namespace, pod, image string) { + t.mu.Lock() + defer t.mu.Unlock() + if len(t.spans) >= maxTrackedProfiles { + t.evictOldest() + } + if existing, ok := t.spans[containerID]; ok { + existing.AddEvent("learning.replaced") + existing.End() + } + spanCtx, span := Tracer().Start(context.Background(), "container.profile.learning", + trace.WithAttributes( + attribute.String("container.id", containerID), + attribute.String("k8s.namespace.name", namespace), + attribute.String("k8s.pod.name", pod), + attribute.String("container.image.name", image), + )) + t.spans[containerID] = span + t.ctxs[containerID] = spanCtx + t.counts[containerID] = 0 + t.startTimes[containerID] = time.Now() +} + +// LearningSpanID returns the hex span ID of the active learning span for the +// given container, or an empty string if no span is tracked. Used by backend +// processors to link their own spans back into this trace. +func (t *ProfileLifecycleTracker) LearningSpanID(containerID string) string { + t.mu.Lock() + span, ok := t.spans[containerID] + t.mu.Unlock() + if !ok { + return "" + } + sc := span.SpanContext() + if !sc.IsValid() { + return "" + } + return sc.SpanID().String() +} + +// LearningTraceparent returns the W3C traceparent header value for the active +// learning span, or an empty string if no span is tracked. Stamp this onto +// storage objects so downstream components (kubescape/storage aggregation) +// can extract the remote span context and create properly parented child spans. +func (t *ProfileLifecycleTracker) LearningTraceparent(containerID string) string { + t.mu.Lock() + ctx, ok := t.ctxs[containerID] + t.mu.Unlock() + if !ok { + return "" + } + carrier := propagation.MapCarrier{} + otel.GetTextMapPropagator().Inject(ctx, carrier) + return carrier["traceparent"] +} + +// LearningCtx returns the context carrying the active learning span for the +// given container, or context.Background() if no span is tracked. Pass this +// to logger.L().Ctx(...) at error sites so the log record inherits the +// learning span's trace_id/span_id for span↔log correlation. +func (t *ProfileLifecycleTracker) LearningCtx(containerID string) context.Context { + t.mu.Lock() + ctx, ok := t.ctxs[containerID] + t.mu.Unlock() + if !ok { + return context.Background() + } + return ctx +} + +// OnEntrySaved emits an immediate child span when a checkpoint profile is +// shipped, subject to M2 throttling: spans are emitted on the first snapshot, +// every 10th, and any snapshot that had dropped events. This keeps span +// volume within the per-agent budget while preserving visibility on errors. +func (t *ProfileLifecycleTracker) OnEntrySaved(containerID string, hasDropped bool) { + t.mu.Lock() + ctx, ok := t.ctxs[containerID] + if !ok { + t.mu.Unlock() + return + } + t.counts[containerID]++ + count := t.counts[containerID] + t.mu.Unlock() + if count != 1 && count%10 != 0 && !hasDropped { + return + } + _, child := Tracer().Start(ctx, "container.profile.cp.saved", + trace.WithAttributes( + attribute.String("container.id", containerID), + attribute.Int("snapshot.number", count), + attribute.Bool("has.dropped.events", hasDropped), + ), + ) + child.End() +} + +// OnLearningEnded ends the lifecycle span with the given reason +// ("completed", "evicted", "too_large", "terminated"). +func (t *ProfileLifecycleTracker) OnLearningEnded(containerID, reason string) { + t.mu.Lock() + span, ok := t.spans[containerID] + delete(t.spans, containerID) + delete(t.ctxs, containerID) + delete(t.counts, containerID) + delete(t.startTimes, containerID) + t.mu.Unlock() + if ok { + span.AddEvent("learning." + reason) + span.End() + } +} + +// evictOldest force-ends the span with the earliest start time. Must be called with mu held. +func (t *ProfileLifecycleTracker) evictOldest() { + if len(t.spans) == 0 { + return + } + var oldest string + var oldestTime time.Time + first := true + for id, ts := range t.startTimes { + if first || ts.Before(oldestTime) { + oldest, oldestTime = id, ts + first = false + } + } + if span, ok := t.spans[oldest]; ok { + span.AddEvent("learning.evicted_cap_exceeded") + span.End() + } + delete(t.spans, oldest) + delete(t.ctxs, oldest) + delete(t.counts, oldest) + delete(t.startTimes, oldest) +} diff --git a/pkg/otelsetup/otelsetup_test.go b/pkg/otelsetup/otelsetup_test.go new file mode 100644 index 0000000000..5a45b78c98 --- /dev/null +++ b/pkg/otelsetup/otelsetup_test.go @@ -0,0 +1,228 @@ +package otelsetup + +import ( + "context" + "testing" + "time" + + "github.com/kubescape/go-logger" + "github.com/kubescape/go-logger/helpers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/log/global" + "go.opentelemetry.io/otel/log/logtest" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" + "go.opentelemetry.io/otel/trace" +) + +// newTestTracerProvider installs an in-memory tracer provider and returns the +// span recorder and a cleanup func that restores the global provider. +func newTestTracerProvider(t *testing.T) (*tracetest.SpanRecorder, func()) { + t.Helper() + rec := tracetest.NewSpanRecorder() + tp := sdktrace.NewTracerProvider(sdktrace.WithSpanProcessor(rec)) + prev := otel.GetTracerProvider() + otel.SetTracerProvider(tp) + return rec, func() { otel.SetTracerProvider(prev) } +} + +// --- SlowEvalThreshold --- + +func TestSlowEvalThreshold_Default(t *testing.T) { + prev := slowEvalThresholdNs.Load() + t.Cleanup(func() { slowEvalThresholdNs.Store(prev) }) + slowEvalThresholdNs.Store(5 * int64(time.Millisecond)) + assert.Equal(t, 5*time.Millisecond, SlowEvalThreshold()) +} + +// --- ProfileLifecycleTracker --- + +func TestProfileLifecycleTracker_StartEndCompleted(t *testing.T) { + rec, cleanup := newTestTracerProvider(t) + defer cleanup() + + tracker := NewProfileLifecycleTracker() + tracker.OnLearningStarted("cid-1", "ns", "pod", "img:latest") + tracker.OnEntrySaved("cid-1", false) // count=1: emitted (first) + tracker.OnEntrySaved("cid-1", false) // count=2: suppressed by M2 throttle + tracker.OnLearningEnded("cid-1", "completed") + + // 1 child CP span (only the first; second is throttled) + 1 parent learning span + spans := rec.Ended() + require.Len(t, spans, 2, "expected 1 cp.saved child span (M2 throttled) + 1 parent learning span") + + assert.Equal(t, "container.profile.cp.saved", spans[0].Name()) + + // Verify snapshot.number attribute on the child + childAttrs := make(map[string]interface{}) + for _, a := range spans[0].Attributes() { + childAttrs[string(a.Key)] = a.Value.AsInterface() + } + assert.Equal(t, "cid-1", childAttrs["container.id"]) + assert.Equal(t, int64(1), childAttrs["snapshot.number"]) + + // Last span is the parent + parent := spans[1] + assert.Equal(t, "container.profile.learning", parent.Name()) + + // Parent has the container attributes + attrs := make(map[string]string) + for _, a := range parent.Attributes() { + attrs[string(a.Key)] = a.Value.AsString() + } + assert.Equal(t, "cid-1", attrs["container.id"]) + assert.Equal(t, "ns", attrs["k8s.namespace.name"]) + assert.Equal(t, "pod", attrs["k8s.pod.name"]) + assert.Equal(t, "img:latest", attrs["container.image.name"]) + + // Parent has only the terminal event + events := parent.Events() + require.Len(t, events, 1, "expected only learning.completed on parent") + assert.Equal(t, "learning.completed", events[0].Name) + + // Child span is parented under the learning span + parentSpanID := parent.SpanContext().SpanID() + assert.Equal(t, parentSpanID, spans[0].Parent().SpanID()) +} + +func TestProfileLifecycleTracker_Terminated(t *testing.T) { + rec, cleanup := newTestTracerProvider(t) + defer cleanup() + + tracker := NewProfileLifecycleTracker() + tracker.OnLearningStarted("cid-2", "ns", "pod", "") + tracker.OnLearningEnded("cid-2", "terminated") + + spans := rec.Ended() + require.Len(t, spans, 1) + events := spans[0].Events() + require.Len(t, events, 1) + assert.Equal(t, "learning.terminated", events[0].Name) +} + +func TestProfileLifecycleTracker_EndWithoutStart(t *testing.T) { + rec, cleanup := newTestTracerProvider(t) + defer cleanup() + + tracker := NewProfileLifecycleTracker() + assert.NotPanics(t, func() { + tracker.OnLearningEnded("nonexistent", "completed") + }) + assert.Empty(t, rec.Ended(), "no spans should be emitted for unknown container") +} + +// TestProfileLifecycleTracker_LearningCtx asserts that LearningCtx returns +// the same SpanContext as the active learning span, so logs emitted via +// logger.L().Ctx(LearningCtx(cid)) inherit the learning trace's IDs. +func TestProfileLifecycleTracker_LearningCtx(t *testing.T) { + rec, cleanup := newTestTracerProvider(t) + defer cleanup() + + tracker := NewProfileLifecycleTracker() + tracker.OnLearningStarted("cid-x", "ns", "pod", "img:latest") + defer tracker.OnLearningEnded("cid-x", "completed") + + ctx := tracker.LearningCtx("cid-x") + sc := trace.SpanContextFromContext(ctx) + require.True(t, sc.IsValid(), "LearningCtx must carry a valid SpanContext") + + // Cross-check against the recorded learning span's SpanID via a child span. + _, child := otel.Tracer("test").Start(ctx, "child") + child.End() + tracker.OnLearningEnded("cid-x", "completed") + + spans := rec.Ended() + require.Len(t, spans, 2) + // Last is the parent learning span; child should reference it. + var parent, kid trace.SpanContext + for _, s := range spans { + if s.Name() == "container.profile.learning" { + parent = s.SpanContext() + } + if s.Name() == "child" { + kid = s.Parent() + } + } + assert.Equal(t, parent.SpanID(), kid.SpanID(), "child span's parent must be the learning span") + assert.Equal(t, parent.TraceID(), sc.TraceID(), "LearningCtx TraceID must match learning span") +} + +// TestProfileLifecycleTracker_LearningCtx_UnknownContainer asserts that an +// unknown containerID yields a background ctx, not a nil one — keeps log call +// sites safe. +func TestProfileLifecycleTracker_LearningCtx_UnknownContainer(t *testing.T) { + tracker := NewProfileLifecycleTracker() + ctx := tracker.LearningCtx("nonexistent") + require.NotNil(t, ctx) + assert.False(t, trace.SpanContextFromContext(ctx).IsValid()) +} + +// TestZapWarning_AttachesSpanContext is the end-to-end proof: a Warning +// emitted via logger.L().Ctx(spanCtx) reaches the OTEL Logs SDK with the +// span's trace_id/span_id propagated through ctx — which is what ClickHouse +// stamps onto trace_id / span_id columns. +func TestZapWarning_AttachesSpanContext(t *testing.T) { + // 1. Install a span recorder so we have a real span to link against. + _, cleanup := newTestTracerProvider(t) + defer cleanup() + + // 2. Install an in-memory log recorder as the global LoggerProvider + // BEFORE constructing the zap logger — otelzap captures the provider + // at construction time. + logRec := logtest.NewRecorder() + prevLogProvider := global.GetLoggerProvider() + global.SetLoggerProvider(logRec) + defer global.SetLoggerProvider(prevLogProvider) + + // 3. Switch go-logger to zap so logger.L() routes warnings through + // otelzap → the global LoggerProvider we just installed. + logger.InitLogger("zap") + defer logger.InitLogger("none") + + // 4. Start a span and emit a warning bound to its ctx. + ctx, span := otel.Tracer("test").Start(context.Background(), "outer") + logger.L().Ctx(ctx).Warning("correlated warning", helpers.String("k", "v")) + span.End() + + // 5. Find the emitted record and confirm its ctx carries the span. + var found bool + for _, records := range logRec.Result() { + for _, r := range records { + emittedSC := trace.SpanContextFromContext(r.Context) + if emittedSC.SpanID() == span.SpanContext().SpanID() { + found = true + assert.Equal(t, span.SpanContext().TraceID(), emittedSC.TraceID()) + } + } + } + assert.True(t, found, "no log record carried the span's SpanID — span↔log correlation broken") +} + +func TestProfileLifecycleTracker_CapEviction(t *testing.T) { + rec, cleanup := newTestTracerProvider(t) + defer cleanup() + + tracker := NewProfileLifecycleTracker() + tracker.OnLearningStarted("old", "ns", "pod", "") + time.Sleep(time.Millisecond) + tracker.OnLearningStarted("new", "ns", "pod", "") + + tracker.mu.Lock() + tracker.evictOldest() + tracker.mu.Unlock() + + spans := rec.Ended() + require.Len(t, spans, 1) + events := spans[0].Events() + require.Len(t, events, 1) + assert.Equal(t, "learning.evicted_cap_exceeded", events[0].Name) + + tracker.mu.Lock() + _, newExists := tracker.spans["new"] + _, oldExists := tracker.spans["old"] + tracker.mu.Unlock() + assert.True(t, newExists) + assert.False(t, oldExists) +} diff --git a/pkg/otelsetup/setup.go b/pkg/otelsetup/setup.go new file mode 100644 index 0000000000..4acb1b5c36 --- /dev/null +++ b/pkg/otelsetup/setup.go @@ -0,0 +1,194 @@ +// Package otelsetup is a node-agent-specific wrapper around +// github.com/kubescape/go-logger/otelsetup. It delegates provider +// initialisation to the shared package and adds the node-agent-specific +// slow-evaluation threshold, named accessors, and structured alert log +// emission. +package otelsetup + +import ( + "context" + "errors" + "fmt" + "net" + "net/http" + "os" + "strconv" + "sync/atomic" + "time" + + "github.com/kubescape/go-logger" + "github.com/kubescape/go-logger/helpers" + gotelsetup "github.com/kubescape/go-logger/otelsetup" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.opentelemetry.io/otel" + promexporter "go.opentelemetry.io/otel/exporters/prometheus" + otellog "go.opentelemetry.io/otel/log" + "go.opentelemetry.io/otel/log/global" + "go.opentelemetry.io/otel/metric" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "go.opentelemetry.io/otel/trace" +) + +// ProviderConfig is a type alias for the shared config so callers in this +// module need not import go-logger/otelsetup directly. +type ProviderConfig = gotelsetup.ProviderConfig + +// slowEvalThresholdNs is the configured slow-evaluation threshold in +// nanoseconds. Set inside InitProviders from OTEL_SLOW_EVAL_THRESHOLD_MS +// (default 5ms) and read by callers via SlowEvalThreshold(). +var slowEvalThresholdNs atomic.Int64 + +// SlowEvalThreshold returns the threshold above which rule evaluations should +// emit a trace span. +func SlowEvalThreshold() time.Duration { + return time.Duration(slowEvalThresholdNs.Load()) +} + +// Tracer returns the global node-agent Tracer. +func Tracer() trace.Tracer { + return otel.GetTracerProvider().Tracer("node-agent") +} + +// Logger returns the global node-agent Logger. +func Logger() otellog.Logger { + return global.GetLoggerProvider().Logger("node-agent") +} + +// Meter returns the global node-agent Meter. +func Meter() metric.Meter { + return otel.GetMeterProvider().Meter("node-agent") +} + +// InitProviders initialises OTEL providers via the shared go-logger package +// and resolves the node-agent-specific slow-evaluation threshold. +// When OTEL_METRICS_EXPORTER=prometheus, a Prometheus scrape endpoint is +// started on :8080/metrics in addition to (or instead of) OTLP metric export. +func InitProviders(ctx context.Context, cfg ProviderConfig) (shutdown func(context.Context) error, err error) { + thresholdMs := int64(5) + if v := os.Getenv("OTEL_SLOW_EVAL_THRESHOLD_MS"); v != "" { + if parsed, perr := strconv.ParseInt(v, 10, 64); perr == nil && parsed > 0 { + thresholdMs = parsed + } + } + slowEvalThresholdNs.Store(thresholdMs * int64(time.Millisecond)) + + baseShutdown, err := gotelsetup.InitProviders(ctx, cfg) + if err != nil { + return nil, err + } + + // Prometheus metrics mode: set up a scrape endpoint and override the + // MeterProvider. This is mutually exclusive with OTLP metric push — when + // OTEL_METRICS_EXPORTER=prometheus is set, the OTLP metric exporter that + // go-logger may have configured is replaced by the prometheus reader. + if os.Getenv("OTEL_METRICS_EXPORTER") == "prometheus" { + promShutdown, perr := initPrometheusMeterProvider(cfg) + if perr != nil { + // Soft-fail: port conflict (e.g. sidecar process when main agent owns :8080) + // or other listener issue. Traces and logs from baseShutdown are preserved; + // only metrics are unavailable. + logger.L().Warning("otelsetup: Prometheus metrics listener unavailable, continuing without metrics", helpers.Error(perr)) + return baseShutdown, nil + } + return func(ctx context.Context) error { + return errors.Join(baseShutdown(ctx), promShutdown(ctx)) + }, nil + } + + return baseShutdown, nil +} + +// initPrometheusMeterProvider creates a prometheus.Exporter-backed MeterProvider, +// registers it as the global provider, and starts an HTTP server on :8080/metrics. +// Returns a shutdown func that stops the HTTP server and flushes the provider. +func initPrometheusMeterProvider(cfg ProviderConfig) (func(context.Context) error, error) { + res, err := resource.Merge(resource.Default(), resource.NewSchemaless( + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + semconv.K8SClusterName(cfg.ClusterName), + semconv.K8SNodeName(cfg.NodeName), + semconv.K8SPodName(cfg.PodName), + semconv.K8SNamespaceName(cfg.Namespace), + )) + if err != nil { + return nil, err + } + + // Bind the port before touching the global MeterProvider so a port-conflict + // error leaves the existing provider intact (the sidecar soft-fail path). + ln, err := net.Listen("tcp", ":8080") + if err != nil { + return nil, fmt.Errorf("otelsetup: prometheus metrics listener: %w", err) + } + + promExp, err := promexporter.New() + if err != nil { + _ = ln.Close() + return nil, err + } + + mp := sdkmetric.NewMeterProvider( + sdkmetric.WithReader(promExp), + sdkmetric.WithResource(res), + ) + otel.SetMeterProvider(mp) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + srv := &http.Server{ + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + } + go func() { + if serr := srv.Serve(ln); serr != nil && !errors.Is(serr, http.ErrServerClosed) { + logger.L().Warning("otelsetup: prometheus metrics server stopped", helpers.Error(serr)) + } + }() + + return func(ctx context.Context) error { + return errors.Join(srv.Shutdown(ctx), mp.Shutdown(ctx)) + }, nil +} + +// AlertLogAttrs is the structured attribute payload for EmitAlertLogRecord. +// MalwareSignature is optional — leave empty for non-malware alerts. +type AlertLogAttrs struct { + RuleID string + AlertType string + ContainerID string + ContainerName string + Namespace string + PodName string + Image string + EventType string + MalwareSignature string +} + +// EmitAlertLogRecord emits a structured "SecurityAlert" log record carrying +// all alert dimensions as record attributes so the back-office can +// index/filter without parsing the body. +func EmitAlertLogRecord(ctx context.Context, attrs AlertLogAttrs) { + var r otellog.Record + now := time.Now() + r.SetTimestamp(now) + r.SetObservedTimestamp(now) + r.SetBody(otellog.StringValue("SecurityAlert")) + r.SetSeverity(otellog.SeverityWarn1) + r.SetSeverityText("WARN") + r.AddAttributes( + otellog.String("rule_id", attrs.RuleID), + otellog.String("alert_type", attrs.AlertType), + otellog.String("container.id", attrs.ContainerID), + otellog.String("container_name", attrs.ContainerName), + otellog.String("namespace", attrs.Namespace), + otellog.String("pod_name", attrs.PodName), + otellog.String("image", attrs.Image), + otellog.String("event_type", attrs.EventType), + ) + if attrs.MalwareSignature != "" { + r.AddAttributes(otellog.String("malware.signature", attrs.MalwareSignature)) + } + Logger().Emit(ctx, r) +} diff --git a/pkg/rulemanager/rule_manager.go b/pkg/rulemanager/rule_manager.go index e000771702..a25e5889db 100644 --- a/pkg/rulemanager/rule_manager.go +++ b/pkg/rulemanager/rule_manager.go @@ -6,11 +6,13 @@ import ( "fmt" "runtime/pprof" "strconv" + "sync" "time" "github.com/armosec/armoapi-go/armotypes" mapset "github.com/deckarep/golang-set/v2" "github.com/goradd/maps" + "github.com/hashicorp/golang-lru/v2/expirable" containercollection "github.com/inspektor-gadget/inspektor-gadget/pkg/container-collection" "github.com/kubescape/go-logger" "github.com/kubescape/go-logger/helpers" @@ -32,11 +34,15 @@ import ( "github.com/kubescape/node-agent/pkg/rulemanager/profilehelper" "github.com/kubescape/node-agent/pkg/rulemanager/ruleadapters" "github.com/kubescape/node-agent/pkg/rulemanager/rulecooldown" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/kubescape/node-agent/pkg/rulemanager/types" typesv1 "github.com/kubescape/node-agent/pkg/rulemanager/types/v1" "github.com/kubescape/node-agent/pkg/utils" corev1 "k8s.io/api/core/v1" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/trace" ) const ( @@ -64,6 +70,8 @@ type RuleManager struct { rulePolicyValidator *RulePolicyValidator mntnsRegistry contextdetection.Registry detectorManager *detectors.DetectorManager + alertLogDedup *expirable.LRU[string, struct{}] + alertLogDedupMu sync.Mutex } var _ RuleManagerClient = (*RuleManager)(nil) @@ -107,6 +115,7 @@ func CreateRuleManager( rulePolicyValidator: rulePolicyValidator, mntnsRegistry: mntnsRegistry, detectorManager: detectorManager, + alertLogDedup: expirable.NewLRU[string, struct{}](1000, nil, 60*time.Second), } // Compile the initial projection spec and start a goroutine that @@ -293,6 +302,7 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) } if len(rules) == 0 { + rm.metrics.ReportAlertSuppressed("", "no_rules_for_pod") return } @@ -324,6 +334,7 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) // Skip profile dependency checks for non-K8s contexts (profiles are K8s-specific) // Only K8s contexts should enforce profile dependencies if isK8sContext && !profileExists && rule.ProfileDependency == armotypes.Required { + rm.metrics.ReportAlertSuppressed(rule.ID, "profile_incomplete") continue } @@ -338,12 +349,13 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) eventFields = extractEventFields(enrichedEvent.Event) } if rule.Prefilter.ShouldSkip(&eventFields) { - rm.metrics.ReportRulePrefiltered(rule.Name) + rm.metrics.ReportRulePrefiltered(rule.ID) continue } } if rule.SupportPolicy && rm.validateRulePolicy(rule, enrichedEvent.Event, enrichedEvent.ContainerID) { + rm.metrics.ReportAlertSuppressed(rule.ID, "policy") continue } @@ -358,10 +370,33 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) shouldAlert, err = rm.celEvaluator.EvaluateRuleWithContext(evalContext, eventType, ruleExpressions) }) evaluationTime := time.Since(startTime) - rm.metrics.ReportRuleEvaluationTime(rule.Name, eventType, evaluationTime) + // Slow-path tracing: only emit a span when evaluation exceeded the threshold. + // This protects the hot path from unconditional tracing overhead on millions of events/sec. + // errCtx tracks the spanned context (when a rule.evaluate span fires) so the + // failure log below inherits its trace_id/span_id — otherwise falls back to rm.ctx. + errCtx := rm.ctx + if evaluationTime >= otelsetup.SlowEvalThreshold() { + evalCtx, span := otelsetup.Tracer().Start(rm.ctx, "rule.evaluate", + trace.WithAttributes( + attribute.String("rule.id", rule.ID), + attribute.String("event.type", string(eventType)), + attribute.String("container.id", enrichedEvent.ContainerID), + attribute.Float64("eval.duration_ms", float64(evaluationTime.Milliseconds())), + attribute.Bool("alert_fired", shouldAlert), + )) + if err != nil { + span.SetStatus(codes.Error, err.Error()) + } + rm.metrics.ReportRuleEvaluationTime(evalCtx, rule.ID, eventType, evaluationTime) + span.End() + errCtx = evalCtx + } else { + rm.metrics.ReportRuleEvaluationTime(rm.ctx, rule.ID, eventType, evaluationTime) + } if err != nil { - logger.L().Error("RuleManager.ReportEnrichedEvent - failed to evaluate rule", helpers.Error(err), helpers.String("rule", rule.ID), helpers.String("eventType", string(eventType))) + logger.L().Ctx(errCtx).Error("RuleManager.ReportEnrichedEvent - failed to evaluate rule", helpers.Error(err), helpers.String("rule", rule.ID), helpers.String("eventType", string(eventType))) + rm.metrics.ReportAlertSuppressed(rule.ID, "eval_error") continue } @@ -370,7 +405,7 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) if eventType == utils.HTTPEventType { // TODO: Manage state evaluation in a better way (this is abuse of the state map, we need a better way to pass payloads from rules.) state = rm.evaluateHTTPPayloadState(rule.State, enrichedEvent) } - rm.metrics.ReportRuleAlert(rule.Name) + rm.metrics.ReportRuleAlert(rule.ID) message, uniqueID, err := rm.getUniqueIdAndMessage(enrichedEvent, rule) if err != nil { logger.L().Error("RuleManager - failed to get unique ID and message", helpers.Error(err)) @@ -378,9 +413,47 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) } if shouldCooldown, _ := rm.ruleCooldown.ShouldCooldown(uniqueID, enrichedEvent.ContainerID, rule.ID); shouldCooldown { + rm.metrics.ReportAlertSuppressed(rule.ID, "cooldown") continue } + // Emit OTEL log after cooldown so suppressed alerts are not recorded. + // Dedup key includes eventType to avoid collapsing distinct alert types. + dedupKey := rule.ID + "|" + enrichedEvent.ContainerID + "|" + string(eventType) + rm.alertLogDedupMu.Lock() + alreadySeen := rm.alertLogDedup.Contains(dedupKey) + if !alreadySeen { + rm.alertLogDedup.Add(dedupKey, struct{}{}) + } + rm.alertLogDedupMu.Unlock() + if !alreadySeen { + var image, containerName string + if enrichable, ok := enrichedEvent.Event.(utils.EnrichEvent); ok { + image = enrichable.GetContainerImage() + containerName = enrichable.GetContainer() + } + alertCtx, alertSpan := otelsetup.Tracer().Start(rm.ctx, "rule.alert", + trace.WithAttributes( + attribute.String("rule.id", rule.ID), + attribute.String("rule.name", rule.Name), + attribute.String("k8s.namespace.name", namespace), + attribute.String("k8s.pod.name", pod), + attribute.String("container.id", enrichedEvent.ContainerID), + attribute.String("event.type", string(eventType)), + )) + otelsetup.EmitAlertLogRecord(alertCtx, otelsetup.AlertLogAttrs{ + RuleID: rule.ID, + AlertType: rule.Name, + ContainerID: enrichedEvent.ContainerID, + ContainerName: containerName, + Namespace: namespace, + PodName: pod, + Image: image, + EventType: string(eventType), + }) + alertSpan.End() + } + ruleFailure := rm.ruleFailureCreator.CreateRuleFailure(rule, enrichedEvent, rm.objectCache, message, uniqueID, apChecksum, state) if ruleFailure == nil { logger.L().Error("RuleManager - failed to create rule failure", helpers.String("rule", rule.Name), @@ -394,7 +467,7 @@ func (rm *RuleManager) ReportEnrichedEvent(enrichedEvent *events.EnrichedEvent) ruleFailure.SetWorkloadDetails(details) rm.exporter.SendRuleAlert(ruleFailure) } - rm.metrics.ReportRuleProcessed(rule.Name) + rm.metrics.ReportRuleProcessed(rule.ID) } } @@ -491,10 +564,10 @@ func (rm *RuleManager) EvaluatePolicyRulesForEvent(eventType utils.EventType, ev shouldAlert, err = rm.celEvaluator.EvaluateRuleWithContext(evalContext, eventType, ruleExpressions) }) evaluationTime := time.Since(startTime) - rm.metrics.ReportRuleEvaluationTime(rule.ID, eventType, evaluationTime) + rm.metrics.ReportRuleEvaluationTime(rm.ctx, rule.ID, eventType, evaluationTime) if err != nil { - logger.L().Error("RuleManager.EvaluatePolicyRulesForEvent - failed to evaluate rule", helpers.Error(err), helpers.String("rule", rule.ID), helpers.String("eventType", string(eventType))) + logger.L().Ctx(rm.ctx).Error("RuleManager.EvaluatePolicyRulesForEvent - failed to evaluate rule", helpers.Error(err), helpers.String("rule", rule.ID), helpers.String("eventType", string(eventType))) continue } @@ -534,11 +607,11 @@ func (rm *RuleManager) getRuleExpressions(rule typesv1.Rule, eventType utils.Eve func (rm *RuleManager) getUniqueIdAndMessage(enrichedEvent *events.EnrichedEvent, rule typesv1.Rule) (string, string, error) { message, err := rm.celEvaluator.EvaluateExpression(enrichedEvent, rule.Expressions.Message) if err != nil { - logger.L().Error("RuleManager - failed to evaluate message", helpers.Error(err)) + logger.L().Ctx(rm.ctx).Error("RuleManager - failed to evaluate message", helpers.Error(err)) } uniqueID, err := rm.celEvaluator.EvaluateExpression(enrichedEvent, rule.Expressions.UniqueID) if err != nil { - logger.L().Error("RuleManager - failed to evaluate unique ID", helpers.Error(err)) + logger.L().Ctx(rm.ctx).Error("RuleManager - failed to evaluate unique ID", helpers.Error(err)) } uniqueID = hashStringToMD5(uniqueID) diff --git a/pkg/sbommanager/v1/metrics.go b/pkg/sbommanager/v1/metrics.go deleted file mode 100644 index 6a553b65a9..0000000000 --- a/pkg/sbommanager/v1/metrics.go +++ /dev/null @@ -1,29 +0,0 @@ -package v1 - -import ( - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var ( - sbomScanTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "sbom_scan_total", - Help: "Total SBOM scan attempts", - }, []string{"status"}) - - sbomScanDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "sbom_scan_duration_seconds", - Help: "SBOM scan duration in seconds", - Buckets: prometheus.ExponentialBuckets(1, 2, 12), - }, []string{"status"}) - - sbomScannerRestartsTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "sbom_scanner_restarts_total", - Help: "Total number of SBOM scanner sidecar restarts detected via connection loss", - }) - - sbomScannerReady = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "sbom_scanner_ready", - Help: "Whether the SBOM scanner sidecar is connected and healthy (1=ready, 0=not ready)", - }) -) diff --git a/pkg/sbommanager/v1/sbom_manager.go b/pkg/sbommanager/v1/sbom_manager.go index 2f6d059b93..1d0e426826 100644 --- a/pkg/sbommanager/v1/sbom_manager.go +++ b/pkg/sbommanager/v1/sbom_manager.go @@ -33,6 +33,7 @@ import ( helpersv1 "github.com/kubescape/k8s-interface/instanceidhandler/v1/helpers" "github.com/kubescape/k8s-interface/names" "github.com/kubescape/node-agent/pkg/config" + "github.com/kubescape/node-agent/pkg/metricsmanager" "github.com/kubescape/node-agent/pkg/objectcache" "github.com/kubescape/node-agent/pkg/sbommanager" "github.com/kubescape/node-agent/pkg/sbommanager/v1/syftutil" @@ -44,6 +45,7 @@ import ( "github.com/moby/sys/mountinfo" "github.com/opencontainers/go-digest" "github.com/spf13/afero" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -90,11 +92,15 @@ type SbomManager struct { pendingOrder []string pendingMu sync.Mutex failureReporter sbommanager.SbomFailureReporter + metrics metricsmanager.MetricsManager } var _ sbommanager.SbomManagerClient = (*SbomManager)(nil) -func CreateSbomManager(ctx context.Context, cfg config.Config, socketPath string, storageClient storage.SbomClient, k8sObjectCache objectcache.K8sObjectCache, scannerClient sbomscanner.SBOMScannerClient, failureReporter sbommanager.SbomFailureReporter) (*SbomManager, error) { +func CreateSbomManager(ctx context.Context, cfg config.Config, socketPath string, storageClient storage.SbomClient, k8sObjectCache objectcache.K8sObjectCache, scannerClient sbomscanner.SBOMScannerClient, failureReporter sbommanager.SbomFailureReporter, metrics metricsmanager.MetricsManager) (*SbomManager, error) { + if metrics == nil { + metrics = &metricsmanager.MetricsNoop{} + } // read HOST_ROOT from env hostRoot, exists := os.LookupEnv("HOST_ROOT") if !exists { @@ -113,6 +119,7 @@ func CreateSbomManager(ctx context.Context, cfg config.Config, socketPath string d := net.Dialer{Timeout: 2 * time.Second} return d.DialContext(ctx, "unix", socketPath) }), + grpc.WithStatsHandler(otelgrpc.NewClientHandler()), ) var scannerMemLimit int64 if memStr, ok := os.LookupEnv("SCANNER_MEMORY_LIMIT"); ok { @@ -136,6 +143,7 @@ func CreateSbomManager(ctx context.Context, cfg config.Config, socketPath string scanRetries: make(map[string]int), pendingScans: make(map[string]pendingScan), failureReporter: failureReporter, + metrics: metrics, } if scannerClient != nil { sm.startScannerReadinessWatcher() @@ -373,7 +381,7 @@ func (s *SbomManager) processContainerWithMetadata(notif containercollection.Pub scanStart := time.Now() if s.scannerClient != nil && s.scannerClient.Ready() { - sbomScannerReady.Set(1) + s.metrics.SetSBOMScannerReady(true) // sidecar path: delegate SBOM creation to the scanner sidecar imageStatusBytes, marshalErr := json.Marshal(imageStatus) if marshalErr != nil { @@ -396,17 +404,17 @@ func (s *SbomManager) processContainerWithMetadata(notif containercollection.Pub Timeout: 15 * time.Minute, }) if scanErr != nil { - scanDuration := time.Since(scanStart).Seconds() + scanDuration := time.Since(scanStart) if errors.Is(scanErr, sbomscanner.ErrScannerCrashed) { - sbomScanTotal.WithLabelValues("oom_killed").Inc() - sbomScanDuration.WithLabelValues("oom_killed").Observe(scanDuration) - sbomScannerRestartsTotal.Inc() - sbomScannerReady.Set(0) + s.metrics.ReportSBOMScan("oom_killed") + s.metrics.ObserveSBOMScanDuration("oom_killed", scanDuration) + s.metrics.ReportSBOMScannerRestart() + s.metrics.SetSBOMScannerReady(false) s.handleScannerCrash(sbomName, wipSbom, notif, scanErr, imageTag, imageID) return } - sbomScanTotal.WithLabelValues("error").Inc() - sbomScanDuration.WithLabelValues("error").Observe(scanDuration) + s.metrics.ReportSBOMScan("error") + s.metrics.ObserveSBOMScanDuration("error", scanDuration) logger.L().Ctx(s.ctx).Error("SbomManager - sidecar scan failed", helpers.Error(scanErr), helpers.String("namespace", notif.Container.K8s.Namespace), @@ -416,12 +424,12 @@ func (s *SbomManager) processContainerWithMetadata(notif containercollection.Pub s.reportFailure(notif, imageTag, imageID, scanfailure.ReasonSBOMGenerationFailed, scanErr) return } - sbomScanTotal.WithLabelValues("success").Inc() - sbomScanDuration.WithLabelValues("success").Observe(time.Since(scanStart).Seconds()) + s.metrics.ReportSBOMScan("success") + s.metrics.ObserveSBOMScanDuration("success", time.Since(scanStart)) delete(s.scanRetries, sbomName) syftDoc = result.SyftDocument } else if s.scannerClient != nil { - sbomScannerReady.Set(0) + s.metrics.SetSBOMScannerReady(false) // sidecar configured but not ready — queue for retry when it becomes ready logger.L().Debug("SbomManager - scanner sidecar not ready, queuing scan for retry", helpers.String("sbomName", sbomName)) diff --git a/pkg/sbomscanner/v1/client.go b/pkg/sbomscanner/v1/client.go index c772cf5de9..25506011c4 100644 --- a/pkg/sbomscanner/v1/client.go +++ b/pkg/sbomscanner/v1/client.go @@ -11,7 +11,9 @@ import ( "github.com/kubescape/go-logger/helpers" pb "github.com/kubescape/node-agent/pkg/sbomscanner/v1/proto" "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" + grpcstats "google.golang.org/grpc/stats" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials/insecure" "google.golang.org/grpc/status" @@ -30,6 +32,11 @@ func NewSBOMScannerClient(socketPath string) (SBOMScannerClient, error) { target := fmt.Sprintf("unix://%s", socketPath) conn, err := grpc.NewClient(target, grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithStatsHandler(otelgrpc.NewClientHandler( + otelgrpc.WithFilter(func(info *grpcstats.RPCTagInfo) bool { + return info.FullMethodName != pb.SBOMScanner_Health_FullMethodName + }), + )), ) if err != nil { return nil, fmt.Errorf("failed to create gRPC client: %w", err) diff --git a/pkg/sbomscanner/v1/run.go b/pkg/sbomscanner/v1/run.go new file mode 100644 index 0000000000..f573f1ea0a --- /dev/null +++ b/pkg/sbomscanner/v1/run.go @@ -0,0 +1,95 @@ +package v1 + +import ( + "context" + "net" + "os" + "os/signal" + "syscall" + "time" + + "github.com/kubescape/go-logger" + "github.com/kubescape/go-logger/helpers" + otelmetrics "github.com/kubescape/node-agent/pkg/metricsmanager/otel" + "github.com/kubescape/node-agent/pkg/otelsetup" + pb "github.com/kubescape/node-agent/pkg/sbomscanner/v1/proto" + "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" + goruntime "go.opentelemetry.io/contrib/instrumentation/runtime" + "google.golang.org/grpc" + grpcstats "google.golang.org/grpc/stats" +) + +func RunServer(ctx context.Context, accountID, accessKey string) { + // Initialize OTEL providers from standard env vars (OTEL_EXPORTER_OTLP_ENDPOINT etc.). + // Gracefully degrades to no-op when endpoint is not configured. + otelShutdown, err := otelsetup.InitProviders(ctx, otelsetup.ProviderConfig{ + ServiceName: "sbom-scanner", + ServiceVersion: os.Getenv("RELEASE"), + NodeName: os.Getenv("NODE_NAME"), + PodName: os.Getenv("POD_NAME"), + Namespace: os.Getenv("NAMESPACE"), + ClusterName: os.Getenv("CLUSTER_NAME"), + AccountID: accountID, + AccessKey: accessKey, + }) + if err != nil { + logger.L().Warning("sbom-scanner: OTEL init failed, running without telemetry", helpers.Error(err)) + } + if otelShutdown != nil { + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = otelShutdown(shutdownCtx) + }() + } + + // Emit Go runtime metrics only when metrics collection is configured; + // avoids ~2–3 KB/hr of metric volume for deployments without telemetry. + if os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") != "" || + os.Getenv("OTEL_METRICS_EXPORTER") != "" || + os.Getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") != "" { + if err := goruntime.Start(goruntime.WithMinimumReadMemStatsInterval(30 * time.Second)); err != nil { + logger.L().Warning("sbom-scanner: Go runtime metrics unavailable", helpers.Error(err)) + } + // Per-process memory gauges (rss + cgroup usage/limit), same as the main + // agent. The sidecar mounts its own namespaced /sys/fs/cgroup (no host + // override), so the cgroup resolver reads the namespace root directly — + // no container ID needed. + otelmetrics.RegisterProcessMemoryMetrics(otelsetup.Meter(), "") + } + + socketPath := os.Getenv("SOCKET_PATH") + if socketPath == "" { + socketPath = "/sbom-comm/scanner.sock" + } + + // Remove stale socket file from a previous run + os.Remove(socketPath) + + lis, err := net.Listen("unix", socketPath) + if err != nil { + logger.L().Fatal("failed to listen on socket", helpers.Error(err), helpers.String("path", socketPath)) + } + + srv := grpc.NewServer(grpc.StatsHandler(otelgrpc.NewServerHandler( + otelgrpc.WithFilter(func(info *grpcstats.RPCTagInfo) bool { + return info.FullMethodName != pb.SBOMScanner_Health_FullMethodName + }), + ))) + pb.RegisterSBOMScannerServer(srv, NewScannerServer()) + + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT) + + go func() { + sig := <-sigCh + logger.L().Info("received signal, shutting down", helpers.String("signal", sig.String())) + srv.GracefulStop() + os.Remove(socketPath) + }() + + logger.L().Info("SBOM scanner sidecar started", helpers.String("socket", socketPath)) + if err := srv.Serve(lis); err != nil { + logger.L().Fatal("gRPC server failed", helpers.Error(err)) + } +} diff --git a/pkg/sbomscanner/v1/server.go b/pkg/sbomscanner/v1/server.go index 360d67c70d..d50331221c 100644 --- a/pkg/sbomscanner/v1/server.go +++ b/pkg/sbomscanner/v1/server.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "errors" + gort "runtime" "runtime/debug" "sync" "time" @@ -14,9 +15,13 @@ import ( sbomcataloger "github.com/anchore/syft/syft/pkg/cataloger/sbom" "github.com/kubescape/go-logger" "github.com/kubescape/go-logger/helpers" + "github.com/kubescape/node-agent/pkg/otelsetup" "github.com/kubescape/node-agent/pkg/sbommanager/v1/syftutil" pb "github.com/kubescape/node-agent/pkg/sbomscanner/v1/proto" "github.com/kubescape/storage/pkg/apis/softwarecomposition/v1beta1" + otelcodes "go.opentelemetry.io/otel/codes" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -71,7 +76,29 @@ func (s *scannerServer) CreateSBOM(ctx context.Context, req *pb.CreateSBOMReques cfg.WithCatalogers(pkgcataloging.NewCatalogerReference(sbomcataloger.NewCataloger(), []string{pkgcataloging.ImageTag})) } - syftSBOM, err := syft.CreateSBOM(ctx, src, cfg) + var memBefore, memAfter gort.MemStats + gort.ReadMemStats(&memBefore) + + scanCtx, scanSpan := otelsetup.Tracer().Start(ctx, "sbom.scan", + trace.WithAttributes( + attribute.String("image.tag", req.ImageTag), + attribute.String("image.id", req.ImageId), + )) + syftSBOM, err := syft.CreateSBOM(scanCtx, src, cfg) + gort.ReadMemStats(&memAfter) + // TotalAlloc is monotonically increasing (cumulative bytes allocated), + // so the delta is always ≥ 0 even when GC runs mid-scan. + totalBefore := float64(memBefore.TotalAlloc) / (1024 * 1024) + totalAfter := float64(memAfter.TotalAlloc) / (1024 * 1024) + scanSpan.SetAttributes( + attribute.Float64("alloc.total.before_mb", totalBefore), + attribute.Float64("alloc.total.after_mb", totalAfter), + attribute.Float64("alloc.total.delta_mb", totalAfter-totalBefore), + ) + if err != nil { + scanSpan.SetStatus(otelcodes.Error, err.Error()) + } + scanSpan.End() if err != nil { if ctx.Err() == context.Canceled { return nil, status.Error(codes.Canceled, "scan canceled")