From d31bf9473100edba6fd8ed5c602b922483981478 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 1 Jul 2026 23:36:37 +0000 Subject: [PATCH 1/2] fix(grpc): self-terminate backend workers when LocalAI dies non-gracefully MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: a backend model-worker subprocess (the per-model gRPC server LocalAI spawns) can be orphaned and linger — holding VRAM and its listen port — if the LocalAI process is killed non-gracefully (e.g. a supervisor's graceful-shutdown grace period elapses and LocalAI is SIGKILLed) before its own teardown runs. Root cause: LocalAI's graceful teardown (pkg/signals/handler.go installs the SIGINT/SIGTERM handler; core/cli/run.go registers app.Shutdown -> ModelLoader.StopAllGRPC -> process.Stop in pkg/model/process.go) only runs when LocalAI receives a catchable signal and survives long enough to run its handlers. Backends are spawned via github.com/mudler/go-processmanager v0.1.1, whose getSysProcAttr() sets Setpgid:true (own process group, so the group can be signalled) but never PR_SET_PDEATHSIG/Pdeathsig, and exposes no Config field or option for a caller to inject/extend SysProcAttr. LocalAI fully delegates spawning to that library (it never builds the exec.Cmd itself), so it cannot set a kernel parent-death signal at the spawn site. If LocalAI is SIGKILLed, nothing tells the backend to exit and it is reparented to init. Fix: add a best-effort, backend-side safety net at the one shared choke point every out-of-process Go backend routes through — grpc.StartServer / RunServer in pkg/grpc. On startup it captures getppid() and polls; when the process is reparented (getppid changes / becomes 1 — the standard POSIX signal the original parent died) it logs and self-terminates. getppid() reparent detection is portable (Linux + macOS), unlike Linux-only PR_SET_PDEATHSIG. Toggle via LOCALAI_BACKEND_PARENT_WATCH (default on; off on Windows) and LOCALAI_BACKEND_PARENT_WATCH_INTERVAL. This is strictly a backstop alongside the existing graceful SIGTERM->grace->SIGKILL teardown, which is unchanged. Scope/limitations: covers Go-based backends (everything using pkg/grpc). The C++ backends (e.g. llama-cpp) and Python backends do not route through pkg/grpc and are not covered by this mechanism — they would each need an equivalent parent-death check (follow-up). The fully general fix is for go-processmanager to expose SysProcAttr injection so LocalAI can set Pdeathsig at spawn for every backend regardless of language (suggested upstream follow-up; out of scope for this LocalAI-only PR). Test: pkg/grpc/parentwatch_test.go builds a real test -> middle -> grandchild process tree, lets the middle process exit to orphan the grandchild running the real watchParentDeath, and asserts it detects the reparent and self-terminates. Unix-only (build-tagged), runs in CI (Linux). Co-Authored-By: Claude Sonnet 5 Signed-off-by: Ettore Di Giacinto --- pkg/grpc/parentwatch.go | 105 ++++++++++++++++++++++ pkg/grpc/parentwatch_test.go | 163 +++++++++++++++++++++++++++++++++++ pkg/grpc/server.go | 6 ++ 3 files changed, 274 insertions(+) create mode 100644 pkg/grpc/parentwatch.go create mode 100644 pkg/grpc/parentwatch_test.go diff --git a/pkg/grpc/parentwatch.go b/pkg/grpc/parentwatch.go new file mode 100644 index 000000000000..f6a8e20094d5 --- /dev/null +++ b/pkg/grpc/parentwatch.go @@ -0,0 +1,105 @@ +package grpc + +import ( + "log" + "os" + "runtime" + "strings" + "time" +) + +// Backend worker processes (the per-model gRPC servers LocalAI spawns) are +// deliberately placed in their own process group by the process manager so +// LocalAI's graceful shutdown can signal the whole group. That graceful path +// (SIGTERM -> grace -> SIGKILL, driven by pkg/signals + pkg/model) only runs +// when LocalAI itself receives a catchable signal and lives long enough to run +// its handlers. If LocalAI is SIGKILLed (e.g. a supervising process's +// graceful-shutdown grace period elapses first), that teardown never runs and +// this backend would be reparented to init and linger, holding VRAM and its +// listen port. +// +// The watcher below is a best-effort backstop for exactly that case: it does +// NOT replace the graceful teardown, it only covers the "parent vanished +// without cleaning up" path. It works by detecting reparenting: when the +// process that spawned this backend dies, the kernel reparents us to the +// nearest sub-reaper or to init (PID 1), so getppid() stops matching the value +// we captured at startup. This getppid() approach is portable across +// Linux/macOS (unlike Linux-only PR_SET_PDEATHSIG), which is why it's used +// here rather than a kernel parent-death signal. +const ( + // EnvBackendParentWatch toggles the parent-death watcher. It is enabled by + // default; set it to a falsey value ("false", "0", "no", "off") to disable + // (e.g. when running a backend standalone for debugging under a shell whose + // lifetime shouldn't govern the backend). + EnvBackendParentWatch = "LOCALAI_BACKEND_PARENT_WATCH" + // EnvBackendParentWatchInterval overrides the poll interval as a Go + // duration string (e.g. "500ms"). Defaults to defaultParentWatchInterval. + EnvBackendParentWatchInterval = "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL" + + defaultParentWatchInterval = 2 * time.Second +) + +// parentWatchEnabled reports whether the watcher should run in this process. +func parentWatchEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv(EnvBackendParentWatch))) { + case "false", "0", "no", "off": + return false + } + // Windows does not reparent orphans to a well-known init PID, so the + // getppid() heuristic used here doesn't apply there. + return runtime.GOOS != "windows" +} + +// parentWatchInterval returns the configured poll interval, or the default. +func parentWatchInterval() time.Duration { + if v := os.Getenv(EnvBackendParentWatchInterval); v != "" { + if d, err := time.ParseDuration(v); err == nil && d > 0 { + return d + } + } + return defaultParentWatchInterval +} + +// parentDied reports whether this process has been reparented away from the +// parent it had when the watcher started. Reparenting is the standard POSIX +// signal that the original parent (here, the LocalAI process that spawned this +// backend) has exited: the orphan is handed to the nearest sub-reaper or to +// init (PID 1), so getppid() no longer matches the value captured at startup. +func parentDied(origPPID int) bool { + ppid := os.Getppid() + return ppid != origPPID || ppid == 1 +} + +// watchParentDeath polls until parentDied reports the original parent is gone, +// then invokes onDeath. It blocks, so run it in its own goroutine. +func watchParentDeath(origPPID int, interval time.Duration, onDeath func()) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for range ticker.C { + if parentDied(origPPID) { + onDeath() + return + } + } +} + +// startParentDeathWatcher installs the best-effort safety net described above +// on the calling backend process. It is a no-op when disabled or on platforms +// where the mechanism doesn't apply. This is a backstop alongside — never a +// replacement for — LocalAI's graceful SIGTERM->grace->SIGKILL teardown. +func startParentDeathWatcher() { + if !parentWatchEnabled() { + return + } + origPPID := os.Getppid() + // A parent of 1 at startup means we were already orphaned (or launched + // directly under init) — there's no original parent to watch for. + if origPPID <= 1 { + return + } + interval := parentWatchInterval() + go watchParentDeath(origPPID, interval, func() { + log.Printf("backend parent process (pid %d) exited without stopping this backend; self-terminating to avoid orphaning", origPPID) + os.Exit(1) + }) +} diff --git a/pkg/grpc/parentwatch_test.go b/pkg/grpc/parentwatch_test.go new file mode 100644 index 000000000000..b3af0c79daf5 --- /dev/null +++ b/pkg/grpc/parentwatch_test.go @@ -0,0 +1,163 @@ +//go:build !windows + +package grpc + +import ( + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "syscall" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// These env vars drive the helper roles this test binary re-executes itself as +// (see the init() dispatcher). They are only set for the spawned child/ +// grandchild processes, never for the normal `go test` invocation. +const ( + envRole = "LOCALAI_PARENTWATCH_TEST_ROLE" + envReady = "LOCALAI_PARENTWATCH_TEST_READY" // grandchild writes its PID here once the watcher is armed + envExited = "LOCALAI_PARENTWATCH_TEST_EXITED" // grandchild writes here when it detects reparenting +) + +// init dispatches the helper roles when this test binary is re-executed with a +// role set. It runs before the testing/Ginkgo machinery, and is a no-op during +// a normal test run (role unset). +func init() { + switch os.Getenv(envRole) { + case "middle": + runMiddleRole() + case "grandchild": + runGrandchildRole() + } +} + +// childEnv returns the current environment with the parentwatch test role set +// to the given value (replacing any inherited role), leaving the ready/exited +// file paths inherited. +func childEnv(role string) []string { + out := make([]string, 0, len(os.Environ())+1) + for _, kv := range os.Environ() { + if len(kv) > len(envRole) && kv[:len(envRole)+1] == envRole+"=" { + continue + } + out = append(out, kv) + } + return append(out, envRole+"="+role) +} + +// runGrandchildRole arms the REAL watchParentDeath against its current parent +// (the "middle" process), signals readiness, then blocks. When middle exits and +// we are reparented, the watcher fires and we record it before exiting. +func runGrandchildRole() { + exitedFile := os.Getenv(envExited) + readyFile := os.Getenv(envReady) + + origPPID := os.Getppid() + go watchParentDeath(origPPID, 50*time.Millisecond, func() { + _ = os.WriteFile(exitedFile, []byte("1"), 0o644) + os.Exit(7) + }) + + // Safety valve: never linger if something goes wrong with the test. + go func() { + time.Sleep(30 * time.Second) + os.Exit(2) + }() + + // Signal readiness only after the watcher captured origPPID, so middle + // won't exit before we've recorded it as our original parent. + _ = os.WriteFile(readyFile, []byte(strconv.Itoa(os.Getpid())), 0o644) + + select {} // block until the watcher terminates us +} + +// runMiddleRole spawns the grandchild (which arms the watcher against us), +// waits until it is ready, then exits — orphaning the grandchild so it gets +// reparented, which is what the watcher must detect. +func runMiddleRole() { + readyFile := os.Getenv(envReady) + + self, err := os.Executable() + if err != nil { + os.Exit(3) + } + cmd := exec.Command(self) + cmd.Env = childEnv("grandchild") + // Own process group, mirroring how real backends are spawned, and discard + // std streams so the grandchild doesn't keep any parent pipe open. + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + if err := cmd.Start(); err != nil { + os.Exit(4) + } + + if !waitForFile(readyFile, 10*time.Second) { + os.Exit(5) + } + os.Exit(0) // orphan the grandchild +} + +func waitForFile(path string, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if _, err := os.Stat(path); err == nil { + return true + } + time.Sleep(20 * time.Millisecond) + } + return false +} + +// This spec builds a genuine two-level process tree (test -> middle -> +// grandchild), lets the middle process die, and asserts the grandchild's +// watchParentDeath detects the reparenting and self-terminates. +var _ = Describe("watchParentDeath", func() { + It("detects reparenting and self-terminates the orphaned process", func() { + if runtime.GOOS == "windows" { + Skip("parent-death watcher is not supported on windows") + } + + dir := GinkgoT().TempDir() + readyFile := filepath.Join(dir, "ready") + exitedFile := filepath.Join(dir, "exited") + + self, err := os.Executable() + Expect(err).NotTo(HaveOccurred(), "cannot resolve test executable") + + middle := exec.Command(self) + middle.Env = append(childEnv("middle"), + envReady+"="+readyFile, + envExited+"="+exitedFile, + ) + // Discard the helpers' output; keep the test log clean. + middle.Stdout = nil + middle.Stderr = nil + + Expect(middle.Start()).To(Succeed(), "failed to start middle helper") + // Wait only for the middle process; the grandchild is intentionally left + // orphaned. No pipes are shared, so this returns as soon as middle exits. + Expect(middle.Wait()).To(Succeed(), "middle helper exited with error") + + // The grandchild must have armed the watcher (and thus captured middle as + // its parent) before middle exited. + _, err = os.Stat(readyFile) + Expect(err).NotTo(HaveOccurred(), "grandchild never signaled readiness") + + // Best-effort cleanup in case the watcher somehow doesn't fire. + DeferCleanup(func() { + if b, err := os.ReadFile(readyFile); err == nil { + if pid, err := strconv.Atoi(string(b)); err == nil { + _ = syscall.Kill(pid, syscall.SIGKILL) + } + } + }) + + // Now that middle is gone, the grandchild has been reparented; the watcher + // must notice and write the exited marker. + Expect(waitForFile(exitedFile, 10*time.Second)).To(BeTrue(), "watcher did not detect parent death within timeout") + }) +}) diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go index c4c2785c8085..0ed50360f14b 100644 --- a/pkg/grpc/server.go +++ b/pkg/grpc/server.go @@ -939,6 +939,9 @@ func StartServer(address string, model AIModel) error { s := grpc.NewServer(serverOpts()...) pb.RegisterBackendServer(s, &server{llm: model}) log.Printf("gRPC Server listening at %v", lis.Addr()) + // Safety net: self-terminate if the LocalAI process that spawned this + // backend dies without running its graceful teardown (see parentwatch.go). + startParentDeathWatcher() if err := s.Serve(lis); err != nil { return err } @@ -954,6 +957,9 @@ func RunServer(address string, model AIModel) (func() error, error) { s := grpc.NewServer(serverOpts()...) pb.RegisterBackendServer(s, &server{llm: model}) log.Printf("gRPC Server listening at %v", lis.Addr()) + // Safety net: self-terminate if the LocalAI process that spawned this + // backend dies without running its graceful teardown (see parentwatch.go). + startParentDeathWatcher() if err = s.Serve(lis); err != nil { return func() error { return lis.Close() From 94e3e06b8bed8ad27e05153103911be74d1c62f1 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 2 Jul 2026 06:58:55 +0000 Subject: [PATCH 2/2] fix(process): extend parent-death backstop to C++ and Python backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Go parent-death watcher (pkg/grpc/parentwatch.go, commit 772b435d5) only protects backends that route through pkg/grpc. C++ and Python backends don't, so the originally-reported case — the llama.cpp gRPC worker surviving a non-graceful LocalAI death — was still uncovered. Extend the same best-effort backstop to both languages, reusing the exact mechanism and semantics: - capture getppid() at startup, skip if already orphaned (<=1) - a background thread polls getppid() and self-exits on reparenting (getppid() != orig || == 1), portable across Linux/macOS, no-op on Windows - same env vars: LOCALAI_BACKEND_PARENT_WATCH (default on; falsy false/0/no/off disable) and LOCALAI_BACKEND_PARENT_WATCH_INTERVAL (default 2s; accepts Go-style durations like 500ms/2s/1m) C++: implemented in backend/cpp/llama-cpp (the reported, most-used C++ backend) as a dependency-free header parent_watch.h, wired into grpc-server.cpp's main() and copied at build time via prepare.sh. C++ backends have no shared server scaffolding, so other C++ backends (ds4, ik-llama-cpp, privacy-filter, ...) are not yet covered and would each need the same one-line include+call as follow-ups. Python: implemented once in the shared common/parent_watch.py and armed from common/grpc_auth.py's get_auth_interceptors() — the single helper every one of the 35 Python backends invokes while building its gRPC server — so all Python backends (and future ones) are covered with no per-backend edits and no duplicated implementation. Tests (real process-tree reparent detection, mirroring the Go test): - backend/cpp/llama-cpp/parent_watch_test.cpp (via run-unit-tests.sh) - backend/python/common/parent_watch_test.py (python -m unittest) Co-Authored-By: Claude Sonnet 5 Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/CMakeLists.txt | 9 + backend/cpp/llama-cpp/grpc-server.cpp | 6 + backend/cpp/llama-cpp/parent_watch.h | 179 ++++++++++++++++++ backend/cpp/llama-cpp/parent_watch_test.cpp | 197 ++++++++++++++++++++ backend/cpp/llama-cpp/prepare.sh | 4 + backend/cpp/run-unit-tests.sh | 2 +- backend/python/common/grpc_auth.py | 9 + backend/python/common/parent_watch.py | 149 +++++++++++++++ backend/python/common/parent_watch_test.py | 150 +++++++++++++++ 9 files changed, 704 insertions(+), 1 deletion(-) create mode 100644 backend/cpp/llama-cpp/parent_watch.h create mode 100644 backend/cpp/llama-cpp/parent_watch_test.cpp create mode 100644 backend/python/common/parent_watch.py create mode 100644 backend/python/common/parent_watch_test.py diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt index 8b8d2e2d5e54..47852d4009f6 100644 --- a/backend/cpp/llama-cpp/CMakeLists.txt +++ b/backend/cpp/llama-cpp/CMakeLists.txt @@ -101,4 +101,13 @@ if(LLAMA_GRPC_BUILD_TESTS) target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET}) target_compile_features(message_content_test PRIVATE cxx_std_17) add_test(NAME message_content_test COMMAND message_content_test) + + # Parent-death watcher test (parent_watch.h) — standard library only, but + # needs a threading runtime for std::thread. + find_package(Threads REQUIRED) + add_executable(parent_watch_test parent_watch_test.cpp parent_watch.h) + target_include_directories(parent_watch_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(parent_watch_test PRIVATE Threads::Threads) + target_compile_features(parent_watch_test PRIVATE cxx_std_17) + add_test(NAME parent_watch_test COMMAND parent_watch_test) endif() diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index a02d461f46dd..660c4367e579 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -75,6 +75,8 @@ #include #endif +#include "parent_watch.h" // best-effort parent-death backstop (see header) + using grpc::Server; using grpc::ServerBuilder; @@ -3442,6 +3444,10 @@ int main(int argc, char** argv) { } } + // Best-effort backstop: self-terminate if the LocalAI process that spawned + // us dies without cleaning us up (see parent_watch.h). + llama_grpc::start_parent_death_watcher(); + server_context ctx_server; BackendServiceImpl service(ctx_server); diff --git a/backend/cpp/llama-cpp/parent_watch.h b/backend/cpp/llama-cpp/parent_watch.h new file mode 100644 index 000000000000..b2eb34e74438 --- /dev/null +++ b/backend/cpp/llama-cpp/parent_watch.h @@ -0,0 +1,179 @@ +// Parent-death watcher (best-effort backstop) for the llama.cpp gRPC backend. +// +// LocalAI spawns this backend as a child process and, on a clean shutdown, +// tears it down itself (SIGTERM -> grace -> SIGKILL). That graceful path only +// runs when LocalAI receives a catchable signal and lives long enough to run +// its handlers. If LocalAI is SIGKILLed (e.g. a supervising process's grace +// period elapses first), that teardown never runs and this backend would be +// reparented to init and linger, holding VRAM and its listen port. +// +// The watcher here is a best-effort backstop for exactly that case: it does +// NOT replace the graceful teardown, it only covers the "parent vanished +// without cleaning up" path. It detects reparenting: when the process that +// spawned this backend dies, the kernel reparents us to the nearest sub-reaper +// or to init (PID 1), so getppid() stops matching the value captured at +// startup. This getppid() approach is portable across Linux/macOS (unlike the +// Linux-only PR_SET_PDEATHSIG), which is why it is used here, mirroring the Go +// backends' pkg/grpc/parentwatch.go. It is disabled on Windows, which has no +// equivalent orphan-reparenting semantics. +// +// This header is intentionally dependency-free (C++ standard library only) so +// it can be exercised by a standalone unit test (parent_watch_test.cpp) without +// building the full llama.cpp + gRPC backend. +#ifndef LLAMA_GRPC_PARENT_WATCH_H +#define LLAMA_GRPC_PARENT_WATCH_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(_WIN32) +#include // getppid(2), _exit(2) +#endif + +namespace llama_grpc { + +// Env var names are shared verbatim with the Go and Python backends for +// consistency across languages. +inline const char *kEnvParentWatch() { return "LOCALAI_BACKEND_PARENT_WATCH"; } +inline const char *kEnvParentWatchInterval() { return "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"; } + +// Default poll interval in milliseconds. Matches the Go side's 2 * time.Second. +inline long parent_watch_default_interval_ms() { return 2000; } + +namespace detail { +inline std::string trim_lower(const std::string &in, bool lower) { + size_t a = in.find_first_not_of(" \t\r\n"); + size_t b = in.find_last_not_of(" \t\r\n"); + if (a == std::string::npos) { + return ""; + } + std::string s = in.substr(a, b - a + 1); + if (lower) { + std::transform(s.begin(), s.end(), s.begin(), + [](unsigned char c) { return std::tolower(c); }); + } + return s; +} +} // namespace detail + +// parent_watch_enabled reports whether the watcher should run. Enabled by +// default; a falsey value ("false"/"0"/"no"/"off", case-insensitive) disables +// it, matching the Go implementation's exact semantics. +inline bool parent_watch_enabled() { +#if defined(_WIN32) + return false; +#else + const char *v = std::getenv(kEnvParentWatch()); + if (v == nullptr || v[0] == '\0') { + return true; + } + const std::string s = detail::trim_lower(v, true); + return !(s == "false" || s == "0" || s == "no" || s == "off"); +#endif +} + +// parent_watch_interval_ms returns the poll interval in milliseconds. Accepts +// Go-style duration strings ("500ms", "2s", "1m") for cross-language parity, or +// a bare number interpreted as seconds. Defaults to +// parent_watch_default_interval_ms(). +inline long parent_watch_interval_ms() { + const long def = parent_watch_default_interval_ms(); + const char *v = std::getenv(kEnvParentWatchInterval()); + if (v == nullptr || v[0] == '\0') { + return def; + } + const std::string s = detail::trim_lower(v, false); + if (s.empty()) { + return def; + } + size_t i = 0; + while (i < s.size() && (std::isdigit((unsigned char)s[i]) || s[i] == '.')) { + i++; + } + if (i == 0) { + return def; + } + double num = 0.0; + try { + num = std::stod(s.substr(0, i)); + } catch (...) { + return def; + } + const std::string unit = s.substr(i); + long ms; + if (unit == "ms") { + ms = (long)num; + } else if (unit == "s" || unit.empty()) { + ms = (long)(num * 1000.0); + } else if (unit == "m") { + ms = (long)(num * 60000.0); + } else { + return def; // unrecognized unit + } + return ms > 0 ? ms : def; +} + +#if !defined(_WIN32) +// parent_died reports whether this process has been reparented away from the +// parent it had when the watcher started. Reparenting is the standard POSIX +// signal that the original parent (here, the LocalAI process that spawned this +// backend) has exited: the orphan is handed to the nearest sub-reaper or to +// init (PID 1), so getppid() no longer matches the value captured at startup. +inline bool parent_died(pid_t orig_ppid) { + const pid_t ppid = getppid(); + return ppid != orig_ppid || ppid == 1; +} + +// watch_parent_death polls until parent_died reports the original parent is +// gone, then invokes on_death. It blocks, so run it on its own thread. +inline void watch_parent_death(pid_t orig_ppid, long interval_ms, + const std::function &on_death) { + for (;;) { + std::this_thread::sleep_for(std::chrono::milliseconds(interval_ms)); + if (parent_died(orig_ppid)) { + on_death(); + return; + } + } +} +#endif + +// start_parent_death_watcher installs the best-effort safety net described in +// the file header on the calling backend process. It is a no-op when disabled, +// on Windows, or when the process is already orphaned at startup +// (getppid() <= 1). This is a backstop alongside — never a replacement for — +// LocalAI's graceful teardown. +inline void start_parent_death_watcher() { +#if !defined(_WIN32) + if (!parent_watch_enabled()) { + return; + } + const pid_t orig_ppid = getppid(); + // A parent of 1 (or less) at startup means we were already orphaned (or + // launched directly under init) — there is no original parent to watch for. + if (orig_ppid <= 1) { + return; + } + const long interval_ms = parent_watch_interval_ms(); + std::thread([orig_ppid, interval_ms]() { + watch_parent_death(orig_ppid, interval_ms, [orig_ppid]() { + fprintf(stderr, + "backend parent process (pid %d) exited without stopping " + "this backend; self-terminating to avoid orphaning\n", + (int)orig_ppid); + fflush(stderr); + _exit(1); + }); + }).detach(); +#endif +} + +} // namespace llama_grpc + +#endif // LLAMA_GRPC_PARENT_WATCH_H diff --git a/backend/cpp/llama-cpp/parent_watch_test.cpp b/backend/cpp/llama-cpp/parent_watch_test.cpp new file mode 100644 index 000000000000..9a7fd8074330 --- /dev/null +++ b/backend/cpp/llama-cpp/parent_watch_test.cpp @@ -0,0 +1,197 @@ +// Unit tests for the parent-death watcher (parent_watch.h). +// +// Build & run standalone (C++ standard library only, no nlohmann/json needed): +// g++ -std=c++17 -pthread parent_watch_test.cpp -o t && ./t +// +// The core test (TestDetectsReparent) builds a genuine two-level process tree +// (test -> middle -> grandchild), lets the middle process die, and asserts the +// grandchild's watch_parent_death detects the reparenting and self-terminates — +// mirroring the Go test in pkg/grpc/parentwatch_test.go, but with fork(2). +// +// On Windows this file compiles to a no-op success (the watcher is unsupported +// there), matching parent_watch.h's platform gating. + +#include +#include +#include + +#include "parent_watch.h" + +static int failures = 0; + +static void check(bool ok, const std::string &name) { + if (!ok) { + failures++; + fprintf(stderr, "FAIL: %s\n", name.c_str()); + } else { + fprintf(stderr, "ok: %s\n", name.c_str()); + } +} + +// Env-parsing tests are platform-independent and always run. +static void test_env_parsing() { + using namespace llama_grpc; + + // Interval: default when unset. + unsetenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"); + check(parent_watch_interval_ms() == 2000, "interval default 2000ms"); + + setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "500ms", 1); + check(parent_watch_interval_ms() == 500, "interval 500ms"); + + setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "2s", 1); + check(parent_watch_interval_ms() == 2000, "interval 2s"); + + setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "1m", 1); + check(parent_watch_interval_ms() == 60000, "interval 1m"); + + setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "3", 1); // bare number -> seconds + check(parent_watch_interval_ms() == 3000, "interval bare 3 -> 3000ms"); + + setenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL", "garbage", 1); + check(parent_watch_interval_ms() == 2000, "interval garbage -> default"); + unsetenv("LOCALAI_BACKEND_PARENT_WATCH_INTERVAL"); + +#if !defined(_WIN32) + // Enabled semantics (POSIX only; always false on Windows). + unsetenv("LOCALAI_BACKEND_PARENT_WATCH"); + check(parent_watch_enabled(), "enabled by default"); + + for (const char *falsey : {"false", "0", "no", "off", "OFF", " False "}) { + setenv("LOCALAI_BACKEND_PARENT_WATCH", falsey, 1); + check(!parent_watch_enabled(), std::string("disabled by '") + falsey + "'"); + } + setenv("LOCALAI_BACKEND_PARENT_WATCH", "true", 1); + check(parent_watch_enabled(), "enabled by 'true'"); + setenv("LOCALAI_BACKEND_PARENT_WATCH", "1", 1); + check(parent_watch_enabled(), "enabled by '1'"); + unsetenv("LOCALAI_BACKEND_PARENT_WATCH"); +#endif +} + +#if !defined(_WIN32) + +#include +#include +#include +#include +#include + +static bool file_exists(const std::string &p) { + struct stat st; + return ::stat(p.c_str(), &st) == 0; +} + +static bool wait_for_file(const std::string &p, int timeout_ms) { + int waited = 0; + while (waited < timeout_ms) { + if (file_exists(p)) { + return true; + } + usleep(20 * 1000); + waited += 20; + } + return false; +} + +static void write_file(const std::string &p, const std::string &content) { + FILE *f = fopen(p.c_str(), "w"); + if (f) { + fwrite(content.data(), 1, content.size(), f); + fclose(f); + } +} + +// Builds test -> middle -> grandchild via fork(2). The grandchild arms the REAL +// watch_parent_death against middle; middle exits, orphaning the grandchild; +// the watcher must detect the reparenting and self-terminate. +static void test_detects_reparent() { + char tmpl[] = "/tmp/parentwatch_test_XXXXXX"; + char *dir = mkdtemp(tmpl); + if (dir == nullptr) { + check(false, "mkdtemp"); + return; + } + const std::string ready_file = std::string(dir) + "/ready"; + const std::string exited_file = std::string(dir) + "/exited"; + + pid_t middle = fork(); + if (middle < 0) { + check(false, "fork middle"); + return; + } + + if (middle == 0) { + // ---- middle process ---- + pid_t grandchild = fork(); + if (grandchild < 0) { + _exit(4); + } + if (grandchild == 0) { + // ---- grandchild process ---- + pid_t orig_ppid = getppid(); // == middle + std::thread([&]() { + llama_grpc::watch_parent_death(orig_ppid, 50 /*ms*/, [&]() { + write_file(exited_file, "1"); + _exit(7); + }); + }).detach(); + + // Safety valve: never linger if something goes wrong. + std::thread([]() { + usleep(30 * 1000 * 1000); + _exit(2); + }).detach(); + + // Signal readiness only after the watcher captured orig_ppid. + write_file(ready_file, std::to_string(getpid())); + for (;;) { + pause(); + } + } + // middle: wait until grandchild is ready, then exit to orphan it. + if (!wait_for_file(ready_file, 10000)) { + _exit(5); + } + _exit(0); + } + + // ---- test (top) process ---- + int status = 0; + waitpid(middle, &status, 0); // reap middle only; grandchild is orphaned + + check(file_exists(ready_file), "grandchild signaled readiness"); + + bool detected = wait_for_file(exited_file, 10000); + check(detected, "watcher detected parent death and self-terminated"); + + // Best-effort cleanup: kill the grandchild if it somehow survived. + if (file_exists(ready_file)) { + FILE *f = fopen(ready_file.c_str(), "r"); + if (f) { + int pid = 0; + if (fscanf(f, "%d", &pid) == 1 && pid > 1) { + kill(pid, SIGKILL); + } + fclose(f); + } + } + unlink(ready_file.c_str()); + unlink(exited_file.c_str()); + rmdir(dir); +} + +#endif // !_WIN32 + +int main() { + test_env_parsing(); +#if !defined(_WIN32) + test_detects_reparent(); +#endif + if (failures == 0) { + fprintf(stderr, "\nAll parent_watch tests passed.\n"); + return 0; + } + fprintf(stderr, "\n%d parent_watch test(s) failed.\n", failures); + return 1; +} diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh index 4da45ea9d8df..27c664e8c7d4 100644 --- a/backend/cpp/llama-cpp/prepare.sh +++ b/backend/cpp/llama-cpp/prepare.sh @@ -22,6 +22,10 @@ cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ # unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON). cp -r message_content.h llama.cpp/tools/grpc-server/ cp -r message_content_test.cpp llama.cpp/tools/grpc-server/ +# Parent-death watcher (included by grpc-server.cpp) and its standalone unit +# test (run via backend/cpp/run-unit-tests.sh; also buildable under ctest). +cp -r parent_watch.h llama.cpp/tools/grpc-server/ +cp -r parent_watch_test.cpp llama.cpp/tools/grpc-server/ cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/ cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/ diff --git a/backend/cpp/run-unit-tests.sh b/backend/cpp/run-unit-tests.sh index 3f63faa402d8..603d13a91c86 100755 --- a/backend/cpp/run-unit-tests.sh +++ b/backend/cpp/run-unit-tests.sh @@ -54,7 +54,7 @@ for test_src in "${tests[@]}"; do name="$(basename "$test_src" .cpp)" bin="$(mktemp -d)/$name" echo "==> $test_src" - if ! "$CXX" -std=c++17 -Wall -Wextra \ + if ! "$CXX" -std=c++17 -Wall -Wextra -pthread \ -I"$JSON_INC" -I"$(dirname "$test_src")" \ "$test_src" -o "$bin"; then echo "COMPILE FAILED: $test_src" >&2 diff --git a/backend/python/common/grpc_auth.py b/backend/python/common/grpc_auth.py index eda138ab4714..9ed866abb502 100644 --- a/backend/python/common/grpc_auth.py +++ b/backend/python/common/grpc_auth.py @@ -11,6 +11,8 @@ import grpc +from parent_watch import start_parent_death_watcher + class _AbortHandler(grpc.RpcMethodHandler): """A method handler that immediately aborts with UNAUTHENTICATED.""" @@ -70,6 +72,13 @@ def get_auth_interceptors(*, aio: bool = False): Returns an empty list when LOCALAI_GRPC_AUTH_TOKEN is not set. """ + # Arm the best-effort parent-death backstop here: this is the single helper + # every LocalAI Python backend invokes exactly once while building its gRPC + # server (mirroring how the Go watcher arms in pkg/grpc's shared serve path). + # start_parent_death_watcher() is idempotent and a no-op when disabled or on + # unsupported platforms — see parent_watch.py. + start_parent_death_watcher() + token = os.environ.get("LOCALAI_GRPC_AUTH_TOKEN", "") if not token: return [] diff --git a/backend/python/common/parent_watch.py b/backend/python/common/parent_watch.py new file mode 100644 index 000000000000..c2f7f6a7ab91 --- /dev/null +++ b/backend/python/common/parent_watch.py @@ -0,0 +1,149 @@ +"""Parent-death watcher (best-effort backstop) for LocalAI Python backends. + +LocalAI spawns each backend as a child process and, on a clean shutdown, tears +it down itself (SIGTERM -> grace -> SIGKILL). That graceful path only runs when +LocalAI receives a catchable signal and lives long enough to run its handlers. +If LocalAI is SIGKILLed (e.g. a supervising process's grace period elapses +first), that teardown never runs and this backend would be reparented to init +and linger, holding GPU/VRAM and its listen port. + +The watcher here is a best-effort backstop for exactly that case: it does NOT +replace the graceful teardown, it only covers the "parent vanished without +cleaning up" path. It detects reparenting: when the process that spawned this +backend dies, the kernel reparents us to the nearest sub-reaper or to init +(PID 1), so os.getppid() stops matching the value captured at startup. This +getppid() approach is portable across Linux/macOS (unlike the Linux-only +PR_SET_PDEATHSIG), which is why it is used here, mirroring the Go backends' +pkg/grpc/parentwatch.go and the C++ backends' parent_watch.h. It is disabled on +Windows, which has no equivalent orphan-reparenting semantics. + +Env vars (shared verbatim across the Go, C++ and Python backends): + LOCALAI_BACKEND_PARENT_WATCH enabled by default; a falsey value + ("false"/"0"/"no"/"off", case-insensitive) + disables it. + LOCALAI_BACKEND_PARENT_WATCH_INTERVAL poll interval as a Go-style duration + string ("500ms", "2s", "1m") or a bare + number of seconds. Defaults to 2s. +""" + +import os +import sys +import threading + +ENV_PARENT_WATCH = "LOCALAI_BACKEND_PARENT_WATCH" +ENV_PARENT_WATCH_INTERVAL = "LOCALAI_BACKEND_PARENT_WATCH_INTERVAL" + +_DEFAULT_INTERVAL_SECONDS = 2.0 + +# Guard so repeated calls (e.g. get_auth_interceptors invoked more than once) +# only ever arm a single watcher thread per process. +_started = False +_started_lock = threading.Lock() + + +def _enabled(): + """Report whether the watcher should run in this process.""" + # Windows does not reparent orphans to a well-known init PID, so the + # getppid() heuristic used here doesn't apply there. + if os.name == "nt" or sys.platform.startswith("win"): + return False + val = os.environ.get(ENV_PARENT_WATCH, "").strip().lower() + if val in ("false", "0", "no", "off"): + return False + return True + + +def _interval_seconds(): + """Return the configured poll interval in seconds, or the default. + + Accepts Go-style duration strings ("500ms", "2s", "1m") for cross-language + parity, or a bare number interpreted as seconds. + """ + raw = os.environ.get(ENV_PARENT_WATCH_INTERVAL, "").strip() + if not raw: + return _DEFAULT_INTERVAL_SECONDS + # Split numeric prefix from unit suffix. + i = 0 + while i < len(raw) and (raw[i].isdigit() or raw[i] == "." or (i == 0 and raw[i] in "+-")): + i += 1 + if i == 0: + return _DEFAULT_INTERVAL_SECONDS + try: + num = float(raw[:i]) + except ValueError: + return _DEFAULT_INTERVAL_SECONDS + unit = raw[i:].lower() + if unit == "ms": + seconds = num / 1000.0 + elif unit in ("s", ""): + seconds = num + elif unit == "m": + seconds = num * 60.0 + else: + return _DEFAULT_INTERVAL_SECONDS + return seconds if seconds > 0 else _DEFAULT_INTERVAL_SECONDS + + +def _parent_died(orig_ppid): + """Report whether this process has been reparented away from orig_ppid. + + Reparenting is the standard POSIX signal that the original parent (here, the + LocalAI process that spawned this backend) has exited: the orphan is handed + to the nearest sub-reaper or to init (PID 1), so os.getppid() no longer + matches the value captured at startup. + """ + ppid = os.getppid() + return ppid != orig_ppid or ppid == 1 + + +def _watch(orig_ppid, interval, on_death): + """Poll until _parent_died reports the original parent is gone, then call + on_death. Blocks, so run it on its own (daemon) thread.""" + import time + + while True: + time.sleep(interval) + if _parent_died(orig_ppid): + on_death() + return + + +def start_parent_death_watcher(): + """Install the best-effort safety net described in this module's docstring. + + No-op when disabled, on Windows, when already orphaned at startup + (os.getppid() <= 1), or if already started. This is a backstop alongside — + never a replacement for — LocalAI's graceful teardown. + """ + global _started + if not _enabled(): + return + with _started_lock: + if _started: + return + orig_ppid = os.getppid() + # A parent of 1 (or less) at startup means we were already orphaned (or + # launched directly under init) — there is no original parent to watch. + if orig_ppid <= 1: + return + interval = _interval_seconds() + + def on_death(): + print( + "backend parent process (pid {}) exited without stopping this " + "backend; self-terminating to avoid orphaning".format(orig_ppid), + file=sys.stderr, + flush=True, + ) + # Immediate, non-cleanup exit: this is a shutdown safety net and the + # normal graceful path is already gone. + os._exit(1) + + thread = threading.Thread( + target=_watch, + args=(orig_ppid, interval, on_death), + name="parent-death-watcher", + daemon=True, + ) + thread.start() + _started = True diff --git a/backend/python/common/parent_watch_test.py b/backend/python/common/parent_watch_test.py new file mode 100644 index 000000000000..da37eb3ec78a --- /dev/null +++ b/backend/python/common/parent_watch_test.py @@ -0,0 +1,150 @@ +"""Unit tests for the parent-death watcher (parent_watch.py). + +Run standalone (Python standard library only, no backend venv needed): + python3 -m unittest parent_watch_test + +The core test (test_detects_reparent) builds a genuine two-level process tree +(test -> middle -> grandchild) with os.fork, lets the middle process die, and +asserts the grandchild's parent_watch._watch detects the reparenting and +self-terminates — mirroring the Go test in pkg/grpc/parentwatch_test.go and the +C++ test in backend/cpp/llama-cpp/parent_watch_test.cpp. +""" + +import os +import sys +import tempfile +import threading +import time +import unittest + +import parent_watch + + +class TestParentWatchEnvParsing(unittest.TestCase): + def setUp(self): + self._saved = { + k: os.environ.get(k) + for k in (parent_watch.ENV_PARENT_WATCH, parent_watch.ENV_PARENT_WATCH_INTERVAL) + } + for k in self._saved: + os.environ.pop(k, None) + + def tearDown(self): + for k, v in self._saved.items(): + if v is None: + os.environ.pop(k, None) + else: + os.environ[k] = v + + def test_interval_default(self): + self.assertEqual(parent_watch._interval_seconds(), 2.0) + + def test_interval_units(self): + cases = {"500ms": 0.5, "2s": 2.0, "1m": 60.0, "3": 3.0, "0.5s": 0.5} + for raw, expected in cases.items(): + os.environ[parent_watch.ENV_PARENT_WATCH_INTERVAL] = raw + self.assertAlmostEqual(parent_watch._interval_seconds(), expected, msg=raw) + + def test_interval_garbage_falls_back(self): + os.environ[parent_watch.ENV_PARENT_WATCH_INTERVAL] = "garbage" + self.assertEqual(parent_watch._interval_seconds(), 2.0) + + @unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only") + def test_enabled_default(self): + self.assertTrue(parent_watch._enabled()) + + @unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only") + def test_disabled_by_falsey(self): + for val in ("false", "0", "no", "off", "OFF", " False "): + os.environ[parent_watch.ENV_PARENT_WATCH] = val + self.assertFalse(parent_watch._enabled(), msg=val) + + @unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "POSIX only") + def test_enabled_by_truthy(self): + for val in ("true", "1", "yes", "on"): + os.environ[parent_watch.ENV_PARENT_WATCH] = val + self.assertTrue(parent_watch._enabled(), msg=val) + + +@unittest.skipIf(os.name == "nt" or sys.platform.startswith("win"), "fork/reparent is POSIX only") +class TestParentWatchReparent(unittest.TestCase): + def _wait_for_file(self, path, timeout=10.0): + deadline = time.time() + timeout + while time.time() < deadline: + if os.path.exists(path): + return True + time.sleep(0.02) + return False + + def test_detects_reparent(self): + tmpdir = tempfile.mkdtemp(prefix="parentwatch_test_") + ready_file = os.path.join(tmpdir, "ready") + exited_file = os.path.join(tmpdir, "exited") + + middle = os.fork() + if middle == 0: + # ---- middle process ---- + grandchild = os.fork() + if grandchild == 0: + # ---- grandchild process: arm the REAL watcher against middle ---- + orig_ppid = os.getppid() + + def on_death(): + with open(exited_file, "w") as f: + f.write("1") + os._exit(7) + + threading.Thread( + target=parent_watch._watch, + args=(orig_ppid, 0.05, on_death), + daemon=True, + ).start() + + # Safety valve: never linger if something goes wrong. + def bail(): + time.sleep(30) + os._exit(2) + + threading.Thread(target=bail, daemon=True).start() + + # Signal readiness only after the watcher captured orig_ppid. + with open(ready_file, "w") as f: + f.write(str(os.getpid())) + while True: + time.sleep(1) + else: + # middle: wait until grandchild is ready, then exit to orphan it. + if not self._wait_for_file(ready_file): + os._exit(5) + os._exit(0) + + # ---- test (top) process ---- + os.waitpid(middle, 0) # reap middle only; grandchild is orphaned + + self.assertTrue(os.path.exists(ready_file), "grandchild never signaled readiness") + self.assertTrue( + self._wait_for_file(exited_file), + "watcher did not detect parent death within timeout", + ) + + # Best-effort cleanup: kill the grandchild if it somehow survived. + try: + with open(ready_file) as f: + pid = int(f.read().strip()) + if pid > 1: + os.kill(pid, 9) + except (OSError, ValueError): + pass + for p in (ready_file, exited_file): + try: + os.remove(p) + except OSError: + pass + try: + os.rmdir(tmpdir) + except OSError: + pass + + +if __name__ == "__main__": + unittest.main()