diff --git a/docs/bridge-integration/ALERTING.md b/docs/bridge-integration/ALERTING.md new file mode 100644 index 000000000..712e8f18a --- /dev/null +++ b/docs/bridge-integration/ALERTING.md @@ -0,0 +1,69 @@ +# Bridge Alerting (ENG-361) + +Operational alerting for the Bridge integration. When a Bridge signal fails +(webhook processing, ERPNext audit write, or a Bridge API outage), the +`AlertService` (`src/services/alerts`) fans the alert out to the configured +destinations. + +## Routing + +| Severity | PagerDuty (page) | Slack / Mattermost (inform) | Discord (inform) | +| ------------ | :--------------: | :-------------------------: | :--------------: | +| **critical** | ✅ | ✅ | ✅ | +| **warning** | — | ✅ | ✅ | + +Delivery is best-effort and fire-and-forget — a failing or unconfigured +destination never blocks or fails the webhook/request path. **A destination +with no configured credential is silently skipped**, so channels can be enabled +incrementally. + +## Alert sources + +| Source | Severity | Where | +| ------------------------------------------------- | -------- | ----------------------------------------------------------- | +| ERPNext audit-write failure (deposit + transfer) | critical | `services/bridge/webhook-server/routes/{deposit,transfer}.ts` | +| Bridge webhook processing exception | critical | same routes (catch block) | +| Bridge API outage — 5xx / timeout / network | critical | `services/bridge/client.ts` | +| IBEX error on a Bridge↔IBEX movement | warning | _follow-up — not yet wired_ | + +`4xx` responses from Bridge are normal API rejections and are **not** alerted. + +## Configuration + +Three optional env vars, each gating one destination: + +| Env var | Destination | Value | +| ----------------------------- | ------------------- | ------------------------------------------- | +| `ALERT_PAGERDUTY_ROUTING_KEY` | PagerDuty | Events API v2 **integration / routing key** | +| `ALERT_SLACK_WEBHOOK_URL` | Slack or Mattermost | Incoming-webhook URL | +| `ALERT_DISCORD_WEBHOOK_URL` | Discord | Channel webhook URL | + +### How to get each value + +**PagerDuty** — `ALERT_PAGERDUTY_ROUTING_KEY` +1. PagerDuty → **Services** → pick (or create) the service that should page for Bridge. +2. **Integrations** → **Add integration** → **Events API v2**. +3. Copy the **Integration Key** — that is the routing key. + +**Slack** — `ALERT_SLACK_WEBHOOK_URL` +1. Create/choose a Slack app → **Incoming Webhooks** → **Activate**. +2. **Add New Webhook to Workspace** → choose the target channel. +3. Copy the URL (`https://hooks.slack.com/services/...`). + _Mattermost works too_ — it accepts the same `{ text }` payload; use its incoming-webhook URL. + +**Discord** — `ALERT_DISCORD_WEBHOOK_URL` +1. Discord → target channel → **Edit Channel** → **Integrations** → **Webhooks**. +2. **New Webhook** → name it → **Copy Webhook URL**. + +### Where to set them + +- **Local dev:** add to `.env` (and `.env.ci` for CI). +- **Staging / production:** set as environment variables / secrets in the deployment — the same place `MATTERMOST_WEBHOOK_URL` is configured. Treat all three as **secrets**. + +> If none are set, alerting is a no-op (no errors, no delivery) — useful until the channels are provisioned. + +## Verifying in staging (ENG-361 acceptance) + +1. Set at least `ALERT_PAGERDUTY_ROUTING_KEY` and `ALERT_SLACK_WEBHOOK_URL` in staging. +2. Simulate a Bridge webhook failure (e.g. force an ERPNext audit-write error, or replay a malformed transfer webhook). +3. Confirm on-call is paged via PagerDuty **and** a message posts to Slack within ~1 minute. diff --git a/src/config/env.ts b/src/config/env.ts index 1216528c0..fe2397f8a 100644 --- a/src/config/env.ts +++ b/src/config/env.ts @@ -124,6 +124,10 @@ export const env = createEnv({ MATTERMOST_WEBHOOK_URL: z.string().min(1).optional(), + ALERT_PAGERDUTY_ROUTING_KEY: z.string().min(1).optional(), + ALERT_SLACK_WEBHOOK_URL: z.string().url().optional(), + ALERT_DISCORD_WEBHOOK_URL: z.string().url().optional(), + PROXY_CHECK_APIKEY: z.string().min(1).optional(), SVIX_SECRET: z.string().optional(), @@ -231,6 +235,10 @@ export const env = createEnv({ MATTERMOST_WEBHOOK_URL: process.env.MATTERMOST_WEBHOOK_URL, + ALERT_PAGERDUTY_ROUTING_KEY: process.env.ALERT_PAGERDUTY_ROUTING_KEY, + ALERT_SLACK_WEBHOOK_URL: process.env.ALERT_SLACK_WEBHOOK_URL, + ALERT_DISCORD_WEBHOOK_URL: process.env.ALERT_DISCORD_WEBHOOK_URL, + PROXY_CHECK_APIKEY: process.env.PROXY_CHECK_APIKEY, SVIX_SECRET: process.env.SVIX_SECRET, diff --git a/src/config/index.ts b/src/config/index.ts index 2b070e1a0..749a0933f 100644 --- a/src/config/index.ts +++ b/src/config/index.ts @@ -188,6 +188,9 @@ export const NEXTCLOUD_URL = env.NEXTCLOUD_URL export const NEXTCLOUD_USER = env.NEXTCLOUD_USER export const NEXTCLOUD_PASSWORD = env.NEXTCLOUD_PASSWORD export const MATTERMOST_WEBHOOK_URL = env.MATTERMOST_WEBHOOK_URL +export const ALERT_PAGERDUTY_ROUTING_KEY = env.ALERT_PAGERDUTY_ROUTING_KEY +export const ALERT_SLACK_WEBHOOK_URL = env.ALERT_SLACK_WEBHOOK_URL +export const ALERT_DISCORD_WEBHOOK_URL = env.ALERT_DISCORD_WEBHOOK_URL export const PROXY_CHECK_APIKEY = env.PROXY_CHECK_APIKEY export const NOSTR_PRIVATE_KEY = env.NOSTR_PRIVATE_KEY diff --git a/src/services/alerts/discord.ts b/src/services/alerts/discord.ts new file mode 100644 index 000000000..2dedb6c96 --- /dev/null +++ b/src/services/alerts/discord.ts @@ -0,0 +1,34 @@ +import { ALERT_DISCORD_WEBHOOK_URL } from "@config" +import { ErrorLevel } from "@domain/shared" +import { recordExceptionInCurrentSpan } from "@services/tracing" +import axios from "axios" + +import { BridgeAlert } from "./index.types" + +// Discord caps message content at 2000 chars; leave headroom. +const DISCORD_CONTENT_MAX = 1900 + +// Discord incoming webhook ({ content }). +export const sendDiscord = async (alert: BridgeAlert): Promise => { + if (!ALERT_DISCORD_WEBHOOK_URL) return + + const icon = alert.severity === "critical" ? "🚨" : "⚠️" + let content = `${icon} **Bridge alert** — ${alert.title}\nsource: \`${alert.source}\` · severity: \`${alert.severity}\`` + if (alert.detail) content += `\n${alert.detail}` + if (alert.context) { + content += "\n```json\n" + JSON.stringify(alert.context, null, 2) + "\n```" + } + if (content.length > DISCORD_CONTENT_MAX) { + content = content.slice(0, DISCORD_CONTENT_MAX) + "…" + } + + try { + await axios.post( + ALERT_DISCORD_WEBHOOK_URL, + { content }, + { timeout: 5000, headers: { "Content-Type": "application/json" } }, + ) + } catch (error) { + recordExceptionInCurrentSpan({ error, level: ErrorLevel.Warn }) + } +} diff --git a/src/services/alerts/index.ts b/src/services/alerts/index.ts new file mode 100644 index 000000000..733dc7e7f --- /dev/null +++ b/src/services/alerts/index.ts @@ -0,0 +1,27 @@ +import { sendPagerDuty } from "./pagerduty" +import { sendSlack } from "./slack" +import { sendDiscord } from "./discord" +import { BridgeAlert } from "./index.types" + +export * from "./index.types" + +/** + * Fire-and-forget fan-out of a Bridge alert to the configured destinations + * (ENG-361). Returns immediately; delivery is best-effort — each sender catches + * its own errors and no-ops when its credential/URL is unset, so it never throws + * or rejects into the caller (no need to await or handle it). + * + * Routing: + * - critical → page on-call (PagerDuty) + inform (Slack/Mattermost, Discord) + * - warning → inform (Slack/Mattermost, Discord) only + */ +export const alertBridge = (alert: BridgeAlert): void => { + const deliver = async () => { + const senders = [sendSlack(alert), sendDiscord(alert)] + if (alert.severity === "critical") { + senders.push(sendPagerDuty(alert)) + } + await Promise.allSettled(senders) + } + deliver().catch(() => undefined) +} diff --git a/src/services/alerts/index.types.ts b/src/services/alerts/index.types.ts new file mode 100644 index 000000000..1c6c5d651 --- /dev/null +++ b/src/services/alerts/index.types.ts @@ -0,0 +1,13 @@ +// Ops alerting for Bridge integration signals (ENG-361). + +export type AlertSeverity = "critical" | "warning" + +export type AlertSource = "bridge-webhook" | "bridge-api" | "ibex" | "erpnext-audit" + +export interface BridgeAlert { + source: AlertSource + severity: AlertSeverity + title: string + detail?: string + context?: Record +} diff --git a/src/services/alerts/pagerduty.ts b/src/services/alerts/pagerduty.ts new file mode 100644 index 000000000..da6acffcc --- /dev/null +++ b/src/services/alerts/pagerduty.ts @@ -0,0 +1,33 @@ +import { ALERT_PAGERDUTY_ROUTING_KEY } from "@config" +import { ErrorLevel } from "@domain/shared" +import { recordExceptionInCurrentSpan } from "@services/tracing" +import axios from "axios" + +import { BridgeAlert } from "./index.types" + +const PAGERDUTY_EVENTS_URL = "https://events.pagerduty.com/v2/enqueue" + +// PagerDuty Events API v2 — triggers a paging incident. "critical" and +// "warning" are both valid PD payload severities, so we pass them through. +export const sendPagerDuty = async (alert: BridgeAlert): Promise => { + if (!ALERT_PAGERDUTY_ROUTING_KEY) return + + try { + await axios.post( + PAGERDUTY_EVENTS_URL, + { + routing_key: ALERT_PAGERDUTY_ROUTING_KEY, + event_action: "trigger", + payload: { + summary: `[bridge:${alert.source}] ${alert.title}`, + severity: alert.severity, + source: "flash-bridge", + custom_details: { ...alert.context, detail: alert.detail }, + }, + }, + { timeout: 5000, headers: { "Content-Type": "application/json" } }, + ) + } catch (error) { + recordExceptionInCurrentSpan({ error, level: ErrorLevel.Warn }) + } +} diff --git a/src/services/alerts/slack.ts b/src/services/alerts/slack.ts new file mode 100644 index 000000000..a4c9456f4 --- /dev/null +++ b/src/services/alerts/slack.ts @@ -0,0 +1,31 @@ +import { ALERT_SLACK_WEBHOOK_URL } from "@config" +import { ErrorLevel } from "@domain/shared" +import { recordExceptionInCurrentSpan } from "@services/tracing" +import axios from "axios" + +import { BridgeAlert } from "./index.types" + +// Slack / Mattermost-compatible incoming webhook ({ text }). +export const sendSlack = async (alert: BridgeAlert): Promise => { + if (!ALERT_SLACK_WEBHOOK_URL) return + + const icon = alert.severity === "critical" ? ":rotating_light:" : ":warning:" + const lines = [ + `${icon} *Bridge alert* — ${alert.title}`, + `*source:* \`${alert.source}\` *severity:* \`${alert.severity}\``, + ] + if (alert.detail) lines.push(alert.detail) + if (alert.context) { + lines.push("```" + JSON.stringify(alert.context, null, 2) + "```") + } + + try { + await axios.post( + ALERT_SLACK_WEBHOOK_URL, + { text: lines.join("\n") }, + { timeout: 5000, headers: { "Content-Type": "application/json" } }, + ) + } catch (error) { + recordExceptionInCurrentSpan({ error, level: ErrorLevel.Warn }) + } +} diff --git a/src/services/bridge/client.ts b/src/services/bridge/client.ts index 363e535bf..2494b2a51 100644 --- a/src/services/bridge/client.ts +++ b/src/services/bridge/client.ts @@ -8,6 +8,7 @@ import crypto from "crypto" import { BridgeConfig } from "@config" import { BridgeCustomerId, BridgeTransferId, BridgeVirtualAccountId } from "@domain/primitives/bridge" +import { alertBridge } from "@services/alerts" import { BridgeTimeoutError } from "./errors" // ============ Error Handling ============ @@ -379,6 +380,16 @@ export class BridgeClient { const responseData = await response.json().catch(() => null) if (!response.ok) { + // Only 5xx indicates a Bridge-side outage; 4xx are normal API rejections. + if (response.status >= 500) { + alertBridge({ + source: "bridge-api", + severity: "critical", + title: `Bridge API ${response.status} on ${method} ${path}`, + detail: response.statusText, + context: { method, path, status: response.status }, + }) + } throw new BridgeApiError( `Bridge API error: ${response.status} ${response.statusText}`, response.status, @@ -389,8 +400,24 @@ export class BridgeClient { return responseData as T } catch (err) { if (err instanceof Error && err.name === "AbortError") { + alertBridge({ + source: "bridge-api", + severity: "critical", + title: `Bridge API timeout on ${method} ${path}`, + context: { method, path, timeoutMs }, + }) throw new BridgeTimeoutError() } + // Network/connectivity failures (5xx already alerted above). + if (!(err instanceof BridgeApiError)) { + alertBridge({ + source: "bridge-api", + severity: "critical", + title: `Bridge API request failed on ${method} ${path}`, + detail: err instanceof Error ? err.message : String(err), + context: { method, path }, + }) + } throw err } finally { clearTimeout(timeoutId) diff --git a/src/services/bridge/webhook-server/routes/deposit.ts b/src/services/bridge/webhook-server/routes/deposit.ts index 1105ac2be..36f6aed55 100644 --- a/src/services/bridge/webhook-server/routes/deposit.ts +++ b/src/services/bridge/webhook-server/routes/deposit.ts @@ -12,6 +12,7 @@ import { baseLogger } from "@services/logger" import { createBridgeDeposit } from "@services/mongoose/bridge-deposit-log" import { reconcileByTxHash } from "@services/bridge/reconciliation" import { writeBridgeDepositRequest } from "@services/frappe/BridgeTransferRequestWriter" +import { alertBridge } from "@services/alerts" export const depositHandler = async (req: Request, res: Response) => { const { event_id, event_object } = req.body @@ -87,6 +88,13 @@ export const depositHandler = async (req: Request, res: Response) => { { error: auditResult, event_id, id }, "Failed to persist Bridge deposit ERPNext audit row", ) + alertBridge({ + source: "erpnext-audit", + severity: "critical", + title: "Bridge deposit ERPNext audit write failed", + detail: auditResult.message, + context: { event_id, transfer_id: id }, + }) return res.status(500).json({ error: "Failed to persist ERPNext audit row" }) } @@ -102,6 +110,13 @@ export const depositHandler = async (req: Request, res: Response) => { return res.status(200).json({ status: "success" }) } catch (error) { baseLogger.error({ error, id, event_id }, "Error processing Bridge deposit webhook") + alertBridge({ + source: "bridge-webhook", + severity: "critical", + title: "Bridge deposit webhook processing error", + detail: error instanceof Error ? error.message : String(error), + context: { event_id, transfer_id: id }, + }) return res.status(500).json({ error: "Internal server error" }) } } diff --git a/src/services/bridge/webhook-server/routes/transfer.ts b/src/services/bridge/webhook-server/routes/transfer.ts index 88f00fad8..3ea8d3d2a 100644 --- a/src/services/bridge/webhook-server/routes/transfer.ts +++ b/src/services/bridge/webhook-server/routes/transfer.ts @@ -13,6 +13,7 @@ import { writeBridgeCashoutCompleted, writeBridgeCashoutFailed, } from "@services/frappe/BridgeTransferRequestWriter" +import { alertBridge } from "@services/alerts" const TERMINAL_FAILURE_STATES = new Set([ "undeliverable", @@ -121,6 +122,13 @@ export const transferHandler = async (req: Request, res: Response) => { { transfer_id, error: auditResult }, "Failed to persist Bridge transfer ERPNext audit row", ) + alertBridge({ + source: "erpnext-audit", + severity: "critical", + title: "Bridge transfer ERPNext audit write failed", + detail: auditResult.message, + context: { transfer_id, event }, + }) return res.status(500).json({ error: "Failed to persist ERPNext audit row" }) } @@ -196,6 +204,13 @@ export const transferHandler = async (req: Request, res: Response) => { { transfer_id, error: auditResult }, "Failed to persist Bridge transfer failure ERPNext audit row", ) + alertBridge({ + source: "erpnext-audit", + severity: "critical", + title: "Bridge transfer-failure ERPNext audit write failed", + detail: auditResult.message, + context: { transfer_id, event }, + }) return res.status(500).json({ error: "Failed to persist ERPNext audit row" }) } @@ -216,6 +231,13 @@ export const transferHandler = async (req: Request, res: Response) => { return res.status(200).json({ status: "success" }) } catch (error) { baseLogger.error({ error, transfer_id }, "Error processing Bridge transfer webhook") + alertBridge({ + source: "bridge-webhook", + severity: "critical", + title: "Bridge transfer webhook processing error", + detail: error instanceof Error ? error.message : String(error), + context: { transfer_id, event }, + }) return res.status(500).json({ error: "Internal server error" }) } }