diff --git a/.changeset/mcp-search-denoise.md b/.changeset/mcp-search-denoise.md new file mode 100644 index 0000000000..53b5495980 --- /dev/null +++ b/.changeset/mcp-search-denoise.md @@ -0,0 +1,20 @@ +--- +'@hyperdx/api': patch +'@hyperdx/app': patch +'@hyperdx/common-utils': patch +--- + +feat(mcp): add denoise option to clickstack_search tool + +Add a `denoise` boolean parameter to the MCP `clickstack_search` tool that +automatically filters out high-frequency repetitive event patterns from +search results, mirroring the web app's "Denoise Results" feature. + +When enabled, the tool samples 10k random events, mines patterns using +the Drain algorithm, identifies noisy patterns (>10% of sample), and +filters them out of result rows. Returns filtered rows plus metadata +listing removed patterns with estimated counts. + +Extracts shared denoise constants (`DENOISE_SAMPLE_SIZE`, +`DENOISE_NOISE_THRESHOLD`) into `@hyperdx/common-utils` so the web app +and MCP server use the same values. diff --git a/packages/api/src/mcp/__tests__/queryTool.test.ts b/packages/api/src/mcp/__tests__/queryTool.test.ts index 4cdfd6563a..a63571f1d7 100644 --- a/packages/api/src/mcp/__tests__/queryTool.test.ts +++ b/packages/api/src/mcp/__tests__/queryTool.test.ts @@ -357,6 +357,110 @@ describe('MCP Query Tools', () => { expect(result.isError).toBe(true); expect(getFirstText(result)).toMatch(/sourceId/i); }); + + it('should expose denoise property in schema', async () => { + const { tools } = await client.listTools(); + const tool = tools.find(t => t.name === 'clickstack_search'); + expect(tool).toBeDefined(); + const props = Object.keys(tool!.inputSchema.properties ?? {}); + expect(props).toContain('denoise'); + }); + + it('should emit denoised block when denoise=true on empty results', async () => { + const result = await callTool(client, 'clickstack_search', { + sourceId: logSource._id.toString(), + denoise: true, + startTime: new Date(Date.now() - 60 * 60 * 1000).toISOString(), + endTime: new Date().toISOString(), + }); + + expect(result.isError).toBeFalsy(); + // With no data, the denoised block should not appear because the + // search result itself has no rows to process (early return path). + const output = JSON.parse(getFirstText(result)); + expect(output).toHaveProperty('result'); + }); + + describe('denoise with seeded data', () => { + const now = new Date(); + const fiveMinAgo = new Date(now.getTime() - 5 * 60 * 1000); + + beforeEach(async () => { + const logs: Parameters[0] = []; + + // Noisy pattern: "Health check OK from " — 80 rows (>10% threshold) + for (let i = 0; i < 80; i++) { + logs.push({ + Body: `Health check OK from 10.0.${Math.floor(i / 256)}.${i % 256}`, + ServiceName: 'loadbalancer', + SeverityText: 'INFO', + Timestamp: new Date(fiveMinAgo.getTime() + i * 100), + }); + } + + // Unique/rare events — 5 rows (well below 10% threshold) + for (let i = 0; i < 5; i++) { + logs.push({ + Body: `Rare event type ${String.fromCharCode(65 + i)} occurred in subsystem`, + ServiceName: 'worker', + SeverityText: 'WARN', + Timestamp: new Date(fiveMinAgo.getTime() + (80 + i) * 1000), + }); + } + + await bulkInsertLogs(logs); + }); + + it('should filter noisy patterns and emit denoised metadata', async () => { + const result = await callTool(client, 'clickstack_search', { + sourceId: logSource._id.toString(), + denoise: true, + maxResults: 200, + startTime: new Date(now.getTime() - 10 * 60 * 1000).toISOString(), + endTime: new Date(now.getTime() + 60 * 1000).toISOString(), + }); + + expect(result.isError).toBeFalsy(); + const output = JSON.parse(getFirstText(result)); + + // Must have a denoised block + expect(output).toHaveProperty('denoised'); + expect(output.denoised).toHaveProperty('removedPatterns'); + expect(output.denoised).toHaveProperty('returnedRowCountBeforeDenoise'); + expect(output.denoised).toHaveProperty('filteredRowCount'); + + // Should not have a skipped reason + expect(output.denoised.skipped).toBeUndefined(); + + // The noisy health check pattern should be in removedPatterns + expect(output.denoised.removedPatterns.length).toBeGreaterThanOrEqual( + 1, + ); + const healthPattern = output.denoised.removedPatterns.find( + (p: { pattern: string }) => p.pattern.includes('Health check'), + ); + expect(healthPattern).toBeDefined(); + + // Filtered count should be less than original + expect(output.denoised.filteredRowCount).toBeLessThan( + output.denoised.returnedRowCountBeforeDenoise, + ); + }); + + it('should return results without denoised block when denoise=false', async () => { + const result = await callTool(client, 'clickstack_search', { + sourceId: logSource._id.toString(), + denoise: false, + maxResults: 200, + startTime: new Date(now.getTime() - 10 * 60 * 1000).toISOString(), + endTime: new Date(now.getTime() + 60 * 1000).toISOString(), + }); + + expect(result.isError).toBeFalsy(); + const output = JSON.parse(getFirstText(result)); + expect(output).not.toHaveProperty('denoised'); + }); + }); }); // ─── clickstack_event_patterns ───────────────────────────────────────────────── diff --git a/packages/api/src/mcp/tools/query/denoise.ts b/packages/api/src/mcp/tools/query/denoise.ts new file mode 100644 index 0000000000..00d5f1bff2 --- /dev/null +++ b/packages/api/src/mcp/tools/query/denoise.ts @@ -0,0 +1,274 @@ +import { ClickhouseClient } from '@hyperdx/common-utils/dist/clickhouse/node'; +import { getMetadata } from '@hyperdx/common-utils/dist/core/metadata'; +import { getFirstTimestampValueExpression } from '@hyperdx/common-utils/dist/core/utils'; +import { + DENOISE_NOISE_THRESHOLD, + DENOISE_SAMPLE_SIZE, + flattenBody, + minePatterns, + TemplateMiner, + TemplateMinerConfig, +} from '@hyperdx/common-utils/dist/drain'; +import type { ChartConfigWithDateRange } from '@hyperdx/common-utils/dist/types'; +import { DisplayType } from '@hyperdx/common-utils/dist/types'; + +import { getConnectionById } from '@/controllers/connection'; +import { getSource } from '@/controllers/sources'; + +import { resolveBodyExpression } from './helpers'; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +export interface DenoiseResult { + /** Filtered rows with noisy patterns removed. */ + rows: Record[]; + /** Patterns that were identified as noisy and removed. */ + removedPatterns: Array<{ + pattern: string; + estimatedCount: number; + sampleCount: number; + }>; + /** + * When non-null, denoising was skipped and rows are returned unmodified. + * The value describes why (e.g. "body_column_not_in_results"). + */ + skipped?: string; +} + +// ─── Core denoising function ───────────────────────────────────────────────── + +/** + * Denoise search results by mining patterns from a random sample, identifying + * "noisy" patterns (those accounting for >10% of the sample), and filtering + * them out of the result rows. + * + * This mirrors the web app's "Denoise Results" feature + * (packages/app/src/components/DBRowTable.tsx) but runs server-side using + * the shared TypeScript Drain implementation. + */ +export async function denoiseSearchResults( + teamId: string, + sourceId: string, + startDate: Date, + endDate: Date, + rows: Record[], + options?: { + where?: string; + whereLanguage?: 'lucene' | 'sql'; + }, +): Promise { + if (rows.length === 0) { + return { rows, removedPatterns: [], skipped: 'no_rows' }; + } + + // ── Resolve source & connection ── + const source = await getSource(teamId, sourceId); + if (!source) { + return { rows, removedPatterns: [], skipped: 'source_not_found' }; + } + + const bodyColumn = resolveBodyExpression(source); + if (!bodyColumn) { + return { rows, removedPatterns: [], skipped: 'no_body_column' }; + } + + const connection = await getConnectionById( + teamId, + source.connection.toString(), + true, + ); + if (!connection) { + return { rows, removedPatterns: [], skipped: 'connection_not_found' }; + } + + const clickhouseClient = new ClickhouseClient({ + host: connection.host, + username: connection.username, + password: connection.password, + }); + const metadata = getMetadata(clickhouseClient); + + const tsExpr = getFirstTimestampValueExpression( + source.timestampValueExpression, + ); + const implicitColumn = + 'implicitColumnExpression' in source + ? source.implicitColumnExpression + : undefined; + const useTextIndexForImplicitColumn = + 'useTextIndexForImplicitColumn' in source + ? source.useTextIndexForImplicitColumn + : undefined; + + // ── Query: Random sample of events for pattern learning ── + const sampleConfig = { + displayType: DisplayType.Search, + source: source._id.toString(), + select: `${bodyColumn} as __hdx_pattern_body, ${tsExpr} as __hdx_pattern_ts`, + from: { + databaseName: source.from.databaseName, + tableName: source.from.tableName, + }, + where: options?.where ?? '', + whereLanguage: options?.whereLanguage ?? ('lucene' as const), + connection: source.connection.toString(), + timestampValueExpression: source.timestampValueExpression, + implicitColumnExpression: implicitColumn, + useTextIndexForImplicitColumn, + orderBy: [{ ordering: 'DESC' as const, valueExpression: 'rand()' }], + limit: { limit: DENOISE_SAMPLE_SIZE, offset: 0 }, + dateRange: [startDate, endDate] as [Date, Date], + } satisfies ChartConfigWithDateRange; + + // ── Query: Total count for sample multiplier ── + const countConfig = { + displayType: DisplayType.Table, + source: source._id.toString(), + select: 'count() as total', + from: { + databaseName: source.from.databaseName, + tableName: source.from.tableName, + }, + where: options?.where ?? '', + whereLanguage: options?.whereLanguage ?? ('lucene' as const), + connection: source.connection.toString(), + timestampValueExpression: source.timestampValueExpression, + implicitColumnExpression: implicitColumn, + useTextIndexForImplicitColumn, + limit: { limit: 1, offset: 0 }, + dateRange: [startDate, endDate] as [Date, Date], + } satisfies ChartConfigWithDateRange; + + let sampleResult: Awaited< + ReturnType + >; + let countResult: Awaited< + ReturnType + >; + try { + [sampleResult, countResult] = await Promise.all([ + clickhouseClient.queryChartConfig({ + config: sampleConfig, + metadata, + querySettings: source.querySettings, + opts: { clickhouse_settings: { max_execution_time: 30 } }, + }), + clickhouseClient.queryChartConfig({ + config: countConfig, + metadata, + querySettings: source.querySettings, + opts: { clickhouse_settings: { max_execution_time: 30 } }, + }), + ]); + } catch { + // If sampling fails, return rows unmodified rather than failing the search + return { rows, removedPatterns: [], skipped: 'sampling_failed' }; + } + + const sampleRows = sampleResult.data; + const totalCount = Number(countResult.data?.[0]?.total ?? 0); + + if (!sampleRows || sampleRows.length === 0) { + return { rows, removedPatterns: [], skipped: 'no_sample_data' }; + } + + // ── Mine patterns from the sample ── + // Note: maxSamples: 1 — minePatterns always keeps at least one sample per + // cluster internally; we just minimize memory overhead. + const { patterns } = minePatterns(sampleRows, { + totalCount, + startDate, + endDate, + maxSamples: 1, + getBody: row => { + const raw = row.__hdx_pattern_body; + return raw != null ? String(raw) : ''; + }, + getTimestamp: row => { + const tsRaw = row.__hdx_pattern_ts; + return tsRaw != null ? new Date(String(tsRaw)).getTime() : null; + }, + }); + + if (patterns.length === 0) { + return { rows, removedPatterns: [] }; + } + + // ── Identify noisy patterns (>10% of sampled events) ── + // Key by template string rather than cluster ID so we are not coupled to + // the auto-incrementing IDs generated inside minePatterns(). The matching + // miner below produces its own IDs; comparing template strings is stable. + const sampledRowCount = sampleRows.length; + const noisyTemplates = new Set(); + const removedPatterns: DenoiseResult['removedPatterns'] = []; + + for (const p of patterns) { + if (p.sampleCount / sampledRowCount > DENOISE_NOISE_THRESHOLD) { + noisyTemplates.add(p.pattern); + removedPatterns.push({ + pattern: p.pattern, + estimatedCount: p.estimatedCount, + sampleCount: p.sampleCount, + }); + } + } + + if (noisyTemplates.size === 0) { + return { rows, removedPatterns: [] }; + } + + // ── Build a miner trained on the same sample for row matching ── + const drainConfig = new TemplateMinerConfig(); + const miner = new TemplateMiner(drainConfig); + for (const row of sampleRows) { + const raw = row.__hdx_pattern_body; + const bodyText = flattenBody(raw != null ? String(raw) : ''); + miner.addLogMessage(bodyText); + } + + // ── Match each result row and filter out noisy ones ── + const bodyColumnKey = findBodyColumnKey(rows[0], bodyColumn); + if (!bodyColumnKey) { + return { + rows, + removedPatterns: [], + skipped: 'body_column_not_in_results', + }; + } + + const filteredRows = rows.filter(row => { + const bodyValue = row[bodyColumnKey]; + if (bodyValue == null) return true; // Keep rows with no body + const bodyText = flattenBody(String(bodyValue)); + const match = miner.match(bodyText, 'fallback'); + if (!match) return true; // No pattern match — keep the row + return !noisyTemplates.has(match.getTemplate()); + }); + + return { + rows: filteredRows, + removedPatterns, + }; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +/** + * Find the key in a result row that corresponds to the body column expression. + * ClickHouse may return the column under its expression name or a simplified alias. + */ +function findBodyColumnKey( + row: Record, + bodyColumn: string, +): string | null { + // Direct match (e.g. "Body", "SpanName") + if (bodyColumn in row) return bodyColumn; + + // Case-insensitive match + const lowerBody = bodyColumn.toLowerCase(); + for (const key of Object.keys(row)) { + if (key.toLowerCase() === lowerBody) return key; + } + + return null; +} diff --git a/packages/api/src/mcp/tools/query/helpers.ts b/packages/api/src/mcp/tools/query/helpers.ts index c87fc82066..c6782a4227 100644 --- a/packages/api/src/mcp/tools/query/helpers.ts +++ b/packages/api/src/mcp/tools/query/helpers.ts @@ -1,6 +1,9 @@ import { ClickhouseClient } from '@hyperdx/common-utils/dist/clickhouse/node'; import { getMetadata } from '@hyperdx/common-utils/dist/core/metadata'; -import { getFirstTimestampValueExpression } from '@hyperdx/common-utils/dist/core/utils'; +import { + getFirstTimestampValueExpression, + splitAndTrimWithBracket, +} from '@hyperdx/common-utils/dist/core/utils'; import { isBuilderSavedChartConfig, isRawSqlSavedChartConfig, @@ -27,6 +30,37 @@ import { trimToolResponse } from '@/utils/trimToolResponse'; import type { ExternalDashboardTileWithId } from '@/utils/zod'; import { externalDashboardTileSchemaWithId } from '@/utils/zod'; +// ─── Source body expression helpers ────────────────────────────────────────── + +export interface SourceBodyFields { + kind: string; + spanNameExpression?: string; + bodyExpression?: string; + implicitColumnExpression?: string; +} + +/** + * Resolve the body column expression for pattern mining from a source. + * Mirrors the web app's getEventBody() logic (packages/app/src/source.ts). + */ +export function resolveBodyExpression( + source: SourceBodyFields, +): string | undefined { + let expression: string | undefined; + if (source.kind === SourceKind.Trace) { + expression = source.spanNameExpression; + } else if (source.kind === SourceKind.Log) { + expression = source.bodyExpression ?? source.implicitColumnExpression; + } + if (!expression) return undefined; + const multiExpr = splitAndTrimWithBracket(expression); + return multiExpr.length === 1 ? expression : multiExpr[0]; +} + +/** Reject bodyExpression values containing SQL-unsafe characters. */ +// eslint-disable-next-line no-useless-escape +export const SAFE_BODY_EXPR_CHARS = /^[\w.':\[\]\-]+$/; + // ─── Safety limits ─────────────────────────────────────────────────────────── /** ClickHouse settings applied to all MCP query-tool executions. */ diff --git a/packages/api/src/mcp/tools/query/runEventPatterns.ts b/packages/api/src/mcp/tools/query/runEventPatterns.ts index 47f757cea6..7fb9a7617d 100644 --- a/packages/api/src/mcp/tools/query/runEventPatterns.ts +++ b/packages/api/src/mcp/tools/query/runEventPatterns.ts @@ -6,42 +6,17 @@ import { } from '@hyperdx/common-utils/dist/core/utils'; import { minePatterns } from '@hyperdx/common-utils/dist/drain'; import type { ChartConfigWithDateRange } from '@hyperdx/common-utils/dist/types'; -import { DisplayType, SourceKind } from '@hyperdx/common-utils/dist/types'; +import { DisplayType } from '@hyperdx/common-utils/dist/types'; import { getConnectionById } from '@/controllers/connection'; import { getSource } from '@/controllers/sources'; import { trimToolResponse } from '@/utils/trimToolResponse'; -import { clickHouseErrorResult } from './helpers'; - -// ─── Source helpers ────────────────────────────────────────────────────────── - -interface SourceBodyFields { - kind: string; - spanNameExpression?: string; - bodyExpression?: string; - implicitColumnExpression?: string; -} - -/** - * Resolve the body column expression for pattern mining from a source. - * Mirrors the web app's getEventBody() logic (packages/app/src/source.ts). - */ -function resolveBodyExpression(source: SourceBodyFields): string | undefined { - let expression: string | undefined; - if (source.kind === SourceKind.Trace) { - expression = source.spanNameExpression; - } else if (source.kind === SourceKind.Log) { - expression = source.bodyExpression ?? source.implicitColumnExpression; - } - if (!expression) return undefined; - const multiExpr = splitAndTrimWithBracket(expression); - return multiExpr.length === 1 ? expression : multiExpr[0]; -} - -/** Reject bodyExpression values containing SQL-unsafe characters. */ -// eslint-disable-next-line no-useless-escape -const SAFE_BODY_EXPR_CHARS = /^[\w.':\[\]\-]+$/; +import { + clickHouseErrorResult, + resolveBodyExpression, + SAFE_BODY_EXPR_CHARS, +} from './helpers'; // ─── Event pattern mining ──────────────────────────────────────────────────── diff --git a/packages/api/src/mcp/tools/query/search.ts b/packages/api/src/mcp/tools/query/search.ts index 624639ffd7..6625b883dd 100644 --- a/packages/api/src/mcp/tools/query/search.ts +++ b/packages/api/src/mcp/tools/query/search.ts @@ -1,8 +1,12 @@ import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { z } from 'zod'; +import logger from '@/utils/logger'; +import { trimToolResponse } from '@/utils/trimToolResponse'; + import { withToolTracing } from '../../utils/tracing'; import type { McpContext } from '../types'; +import { denoiseSearchResults } from './denoise'; import { buildTile, parseTimeRange, runConfigTile } from './helpers'; import { endTimeSchema, @@ -36,6 +40,16 @@ const searchSchema = z.object({ 'Maximum number of rows to return (1-200). Default: 50. ' + 'Use smaller values to reduce response size.', ), + denoise: z + .boolean() + .optional() + .default(false) + .describe( + 'When true, automatically removes events matching high-frequency patterns ' + + '(those accounting for >10% of sampled events) from the results. ' + + 'Useful for cutting through log noise to find unusual or interesting events. ' + + 'Adds ~1-2s of latency for the pattern sampling queries.', + ), startTime: startTimeSchema, endTime: endTimeSchema, }); @@ -56,6 +70,8 @@ export function registerSearch(server: McpServer, context: McpContext) { 'Requires sourceId — call clickstack_list_sources then clickstack_describe_source first.\n\n' + 'For aggregated metrics, use clickstack_table instead. ' + 'For pattern discovery, use clickstack_event_patterns instead.\n\n' + + 'Set denoise=true to automatically filter out high-frequency repetitive patterns, ' + + 'surfacing only unusual or interesting events.\n\n' + 'Column naming: top-level columns are PascalCase (Duration, StatusCode). ' + "Map attributes use bracket syntax: SpanAttributes['http.method'].", inputSchema: searchSchema, @@ -78,9 +94,132 @@ export function registerSearch(server: McpServer, context: McpContext) { whereLanguage: input.whereLanguage, }); - return runConfigTile(teamId.toString(), tile, startDate, endDate, { - maxResults: input.maxResults, - }); + const result = await runConfigTile( + teamId.toString(), + tile, + startDate, + endDate, + { + maxResults: input.maxResults, + }, + ); + + // ── Denoising post-processing ── + if (!input.denoise || ('isError' in result && result.isError)) { + return result; + } + + // Extract the raw result data from the formatted response. + // runConfigTile returns { content: [{ type: "text", text: JSON }] }. + const resultText = result.content?.[0]?.text; + if (!resultText) return result; + + let parsed: { result?: { data?: Record[] } }; + try { + parsed = JSON.parse(resultText); + } catch { + return result; + } + + const resultData = parsed.result; + const rows = (resultData as Record | undefined)?.data as + | Record[] + | undefined; + if (!rows || !Array.isArray(rows) || rows.length === 0) { + return result; + } + + let denoised; + try { + denoised = await denoiseSearchResults( + teamId.toString(), + input.sourceId, + startDate, + endDate, + rows, + { + where: input.where, + whereLanguage: input.whereLanguage, + }, + ); + } catch (err) { + // Denoise is a post-processing enhancement — a failure here must + // never discard the already-successful search result. + logger.warn( + { err, sourceId: input.sourceId }, + 'denoiseSearchResults failed; returning raw results', + ); + + const { data: trimmedResult, isTrimmed } = trimToolResponse(resultData); + return { + content: [ + { + type: 'text' as const, + text: JSON.stringify( + { + result: trimmedResult, + denoised: { + removedPatterns: [], + returnedRowCountBeforeDenoise: rows.length, + filteredRowCount: rows.length, + skipped: 'denoise_failed', + }, + ...(isTrimmed + ? { + note: 'Result was trimmed for context size. Narrow the time range or add filters to reduce data.', + } + : {}), + }, + null, + 2, + ), + }, + ], + }; + } + + // Replace rows in the result with denoised rows and add metadata. + // Always emit a `denoised` block when denoise=true so callers can + // distinguish "no noisy patterns" from "denoise was not requested". + const denoisedResult = { + ...resultData, + data: denoised.rows, + }; + const { data: trimmedResult, isTrimmed } = + trimToolResponse(denoisedResult); + + return { + content: [ + { + type: 'text' as const, + text: JSON.stringify( + { + result: trimmedResult, + denoised: { + removedPatterns: denoised.removedPatterns, + // rows.length is the count returned by runConfigTile + // (already subject to maxResults and trim limits). + returnedRowCountBeforeDenoise: rows.length, + filteredRowCount: denoised.rows.length, + ...(denoised.skipped ? { skipped: denoised.skipped } : {}), + }, + ...(isTrimmed + ? { + note: 'Result was trimmed for context size. Narrow the time range or add filters to reduce data.', + } + : {}), + ...(denoised.rows.length === 0 && !denoised.skipped + ? { + hint: 'All events matched noisy patterns and were removed. Try narrowing filters or disabling denoise to see all events.', + } + : {}), + }, + null, + 2, + ), + }, + ], + }; }), ); } diff --git a/packages/app/src/components/DBRowTable.tsx b/packages/app/src/components/DBRowTable.tsx index 54106c3210..23f5877a6a 100644 --- a/packages/app/src/components/DBRowTable.tsx +++ b/packages/app/src/components/DBRowTable.tsx @@ -30,6 +30,10 @@ import { JSDataType, } from '@hyperdx/common-utils/dist/clickhouse'; import { splitAndTrimWithBracket } from '@hyperdx/common-utils/dist/core/utils'; +import { + DENOISE_NOISE_THRESHOLD, + DENOISE_SAMPLE_SIZE, +} from '@hyperdx/common-utils/dist/drain'; import { BuilderChartConfigWithDateRange, SelectList, @@ -1716,7 +1720,7 @@ function DBSqlRowTableComponent({ const patternColumn = columns[columns.length - 1]; const groupedPatterns = useGroupedPatterns({ config, - samples: 10_000, + samples: DENOISE_SAMPLE_SIZE, bodyValueExpression: patternColumn ?? '', severityTextExpression: (source?.kind === SourceKind.Log @@ -1729,7 +1733,9 @@ function DBSqlRowTableComponent({ queryKey: ['noisy-patterns', config], queryFn: async () => { return Object.values(groupedPatterns.data).filter( - p => p.count / (groupedPatterns.sampledRowCount ?? 1) > 0.1, + p => + p.count / (groupedPatterns.sampledRowCount ?? 1) > + DENOISE_NOISE_THRESHOLD, ); }, enabled: diff --git a/packages/common-utils/src/drain/index.ts b/packages/common-utils/src/drain/index.ts index 937aaa82da..f735377f06 100644 --- a/packages/common-utils/src/drain/index.ts +++ b/packages/common-utils/src/drain/index.ts @@ -10,7 +10,12 @@ export type { PatternGroup, TrendBucket, } from './mine-patterns'; -export { flattenBody, minePatterns } from './mine-patterns'; +export { + DENOISE_NOISE_THRESHOLD, + DENOISE_SAMPLE_SIZE, + flattenBody, + minePatterns, +} from './mine-patterns'; export { Node } from './node'; export type { AddLogMessageResult, ExtractedParameter } from './template-miner'; export { TemplateMiner } from './template-miner'; diff --git a/packages/common-utils/src/drain/mine-patterns.ts b/packages/common-utils/src/drain/mine-patterns.ts index 82c4abd9ce..05583db7fd 100644 --- a/packages/common-utils/src/drain/mine-patterns.ts +++ b/packages/common-utils/src/drain/mine-patterns.ts @@ -6,6 +6,15 @@ import { import { TemplateMinerConfig } from './config'; import { TemplateMiner } from './template-miner'; +// ─── Denoise constants ─────────────────────────────────────────────────────── +// Shared between the web app and MCP server denoise paths. + +/** Number of random rows to sample for pattern learning. */ +export const DENOISE_SAMPLE_SIZE = 10_000; + +/** Patterns matching more than this fraction of sampled events are "noisy". */ +export const DENOISE_NOISE_THRESHOLD = 0.1; + // ─── Body normalization ────────────────────────────────────────────────────── /** Collapse newlines and runs of whitespace into single spaces. */