diff --git a/.changeset/rare-spoons-change.md b/.changeset/rare-spoons-change.md new file mode 100644 index 0000000000..e68e316b5a --- /dev/null +++ b/.changeset/rare-spoons-change.md @@ -0,0 +1,9 @@ +--- +"@hyperdx/common-utils": minor +"@hyperdx/api": minor +"@hyperdx/app": minor +--- + +Adding consecutive-window configuration to alerts, so that you can specify a condition like "only fire this alert after some condition is met for N consecutive windows." This helps prevent flaky alerts (and pages), and cut down on alert noise in many cases. + +Also adds a `PENDING` alert state for alarms that _will_ fire if current trends continue. diff --git a/packages/api/src/controllers/alertHistory.ts b/packages/api/src/controllers/alertHistory.ts index 93ed9c1468..86f5701d5b 100644 --- a/packages/api/src/controllers/alertHistory.ts +++ b/packages/api/src/controllers/alertHistory.ts @@ -18,14 +18,24 @@ type GroupedAlertHistory = { lastValues: IAlertHistory['lastValues'][]; }; +function groupStateToOverallState(states: string[]): AlertState { + if (states.includes(AlertState.ALERT)) { + return AlertState.ALERT; + } + + if (states.includes(AlertState.PENDING)) { + return AlertState.PENDING; + } + + return AlertState.OK; +} + function mapGroupedHistories( groupedHistories: GroupedAlertHistory[], ): Omit[] { return groupedHistories.map(group => ({ createdAt: group._id, - state: group.states.includes(AlertState.ALERT) - ? AlertState.ALERT - : AlertState.OK, + state: groupStateToOverallState(group.states), counts: group.counts, lastValues: group.lastValues .flat() diff --git a/packages/api/src/controllers/alerts.ts b/packages/api/src/controllers/alerts.ts index 095d317d3b..238b069946 100644 --- a/packages/api/src/controllers/alerts.ts +++ b/packages/api/src/controllers/alerts.ts @@ -155,6 +155,9 @@ const makeAlert = (alert: AlertInput, userId?: ObjectId): Partial => { // Chart alerts dashboard: alert.dashboardId as unknown as ObjectId, tileId: alert.tileId, + + // Multi-window alerting + numConsecutiveWindows: alert.numConsecutiveWindows ?? null, }; }; diff --git a/packages/api/src/models/alert.ts b/packages/api/src/models/alert.ts index bea7653d7b..e607629f78 100644 --- a/packages/api/src/models/alert.ts +++ b/packages/api/src/models/alert.ts @@ -14,6 +14,7 @@ export enum AlertState { DISABLED = 'DISABLED', INSUFFICIENT_DATA = 'INSUFFICIENT_DATA', OK = 'OK', + PENDING = 'PENDING', } export interface IAlertError { @@ -84,6 +85,9 @@ export interface IAlert { until: Date; }; + // Multi-window alerting: fire only after N violations in M consecutive windows + numConsecutiveWindows?: number | null; + // Errors recorded during the most recent execution executionErrors?: IAlertError[]; createdAt: Date; @@ -190,6 +194,11 @@ const AlertSchema = new Schema( type: String, required: false, }, + numConsecutiveWindows: { + type: Number, + required: false, + min: 1, + }, silenced: { required: false, type: { diff --git a/packages/api/src/models/alertHistory.ts b/packages/api/src/models/alertHistory.ts index 93cd4ca254..5cf8630f81 100644 --- a/packages/api/src/models/alertHistory.ts +++ b/packages/api/src/models/alertHistory.ts @@ -12,6 +12,7 @@ export interface IAlertHistory { state: AlertState; lastValues: { startTime: Date; count: number }[]; group?: string; // For group-by alerts, stores the group identifier + fired?: boolean; } const AlertHistorySchema = new Schema({ @@ -45,6 +46,10 @@ const AlertHistorySchema = new Schema({ type: String, required: false, }, + fired: { + type: Boolean, + required: false, + }, }); AlertHistorySchema.index( diff --git a/packages/api/src/routers/api/alerts.ts b/packages/api/src/routers/api/alerts.ts index e3fdde4e42..9da8c63b31 100644 --- a/packages/api/src/routers/api/alerts.ts +++ b/packages/api/src/routers/api/alerts.ts @@ -84,6 +84,7 @@ const formatAlertResponse = ( 'createdAt', 'updatedAt', 'executionErrors', + 'numConsecutiveWindows', ]), }; }; diff --git a/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts b/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts index 16c0726065..374ff1cbc2 100644 --- a/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts +++ b/packages/api/src/tasks/checkAlerts/__tests__/checkAlerts.test.ts @@ -25,7 +25,7 @@ import { RAW_SQL_ALERT_TEMPLATE, RAW_SQL_NUMBER_ALERT_TEMPLATE, } from '@/fixtures'; -import Alert, { AlertSource } from '@/models/alert'; +import Alert, { AlertSource, IAlert } from '@/models/alert'; import AlertHistory from '@/models/alertHistory'; import Connection, { IConnection } from '@/models/connection'; import Dashboard, { IDashboard } from '@/models/dashboard'; @@ -37,6 +37,7 @@ import * as checkAlert from '@/tasks/checkAlerts'; import { alertHasGroupBy, doesExceedThreshold, + getConsecutiveWindowHistories, getPreviousAlertHistories, getScheduledWindowStart, parseAlertData, @@ -2114,15 +2115,16 @@ describe('checkAlerts', () => { alertProvider: AlertProvider, teamWebhooksById: Map, ) => { - const previousMap = await getPreviousAlertHistories( - [details.alert.id], - now, - ); + const [previousMap, recentHistoryMap] = await Promise.all([ + getPreviousAlertHistories([details.alert.id], now), + getConsecutiveWindowHistories([details.alert], now), + ]); await processAlert( now, { ...details, previousMap, + recentHistoryMap, }, clickhouseClient, connection.id, @@ -8533,6 +8535,351 @@ describe('checkAlerts', () => { ); expect((await Alert.findById(details.alert.id))!.state).toBe('OK'); }); + + describe('multi-window alerting (numConsecutiveWindows)', () => { + it('fires on the first violation when numConsecutiveWindows=1', async () => { + const { + team, + webhook, + connection, + source, + savedSearch, + teamWebhooksById, + clickhouseClient, + } = await setupSavedSearchAlertTest(); + + const details = await createAlertDetails( + team, + source, + { + source: AlertSource.SAVED_SEARCH, + channel: { type: 'webhook', webhookId: webhook._id.toString() }, + interval: '5m', + thresholdType: AlertThresholdType.ABOVE, + threshold: 0, + savedSearchId: savedSearch.id, + numConsecutiveWindows: 1, + }, + { taskType: AlertTaskType.SAVED_SEARCH, savedSearch }, + ); + + await bulkInsertLogs([ + { + ServiceName: 'api', + Timestamp: new Date('2024-01-01T00:05:00Z'), + SeverityText: 'error', + Body: 'err', + }, + ]); + + await processAlertAtTime( + new Date('2024-01-01T00:12:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + + const histories = await AlertHistory.find({ alert: details.alert.id }); + expect(histories).toHaveLength(1); + expect(histories[0].state).toBe('ALERT'); + expect(histories[0].fired).toBe(true); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(1); + }); + + it('suppresses notification until all M windows have violated', async () => { + const { + team, + webhook, + connection, + source, + savedSearch, + teamWebhooksById, + clickhouseClient, + } = await setupSavedSearchAlertTest(); + + jest + .spyOn(slack, 'postMessageToWebhook') + .mockResolvedValue(null as any); + + const details = await createAlertDetails( + team, + source, + { + source: AlertSource.SAVED_SEARCH, + channel: { type: 'webhook', webhookId: webhook._id.toString() }, + interval: '5m', + thresholdType: AlertThresholdType.ABOVE, + threshold: 0, + savedSearchId: savedSearch.id, + numConsecutiveWindows: 3, + }, + { taskType: AlertTaskType.SAVED_SEARCH, savedSearch }, + ); + + await bulkInsertLogs([ + { + ServiceName: 'api', + Timestamp: new Date('2024-01-01T00:05:00Z'), + SeverityText: 'error', + Body: 'err', + }, + { + ServiceName: 'api', + Timestamp: new Date('2024-01-01T00:10:00Z'), + SeverityText: 'error', + Body: 'err', + }, + { + ServiceName: 'api', + Timestamp: new Date('2024-01-01T00:15:00Z'), + SeverityText: 'error', + Body: 'err', + }, + ]); + + // don't fire on windows 1 or 2 since we need 3 consecutive violations to fire + await processAlertAtTime( + new Date('2024-01-01T00:12:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(0); + let histories = await AlertHistory.find({ + alert: details.alert.id, + }).sort({ createdAt: 1 }); + expect(histories).toHaveLength(1); + expect(histories[0].state).toBe('PENDING'); + expect(histories[0].fired).toBeFalsy(); // shouldn't fire yet + + await processAlertAtTime( + new Date('2024-01-01T00:17:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(0); + histories = await AlertHistory.find({ alert: details.alert.id }).sort({ + createdAt: 1, + }); + expect(histories).toHaveLength(2); + expect(histories[1].state).toBe('PENDING'); + expect(histories[1].fired).toBeFalsy(); // shouldn't fire yet + + await processAlertAtTime( + new Date('2024-01-01T00:22:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(1); + histories = await AlertHistory.find({ alert: details.alert.id }).sort({ + createdAt: 1, + }); + expect(histories).toHaveLength(3); + expect(histories[2].state).toBe('ALERT'); + expect(histories[2].fired).toBe(true); // fires here b/c it's the third consecutive violation + }); + + it('does not send a resolution notification when no alert had previously fired', async () => { + const { + team, + webhook, + connection, + source, + savedSearch, + teamWebhooksById, + clickhouseClient, + } = await setupSavedSearchAlertTest(); + + jest + .spyOn(slack, 'postMessageToWebhook') + .mockResolvedValue(null as any); + + const details = await createAlertDetails( + team, + source, + { + source: AlertSource.SAVED_SEARCH, + channel: { type: 'webhook', webhookId: webhook._id.toString() }, + interval: '5m', + thresholdType: AlertThresholdType.ABOVE, + threshold: 1, // threshold=1 so zero-fill (no data) produces OK, not ALERT + savedSearchId: savedSearch.id, + numConsecutiveWindows: 3, + }, + { taskType: AlertTaskType.SAVED_SEARCH, savedSearch }, + ); + + await bulkInsertLogs([ + { + ServiceName: 'api', + Timestamp: new Date('2024-01-01T00:05:00Z'), + SeverityText: 'error', + Body: 'err', + }, + { + ServiceName: 'api', + Timestamp: new Date('2024-01-01T00:10:00Z'), + SeverityText: 'error', + Body: 'err', + }, + ]); + + await processAlertAtTime( + new Date('2024-01-01T00:12:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(0); + + await processAlertAtTime( + new Date('2024-01-01T00:17:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(0); + + // window 3 is ok (no violation) + await processAlertAtTime( + new Date('2024-01-01T00:22:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(0); // webhook should never fire in this case + + const histories = await AlertHistory.find({ + alert: details.alert.id, + }).sort({ createdAt: 1 }); + expect(histories).toHaveLength(3); + expect(histories[0].state).toBe('PENDING'); + expect(histories[0].fired).toBeFalsy(); + expect(histories[1].state).toBe('PENDING'); + expect(histories[1].fired).toBeFalsy(); + expect(histories[2].state).toBe('OK'); + }); + + it('tracks consecutive windows independently per group', async () => { + const { + team, + webhook, + connection, + source, + savedSearch, + teamWebhooksById, + clickhouseClient, + } = await setupSavedSearchAlertTest(); + + jest + .spyOn(slack, 'postMessageToWebhook') + .mockResolvedValue(null as any); + + const details = await createAlertDetails( + team, + source, + { + source: AlertSource.SAVED_SEARCH, + channel: { type: 'webhook', webhookId: webhook._id.toString() }, + interval: '5m', + thresholdType: AlertThresholdType.ABOVE, + threshold: 0, + savedSearchId: savedSearch.id, + groupBy: 'ServiceName', + numConsecutiveWindows: 2, + }, + { taskType: AlertTaskType.SAVED_SEARCH, savedSearch }, + ); + + // Window 1: only service-a violates. + // Window 2: both service-a (2nd consecutive) and service-b (1st) violate. + await bulkInsertLogs([ + { + ServiceName: 'service-a', + Timestamp: new Date('2024-01-01T00:05:00Z'), + SeverityText: 'error', + Body: 'err', + }, + { + ServiceName: 'service-a', + Timestamp: new Date('2024-01-01T00:10:00Z'), + SeverityText: 'error', + Body: 'err', + }, + { + ServiceName: 'service-b', + Timestamp: new Date('2024-01-01T00:10:00Z'), + SeverityText: 'error', + Body: 'err', + }, + ]); + + // Window 1: service-a has its first violation -> PENDING, no notification. + await processAlertAtTime( + new Date('2024-01-01T00:12:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(0); + + // Window 2: service-a hits 2 consecutive violations -> ALERT (fires); + // service-b is on its first violation -> PENDING (does not fire). + await processAlertAtTime( + new Date('2024-01-01T00:17:00Z'), + details, + clickhouseClient, + connection, + alertProvider, + teamWebhooksById, + ); + + // Exactly one notification: service-a's transition to ALERT. + expect(slack.postMessageToWebhook).toHaveBeenCalledTimes(1); + + const histories = await AlertHistory.find({ + alert: details.alert.id, + }).sort({ createdAt: 1 }); + + const serviceAHistories = histories + .filter(h => h.group === 'ServiceName:service-a') + .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime()); + const serviceBHistories = histories.filter( + h => h.group === 'ServiceName:service-b', + ); + + // service-a: first window PENDING, second window ALERT (fired). + expect(serviceAHistories).toHaveLength(2); + expect(serviceAHistories[0].state).toBe('PENDING'); + expect(serviceAHistories[0].fired).toBeFalsy(); + expect(serviceAHistories[1].state).toBe('ALERT'); + expect(serviceAHistories[1].fired).toBe(true); + + // service-b: only one violation so far, so it must still be PENDING even + // though service-a fired in the same run. + expect(serviceBHistories).toHaveLength(1); + expect(serviceBHistories[0].state).toBe('PENDING'); + expect(serviceBHistories[0].fired).toBeFalsy(); + }); + }); }); describe('processAlert with materialized views', () => { @@ -9090,4 +9437,115 @@ describe('checkAlerts', () => { ); }); }); + + describe('getConsecutiveWindowHistories', () => { + const server = getServer(); + + beforeAll(async () => { + await server.start(); + }); + + afterEach(async () => { + await server.clearDBs(); + jest.clearAllMocks(); + }); + + afterAll(async () => { + await server.stop(); + }); + + // getConsecutiveWindowHistories only reads scheduling/config fields off the + // alert (id, interval, numConsecutiveWindows, schedule*) and never loads the + // alert from the DB, so a lightweight stub is sufficient. + const makeAlert = ( + id: mongoose.Types.ObjectId, + numConsecutiveWindows?: number, + interval = '5m', + ): IAlert => + // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion + ({ + id: id.toString(), + interval, + numConsecutiveWindows, + }) as unknown as IAlert; + + const saveHistory = ( + alertId: mongoose.Types.ObjectId, + createdAt: Date, + opts: { group?: string; state?: AlertState } = {}, + ) => + new AlertHistory({ + alert: alertId, + createdAt, + state: opts.state ?? AlertState.ALERT, + group: opts.group, + }).save(); + + it('skips alerts with numConsecutiveWindows <= 1 (no query, empty map)', async () => { + const alertId = new mongoose.Types.ObjectId(); + await saveHistory(alertId, new Date('2025-01-01T00:10:00Z')); + + const aggregateSpy = jest.spyOn(AlertHistory, 'aggregate'); + + const result = await getConsecutiveWindowHistories( + [makeAlert(alertId, 1), makeAlert(new mongoose.Types.ObjectId())], + new Date('2025-01-01T00:17:00Z'), + ); + + expect(aggregateSpy).not.toHaveBeenCalled(); + expect(result.size).toBe(0); + }); + + it('buckets recent histories per group, newest-first, within the lookback window', async () => { + const alertId = new mongoose.Types.ObjectId(); + // now=00:17 -> windowStart=00:15; lookback = (3-1)*5m -> [00:05, 00:15) + await saveHistory(alertId, new Date('2025-01-01T00:00:00Z'), { + group: 'ServiceName:a', + }); // excluded: before earliestAllowedTime + await saveHistory(alertId, new Date('2025-01-01T00:05:00Z'), { + group: 'ServiceName:a', + }); + await saveHistory(alertId, new Date('2025-01-01T00:10:00Z'), { + group: 'ServiceName:a', + }); + await saveHistory(alertId, new Date('2025-01-01T00:15:00Z'), { + group: 'ServiceName:a', + }); // excluded: == windowStart (the current window being evaluated) + await saveHistory(alertId, new Date('2025-01-01T00:10:00Z'), { + group: 'ServiceName:b', + }); + + const result = await getConsecutiveWindowHistories( + [makeAlert(alertId, 3)], + new Date('2025-01-01T00:17:00Z'), + ); + + const aKey = `${alertId.toString()}||ServiceName:a`; + const bKey = `${alertId.toString()}||ServiceName:b`; + + expect(result.get(aKey)!.map(h => h.createdAt)).toEqual([ + new Date('2025-01-01T00:10:00Z'), + new Date('2025-01-01T00:05:00Z'), + ]); + expect(result.get(bKey)!.map(h => h.createdAt)).toEqual([ + new Date('2025-01-01T00:10:00Z'), + ]); + }); + + it('keys ungrouped histories by the bare alert id', async () => { + const alertId = new mongoose.Types.ObjectId(); + // now=00:17 -> windowStart=00:15; lookback = (2-1)*5m -> [00:10, 00:15) + await saveHistory(alertId, new Date('2025-01-01T00:10:00Z')); + + const result = await getConsecutiveWindowHistories( + [makeAlert(alertId, 2)], + new Date('2025-01-01T00:17:00Z'), + ); + + expect(result.size).toBe(1); + expect(result.get(alertId.toString())!.map(h => h.state)).toEqual([ + AlertState.ALERT, + ]); + }); + }); }); diff --git a/packages/api/src/tasks/checkAlerts/index.ts b/packages/api/src/tasks/checkAlerts/index.ts index 3c0199f796..82f15525df 100644 --- a/packages/api/src/tasks/checkAlerts/index.ts +++ b/packages/api/src/tasks/checkAlerts/index.ts @@ -324,6 +324,31 @@ export const getScheduledWindowStart = ( return fns.addMinutes(roundedShiftedNow, scheduleOffsetMinutes); }; +/** + * Compute the scheduled window start ("now rounded down to the window") for an + * alert at the given time. This mirrors the computation inside processAlert so + * that history fetched up-front (see getConsecutiveWindowHistories) lines up + * exactly with the window processAlert evaluates. + */ +export const getAlertWindowStart = (alert: IAlert, now: Date): Date => { + const windowSizeInMins = ms(alert.interval) / 60000; + const scheduleStartAt = normalizeScheduleStartAt({ + alertId: alert.id, + scheduleStartAt: alert.scheduleStartAt, + }); + const scheduleOffsetMinutes = normalizeScheduleOffsetMinutes({ + alertId: alert.id, + scheduleOffsetMinutes: alert.scheduleOffsetMinutes, + windowSizeInMins, + }); + return getScheduledWindowStart( + now, + windowSizeInMins, + scheduleOffsetMinutes, + scheduleStartAt, + ); +}; + const fireChannelEvent = async ({ alert, alertProvider, @@ -738,7 +763,7 @@ export const processAlert = async ( alertProvider: AlertProvider, teamWebhooksById: Map, ) => { - const { alert, previousMap } = details; + const { alert, previousMap, recentHistoryMap } = details; const source = 'source' in details ? details.source : undefined; // Errors collected during this execution. Webhook errors accumulate here; query // and validation errors are recorded via recordAlertErrors before returning. @@ -1057,13 +1082,38 @@ export const processAlert = async ( } }; + const numWindowsToLookBack = alert.numConsecutiveWindows ?? 1; + + const shouldFireBasedOnConsecutiveWindows = ( + groupKey?: string, + ): boolean => { + if (numWindowsToLookBack <= 1) { + return true; + } + + // recentHistoryMap entries are pre-filtered to the lookback window and + // sorted newest-first, so take the most recent M-1 for this group. + const key = computeHistoryMapKey(alert.id, groupKey || ''); + const groupHistories = recentHistoryMap?.get(key) ?? []; + const relevant = groupHistories.slice(0, numWindowsToLookBack - 1); + + return ( + relevant.length === numWindowsToLookBack - 1 && + relevant.every( + h => h.state === AlertState.ALERT || h.state === AlertState.PENDING, + ) + ); + }; + const sendNotificationIfResolved = async ( previousHistory: AggregatedAlertHistory | undefined, currentHistory: IAlertHistory, groupKey: string, ) => { if ( - previousHistory?.state === AlertState.ALERT && + (previousHistory?.state === AlertState.ALERT || + previousHistory?.state === AlertState.PENDING) && + previousHistory?.fired !== false && currentHistory.state === AlertState.OK ) { const lastValue = @@ -1099,19 +1149,26 @@ export const processAlert = async ( : 0; history.lastValues.push({ count: value, startTime: alertTimestamp }); + const previous = previousMap.get(computeHistoryMapKey(alert.id, '')); if (doesExceedThreshold(alert, value)) { - history.state = AlertState.ALERT; history.counts += 1; - await trySendNotification({ - state: AlertState.ALERT, - group: '', - totalCount: value, - startTime: alertTimestamp, - }); + if (shouldFireBasedOnConsecutiveWindows()) { + history.state = AlertState.ALERT; + history.fired = true; + await trySendNotification({ + state: AlertState.ALERT, + group: '', + totalCount: value, + startTime: alertTimestamp, + }); + } else { + history.state = AlertState.PENDING; + // Carry forward fired=true if a notification was previously sent and not yet resolved. + history.fired = previous?.fired === true; + } } // Auto-resolve - const previous = previousMap.get(computeHistoryMapKey(alert.id, '')); await sendNotificationIfResolved(previous, history, ''); const historyRecords = Array.from(histories.values()); @@ -1161,19 +1218,32 @@ export const processAlert = async ( const hasAlertsInPreviousMap = previousMap .values() - .some(history => history.state === AlertState.ALERT); + .some( + history => + history.state === AlertState.ALERT || + history.state === AlertState.PENDING, + ); if (zeroValueIsAlert) { const history = getOrCreateHistory(''); history.lastValues.push({ count: 0, startTime: bucketStart }); - history.state = AlertState.ALERT; history.counts += 1; - await trySendNotification({ - state: AlertState.ALERT, - group: '', - totalCount: 0, - startTime: bucketStart, - }); + if (shouldFireBasedOnConsecutiveWindows()) { + history.state = AlertState.ALERT; + history.fired = true; + await trySendNotification({ + state: AlertState.ALERT, + group: '', + totalCount: 0, + startTime: bucketStart, + }); + } else { + history.state = AlertState.PENDING; + // Carry forward fired=true if a notification was previously sent and not yet resolved. + history.fired = + previousMap.get(computeHistoryMapKey(alert.id, ''))?.fired === + true; + } } else if (!hasGroupBy || !hasAlertsInPreviousMap) { // For grouped alerts, if there are alerts in the previous map, // we will handle creating a history as part of auto-resolve later @@ -1201,16 +1271,24 @@ export const processAlert = async ( const history = getOrCreateHistory(groupKey); if (doesExceedThreshold(alert, value)) { - history.state = AlertState.ALERT; - await trySendNotification({ - state: AlertState.ALERT, - group: groupKey, - totalCount: value, - startTime: bucketStart, - attributes, - }); - history.counts += 1; + if (shouldFireBasedOnConsecutiveWindows(groupKey)) { + history.state = AlertState.ALERT; + history.fired = true; + await trySendNotification({ + state: AlertState.ALERT, + group: groupKey, + totalCount: value, + startTime: bucketStart, + attributes, + }); + } else { + history.state = AlertState.PENDING; + // Carry forward fired=true if a notification was previously sent and not yet resolved. + history.fired = + previousMap.get(computeHistoryMapKey(alert.id, groupKey)) + ?.fired === true; + } } else { // TODO: if the alert was previously alerting (different bucket), should we set state to OK (plus auto-resolve)? } @@ -1218,16 +1296,17 @@ export const processAlert = async ( } } - // Handle missing groups: If current check found no data, check if any previously alerting groups need to be resolved - // For group-by alerts, check if any previously alerting groups are missing from current data + // Handle missing groups: If current check found no data, check if any previously alerting/pending groups need to be resolved + // For group-by alerts, check if any previously alerting or pending groups are missing from current data if (hasGroupBy && previousMap && previousMap.size > 0) { for (const [previousKey, previousHistory] of previousMap.entries()) { const groupKey = extractGroupKeyFromMapKey(previousKey, alert.id); - // If this group was previously ALERT but is missing from current data and would be resolved by a 0 value, + // If this group was previously ALERT or PENDING but is missing from current data and would be resolved by a 0 value, // create an OK history for the group if ( - previousHistory.state === AlertState.ALERT && + (previousHistory.state === AlertState.ALERT || + previousHistory.state === AlertState.PENDING) && !histories.has(groupKey) && !doesExceedThreshold(alert, 0) ) { @@ -1236,7 +1315,7 @@ export const processAlert = async ( alertId: alert.id, group: groupKey, }, - `Group "${groupKey}" is missing from current data but was previously alerting - creating OK history`, + `Group "${groupKey}" is missing from current data but was previously ${previousHistory.state} - creating OK history`, ); const history = getOrCreateHistory(groupKey); history.lastValues.push({ count: 0, startTime: expectedBuckets[0] }); @@ -1317,6 +1396,7 @@ export interface AggregatedAlertHistory { createdAt: Date; state: AlertState; group?: string; + fired?: boolean; } /** @@ -1337,12 +1417,14 @@ export interface AggregatedAlertHistory { export const getPreviousAlertHistories = async ( alertIds: string[], now: Date, + sharedQueue?: PQueue, ) => { const lookbackDate = new Date(now.getTime() - ms('7d')); - // Use a concurrency-limited queue to avoid overwhelming the connection pool - // when there are many alerts (e.g., 200+ alert IDs). - const queue = new PQueue({ concurrency: ALERT_HISTORY_QUERY_CONCURRENCY }); + // Concurrency-limited per-alert queries to avoid overwhelming the connection + // pool when there are many alerts (e.g., 200+ alert IDs). + const queue = + sharedQueue ?? new PQueue({ concurrency: ALERT_HISTORY_QUERY_CONCURRENCY }); const results = await Promise.all( alertIds.map(alertId => @@ -1371,6 +1453,7 @@ export const getPreviousAlertHistories = async ( }, createdAt: { $first: '$createdAt' }, state: { $first: '$state' }, + fired: { $first: '$fired' }, }, }, { @@ -1379,6 +1462,7 @@ export const getPreviousAlertHistories = async ( createdAt: 1, state: 1, group: '$_id.group', + fired: 1, }, }, ]); @@ -1401,6 +1485,90 @@ export const getPreviousAlertHistories = async ( ); }; +/** + * For alerts that use multi-window lookback (numConsecutiveWindows > 1), + * batch-fetch the per-group history needed to decide whether the alert condition + * has been met in M consecutive windows. + * + * Alerts with numConsecutiveWindows <= 1 are skipped entirely (no query is run). + * + * For each multi-window alert we fetch the AlertHistory records whose createdAt + * falls in [windowStart - (M-1)*window, windowStart), sorted newest-first, then + * bucket them by group. processAlert takes the most recent M-1 per group and + * requires every one of them to be ALERT/PENDING to fire. The window start is + * computed with getAlertWindowStart so it matches the window processAlert + * evaluates for the same `now`. + */ +export const getConsecutiveWindowHistories = async ( + alerts: IAlert[], + now: Date, + sharedQueue?: PQueue, +): Promise> => { + const map = new Map(); + + const multiWindowAlerts = alerts.filter( + alert => (alert.numConsecutiveWindows ?? 1) > 1, + ); + if (multiWindowAlerts.length === 0) { + return map; + } + + // Concurrency-limited per-alert queries (same approach as getPreviousAlertHistories) + const queue = + sharedQueue ?? new PQueue({ concurrency: ALERT_HISTORY_QUERY_CONCURRENCY }); + + const results = await Promise.all( + multiWindowAlerts.map(alert => + queue.add(async () => { + const numWindowsToLookBack = alert.numConsecutiveWindows ?? 1; + const windowSizeInMins = ms(alert.interval) / 60000; + const windowStart = getAlertWindowStart(alert, now); + const earliestAllowedTime = new Date( + windowStart.getTime() - + (numWindowsToLookBack - 1) * windowSizeInMins * 60_000, + ); + const id = new mongoose.Types.ObjectId(alert.id); + const histories = await AlertHistory.aggregate([ + { + $match: { + alert: id, + createdAt: { $gte: earliestAllowedTime, $lt: windowStart }, + }, + }, + { $sort: { alert: 1, group: 1, createdAt: -1 } }, + { + $project: { + _id: '$alert', + createdAt: 1, + state: 1, + group: 1, + fired: 1, + }, + }, + ]); + return { alertId: alert.id, histories }; + }), + ), + ); + + for (const result of results) { + if (!result) { + continue; + } + for (const history of result.histories) { + const key = computeHistoryMapKey(result.alertId, history.group || ''); + const bucket = map.get(key); + if (bucket) { + bucket.push(history); + } else { + map.set(key, [history]); + } + } + } + + return map; +}; + export default class CheckAlertTask implements HdxTask { private provider!: AlertProvider; private task_queue: PQueue; diff --git a/packages/api/src/tasks/checkAlerts/providers/default.ts b/packages/api/src/tasks/checkAlerts/providers/default.ts index 7a255bff76..079cdcc89d 100644 --- a/packages/api/src/tasks/checkAlerts/providers/default.ts +++ b/packages/api/src/tasks/checkAlerts/providers/default.ts @@ -1,3 +1,4 @@ +import PQueue from '@esm2cjs/p-queue'; import { ClickhouseClient } from '@hyperdx/common-utils/dist/clickhouse/node'; import { displayTypeSupportsRawSqlAlerts } from '@hyperdx/common-utils/dist/core/utils'; import { isRawSqlSavedChartConfig } from '@hyperdx/common-utils/dist/guards'; @@ -7,6 +8,7 @@ import ms from 'ms'; import { URLSearchParams } from 'url'; import * as config from '@/config'; +import { ALERT_HISTORY_QUERY_CONCURRENCY } from '@/controllers/alertHistory'; import { LOCAL_APP_TEAM } from '@/controllers/team'; import { connectDB, mongooseConnection, ObjectId } from '@/models'; import Alert, { @@ -23,6 +25,7 @@ import { type ISource, Source } from '@/models/source'; import Webhook, { IWebhook } from '@/models/webhook'; import { AggregatedAlertHistory, + getConsecutiveWindowHistories, getPreviousAlertHistories, } from '@/tasks/checkAlerts'; import { @@ -216,6 +219,7 @@ async function loadAlert( alert: IAlert, groupedTasks: Map, previousAlerts: Map, + recentHistoryMap: Map, now: Date, ) { if (!alert.source) { @@ -258,7 +262,11 @@ async function loadAlert( if (!v) { throw new Error(`provider did not set key ${conn.id} before appending`); } - v.alerts.push({ ...details, previousMap: previousAlerts }); + v.alerts.push({ + ...details, + previousMap: previousAlerts, + recentHistoryMap, + }); } export default class DefaultAlertProvider implements AlertProvider { @@ -276,11 +284,25 @@ export default class DefaultAlertProvider implements AlertProvider { const now = new Date(); const alertIds = alerts.map(({ id }) => id); - const previousAlerts = await getPreviousAlertHistories(alertIds, now); + // Share a single queue across both history fetches so their combined + // in-flight per-alert queries stay within one global cap. + const historyQueryQueue = new PQueue({ + concurrency: ALERT_HISTORY_QUERY_CONCURRENCY, + }); + const [previousAlerts, recentHistoryMap] = await Promise.all([ + getPreviousAlertHistories(alertIds, now, historyQueryQueue), + getConsecutiveWindowHistories(alerts, now, historyQueryQueue), + ]); for (const alert of alerts) { try { - await loadAlert(alert, groupedTasks, previousAlerts, now); + await loadAlert( + alert, + groupedTasks, + previousAlerts, + recentHistoryMap, + now, + ); } catch (e) { logger.error({ message: `failed to load alert: ${e}`, @@ -382,7 +404,9 @@ export default class DefaultAlertProvider implements AlertProvider { const finalState = historiesToCheck.some(h => h.state === AlertState.ALERT) ? AlertState.ALERT - : AlertState.OK; + : historiesToCheck.some(h => h.state === AlertState.PENDING) + ? AlertState.PENDING + : AlertState.OK; // Update alert state + errors based on this execution await Alert.updateOne( diff --git a/packages/api/src/tasks/checkAlerts/providers/index.ts b/packages/api/src/tasks/checkAlerts/providers/index.ts index 116be8c588..aeed3e5407 100644 --- a/packages/api/src/tasks/checkAlerts/providers/index.ts +++ b/packages/api/src/tasks/checkAlerts/providers/index.ts @@ -32,6 +32,9 @@ export type PopulatedAlertChannel = { type: 'webhook' } & { channel: IWebhook }; export type AlertDetails = { alert: IAlert; previousMap: Map; // Map of alertId||group -> history for group-by alerts + // For multi-window alerts (numConsecutiveWindows > 1): the recent per-group + // history (alertId||group -> histories, newest-first). + recentHistoryMap?: Map; } & ( | { taskType: AlertTaskType.SAVED_SEARCH; diff --git a/packages/api/src/utils/zod.ts b/packages/api/src/utils/zod.ts index 3675f0dac1..5dd1798299 100644 --- a/packages/api/src/utils/zod.ts +++ b/packages/api/src/utils/zod.ts @@ -636,6 +636,7 @@ export const alertSchema = z name: z.string().min(1).max(512).nullish(), message: z.string().min(1).max(4096).nullish(), note: alertNoteSchema, + numConsecutiveWindows: z.number().int().min(1).nullish(), }) .and(zSavedSearchAlert.or(zTileAlert)) .superRefine(validateAlertScheduleOffsetMinutes) diff --git a/packages/app/src/AlertsPage.tsx b/packages/app/src/AlertsPage.tsx index 5ddf11828e..1d1345c0f3 100644 --- a/packages/app/src/AlertsPage.tsx +++ b/packages/app/src/AlertsPage.tsx @@ -30,6 +30,7 @@ import { IconChevronDown, IconChevronRight, IconHelpCircle, + IconHourglass, IconInfoCircleFilled, IconNote, IconSearch, @@ -213,6 +214,11 @@ function AlertDetails({ alert }: { alert: AlertsPageItem }) { Alert )} + {alert.state === AlertState.PENDING && ( + + Pending + + )} {alert.state === AlertState.OK && Ok} {alert.state === AlertState.DISABLED && ( @@ -269,6 +275,9 @@ function AlertDetails({ alert }: { alert: AlertsPageItem }) { function AlertCardList({ alerts }: { alerts: AlertsPageItem[] }) { const alarmAlerts = alerts.filter(alert => alert.state === AlertState.ALERT); + const pendingAlerts = alerts.filter( + alert => alert.state === AlertState.PENDING, + ); const okData = alerts.filter(alert => alert.state === AlertState.OK); return ( @@ -283,6 +292,16 @@ function AlertCardList({ alerts }: { alerts: AlertsPageItem[] }) { ))} )} + {pendingAlerts.length > 0 && ( +
+ + Pending + + {pendingAlerts.map(alert => ( + + ))} +
+ )}
OK diff --git a/packages/app/src/DBDashboardPage.tsx b/packages/app/src/DBDashboardPage.tsx index a35ab57c7c..75d10d1e7f 100644 --- a/packages/app/src/DBDashboardPage.tsx +++ b/packages/app/src/DBDashboardPage.tsx @@ -593,6 +593,9 @@ const Tile = forwardRef( if (alert.silenced?.at) { return 'yellow'; } + if (alert.state === AlertState.PENDING) { + return 'orange'; + } return 'red'; }, [alert]); diff --git a/packages/app/src/DBSearchPageAlertModal.tsx b/packages/app/src/DBSearchPageAlertModal.tsx index 9f2cd43098..3af7e81fe3 100644 --- a/packages/app/src/DBSearchPageAlertModal.tsx +++ b/packages/app/src/DBSearchPageAlertModal.tsx @@ -77,6 +77,7 @@ const SavedSearchAlertFormSchema = z scheduleStartAt: scheduleStartAtSchema, thresholdType: z.nativeEnum(AlertThresholdType), channel: zAlertChannel, + numConsecutiveWindows: z.number().int().min(1).optional(), }) .passthrough() .superRefine(validateAlertScheduleOffsetMinutes) @@ -150,6 +151,10 @@ const AlertForm = ({ const groupByValue = useWatch({ control, name: 'groupBy' }); const threshold = useWatch({ control, name: 'threshold' }); const thresholdMax = useWatch({ control, name: 'thresholdMax' }); + const numConsecutiveWindows = useWatch({ + control, + name: 'numConsecutiveWindows', + }); const maxScheduleOffsetMinutes = Math.max( intervalToMinutes(interval ?? '5m') - 1, 0, @@ -264,6 +269,8 @@ const AlertForm = ({ scheduleOffsetMinutes={scheduleOffsetMinutes} maxScheduleOffsetMinutes={maxScheduleOffsetMinutes} offsetWindowLabel={`from each ${intervalLabel} window`} + numConsecutiveWindowsName="numConsecutiveWindows" + numConsecutiveWindows={numConsecutiveWindows ?? undefined} /> grouped by diff --git a/packages/app/src/components/AlertScheduleFields.tsx b/packages/app/src/components/AlertScheduleFields.tsx index ccdba37367..774baca58a 100644 --- a/packages/app/src/components/AlertScheduleFields.tsx +++ b/packages/app/src/components/AlertScheduleFields.tsx @@ -36,6 +36,8 @@ type AlertScheduleFieldsProps = { scheduleOffsetMinutes: number | null | undefined; maxScheduleOffsetMinutes: number; offsetWindowLabel: string; + numConsecutiveWindowsName?: FieldPath; + numConsecutiveWindows?: number; }; export function AlertScheduleFields({ @@ -46,6 +48,8 @@ export function AlertScheduleFields({ scheduleOffsetMinutes, maxScheduleOffsetMinutes, offsetWindowLabel, + numConsecutiveWindowsName, + numConsecutiveWindows, }: AlertScheduleFieldsProps) { const showScheduleOffsetInput = maxScheduleOffsetMinutes > 0; const scheduleStartAtValue = useWatch({ @@ -54,7 +58,9 @@ export function AlertScheduleFields({ }) as string | null | undefined; const hasScheduleStartAtAnchor = scheduleStartAtValue != null; const hasAdvancedScheduleValues = - (scheduleOffsetMinutes ?? 0) > 0 || hasScheduleStartAtAnchor; + (scheduleOffsetMinutes ?? 0) > 0 || + hasScheduleStartAtAnchor || + (numConsecutiveWindows ?? 1) > 1; const [opened, setOpened] = useState(hasAdvancedScheduleValues); useEffect(() => { @@ -101,6 +107,48 @@ export function AlertScheduleFields({ Optional schedule controls for aligning alert windows. + {numConsecutiveWindowsName && ( + + + + Consecutive windows + + + + + + + + ( + { + const num = typeof v === 'number' ? v : 1; + field.onChange(num > 1 ? num : undefined); + }} + min={1} + size="xs" + w={70} + /> + )} + /> + + {(numConsecutiveWindows ?? 1) === 1 + ? 'window' + : 'consecutive windows'} + + + )} {showScheduleOffsetInput && ( <> diff --git a/packages/app/src/components/AlertStatusIcon.tsx b/packages/app/src/components/AlertStatusIcon.tsx index 310311155a..74b3bd5f31 100644 --- a/packages/app/src/components/AlertStatusIcon.tsx +++ b/packages/app/src/components/AlertStatusIcon.tsx @@ -9,12 +9,17 @@ export function AlertStatusIcon({ }) { if (!Array.isArray(alerts) || alerts.length === 0) return null; const alertingCount = alerts.filter(a => a.state === AlertState.ALERT).length; + const pendingCount = alerts.filter( + a => a.state === AlertState.PENDING, + ).length; return ( 0 ? `${alertingCount} alert${alertingCount > 1 ? 's' : ''} triggered` - : 'Alerts configured' + : pendingCount > 0 + ? `${pendingCount} alert${pendingCount > 1 ? 's' : ''} pending` + : 'Alerts configured' } > {alertingCount > 0 ? ( @@ -23,6 +28,12 @@ export function AlertStatusIcon({ color="var(--mantine-color-red-filled)" data-testid="alert-status-icon-triggered" /> + ) : pendingCount > 0 ? ( + ) : ( )} diff --git a/packages/app/src/components/DBEditTimeChartForm/TileAlertEditor.tsx b/packages/app/src/components/DBEditTimeChartForm/TileAlertEditor.tsx index 187aaaaa69..8bfb0180fa 100644 --- a/packages/app/src/components/DBEditTimeChartForm/TileAlertEditor.tsx +++ b/packages/app/src/components/DBEditTimeChartForm/TileAlertEditor.tsx @@ -72,6 +72,10 @@ export function TileAlertEditor({ control, name: 'alert.scheduleOffsetMinutes', }); + const alertnumConsecutiveWindows = useWatch({ + control, + name: 'alert.numConsecutiveWindows', + }); const maxAlertScheduleOffsetMinutes = alert?.interval ? Math.max(intervalToMinutes(alert.interval) - 1, 0) : 0; @@ -206,7 +210,7 @@ export function TileAlertEditor({ )} /> - window via + via Send to diff --git a/packages/app/src/components/alerts/AlertHistoryCards.tsx b/packages/app/src/components/alerts/AlertHistoryCards.tsx index 22aad47cfa..a20537308f 100644 --- a/packages/app/src/components/alerts/AlertHistoryCards.tsx +++ b/packages/app/src/components/alerts/AlertHistoryCards.tsx @@ -26,6 +26,17 @@ import styles from '@styles/AlertsPage.module.scss'; const HISTORY_ITEMS = 18; +function stateToBgColorClass(state: AlertState) { + switch (state) { + case AlertState.OK: + return styles.ok; + case AlertState.PENDING: + return styles.pending; + default: + return styles.alarm; + } +} + function AlertHistoryCard({ history, alertUrl, @@ -58,18 +69,20 @@ function AlertHistoryCard({
); + const count = history.counts ?? 0; + const pending = history.state === AlertState.PENDING ? 'pending' : ''; + const alert = `alert${count === 0 || count > 1 ? 's' : ''}`; + const time = formatRelative(start, today); + const label = `${count} ${pending} ${alert} ${time}`; + return ( - + {href ? ( {content} diff --git a/packages/app/styles/AlertsPage.module.scss b/packages/app/styles/AlertsPage.module.scss index cf95722e94..6261985499 100644 --- a/packages/app/styles/AlertsPage.module.scss +++ b/packages/app/styles/AlertsPage.module.scss @@ -39,6 +39,10 @@ background-color: var(--color-bg-danger); } + &.pending { + background-color: var(--color-bg-warning); + } + &.clickable { cursor: pointer; } diff --git a/packages/common-utils/src/types.ts b/packages/common-utils/src/types.ts index e1a2e79974..e8c597fb03 100644 --- a/packages/common-utils/src/types.ts +++ b/packages/common-utils/src/types.ts @@ -431,6 +431,7 @@ export enum AlertState { DISABLED = 'DISABLED', INSUFFICIENT_DATA = 'INSUFFICIENT_DATA', OK = 'OK', + PENDING = 'PENDING', } export enum AlertErrorType { @@ -604,6 +605,7 @@ export const AlertBaseObjectSchema = z.object({ until: z.string(), }) .optional(), + numConsecutiveWindows: z.number().int().min(1).nullish(), }); // Keep AlertBaseSchema as a ZodObject for backwards compatibility with @@ -2077,6 +2079,7 @@ export const AlertsPageItemSchema = z.object({ }) .optional(), executionErrors: z.array(AlertErrorSchema).optional(), + numConsecutiveWindows: z.number().int().min(1).nullish(), }); export type AlertsPageItem = z.infer;