Skip to content

Commit a2fd0ed

Browse files
author
Daniel Duong
committed
Merge branch 'main' of https://github.com/opendatahub-io/odh-dashboard into feat/autorag
2 parents 53b062c + 2984d23 commit a2fd0ed

187 files changed

Lines changed: 10041 additions & 2354 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,5 +83,6 @@ Before performing certain tasks, read and follow the corresponding specialized r
8383
| **Contract Tests** | [docs/agent-rules/contract-tests.md](docs/agent-rules/contract-tests.md) | When working on contract tests or BFF API validation |
8484
| **Cypress E2E Tests** | [docs/agent-rules/cypress-e2e.md](docs/agent-rules/cypress-e2e.md) | When creating or modifying E2E tests, Robot Framework migrations |
8585
| **Cypress Mock Tests**| [docs/agent-rules/cypress-mock.md](docs/agent-rules/cypress-mock.md) | When creating or modifying mock/component tests |
86+
| **Unit Tests** | [docs/agent-rules/unit-tests.md](docs/agent-rules/unit-tests.md) | When creating or modifying Jest unit tests for utilities, hooks, or components |
8687

8788
**Important**: Always read the relevant rule file before starting the task to ensure you follow the project's conventions and patterns.

OWNERS

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ filters:
1616
- workbenches-approvers
1717
- gen-ai-approvers
1818
- nim-serving-approvers
19+
- autorag-approvers
1920

2021
# Catchall
2122
'.*':
@@ -147,3 +148,12 @@ filters:
147148
- notebooks-reviewers
148149
labels:
149150
- 'area/notebooks'
151+
152+
# AutoRAG
153+
'packages/autorag/.*':
154+
approvers:
155+
- autorag-approvers
156+
reviewers:
157+
- autorag-reviewers
158+
labels:
159+
- 'area/autorag'

OWNERS_ALIASES

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ aliases:
150150
- rhdedgar
151151
- iamemilio
152152
- s-akhtar-baig
153-
153+
154154
# Notebooks
155155
notebooks-approvers:
156156
- andyatmiami
@@ -181,3 +181,17 @@ aliases:
181181
- mtalvi
182182
- xieshenzh
183183
- swati-kale
184+
185+
# AutoRAG
186+
autorag-approvers:
187+
- nickmazzi
188+
- chrjones-rh
189+
autorag-reviewers:
190+
- nickmazzi
191+
- chrjones-rh
192+
- srtanish1992
193+
- daniduong
194+
- GAUNSD
195+
- jefho-rh
196+
- MatthewAThompson
197+
- jkyaw

backend/src/__tests__/websocket-proxy.spec.ts

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,36 @@ describe('WebSocket K8s Proxy', () => {
586586
expect.stringContaining('Unexpected response from K8s API'),
587587
);
588588
});
589+
590+
it('should convert reserved close code 1006 to 1011 when forwarding to client', async () => {
591+
// Code 1006 (Abnormal Closure) cannot be sent in a close frame per RFC 6455
592+
// The ws module enforces this. We convert it to 1011 (Internal Error).
593+
await routeHandler(mockConnection, mockRequest);
594+
595+
const targetCloseHandler = mockTargetSocket.on.mock.calls.find(
596+
(call: any) => call[0] === 'close',
597+
)?.[1];
598+
599+
// K8s API closes with code 1006 (abnormal closure - e.g., network drop)
600+
targetCloseHandler(1006, 'Abnormal closure');
601+
602+
// Client socket should receive 1011 instead of 1006
603+
expect(mockSourceSocket.close).toHaveBeenCalledWith(1011, 'Abnormal closure');
604+
});
605+
606+
it('should pass through normal close codes unchanged', async () => {
607+
await routeHandler(mockConnection, mockRequest);
608+
609+
const targetCloseHandler = mockTargetSocket.on.mock.calls.find(
610+
(call: any) => call[0] === 'close',
611+
)?.[1];
612+
613+
// K8s API closes with normal code 1000
614+
targetCloseHandler(1000, 'Normal closure');
615+
616+
// Client socket should receive 1000 unchanged
617+
expect(mockSourceSocket.close).toHaveBeenCalledWith(1000, 'Normal closure');
618+
});
589619
});
590620

591621
describe('Stale Connection Cleanup', () => {
@@ -602,7 +632,7 @@ describe('WebSocket K8s Proxy', () => {
602632
// Connection is still active, so no cleanup warning yet
603633
expect(mockLog.warn).not.toHaveBeenCalledWith(
604634
expect.anything(),
605-
expect.stringContaining('Removing stale connection'),
635+
expect.stringContaining('Closing stale websocket connection'),
606636
);
607637

608638
// Advance time beyond stale threshold
@@ -614,9 +644,78 @@ describe('WebSocket K8s Proxy', () => {
614644
expect.objectContaining({
615645
inactivityDuration: expect.any(Number),
616646
}),
617-
expect.stringContaining('Removing stale connection from tracking'),
647+
expect.stringContaining('Closing stale websocket connection'),
648+
);
649+
});
650+
651+
/**
652+
* This test verifies the fix for the zombie connection bug:
653+
*
654+
* When a connection becomes stale, both websockets are now properly closed
655+
* so that the client receives the close event and can reconnect.
656+
*/
657+
it('should close websockets when cleaning up stale connections', async () => {
658+
await wssK8sRoutes(mockFastify);
659+
routeHandler = (mockFastify.get as jest.Mock).mock.calls[0][2];
660+
661+
// Create a connection
662+
await routeHandler(mockConnection, mockRequest);
663+
664+
// Verify sockets haven't been closed yet
665+
expect(mockSourceSocket.close).not.toHaveBeenCalled();
666+
expect(mockTargetSocket.close).not.toHaveBeenCalled();
667+
668+
// Let connection become stale (advance past threshold + cleanup cycle)
669+
jest.advanceTimersByTime(STALE_CONNECTION_MS + 60000);
670+
671+
// Verify stale cleanup ran
672+
expect(mockLog.warn).toHaveBeenCalledWith(
673+
expect.objectContaining({
674+
inactivityDuration: expect.any(Number),
675+
}),
676+
expect.stringContaining('Closing stale websocket connection'),
677+
);
678+
679+
// Both websockets should be closed with code 1001 (Going Away)
680+
expect(mockSourceSocket.close).toHaveBeenCalledWith(
681+
1001,
682+
'Connection stale due to inactivity',
683+
);
684+
expect(mockTargetSocket.close).toHaveBeenCalledWith(
685+
1001,
686+
'Connection stale due to inactivity',
618687
);
619688
});
689+
690+
/**
691+
* This test verifies that heartbeat intervals are properly cleared
692+
* when cleaning up stale connections.
693+
*/
694+
it('should clear heartbeat interval when cleaning up stale connections', async () => {
695+
await wssK8sRoutes(mockFastify);
696+
routeHandler = (mockFastify.get as jest.Mock).mock.calls[0][2];
697+
698+
// Create a connection
699+
await routeHandler(mockConnection, mockRequest);
700+
701+
// Simulate connection opening to start heartbeat
702+
const openHandler = mockTargetSocket.on.mock.calls.find(
703+
(call: any) => call[0] === 'open',
704+
)?.[1];
705+
openHandler();
706+
707+
// Verify heartbeat is running
708+
jest.advanceTimersByTime(HEARTBEAT_INTERVAL_MS);
709+
expect(mockTargetSocket.ping).toHaveBeenCalledTimes(1);
710+
711+
// Advance time beyond stale threshold + cleanup cycle
712+
jest.advanceTimersByTime(STALE_CONNECTION_MS + 60000);
713+
714+
// After cleanup, heartbeat should be stopped - no more pings
715+
const pingCountAfterCleanup = mockTargetSocket.ping.mock.calls.length;
716+
jest.advanceTimersByTime(HEARTBEAT_INTERVAL_MS * 3);
717+
expect(mockTargetSocket.ping.mock.calls.length).toBe(pingCountAfterCleanup);
718+
});
620719
});
621720

622721
describe('Configuration Constants', () => {

backend/src/routes/wss/k8s/index.ts

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,23 @@ const base64 = (data: string): string =>
1010
// eslint-disable-next-line no-restricted-properties
1111
Buffer.from(data).toString('base64').replace(/\+/g, '-').replace(/\//g, '_').split('=', 1)[0];
1212

13+
/**
14+
* Convert reserved WebSocket close codes to valid codes that can be sent in a close frame.
15+
*
16+
* Per RFC 6455, certain codes cannot be set by applications:
17+
* - 1004: Reserved
18+
* - 1005: No Status Received (MUST NOT be set in close frame)
19+
* - 1006: Abnormal Closure (MUST NOT be set in close frame, indicates no close frame received)
20+
*
21+
* The `ws` module enforces this and throws if you try to close with these codes.
22+
*
23+
* We convert these to 1011 (Internal Error) which is semantically appropriate:
24+
* "The server is terminating the connection because it encountered an unexpected condition"
25+
*/
1326
const liftErrorCode = (code: number) => {
1427
if (code === 1004 || code === 1005 || code === 1006) {
15-
// ws module forbid those error codes usage, lift to "application level" (4xxx)
16-
return 3000 + code;
28+
// Use 1011 (Internal Error) as a valid substitute for reserved codes
29+
return 1011;
1730
}
1831
return code;
1932
};
@@ -43,8 +56,17 @@ type ConnectionMetrics = {
4356
lastResourceVersion?: string;
4457
};
4558

59+
// Connection tracking includes metrics plus socket references for proper cleanup
60+
type TrackedConnection = {
61+
metrics: ConnectionMetrics;
62+
source: WebSocket;
63+
target: WebSocket;
64+
kubeUri: string;
65+
heartbeatInterval: NodeJS.Timeout | null;
66+
};
67+
4668
// Global connection tracking for monitoring and cleanup
47-
const activeConnections = new Map<string, ConnectionMetrics>();
69+
const activeConnections = new Map<string, TrackedConnection>();
4870

4971
// Constants for connection management (exported for testing)
5072
export const CONNECTION_TIMEOUT_MS = 10000; // 10 seconds to establish connection
@@ -56,7 +78,8 @@ const cleanupStaleConnections = (fastify: KubeFastifyInstance) => {
5678
const now = Date.now();
5779
let cleanedCount = 0;
5880

59-
for (const [connectionId, metrics] of activeConnections.entries()) {
81+
for (const [connectionId, tracked] of activeConnections.entries()) {
82+
const { metrics, source, target, kubeUri, heartbeatInterval } = tracked;
6083
const inactivityDuration = now - Math.max(metrics.lastMessageReceived, metrics.lastMessageSent);
6184

6285
if (inactivityDuration > STALE_CONNECTION_MS) {
@@ -66,9 +89,21 @@ const cleanupStaleConnections = (fastify: KubeFastifyInstance) => {
6689
inactivityDuration,
6790
messagesReceived: metrics.messagesReceived,
6891
messagesSent: metrics.messagesSent,
92+
duration: now - metrics.created,
6993
},
70-
`Removing stale connection from tracking: ${connectionId}`,
94+
`Closing stale websocket connection: ${kubeUri}`,
7195
);
96+
97+
// Clear heartbeat interval if it exists
98+
if (heartbeatInterval) {
99+
clearInterval(heartbeatInterval);
100+
}
101+
102+
// Close both websockets so the client receives the close event and can reconnect
103+
// Use code 1001 (going away) to indicate the server is closing due to inactivity
104+
closeWebSocket(source, 1001, 'Connection stale due to inactivity');
105+
closeWebSocket(target, 1001, 'Connection stale due to inactivity');
106+
72107
activeConnections.delete(connectionId);
73108
cleanedCount++;
74109
}
@@ -121,7 +156,6 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
121156
messagesReceived: 0,
122157
messagesSent: 0,
123158
};
124-
activeConnections.set(connectionId, metrics);
125159

126160
fastify.log.info(
127161
{
@@ -147,6 +181,16 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
147181
ca: https.globalAgent.options.ca as WebSocket.CertMeta,
148182
});
149183

184+
// Track connection with socket references for proper cleanup
185+
const tracked: TrackedConnection = {
186+
metrics,
187+
source,
188+
target,
189+
kubeUri,
190+
heartbeatInterval: null,
191+
};
192+
activeConnections.set(connectionId, tracked);
193+
150194
// Close both connections and log diagnostics
151195
const close = (code: number, reason: string | Buffer) => {
152196
// Make idempotent - only run once per connection
@@ -169,6 +213,12 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
169213
`Closing websocket connection: ${kubeUri}`,
170214
);
171215

216+
// Clear heartbeat interval if it exists
217+
if (tracked.heartbeatInterval) {
218+
clearInterval(tracked.heartbeatInterval);
219+
tracked.heartbeatInterval = null;
220+
}
221+
172222
closeWebSocket(source, code, reason);
173223
closeWebSocket(target, code, reason);
174224
activeConnections.delete(connectionId);
@@ -189,9 +239,6 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
189239
}
190240
}, CONNECTION_TIMEOUT_MS);
191241

192-
// Heartbeat monitoring
193-
let heartbeatInterval: NodeJS.Timeout | null = null;
194-
195242
const onUnexpectedResponse = (_: ClientRequest, response: IncomingMessage) => {
196243
const statusCode = response.statusCode || 'unknown';
197244
const statusMessage = response.statusMessage || 'unknown';
@@ -219,7 +266,7 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
219266
);
220267

221268
// Start heartbeat monitoring
222-
heartbeatInterval = setInterval(() => {
269+
tracked.heartbeatInterval = setInterval(() => {
223270
// Send ping to keep connection alive
224271
if (target.readyState === WebSocket.OPEN) {
225272
try {
@@ -311,8 +358,9 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
311358

312359
// Handle K8s API connection close
313360
target.on('close', (code, reason) => {
314-
if (heartbeatInterval) {
315-
clearInterval(heartbeatInterval);
361+
if (tracked.heartbeatInterval) {
362+
clearInterval(tracked.heartbeatInterval);
363+
tracked.heartbeatInterval = null;
316364
}
317365

318366
const reasonStr = String(reason);
@@ -378,8 +426,9 @@ export default async (fastify: KubeFastifyInstance): Promise<void> => {
378426

379427
// Handle client connection close
380428
source.on('close', (code, reason) => {
381-
if (heartbeatInterval) {
382-
clearInterval(heartbeatInterval);
429+
if (tracked.heartbeatInterval) {
430+
clearInterval(tracked.heartbeatInterval);
431+
tracked.heartbeatInterval = null;
383432
}
384433
clearTimeout(connectionTimeout);
385434

backend/src/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ export type DashboardConfig = K8sResourceCommon & {
5151
disableFeatureStore: boolean;
5252
trainingJobs: boolean;
5353
genAiStudio: boolean;
54+
autoRag: boolean;
5455
modelAsService: boolean;
5556
mlflow: boolean;
5657
disableLLMd: boolean;

backend/src/utils/constants.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ export const blankDashboardCR: DashboardConfig = {
8181
disableAdminConnectionTypes: false,
8282
disableFeatureStore: false,
8383
genAiStudio: false,
84+
autoRag: false,
8485
modelAsService: false,
8586
maasApiKeys: false,
8687
disableFineTuning: true,
@@ -89,7 +90,7 @@ export const blankDashboardCR: DashboardConfig = {
8990
mlflow: false,
9091
trainingJobs: true,
9192
disableLLMd: false,
92-
projectRBAC: false,
93+
projectRBAC: true,
9394
autorag: false,
9495
},
9596
notebookController: {

0 commit comments

Comments
 (0)