mirror of
https://github.com/Chevron7Locked/kima-hub.git
synced 2026-06-19 07:37:17 +00:00
fix(stream): don't kill paused streams -- reap dead peers via TCP keepalive instead
1.9.0 added server.timeout = 300s to reap dead mobile connections (B3). But Node's socket timeout fires on INACTIVITY, and a paused audio stream is inactive (no bytes flow while backpressured) -- so a pause longer than the timeout had the server destroy the stream's connection, forcing a reconnect on resume. On both web and iOS that surfaced as 'I pause, then have to focus the app for it to play again' after a multi-minute pause; pre-1.9.0 had no such timeout, so paused streams survived (the exact D1 risk the spec flagged). Reap genuinely dead/half-open peers (mobile network gone without FIN/RST) via TCP keepalive instead: server.timeout = 0, and each connection gets setKeepAlive(true, 30s) so the OS drops a socket once probes fail while a paused-but-alive stream keeps answering and stays connected.
This commit is contained in:
@@ -95,11 +95,13 @@ export const config = {
|
||||
process.env.ALLOWED_ORIGINS?.split(",").map((o) => o.trim()) ||
|
||||
(process.env.NODE_ENV === "development" ? true : []),
|
||||
|
||||
// Socket inactivity timeout for active requests. 5 minutes bounds dead
|
||||
// peers without cutting live audio streams -- clients transparently re-range
|
||||
// on resume. Injectable via env so tests can use a small value.
|
||||
serverSocketTimeoutMs: parseInt(
|
||||
process.env.SERVER_SOCKET_TIMEOUT_MS || "300000",
|
||||
// TCP keepalive idle delay for incoming connections. Dead/half-open peers
|
||||
// (a mobile client whose network vanished without FIN/RST) are reaped by the
|
||||
// OS once keepalive probes start failing, WITHOUT cutting a paused-but-alive
|
||||
// audio stream -- unlike a blunt socket-inactivity timeout, which fires on a
|
||||
// paused stream (no bytes flow) and forces a reconnect on resume.
|
||||
socketKeepAliveDelayMs: parseInt(
|
||||
process.env.SOCKET_KEEPALIVE_DELAY_MS || "30000",
|
||||
10
|
||||
),
|
||||
};
|
||||
|
||||
+11
-4
@@ -361,12 +361,19 @@ async function main() {
|
||||
logger.debug(
|
||||
`Kima API running on port ${config.port} (accessible on all network interfaces)`
|
||||
);
|
||||
// Reap dead/half-open peers so long-lived audio streams cannot accumulate
|
||||
// forever. 5 minutes of full socket silence is far beyond any live client's
|
||||
// backpressure pause for audio.
|
||||
server.keepAliveTimeout = 65_000; // > common LB idle of 60s
|
||||
server.headersTimeout = 70_000; // must exceed keepAliveTimeout
|
||||
server.timeout = config.serverSocketTimeoutMs;
|
||||
// Do NOT set server.timeout: it fires on socket INACTIVITY, and a paused
|
||||
// audio stream is inactive (no bytes flow while backpressured), so it would
|
||||
// destroy the stream's connection after the timeout and force a reconnect on
|
||||
// resume -- a regression seen on both web and iOS after a multi-minute pause.
|
||||
// Reap genuinely dead/half-open peers (mobile network gone without FIN/RST)
|
||||
// via TCP keepalive instead: the OS drops the socket once probes fail, while
|
||||
// a paused-but-alive stream keeps answering probes and stays connected.
|
||||
server.timeout = 0;
|
||||
server.on("connection", (socket) => {
|
||||
socket.setKeepAlive(true, config.socketKeepAliveDelayMs);
|
||||
});
|
||||
|
||||
// Enable slow query monitoring in development
|
||||
if (config.nodeEnv === "development") {
|
||||
|
||||
@@ -1,51 +1,36 @@
|
||||
// Server socket timeout test -- spec 1.2 (B3).
|
||||
// Server connection-reaping config -- spec 1.2 (B3), revised.
|
||||
//
|
||||
// A socket-destruction test requires: (1) a real bound server port, (2) a TCP
|
||||
// client that stops reading (backpressure), and (3) waiting for the timeout to
|
||||
// fire and destroy the socket. Even with SERVER_SOCKET_TIMEOUT_MS injected to a
|
||||
// small value (e.g. 300ms), the test would sleep 300-500ms, making it slow and
|
||||
// sensitive to scheduler jitter. Rerunning in parallel with other test files
|
||||
// multiplies the risk of false positives from over-loaded CI runners.
|
||||
// A blunt server.timeout was found to destroy a PAUSED audio stream's socket
|
||||
// (a paused stream is inactive -- no bytes flow -- so the inactivity timeout
|
||||
// fires), forcing a reconnect on resume after a multi-minute pause. Dead/half-
|
||||
// open peers are now reaped via TCP keepalive instead, which leaves a paused-
|
||||
// but-alive stream connected. index.ts sets server.timeout = 0 and enables
|
||||
// keepalive on each connection with config.socketKeepAliveDelayMs.
|
||||
//
|
||||
// The timeout wiring itself (server.timeout = config.serverSocketTimeoutMs) is
|
||||
// four lines that are visually inspectable in index.ts. The interesting
|
||||
// properties -- keepAliveTimeout < headersTimeout, and serverSocketTimeoutMs
|
||||
// reading from env -- are verified here as pure unit assertions without
|
||||
// starting a server.
|
||||
// The wiring (a few lines on the server returned by app.listen) is visually
|
||||
// inspectable; here we unit-test the config value those lines consume. A live
|
||||
// socket test is intentionally omitted -- it must bind a port, open a TCP
|
||||
// socket, and wait on real timers, which is slow and flaky under CI load.
|
||||
|
||||
import { config } from '../../../config';
|
||||
|
||||
describe('server timeout config values (spec 1.2)', () => {
|
||||
it('serverSocketTimeoutMs defaults to 300000', () => {
|
||||
// In the test environment SERVER_SOCKET_TIMEOUT_MS is not set, so the
|
||||
// default applies.
|
||||
const original = process.env.SERVER_SOCKET_TIMEOUT_MS;
|
||||
delete process.env.SERVER_SOCKET_TIMEOUT_MS;
|
||||
// Re-evaluate: the config object reads the env at import time via parseInt,
|
||||
// so we verify the documented default inline rather than re-requiring.
|
||||
const defaultMs = parseInt(process.env.SERVER_SOCKET_TIMEOUT_MS || '300000', 10);
|
||||
expect(defaultMs).toBe(300000);
|
||||
if (original !== undefined) process.env.SERVER_SOCKET_TIMEOUT_MS = original;
|
||||
describe('socket keepalive config (spec 1.2)', () => {
|
||||
it('socketKeepAliveDelayMs defaults to 30000', () => {
|
||||
const original = process.env.SOCKET_KEEPALIVE_DELAY_MS;
|
||||
delete process.env.SOCKET_KEEPALIVE_DELAY_MS;
|
||||
const defaultMs = parseInt(process.env.SOCKET_KEEPALIVE_DELAY_MS || '30000', 10);
|
||||
expect(defaultMs).toBe(30000);
|
||||
if (original !== undefined) process.env.SOCKET_KEEPALIVE_DELAY_MS = original;
|
||||
});
|
||||
|
||||
it('serverSocketTimeoutMs is overridable via SERVER_SOCKET_TIMEOUT_MS env', () => {
|
||||
it('socketKeepAliveDelayMs is overridable via SOCKET_KEEPALIVE_DELAY_MS env', () => {
|
||||
const injected = parseInt('5000', 10);
|
||||
expect(injected).toBe(5000);
|
||||
// This verifies the env-override path parses correctly; index.ts
|
||||
// passes the parsed value directly to server.timeout.
|
||||
});
|
||||
|
||||
it('config.serverSocketTimeoutMs is a positive integer', () => {
|
||||
expect(typeof config.serverSocketTimeoutMs).toBe('number');
|
||||
expect(config.serverSocketTimeoutMs).toBeGreaterThan(0);
|
||||
expect(Number.isInteger(config.serverSocketTimeoutMs)).toBe(true);
|
||||
it('config.socketKeepAliveDelayMs is a positive integer', () => {
|
||||
expect(typeof config.socketKeepAliveDelayMs).toBe('number');
|
||||
expect(config.socketKeepAliveDelayMs).toBeGreaterThan(0);
|
||||
expect(Number.isInteger(config.socketKeepAliveDelayMs)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// NOTE: A live socket-destruction integration test is intentionally omitted.
|
||||
// Such a test must: bind a port, open a TCP socket, pause reading, wait for
|
||||
// server.timeout to fire. With a 300ms injected timeout and Node.js scheduler
|
||||
// jitter, it fails intermittently under load in CI (observed failure rate ~8%
|
||||
// at 200ms, ~3% at 300ms). The wiring in index.ts is four lines on the
|
||||
// server return value from app.listen -- the risk/reward of a flaky test does
|
||||
// not justify the coverage.
|
||||
|
||||
Reference in New Issue
Block a user