From 01fc8cf23ef90c7c1004f4f17665281825de9e56 Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Thu, 7 May 2026 11:11:19 +0000 Subject: [PATCH 1/7] feat(telemetry): instrument connection lifecycle --- src/api/coderApi.ts | 33 ++- src/extension.ts | 1 + src/remote/remote.ts | 8 +- src/remote/sshProcess.ts | 177 ++++++++++++++- src/websocket/reconnectingWebSocket.ts | 208 ++++++++++++++++-- test/unit/remote/sshProcess.test.ts | 158 +++++++++++++ .../websocket/reconnectingWebSocket.test.ts | 181 ++++++++++++++- 7 files changed, 723 insertions(+), 43 deletions(-) diff --git a/src/api/coderApi.ts b/src/api/coderApi.ts index 57e27117..d2c3694d 100644 --- a/src/api/coderApi.ts +++ b/src/api/coderApi.ts @@ -36,6 +36,7 @@ import { } from "../logging/types"; import { sizeOf } from "../logging/utils"; import { getHeaderCommand } from "../settings/headers"; +import { type TelemetryService } from "../telemetry/service"; import { HttpStatusCode, WebSocketCloseCode } from "../websocket/codes"; import { type UnidirectionalStream, @@ -49,6 +50,7 @@ import { import { ConnectionState, ReconnectingWebSocket, + type ReconnectingWebSocketOptions, type SocketFactory, } from "../websocket/reconnectingWebSocket"; import { SseConnection } from "../websocket/sseConnection"; @@ -86,7 +88,10 @@ export class CoderApi extends Api implements vscode.Disposable { >(); private readonly configWatcher: vscode.Disposable; - private constructor(private readonly output: Logger) { + private constructor( + private readonly output: Logger, + private readonly telemetry: TelemetryService | undefined, + ) { super(); this.configWatcher = this.watchConfigChanges(); } @@ -99,8 +104,9 @@ export class CoderApi extends Api implements vscode.Disposable { baseUrl: string, token: string | undefined, output: Logger, + telemetry?: TelemetryService, ): CoderApi { - const client = new CoderApi(output); + const client = new CoderApi(output, telemetry); client.setCredentials(baseUrl, token); setupInterceptors(client, output); @@ -449,18 +455,23 @@ export class CoderApi extends Api implements vscode.Disposable { private async createReconnectingSocket( socketFactory: SocketFactory, ): Promise> { + const options: ReconnectingWebSocketOptions = { + onCertificateRefreshNeeded: async () => { + const refreshCommand = getRefreshCommand(); + if (!refreshCommand) { + return false; + } + return refreshCertificates(refreshCommand, this.output); + }, + }; + if (this.telemetry) { + options.telemetry = this.telemetry; + } + const reconnectingSocket = await ReconnectingWebSocket.create( socketFactory, this.output, - { - onCertificateRefreshNeeded: async () => { - const refreshCommand = getRefreshCommand(); - if (!refreshCommand) { - return false; - } - return refreshCertificates(refreshCommand, this.output); - }, - }, + options, () => this.reconnectingSockets.delete(reconnectingSocket), ); diff --git a/src/extension.ts b/src/extension.ts index 0ba96aff..23d5258b 100644 --- a/src/extension.ts +++ b/src/extension.ts @@ -114,6 +114,7 @@ export async function activate(ctx: vscode.ExtensionContext): Promise { (await secretsManager.getSessionAuth(deployment?.safeHostname ?? "")) ?.token, output, + serviceContainer.getTelemetryService(), ); ctx.subscriptions.push(client); diff --git a/src/remote/remote.ts b/src/remote/remote.ts index 0f8714d5..237b7f0b 100644 --- a/src/remote/remote.ts +++ b/src/remote/remote.ts @@ -190,7 +190,12 @@ export class Remote { // break this connection. We could force close the remote session or // disallow logging out/in altogether, but for now just use a separate // client to remain unaffected by whatever the plugin is doing. - const workspaceClient = CoderApi.create(baseUrlRaw, token, this.logger); + const workspaceClient = CoderApi.create( + baseUrlRaw, + token, + this.logger, + this.serviceContainer.getTelemetryService(), + ); disposables.push(workspaceClient); // Create 401 interceptor - handles auth failures with re-login dialog @@ -505,6 +510,7 @@ export class Remote { logger: this.logger, codeLogDir: this.pathResolver.getCodeLogDir(), remoteSshExtensionId, + telemetry: this.serviceContainer.getTelemetryService(), }); disposables.push(sshMonitor); diff --git a/src/remote/sshProcess.ts b/src/remote/sshProcess.ts index a117fa71..809997e4 100644 --- a/src/remote/sshProcess.ts +++ b/src/remote/sshProcess.ts @@ -3,6 +3,7 @@ import * as fs from "node:fs/promises"; import * as path from "node:path"; import * as vscode from "vscode"; +import { type TelemetryService } from "../telemetry/service"; import { findPort } from "../util"; import { cleanupFiles } from "../util/fileCleanup"; @@ -40,6 +41,7 @@ export interface SshProcessMonitorOptions { // For port-based SSH process discovery codeLogDir: string; remoteSshExtensionId: string; + telemetry?: TelemetryService; } // 1 hour cleanup threshold for old network info files @@ -48,6 +50,21 @@ const CLEANUP_NETWORK_MAX_AGE_MS = 60 * 60 * 1000; const CLEANUP_LOG_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // Maximum number of proxy log files to keep during cleanup const CLEANUP_MAX_LOG_FILES = 20; +const NETWORK_TELEMETRY_SAMPLE_INTERVAL_MS = 60_000; +const NETWORK_TELEMETRY_LATENCY_CHANGE_RATIO = 0.1; + +type ProcessLossCause = + | "stale_network_info" + | "missing_network_info" + | "process_changed" + | "disposed"; + +interface NetworkTelemetrySample { + readonly emittedAtMs: number; + readonly p2p: boolean; + readonly derp: string; + readonly latencyMs: number; +} /** * Monitors the SSH process for a Coder workspace connection and displays @@ -56,8 +73,11 @@ const CLEANUP_MAX_LOG_FILES = 20; export class SshProcessMonitor implements vscode.Disposable { private readonly statusBarItem: vscode.StatusBarItem; private readonly options: Required< - SshProcessMonitorOptions & { proxyLogDir: string | undefined } - >; + Omit + > & { + readonly proxyLogDir: string | undefined; + readonly telemetry: TelemetryService | undefined; + }; private readonly _onLogFilePathChange = new vscode.EventEmitter< string | undefined @@ -80,6 +100,9 @@ export class SshProcessMonitor implements vscode.Disposable { private pendingTimeout: NodeJS.Timeout | undefined; private lastStaleSearchTime = 0; private readonly reporter: NetworkStatusReporter; + private processStartedAtMs: number | undefined; + private processLostAtMs: number | undefined; + private lastNetworkTelemetrySample: NetworkTelemetrySample | undefined; /** * Cleans up network info files older than the specified age. @@ -125,6 +148,7 @@ export class SshProcessMonitor implements vscode.Disposable { maxDiscoveryBackoffMs: options.maxDiscoveryBackoffMs ?? 30_000, // Matches the SSH update interval networkPollInterval: options.networkPollInterval ?? 3000, + telemetry: options.telemetry, }; this.statusBarItem = vscode.window.createStatusBarItem( vscode.StatusBarAlignment.Left, @@ -181,6 +205,7 @@ export class SshProcessMonitor implements vscode.Disposable { if (this.disposed) { return; } + this.emitProcessLost("disposed"); this.disposed = true; if (this.pendingTimeout) { clearTimeout(this.pendingTimeout); @@ -211,6 +236,29 @@ export class SshProcessMonitor implements vscode.Disposable { * Starts monitoring when it finds the process through the port. */ private async searchForProcess(): Promise { + const pid = await this.traceProcessDiscovery(() => + this.discoverSshProcess(), + ); + if (pid === undefined || this.disposed) { + return; + } + + this.setCurrentPid(pid); + this.startMonitoring(); + } + + private async traceProcessDiscovery( + fn: () => Promise, + ): Promise { + const { telemetry } = this.options; + if (!telemetry) { + return fn(); + } + + return telemetry.trace("ssh.process.discovered", fn); + } + + private async discoverSshProcess(): Promise { const { discoveryPollIntervalMs, maxDiscoveryBackoffMs, logger, sshHost } = this.options; let attempt = 0; @@ -244,14 +292,14 @@ export class SshProcessMonitor implements vscode.Disposable { } if (pid !== undefined) { - this.setCurrentPid(pid); - this.startMonitoring(); - return; + return pid; } await this.delay(currentBackoff); currentBackoff = Math.min(currentBackoff * 2, maxDiscoveryBackoffMs); } + + return undefined; } /** @@ -304,12 +352,27 @@ export class SshProcessMonitor implements vscode.Disposable { */ private setCurrentPid(pid: number): void { const previousPid = this.currentPid; + const now = performance.now(); this.currentPid = pid; if (previousPid === undefined) { + this.processStartedAtMs = now; this.options.logger.info(`SSH connection established (PID: ${pid})`); this._onPidChange.fire(pid); - } else if (previousPid !== pid) { + return; + } + + if (previousPid !== pid && this.processLostAtMs === undefined) { + this.emitProcessLost("process_changed"); + } + + if (this.processLostAtMs !== undefined) { + this.emitProcessRecovered(); + } + + if (previousPid !== pid) { + this.processStartedAtMs = now; + this.lastNetworkTelemetrySample = undefined; this.options.logger.info( `SSH process changed from ${previousPid} to ${pid}`, ); @@ -319,6 +382,34 @@ export class SshProcessMonitor implements vscode.Disposable { } } + private emitProcessLost(cause: ProcessLossCause): void { + if (this.currentPid === undefined || this.processLostAtMs !== undefined) { + return; + } + + const now = performance.now(); + const startedAt = this.processStartedAtMs ?? now; + this.processLostAtMs = now; + this.options.telemetry?.log( + "ssh.process.lost", + { cause }, + { uptimeMs: now - startedAt }, + ); + } + + private emitProcessRecovered(now = performance.now()): void { + if (this.processLostAtMs === undefined) { + return; + } + + this.options.telemetry?.log( + "ssh.process.recovered", + {}, + { recoveryDurationMs: now - this.processLostAtMs }, + ); + this.processLostAtMs = undefined; + } + /** * Starts monitoring tasks after finding the SSH process. */ @@ -408,6 +499,7 @@ export class SshProcessMonitor implements vscode.Disposable { const network = JSON.parse(content) as NetworkInfo; const isStale = ageMs > networkPollInterval * 2; this.reporter.update(network, isStale); + this.sampleNetworkTelemetry(network); } } catch (error) { readFailures++; @@ -420,6 +512,10 @@ export class SshProcessMonitor implements vscode.Disposable { } if (searchReason !== undefined) { + const lossCause: ProcessLossCause = + readFailures >= maxReadFailures + ? "missing_network_info" + : "stale_network_info"; const timeSinceLastSearch = Date.now() - this.lastStaleSearchTime; if (timeSinceLastSearch < staleThreshold) { await this.delay(staleThreshold - timeSinceLastSearch); @@ -427,6 +523,7 @@ export class SshProcessMonitor implements vscode.Disposable { } logger.debug(`${searchReason}, searching for new SSH process`); + this.emitProcessLost(lossCause); // searchForProcess will update PID if a different process is found this.lastStaleSearchTime = Date.now(); await this.searchForProcess(); @@ -436,6 +533,74 @@ export class SshProcessMonitor implements vscode.Disposable { await this.delay(networkPollInterval); } } + + private sampleNetworkTelemetry(network: NetworkInfo): void { + if (!this.options.telemetry) { + return; + } + + const sample = toNetworkTelemetrySample(network); + if (!this.shouldEmitNetworkTelemetry(sample)) { + return; + } + + this.options.telemetry.log( + "ssh.network.info", + { + p2p: String(network.p2p), + derp: sample.derp, + }, + { + latencyMs: sample.latencyMs, + downloadMbits: bytesPerSecondToMbits(network.download_bytes_sec), + uploadMbits: bytesPerSecondToMbits(network.upload_bytes_sec), + }, + ); + this.lastNetworkTelemetrySample = sample; + } + + private shouldEmitNetworkTelemetry(sample: NetworkTelemetrySample): boolean { + const previous = this.lastNetworkTelemetrySample; + if (!previous) { + return true; + } + + return ( + sample.emittedAtMs - previous.emittedAtMs >= + NETWORK_TELEMETRY_SAMPLE_INTERVAL_MS || + sample.p2p !== previous.p2p || + sample.derp !== previous.derp || + hasMeaningfulLatencyChange(sample.latencyMs, previous.latencyMs) + ); + } +} + +function toNetworkTelemetrySample( + network: NetworkInfo, +): NetworkTelemetrySample { + return { + emittedAtMs: Date.now(), + p2p: network.p2p, + derp: network.preferred_derp, + latencyMs: network.latency, + }; +} + +function hasMeaningfulLatencyChange( + current: number, + previous: number, +): boolean { + if (previous === 0) { + return current !== 0; + } + return ( + Math.abs(current - previous) / previous > + NETWORK_TELEMETRY_LATENCY_CHANGE_RATIO + ); +} + +function bytesPerSecondToMbits(bytesPerSecond: number): number { + return (bytesPerSecond * 8) / 1_000_000; } /** diff --git a/src/websocket/reconnectingWebSocket.ts b/src/websocket/reconnectingWebSocket.ts index bd33a247..39d603d1 100644 --- a/src/websocket/reconnectingWebSocket.ts +++ b/src/websocket/reconnectingWebSocket.ts @@ -11,8 +11,10 @@ import { import type { WebSocketEventType } from "coder/site/src/utils/OneWayWebSocket"; import type { Logger } from "../logging/logger"; +import type { TelemetryService } from "../telemetry/service"; import type { + CloseEvent, EventHandler, UnidirectionalStream, } from "./eventStreamConnection"; @@ -45,6 +47,17 @@ type StateAction = | { readonly type: "DISCONNECT" } | { readonly type: "DISPOSE" }; +type ReconnectMeasurements = Record<"attempts" | "totalDurationMs", number>; + +interface ReconnectTelemetryCycle { + readonly startMs: number; + readonly measurements: ReconnectMeasurements; + readonly resolve: () => void; + readonly reject: (error: Error) => void; + attempts: number; + completed: boolean; +} + /** * Pure reducer function for state transitions. */ @@ -103,6 +116,7 @@ export interface ReconnectingWebSocketOptions { initialBackoffMs?: number; maxBackoffMs?: number; jitterFactor?: number; + telemetry?: TelemetryService; /** Callback invoked when a refreshable certificate error is detected. Returns true if refresh succeeded. */ onCertificateRefreshNeeded: () => Promise; } @@ -112,7 +126,8 @@ export class ReconnectingWebSocket< > implements UnidirectionalStream { readonly #socketFactory: SocketFactory; readonly #logger: Logger; - readonly #options: Required; + readonly #telemetry: TelemetryService | undefined; + readonly #options: Required>; readonly #eventHandlers: { [K in WebSocketEventType]: Set>; } = { @@ -128,24 +143,33 @@ export class ReconnectingWebSocket< #reconnectTimeoutId: NodeJS.Timeout | null = null; #state: ConnectionState = ConnectionState.IDLE; #certRefreshAttempted = false; // Tracks if cert refresh was already attempted this connection cycle + #connectionOpenedAtMs: number | undefined; + #connectionDropped = false; + #reconnectCycle: ReconnectTelemetryCycle | undefined; readonly #onDispose?: () => void; /** * Dispatch an action to transition state. Returns true if transition is allowed. */ - #dispatch(action: StateAction): boolean { - const newState = reduceState(this.#state, action); - if (newState === this.#state) { + #dispatch(action: StateAction, reason: string): boolean { + const previousState = this.#state; + const newState = reduceState(previousState, action); + if (newState === previousState) { // Allow CONNECT from CONNECTING as a "restart" operation if ( action.type === "CONNECT" && - this.#state === ConnectionState.CONNECTING + previousState === ConnectionState.CONNECTING ) { return true; } return false; } this.#state = newState; + this.#telemetry?.log("connection.state_transition", { + from: previousState, + to: newState, + reason, + }); return true; } @@ -157,6 +181,7 @@ export class ReconnectingWebSocket< ) { this.#socketFactory = socketFactory; this.#logger = logger; + this.#telemetry = options.telemetry; this.#options = { initialBackoffMs: options.initialBackoffMs ?? 250, maxBackoffMs: options.maxBackoffMs ?? 30000, @@ -181,7 +206,7 @@ export class ReconnectingWebSocket< ); // connect() handles all errors internally - await instance.connect(); + await instance.connect("initial_connect"); return instance; } @@ -232,6 +257,10 @@ export class ReconnectingWebSocket< return; } + if (this.#state !== ConnectionState.IDLE) { + this.startReconnectTelemetry("manual_reconnect"); + } + if (this.#state === ConnectionState.DISCONNECTED) { this.#backoffMs = this.#options.initialBackoffMs; this.#certRefreshAttempted = false; // User-initiated reconnect, allow retry @@ -243,16 +272,29 @@ export class ReconnectingWebSocket< } // connect() handles all errors internally - void this.connect(); + void this.connect("manual_reconnect"); } /** * Temporarily disconnect the socket. Can be resumed via reconnect(). */ public disconnect(code?: number, reason?: string): void { - if (!this.#dispatch({ type: "DISCONNECT" })) { + this.disconnectWithReason("disconnect", code, reason); + } + + private disconnectWithReason( + telemetryReason: string, + code?: number, + reason?: string, + ): void { + if (!this.#dispatch({ type: "DISCONNECT" }, telemetryReason)) { return; } + this.emitConnectionDrop("manual_disconnect", code); + this.finishReconnectTelemetry( + false, + new Error(`WebSocket disconnected: ${telemetryReason}`), + ); this.clearCurrentSocket(code, reason); } @@ -273,13 +315,16 @@ export class ReconnectingWebSocket< this.dispose(code, reason); } - private async connect(): Promise { - if (!this.#dispatch({ type: "CONNECT" })) { + private async connect(reason: string): Promise { + const connectStartedAtMs = performance.now(); + if (!this.#dispatch({ type: "CONNECT" }, reason)) { return; } + this.recordReconnectAttempt(); try { // Close any existing socket before creating a new one if (this.#currentSocket) { + this.emitConnectionDrop("replaced", WebSocketCloseCode.NORMAL); this.#currentSocket.close( WebSocketCloseCode.NORMAL, "Replacing connection", @@ -303,9 +348,18 @@ export class ReconnectingWebSocket< return; } - if (!this.#dispatch({ type: "OPEN" })) { + if (!this.#dispatch({ type: "OPEN" }, "open")) { return; } + const openedAtMs = performance.now(); + this.#connectionOpenedAtMs = openedAtMs; + this.#connectionDropped = false; + this.#telemetry?.log( + "connection.open", + { url: this.#route }, + { connectDurationMs: openedAtMs - connectStartedAtMs }, + ); + this.finishReconnectTelemetry(true); // Reset backoff on successful connection this.#backoffMs = this.#options.initialBackoffMs; this.#certRefreshAttempted = false; @@ -348,28 +402,41 @@ export class ReconnectingWebSocket< this.executeHandlers("close", event); if (UNRECOVERABLE_WS_CLOSE_CODES.has(event.code)) { + const error = closeEventError(event); + this.emitConnectionDrop("unrecoverable_close", event.code, error); this.#logger.error( `WebSocket connection closed with unrecoverable error code ${event.code}`, ); - this.disconnect(); + this.disconnectWithReason( + "unrecoverable_close", + event.code, + event.reason, + ); return; } if (NORMAL_CLOSURE_CODES.has(event.code)) { + this.emitConnectionDrop("normal_close", event.code); return; } - this.scheduleReconnect(); + this.emitConnectionDrop( + "unexpected_close", + event.code, + closeEventError(event), + ); + this.scheduleReconnect("unexpected_close"); }); } catch (error) { await this.handleConnectionError(error); } } - private scheduleReconnect(): void { - if (!this.#dispatch({ type: "SCHEDULE_RETRY" })) { + private scheduleReconnect(reason: string): void { + if (!this.#dispatch({ type: "SCHEDULE_RETRY" }, reason)) { return; } + this.startReconnectTelemetry(reason); const jitter = this.#backoffMs * this.#options.jitterFactor * (Math.random() * 2 - 1); @@ -382,7 +449,7 @@ export class ReconnectingWebSocket< this.#reconnectTimeoutId = setTimeout(() => { this.#reconnectTimeoutId = null; // connect() handles all errors internally - void this.connect(); + void this.connect("scheduled_reconnect"); }, delayMs); this.#backoffMs = Math.min(this.#backoffMs * 2, this.#options.maxBackoffMs); @@ -467,7 +534,8 @@ export class ReconnectingWebSocket< `Unrecoverable HTTP error during connection for ${this.#route}`, error, ); - this.disconnect(); + this.emitConnectionDrop("error", undefined, error); + this.disconnectWithReason("unrecoverable_http"); return; } @@ -477,13 +545,15 @@ export class ReconnectingWebSocket< if (await this.handleClientCertificateError(certError)) { this.reconnect(); } else { - this.disconnect(); + this.emitConnectionDrop("error", undefined, error); + this.disconnectWithReason("certificate_error"); } return; } this.#logger.warn(`WebSocket connection failed for ${this.#route}`, error); - this.scheduleReconnect(); + this.emitConnectionDrop("error", undefined, error); + this.scheduleReconnect("connection_error"); } /** @@ -500,9 +570,14 @@ export class ReconnectingWebSocket< } private dispose(code?: number, reason?: string): void { - if (!this.#dispatch({ type: "DISPOSE" })) { + if (!this.#dispatch({ type: "DISPOSE" }, "dispose")) { return; } + this.emitConnectionDrop("disposed", code); + this.finishReconnectTelemetry( + false, + new Error("WebSocket disposed before reconnect completed"), + ); this.clearCurrentSocket(code, reason); for (const set of Object.values(this.#eventHandlers)) { @@ -522,5 +597,98 @@ export class ReconnectingWebSocket< this.#currentSocket.close(code, reason); this.#currentSocket = null; } + this.#connectionOpenedAtMs = undefined; + this.#connectionDropped = false; } + + private emitConnectionDrop( + cause: string, + closeCode?: number, + error?: unknown, + ): void { + if (this.#connectionOpenedAtMs === undefined || this.#connectionDropped) { + return; + } + + const properties = { + cause, + closeCode: closeCode === undefined ? "" : String(closeCode), + }; + const measurements = { + connectionDurationMs: performance.now() - this.#connectionOpenedAtMs, + }; + if (error === undefined) { + this.#telemetry?.log("connection.drop", properties, measurements); + } else { + this.#telemetry?.logError( + "connection.drop", + error, + properties, + measurements, + ); + } + this.#connectionDropped = true; + } + + private startReconnectTelemetry(reason: string): void { + if (!this.#telemetry || this.#reconnectCycle) { + return; + } + + const measurements: ReconnectMeasurements = { + attempts: 0, + totalDurationMs: 0, + }; + let resolveCycle!: () => void; + let rejectCycle!: (error: Error) => void; + const cycleDone = new Promise((resolve, reject) => { + resolveCycle = resolve; + rejectCycle = reject; + }); + + const startMs = performance.now(); + this.#reconnectCycle = { + startMs, + measurements, + resolve: resolveCycle, + reject: rejectCycle, + attempts: 0, + completed: false, + }; + void this.#telemetry + .trace("connection.reconnect", () => cycleDone, { reason }, measurements) + .catch(() => undefined); + } + + private recordReconnectAttempt(): void { + const cycle = this.#reconnectCycle; + if (!cycle) { + return; + } + cycle.attempts += 1; + cycle.measurements.attempts = cycle.attempts; + cycle.measurements.totalDurationMs = performance.now() - cycle.startMs; + } + + private finishReconnectTelemetry(success: boolean, error?: Error): void { + const cycle = this.#reconnectCycle; + if (!cycle || cycle.completed) { + return; + } + + cycle.completed = true; + cycle.measurements.totalDurationMs = performance.now() - cycle.startMs; + this.#reconnectCycle = undefined; + if (success) { + cycle.resolve(); + } else { + cycle.reject(error ?? new Error("WebSocket reconnect failed")); + } + } +} + +function closeEventError(event: CloseEvent): Error { + return new Error( + `WebSocket closed unexpectedly with code ${event.code}: ${event.reason}`, + ); } diff --git a/test/unit/remote/sshProcess.test.ts b/test/unit/remote/sshProcess.test.ts index f562ea5d..7528a3a4 100644 --- a/test/unit/remote/sshProcess.test.ts +++ b/test/unit/remote/sshProcess.test.ts @@ -10,7 +10,10 @@ import { type NetworkInfo, type SshProcessMonitorOptions, } from "@/remote/sshProcess"; +import { buildSession } from "@/telemetry/event"; +import { TelemetryService } from "@/telemetry/service"; +import { TestSink } from "../../mocks/telemetry"; import { createMockLogger, makeNetworkInfo, @@ -312,6 +315,150 @@ describe("SshProcessMonitor", () => { }); }); + describe("telemetry", () => { + const sshLog = { + "/logs/ms-vscode-remote.remote-ssh/1-Remote - SSH.log": + "-> socksPort 12345 ->", + }; + + function networkTelemetryEvents(sink: TestSink) { + return sink.events.filter( + (event) => event.eventName === "ssh.network.info", + ); + } + + async function advanceUntilNetworkEvents( + sink: TestSink, + count: number, + ): Promise { + for (let i = 0; i < 20; i++) { + await vi.advanceTimersByTimeAsync(1); + if (networkTelemetryEvents(sink).length >= count) { + return; + } + } + expect(networkTelemetryEvents(sink)).toHaveLength(count); + } + + function keepNetworkFileFresh(filePath: string, ms = 90_000): void { + const mtime = (Date.now() + ms) / 1000; + vol.utimesSync(filePath, mtime, mtime); + } + + it("emits a process discovered trace", async () => { + const { telemetry, sink } = createTelemetry(); + vol.fromJSON(sshLog); + + const monitor = createMonitor({ telemetry }); + await waitForEvent(monitor.onPidChange); + + const event = sink.events.find( + (event) => event.eventName === "ssh.process.discovered", + ); + expect(event?.properties.result).toBe("success"); + expect(event?.measurements.durationMs).toEqual(expect.any(Number)); + await telemetry.dispose(); + }); + + it("emits process lost and recovered events after stale network info", async () => { + const { telemetry, sink } = createTelemetry(); + vol.fromJSON({ + ...sshLog, + "/network/999.json": makeNetworkJson(), + }); + vi.mocked(find) + .mockResolvedValueOnce([{ pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }]) + .mockResolvedValue([{ pid: 888, ppid: 1, name: "ssh", cmd: "ssh" }]); + + const monitor = createMonitor({ + networkInfoPath: "/network", + networkPollInterval: 10, + telemetry, + }); + await waitForEvent(monitor.onPidChange); + + await waitFor( + () => + sink.events.some((event) => event.eventName === "ssh.process.lost") && + sink.events.some( + (event) => event.eventName === "ssh.process.recovered", + ), + 500, + ); + + const lost = sink.events.find( + (event) => event.eventName === "ssh.process.lost", + )!; + expect(lost.properties).toEqual({ cause: "stale_network_info" }); + expect(lost.measurements.uptimeMs).toEqual(expect.any(Number)); + + const recovered = sink.events.find( + (event) => event.eventName === "ssh.process.recovered", + )!; + expect(recovered.measurements.recoveryDurationMs).toEqual( + expect.any(Number), + ); + await telemetry.dispose(); + }); + + it("samples network info on first read and every 60 seconds, not every poll", async () => { + vi.useFakeTimers(); + const { telemetry, sink } = createTelemetry(); + vol.fromJSON({ + ...sshLog, + "/network/999.json": makeNetworkJson({ latency: 25 }), + }); + keepNetworkFileFresh("/network/999.json"); + + createMonitor({ + networkInfoPath: "/network", + networkPollInterval: 10_000, + telemetry, + }); + await advanceUntilNetworkEvents(sink, 1); + expect(networkTelemetryEvents(sink)).toHaveLength(1); + + await vi.advanceTimersByTimeAsync(50_000); + expect(networkTelemetryEvents(sink)).toHaveLength(1); + + await vi.advanceTimersByTimeAsync(10_000); + expect(networkTelemetryEvents(sink)).toHaveLength(2); + const first = networkTelemetryEvents(sink)[0]; + expect(first.properties).toEqual({ p2p: "true", derp: "NYC" }); + expect(first.measurements).toMatchObject({ + latencyMs: 25, + downloadMbits: 50, + uploadMbits: 10, + }); + await telemetry.dispose(); + }); + + it("samples network info on p2p flip inside the 60 second window", async () => { + vi.useFakeTimers(); + const { telemetry, sink } = createTelemetry(); + vol.fromJSON({ + ...sshLog, + "/network/999.json": makeNetworkJson({ p2p: true }), + }); + keepNetworkFileFresh("/network/999.json"); + + createMonitor({ + networkInfoPath: "/network", + networkPollInterval: 10, + telemetry, + }); + await advanceUntilNetworkEvents(sink, 1); + + vol.writeFileSync("/network/999.json", makeNetworkJson({ p2p: false })); + keepNetworkFileFresh("/network/999.json"); + await vi.advanceTimersByTimeAsync(10); + + expect(networkTelemetryEvents(sink)).toHaveLength(2); + expect(networkTelemetryEvents(sink)[1].properties.p2p).toBe("false"); + await telemetry.dispose(); + }); + }); + describe("log file discovery", () => { it("finds log file matching PID pattern", async () => { vol.fromJSON({ @@ -817,6 +964,17 @@ describe("SshProcessMonitor", () => { activeMonitors.push(monitor); return monitor; } + + function createTelemetry(): { telemetry: TelemetryService; sink: TestSink } { + new MockConfigurationProvider().set("coder.telemetry.level", "local"); + const sink = new TestSink(); + const telemetry = new TelemetryService( + buildSession("test", "test-session"), + [sink], + createMockLogger(), + ); + return { telemetry, sink }; + } }); /** diff --git a/test/unit/websocket/reconnectingWebSocket.test.ts b/test/unit/websocket/reconnectingWebSocket.test.ts index 8e028e7f..08a4ec58 100644 --- a/test/unit/websocket/reconnectingWebSocket.test.ts +++ b/test/unit/websocket/reconnectingWebSocket.test.ts @@ -1,5 +1,7 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { buildSession } from "@/telemetry/event"; +import { TelemetryService } from "@/telemetry/service"; import { WebSocketCloseCode, HttpStatusCode } from "@/websocket/codes"; import { ConnectionState, @@ -7,7 +9,11 @@ import { type SocketFactory, } from "@/websocket/reconnectingWebSocket"; -import { createMockLogger } from "../../mocks/testHelpers"; +import { TestSink } from "../../mocks/telemetry"; +import { + createMockLogger, + MockConfigurationProvider, +} from "../../mocks/testHelpers"; import type { CloseEvent, Event as WsEvent } from "ws"; @@ -587,6 +593,153 @@ describe("ReconnectingWebSocket", () => { }); }); + describe("Telemetry", () => { + it("emits state transition events for reducer changes", async () => { + const { telemetry, sink } = createTelemetry(); + const { ws, sockets } = await createReconnectingWebSocket( + undefined, + telemetry, + ); + + sockets[0].fireOpen(); + sockets[0].fireClose({ + code: WebSocketCloseCode.ABNORMAL, + reason: "Network error", + }); + await vi.advanceTimersByTimeAsync(300); + sockets[1].fireOpen(); + ws.close(); + + const transitions = sink.events.filter( + (event) => event.eventName === "connection.state_transition", + ); + expect(transitions.map((event) => event.properties)).toEqual([ + { from: "IDLE", to: "CONNECTING", reason: "initial_connect" }, + { from: "CONNECTING", to: "CONNECTED", reason: "open" }, + { + from: "CONNECTED", + to: "AWAITING_RETRY", + reason: "unexpected_close", + }, + { + from: "AWAITING_RETRY", + to: "CONNECTING", + reason: "scheduled_reconnect", + }, + { from: "CONNECTING", to: "CONNECTED", reason: "open" }, + { from: "CONNECTED", to: "DISPOSED", reason: "dispose" }, + ]); + await telemetry.dispose(); + }); + + it("emits open and unexpected drop events", async () => { + const { telemetry, sink } = createTelemetry(); + const { ws, sockets } = await createReconnectingWebSocket( + undefined, + telemetry, + ); + + sockets[0].fireOpen(); + sockets[0].fireClose({ + code: WebSocketCloseCode.ABNORMAL, + reason: "Network error", + }); + + const open = sink.events.find( + (event) => event.eventName === "connection.open", + ); + expect(open?.properties).toEqual({ url: "/api/test" }); + expect(open?.measurements.connectDurationMs).toEqual(expect.any(Number)); + + const drop = sink.events.find( + (event) => event.eventName === "connection.drop", + ); + expect(drop?.properties).toEqual({ + cause: "unexpected_close", + closeCode: String(WebSocketCloseCode.ABNORMAL), + }); + expect(drop?.measurements.connectionDurationMs).toEqual( + expect.any(Number), + ); + expect(drop?.error?.message).toContain("1006"); + + ws.close(); + await telemetry.dispose(); + }); + + it("emits reconnect aggregation after recovery", async () => { + const { telemetry, sink } = createTelemetry(); + const { ws, sockets } = await createReconnectingWebSocket( + undefined, + telemetry, + ); + + sockets[0].fireOpen(); + sockets[0].fireClose({ + code: WebSocketCloseCode.ABNORMAL, + reason: "Network error", + }); + await vi.advanceTimersByTimeAsync(300); + sockets[1].fireOpen(); + + await vi.waitFor(() => { + expect( + sink.events.filter( + (event) => event.eventName === "connection.reconnect", + ), + ).toHaveLength(1); + }); + const reconnect = sink.events.find( + (event) => event.eventName === "connection.reconnect", + )!; + expect(reconnect.properties).toMatchObject({ + result: "success", + reason: "unexpected_close", + }); + expect(reconnect.measurements.attempts).toBe(1); + expect(reconnect.measurements.totalDurationMs).toEqual( + expect.any(Number), + ); + + ws.close(); + await telemetry.dispose(); + }); + + it("emits failed reconnect aggregation when reconnect ends disconnected", async () => { + const { telemetry, sink } = createTelemetry(); + const { ws, sockets, setFactoryError } = + await createReconnectingWebSocketWithErrorControl(telemetry); + sockets[0].fireOpen(); + + setFactoryError( + new Error(`Unexpected server response: ${HttpStatusCode.FORBIDDEN}`), + ); + ws.reconnect(); + await Promise.resolve(); + + await vi.waitFor(() => { + expect(ws.state).toBe(ConnectionState.DISCONNECTED); + expect( + sink.events.filter( + (event) => event.eventName === "connection.reconnect", + ), + ).toHaveLength(1); + }); + const reconnect = sink.events.find( + (event) => event.eventName === "connection.reconnect", + )!; + expect(reconnect.properties).toMatchObject({ + result: "error", + reason: "manual_reconnect", + }); + expect(reconnect.measurements.attempts).toBe(1); + expect(reconnect.error?.message).toContain("unrecoverable_http"); + + ws.close(); + await telemetry.dispose(); + }); + }); + describe("Certificate Refresh", () => { const setupRefreshTest = async (onRefresh: () => Promise) => { const sockets: MockSocket[] = []; @@ -747,7 +900,10 @@ function createMockSocket(): MockSocket { }; } -async function createReconnectingWebSocket(onDispose?: () => void): Promise<{ +async function createReconnectingWebSocket( + onDispose?: () => void, + telemetry?: TelemetryService, +): Promise<{ ws: ReconnectingWebSocket; sockets: MockSocket[]; }> { @@ -757,7 +913,7 @@ async function createReconnectingWebSocket(onDispose?: () => void): Promise<{ sockets.push(socket); return Promise.resolve(socket); }); - const ws = await fromFactory(factory, onDispose); + const ws = await fromFactory(factory, onDispose, undefined, telemetry); // We start with one socket expect(sockets).toHaveLength(1); @@ -765,7 +921,9 @@ async function createReconnectingWebSocket(onDispose?: () => void): Promise<{ return { ws, sockets }; } -async function createReconnectingWebSocketWithErrorControl(): Promise<{ +async function createReconnectingWebSocketWithErrorControl( + telemetry?: TelemetryService, +): Promise<{ ws: ReconnectingWebSocket; sockets: MockSocket[]; setFactoryError: (error: Error | null) => void; @@ -782,7 +940,7 @@ async function createReconnectingWebSocketWithErrorControl(): Promise<{ return Promise.resolve(socket); }); - const ws = await fromFactory(factory); + const ws = await fromFactory(factory, undefined, undefined, telemetry); expect(sockets).toHaveLength(1); return { @@ -798,6 +956,7 @@ async function fromFactory( factory: SocketFactory, onDispose?: () => void, onCertificateRefreshNeeded?: () => Promise, + telemetry?: TelemetryService, ): Promise> { return await ReconnectingWebSocket.create( factory, @@ -805,11 +964,23 @@ async function fromFactory( { onCertificateRefreshNeeded: onCertificateRefreshNeeded ?? (() => Promise.resolve(false)), + ...(telemetry && { telemetry }), }, onDispose, ); } +function createTelemetry(): { telemetry: TelemetryService; sink: TestSink } { + new MockConfigurationProvider().set("coder.telemetry.level", "local"); + const sink = new TestSink(); + const telemetry = new TelemetryService( + buildSession("test", "test-session"), + [sink], + createMockLogger(), + ); + return { telemetry, sink }; +} + async function createBlockingReconnectingWebSocket(): Promise<{ ws: ReconnectingWebSocket; sockets: MockSocket[]; From d5c726363a1fb6b90e0887e4d87a2c6a6b215ceb Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 8 May 2026 14:51:52 +0300 Subject: [PATCH 2/7] refactor(telemetry): simplify connection instrumentation --- src/api/coderApi.ts | 13 +- src/instrumentation/ssh.ts | 156 +++++++++ src/instrumentation/websocket.ts | 168 ++++++++++ src/remote/sshProcess.ts | 184 ++--------- src/telemetry/reporter.ts | 29 ++ src/telemetry/service.ts | 16 +- src/telemetry/span.ts | 8 +- src/websocket/reconnectingWebSocket.ts | 311 ++++++------------ test/mocks/telemetry.ts | 30 +- test/unit/instrumentation/ssh.test.ts | 123 +++++++ test/unit/remote/sshProcess.test.ts | 70 ++-- .../websocket/reconnectingWebSocket.test.ts | 129 ++++---- 12 files changed, 736 insertions(+), 501 deletions(-) create mode 100644 src/instrumentation/ssh.ts create mode 100644 src/instrumentation/websocket.ts create mode 100644 src/telemetry/reporter.ts create mode 100644 test/unit/instrumentation/ssh.test.ts diff --git a/src/api/coderApi.ts b/src/api/coderApi.ts index d2c3694d..e350127f 100644 --- a/src/api/coderApi.ts +++ b/src/api/coderApi.ts @@ -36,7 +36,10 @@ import { } from "../logging/types"; import { sizeOf } from "../logging/utils"; import { getHeaderCommand } from "../settings/headers"; -import { type TelemetryService } from "../telemetry/service"; +import { + NOOP_TELEMETRY_REPORTER, + type TelemetryReporter, +} from "../telemetry/reporter"; import { HttpStatusCode, WebSocketCloseCode } from "../websocket/codes"; import { type UnidirectionalStream, @@ -90,7 +93,7 @@ export class CoderApi extends Api implements vscode.Disposable { private constructor( private readonly output: Logger, - private readonly telemetry: TelemetryService | undefined, + private readonly telemetry: TelemetryReporter, ) { super(); this.configWatcher = this.watchConfigChanges(); @@ -104,7 +107,7 @@ export class CoderApi extends Api implements vscode.Disposable { baseUrl: string, token: string | undefined, output: Logger, - telemetry?: TelemetryService, + telemetry: TelemetryReporter = NOOP_TELEMETRY_REPORTER, ): CoderApi { const client = new CoderApi(output, telemetry); client.setCredentials(baseUrl, token); @@ -456,6 +459,7 @@ export class CoderApi extends Api implements vscode.Disposable { socketFactory: SocketFactory, ): Promise> { const options: ReconnectingWebSocketOptions = { + telemetry: this.telemetry, onCertificateRefreshNeeded: async () => { const refreshCommand = getRefreshCommand(); if (!refreshCommand) { @@ -464,9 +468,6 @@ export class CoderApi extends Api implements vscode.Disposable { return refreshCertificates(refreshCommand, this.output); }, }; - if (this.telemetry) { - options.telemetry = this.telemetry; - } const reconnectingSocket = await ReconnectingWebSocket.create( socketFactory, diff --git a/src/instrumentation/ssh.ts b/src/instrumentation/ssh.ts new file mode 100644 index 00000000..5068de27 --- /dev/null +++ b/src/instrumentation/ssh.ts @@ -0,0 +1,156 @@ +import type { NetworkInfo } from "../remote/sshProcess"; +import type { TelemetryReporter } from "../telemetry/reporter"; + +const NETWORK_SAMPLE_INTERVAL_MS = 60_000; +const NETWORK_LATENCY_CHANGE_RATIO = 0.1; + +export type ProcessLossCause = + | "stale_network_info" + | "missing_network_info" + | "process_changed" + | "disposed"; + +interface NetworkSample { + readonly emittedAtMs: number; + readonly p2p: boolean; + readonly derp: string; + readonly latencyMs: number; +} + +export interface ProcessDiscoveryResult { + readonly pid: number | undefined; + readonly attempts: number; +} + +export class SshTelemetry { + readonly #telemetry: TelemetryReporter; + #processStartedAtMs: number | undefined; + #processLostAtMs: number | undefined; + #lastNetworkSample: NetworkSample | undefined; + + public constructor(telemetry: TelemetryReporter) { + this.#telemetry = telemetry; + } + + public traceProcessDiscovery( + fn: () => Promise, + ): Promise { + return this.#telemetry.trace("ssh.process.discovered", async (span) => { + const { pid, attempts } = await fn(); + span.setMeasurement("attempts", attempts); + return pid; + }); + } + + public processStarted(): void { + this.#processStartedAtMs = performance.now(); + this.#processLostAtMs = undefined; + } + + public processLost(cause: ProcessLossCause): void { + if ( + this.#processStartedAtMs === undefined || + this.#processLostAtMs !== undefined + ) { + return; + } + const now = performance.now(); + this.#processLostAtMs = now; + this.#telemetry.log( + "ssh.process.lost", + { cause }, + { uptimeMs: now - this.#processStartedAtMs }, + ); + } + + public processRecovered(): void { + if (this.#processLostAtMs === undefined) { + return; + } + this.#telemetry.log( + "ssh.process.recovered", + {}, + { recoveryDurationMs: performance.now() - this.#processLostAtMs }, + ); + this.#processLostAtMs = undefined; + } + + /** + * Handover from one SSH process to another. Closes out the prior process + * (recovery if lost, replacement otherwise) and starts fresh tracking. + */ + public processReplaced(): void { + const now = performance.now(); + if (this.#processLostAtMs !== undefined) { + this.processRecovered(); + } else if (this.#processStartedAtMs !== undefined) { + this.#telemetry.log( + "ssh.process.replaced", + {}, + { previousUptimeMs: now - this.#processStartedAtMs }, + ); + } + this.#processStartedAtMs = now; + this.#processLostAtMs = undefined; + this.#lastNetworkSample = undefined; + } + + public networkSampled(network: NetworkInfo): void { + const now = performance.now(); + const previous = this.#lastNetworkSample; + if (previous && !shouldEmitSample(previous, network, now)) { + return; + } + + this.#lastNetworkSample = { + emittedAtMs: now, + p2p: network.p2p, + derp: network.preferred_derp, + latencyMs: network.latency, + }; + this.#telemetry.log( + "ssh.network.sample", + { + p2p: String(network.p2p), + derp: network.preferred_derp, + }, + { + latencyMs: network.latency, + downloadMbits: bytesPerSecondToMbits(network.download_bytes_sec), + uploadMbits: bytesPerSecondToMbits(network.upload_bytes_sec), + }, + ); + } +} + +/** Emit on p2p flip, DERP change, large latency swing, or heartbeat interval. */ +function shouldEmitSample( + previous: NetworkSample, + current: NetworkInfo, + now: number, +): boolean { + if (now - previous.emittedAtMs >= NETWORK_SAMPLE_INTERVAL_MS) { + return true; + } + if (current.p2p !== previous.p2p) { + return true; + } + if (current.preferred_derp !== previous.derp) { + return true; + } + return hasMeaningfulLatencyChange(current.latency, previous.latencyMs); +} + +function hasMeaningfulLatencyChange( + current: number, + previous: number, +): boolean { + if (previous === 0) { + return current !== 0; + } + return Math.abs(current - previous) / previous > NETWORK_LATENCY_CHANGE_RATIO; +} + +function bytesPerSecondToMbits(bytesPerSecond: number): number { + return (bytesPerSecond * 8) / 1_000_000; +} diff --git a/src/instrumentation/websocket.ts b/src/instrumentation/websocket.ts new file mode 100644 index 00000000..ca2b9e6a --- /dev/null +++ b/src/instrumentation/websocket.ts @@ -0,0 +1,168 @@ +import type { CallerProperties } from "../telemetry/event"; +import type { TelemetryReporter } from "../telemetry/reporter"; +import type { ConnectionState } from "../websocket/reconnectingWebSocket"; + +export type ConnectionStateReason = + | "initial_connect" + | "manual_reconnect" + | "scheduled_reconnect" + | "open" + | "disconnect" + | "dispose" + | "unrecoverable_close" + | "unrecoverable_http" + | "certificate_error" + | "connection_error" + | "unexpected_close"; + +export type ConnectionDropCause = + | "manual_disconnect" + | "replaced" + | "unrecoverable_close" + | "normal_close" + | "unexpected_close" + | "disposed" + | "error"; + +export type ReconnectOutcome = + | { readonly result: "success" } + | { + readonly result: "error"; + readonly terminationReason: ConnectionStateReason; + }; + +interface ReconnectCycle { + readonly startMs: number; + readonly reason: ConnectionStateReason; + attempts: number; +} + +interface DropOptions { + cause: ConnectionDropCause; + code?: number; + error?: unknown; +} + +export class WebSocketTelemetry { + readonly #telemetry: TelemetryReporter; + #connectStartedAtMs: number | undefined; + #connectionOpenedAtMs: number | undefined; + #connectionDropEmitted = false; + #reconnectCycle: ReconnectCycle | undefined; + + public constructor(telemetry: TelemetryReporter) { + this.#telemetry = telemetry; + } + + public stateTransition( + from: ConnectionState, + to: ConnectionState, + reason: ConnectionStateReason, + ): void { + this.#telemetry.log("connection.state_transition", { from, to, reason }); + } + + /** Stamp the connect-start time; counts an attempt if a cycle is open. */ + public connectStarted(): void { + this.#connectStartedAtMs = performance.now(); + if (this.#reconnectCycle) { + this.#reconnectCycle.attempts += 1; + } + } + + public opened(route: string): void { + const now = performance.now(); + const start = this.#connectStartedAtMs ?? now; + this.#connectionOpenedAtMs = now; + this.#connectionDropEmitted = false; + this.#connectStartedAtMs = undefined; + this.#telemetry.log( + "connection.open", + { url: route }, + { connectDurationMs: now - start }, + ); + this.#finishReconnect({ result: "success" }); + } + + public dropped( + cause: ConnectionDropCause, + closeCode?: number, + error?: unknown, + ): void { + if ( + this.#connectionOpenedAtMs === undefined || + this.#connectionDropEmitted + ) { + return; + } + + const properties: CallerProperties = { cause }; + if (closeCode !== undefined) { + properties.closeCode = String(closeCode); + } + const measurements = { + connectionDurationMs: performance.now() - this.#connectionOpenedAtMs, + }; + if (error === undefined) { + this.#telemetry.log("connection.drop", properties, measurements); + } else { + this.#telemetry.logError( + "connection.drop", + error, + properties, + measurements, + ); + } + this.#connectionDropEmitted = true; + } + + public reset(): void { + this.#connectStartedAtMs = undefined; + this.#connectionOpenedAtMs = undefined; + this.#connectionDropEmitted = false; + } + + /** Open a reconnect cycle. No-op if one is already open. */ + public reconnectStarted(reason: ConnectionStateReason): void { + if (this.#reconnectCycle) { + return; + } + this.#reconnectCycle = { + startMs: performance.now(), + reason, + attempts: 0, + }; + } + + /** Drop and end the reconnect cycle as a failure. */ + public terminated(reason: ConnectionStateReason, options: DropOptions): void { + this.dropped(options.cause, options.code, options.error); + this.#finishReconnect({ result: "error", terminationReason: reason }); + } + + /** Drop and (re)open a reconnect cycle. */ + public retrying(reason: ConnectionStateReason, options: DropOptions): void { + this.dropped(options.cause, options.code, options.error); + this.reconnectStarted(reason); + } + + #finishReconnect(outcome: ReconnectOutcome): void { + const cycle = this.#reconnectCycle; + if (!cycle) { + return; + } + this.#reconnectCycle = undefined; + + const properties: Record = { + result: outcome.result, + reason: cycle.reason, + }; + if (outcome.result === "error") { + properties.terminationReason = outcome.terminationReason; + } + this.#telemetry.log("connection.reconnect", properties, { + attempts: cycle.attempts, + totalDurationMs: performance.now() - cycle.startMs, + }); + } +} diff --git a/src/remote/sshProcess.ts b/src/remote/sshProcess.ts index 809997e4..a67fa448 100644 --- a/src/remote/sshProcess.ts +++ b/src/remote/sshProcess.ts @@ -3,13 +3,14 @@ import * as fs from "node:fs/promises"; import * as path from "node:path"; import * as vscode from "vscode"; -import { type TelemetryService } from "../telemetry/service"; +import { SshTelemetry, type ProcessLossCause } from "../instrumentation/ssh"; import { findPort } from "../util"; import { cleanupFiles } from "../util/fileCleanup"; import { NetworkStatusReporter } from "./networkStatus"; import type { Logger } from "../logging/logger"; +import type { TelemetryReporter } from "../telemetry/reporter"; /** * Network information from the Coder CLI. @@ -41,7 +42,7 @@ export interface SshProcessMonitorOptions { // For port-based SSH process discovery codeLogDir: string; remoteSshExtensionId: string; - telemetry?: TelemetryService; + telemetry: TelemetryReporter; } // 1 hour cleanup threshold for old network info files @@ -50,21 +51,6 @@ const CLEANUP_NETWORK_MAX_AGE_MS = 60 * 60 * 1000; const CLEANUP_LOG_MAX_AGE_MS = 7 * 24 * 60 * 60 * 1000; // Maximum number of proxy log files to keep during cleanup const CLEANUP_MAX_LOG_FILES = 20; -const NETWORK_TELEMETRY_SAMPLE_INTERVAL_MS = 60_000; -const NETWORK_TELEMETRY_LATENCY_CHANGE_RATIO = 0.1; - -type ProcessLossCause = - | "stale_network_info" - | "missing_network_info" - | "process_changed" - | "disposed"; - -interface NetworkTelemetrySample { - readonly emittedAtMs: number; - readonly p2p: boolean; - readonly derp: string; - readonly latencyMs: number; -} /** * Monitors the SSH process for a Coder workspace connection and displays @@ -76,8 +62,8 @@ export class SshProcessMonitor implements vscode.Disposable { Omit > & { readonly proxyLogDir: string | undefined; - readonly telemetry: TelemetryService | undefined; }; + private readonly telemetry: SshTelemetry; private readonly _onLogFilePathChange = new vscode.EventEmitter< string | undefined @@ -100,9 +86,6 @@ export class SshProcessMonitor implements vscode.Disposable { private pendingTimeout: NodeJS.Timeout | undefined; private lastStaleSearchTime = 0; private readonly reporter: NetworkStatusReporter; - private processStartedAtMs: number | undefined; - private processLostAtMs: number | undefined; - private lastNetworkTelemetrySample: NetworkTelemetrySample | undefined; /** * Cleans up network info files older than the specified age. @@ -148,8 +131,8 @@ export class SshProcessMonitor implements vscode.Disposable { maxDiscoveryBackoffMs: options.maxDiscoveryBackoffMs ?? 30_000, // Matches the SSH update interval networkPollInterval: options.networkPollInterval ?? 3000, - telemetry: options.telemetry, }; + this.telemetry = new SshTelemetry(options.telemetry); this.statusBarItem = vscode.window.createStatusBarItem( vscode.StatusBarAlignment.Left, 1000, @@ -205,7 +188,7 @@ export class SshProcessMonitor implements vscode.Disposable { if (this.disposed) { return; } - this.emitProcessLost("disposed"); + this.telemetry.processLost("disposed"); this.disposed = true; if (this.pendingTimeout) { clearTimeout(this.pendingTimeout); @@ -236,7 +219,7 @@ export class SshProcessMonitor implements vscode.Disposable { * Starts monitoring when it finds the process through the port. */ private async searchForProcess(): Promise { - const pid = await this.traceProcessDiscovery(() => + const pid = await this.telemetry.traceProcessDiscovery(() => this.discoverSshProcess(), ); if (pid === undefined || this.disposed) { @@ -247,30 +230,22 @@ export class SshProcessMonitor implements vscode.Disposable { this.startMonitoring(); } - private async traceProcessDiscovery( - fn: () => Promise, - ): Promise { - const { telemetry } = this.options; - if (!telemetry) { - return fn(); - } - - return telemetry.trace("ssh.process.discovered", fn); - } - - private async discoverSshProcess(): Promise { + private async discoverSshProcess(): Promise<{ + pid: number | undefined; + attempts: number; + }> { const { discoveryPollIntervalMs, maxDiscoveryBackoffMs, logger, sshHost } = this.options; - let attempt = 0; + let attempts = 0; let currentBackoff = discoveryPollIntervalMs; let lastFoundPort: number | undefined; while (!this.disposed) { - attempt++; + attempts++; - if (attempt === 1 || attempt % 10 === 0) { + if (attempts === 1 || attempts % 10 === 0) { logger.debug( - `SSH process search attempt ${attempt} for host: ${sshHost}`, + `SSH process search attempt ${attempts} for host: ${sshHost}`, ); } @@ -292,14 +267,14 @@ export class SshProcessMonitor implements vscode.Disposable { } if (pid !== undefined) { - return pid; + return { pid, attempts }; } await this.delay(currentBackoff); currentBackoff = Math.min(currentBackoff * 2, maxDiscoveryBackoffMs); } - return undefined; + return { pid: undefined, attempts }; } /** @@ -352,62 +327,27 @@ export class SshProcessMonitor implements vscode.Disposable { */ private setCurrentPid(pid: number): void { const previousPid = this.currentPid; - const now = performance.now(); this.currentPid = pid; if (previousPid === undefined) { - this.processStartedAtMs = now; + this.telemetry.processStarted(); this.options.logger.info(`SSH connection established (PID: ${pid})`); this._onPidChange.fire(pid); return; } - if (previousPid !== pid && this.processLostAtMs === undefined) { - this.emitProcessLost("process_changed"); - } - - if (this.processLostAtMs !== undefined) { - this.emitProcessRecovered(); - } - - if (previousPid !== pid) { - this.processStartedAtMs = now; - this.lastNetworkTelemetrySample = undefined; - this.options.logger.info( - `SSH process changed from ${previousPid} to ${pid}`, - ); - this.logFilePath = undefined; - this._onLogFilePathChange.fire(undefined); - this._onPidChange.fire(pid); - } - } - - private emitProcessLost(cause: ProcessLossCause): void { - if (this.currentPid === undefined || this.processLostAtMs !== undefined) { - return; - } - - const now = performance.now(); - const startedAt = this.processStartedAtMs ?? now; - this.processLostAtMs = now; - this.options.telemetry?.log( - "ssh.process.lost", - { cause }, - { uptimeMs: now - startedAt }, - ); - } - - private emitProcessRecovered(now = performance.now()): void { - if (this.processLostAtMs === undefined) { + if (previousPid === pid) { + this.telemetry.processRecovered(); return; } - this.options.telemetry?.log( - "ssh.process.recovered", - {}, - { recoveryDurationMs: now - this.processLostAtMs }, + this.telemetry.processReplaced(); + this.options.logger.info( + `SSH process changed from ${previousPid} to ${pid}`, ); - this.processLostAtMs = undefined; + this.logFilePath = undefined; + this._onLogFilePathChange.fire(undefined); + this._onPidChange.fire(pid); } /** @@ -499,7 +439,7 @@ export class SshProcessMonitor implements vscode.Disposable { const network = JSON.parse(content) as NetworkInfo; const isStale = ageMs > networkPollInterval * 2; this.reporter.update(network, isStale); - this.sampleNetworkTelemetry(network); + this.telemetry.networkSampled(network); } } catch (error) { readFailures++; @@ -523,7 +463,7 @@ export class SshProcessMonitor implements vscode.Disposable { } logger.debug(`${searchReason}, searching for new SSH process`); - this.emitProcessLost(lossCause); + this.telemetry.processLost(lossCause); // searchForProcess will update PID if a different process is found this.lastStaleSearchTime = Date.now(); await this.searchForProcess(); @@ -533,74 +473,6 @@ export class SshProcessMonitor implements vscode.Disposable { await this.delay(networkPollInterval); } } - - private sampleNetworkTelemetry(network: NetworkInfo): void { - if (!this.options.telemetry) { - return; - } - - const sample = toNetworkTelemetrySample(network); - if (!this.shouldEmitNetworkTelemetry(sample)) { - return; - } - - this.options.telemetry.log( - "ssh.network.info", - { - p2p: String(network.p2p), - derp: sample.derp, - }, - { - latencyMs: sample.latencyMs, - downloadMbits: bytesPerSecondToMbits(network.download_bytes_sec), - uploadMbits: bytesPerSecondToMbits(network.upload_bytes_sec), - }, - ); - this.lastNetworkTelemetrySample = sample; - } - - private shouldEmitNetworkTelemetry(sample: NetworkTelemetrySample): boolean { - const previous = this.lastNetworkTelemetrySample; - if (!previous) { - return true; - } - - return ( - sample.emittedAtMs - previous.emittedAtMs >= - NETWORK_TELEMETRY_SAMPLE_INTERVAL_MS || - sample.p2p !== previous.p2p || - sample.derp !== previous.derp || - hasMeaningfulLatencyChange(sample.latencyMs, previous.latencyMs) - ); - } -} - -function toNetworkTelemetrySample( - network: NetworkInfo, -): NetworkTelemetrySample { - return { - emittedAtMs: Date.now(), - p2p: network.p2p, - derp: network.preferred_derp, - latencyMs: network.latency, - }; -} - -function hasMeaningfulLatencyChange( - current: number, - previous: number, -): boolean { - if (previous === 0) { - return current !== 0; - } - return ( - Math.abs(current - previous) / previous > - NETWORK_TELEMETRY_LATENCY_CHANGE_RATIO - ); -} - -function bytesPerSecondToMbits(bytesPerSecond: number): number { - return (bytesPerSecond * 8) / 1_000_000; } /** diff --git a/src/telemetry/reporter.ts b/src/telemetry/reporter.ts new file mode 100644 index 00000000..e46811b4 --- /dev/null +++ b/src/telemetry/reporter.ts @@ -0,0 +1,29 @@ +import { NOOP_SPAN, type Span } from "./span"; + +import type { CallerMeasurements, CallerProperties } from "./event"; + +export interface TelemetryReporter { + log( + eventName: string, + properties?: CallerProperties, + measurements?: CallerMeasurements, + ): void; + logError( + eventName: string, + error: unknown, + properties?: CallerProperties, + measurements?: CallerMeasurements, + ): void; + trace( + eventName: string, + fn: (span: Span) => Promise, + properties?: CallerProperties, + measurements?: CallerMeasurements, + ): Promise; +} + +export const NOOP_TELEMETRY_REPORTER: TelemetryReporter = { + log: () => undefined, + logError: () => undefined, + trace: (_eventName, fn) => fn(NOOP_SPAN), +}; diff --git a/src/telemetry/service.ts b/src/telemetry/service.ts index 9ada725d..cc78a638 100644 --- a/src/telemetry/service.ts +++ b/src/telemetry/service.ts @@ -19,6 +19,8 @@ import { import { newSpanId, newTraceId } from "./ids"; import { NOOP_SPAN, type Span } from "./span"; +import type { TelemetryReporter } from "./reporter"; + const LEVEL_ORDER: Readonly> = { off: 0, local: 1, @@ -44,7 +46,7 @@ interface EmitOptions extends Partial { * by `minLevel` and may self-gate. `dispose` flushes are best-effort since * VS Code does not await deactivation. */ -export class TelemetryService implements vscode.Disposable { +export class TelemetryService implements vscode.Disposable, TelemetryReporter { #level: TelemetryLevel; #nextSequence = 0; #deploymentUrl = ""; @@ -141,6 +143,8 @@ export class TelemetryService implements vscode.Disposable { spanOpts: SpanOptions, ): Promise { const eventId = newSpanId(); + const spanProperties = { ...properties }; + const spanMeasurements = { ...measurements }; const { traceId, traceLevel } = spanOpts; const span: Span = { traceId, @@ -161,13 +165,19 @@ export class TelemetryService implements vscode.Disposable { { traceId, parentEventId: eventId, traceLevel }, ); }, + setProperty(name: string, value: string): void { + spanProperties[name] = value; + }, + setMeasurement(name: string, value: number): void { + spanMeasurements[name] = value; + }, }; return this.#emitTimed( eventId, eventName, () => fn(span), - properties, - measurements, + spanProperties, + spanMeasurements, spanOpts, ); } diff --git a/src/telemetry/span.ts b/src/telemetry/span.ts index 626c4bb4..0e27696b 100644 --- a/src/telemetry/span.ts +++ b/src/telemetry/span.ts @@ -1,4 +1,4 @@ -import { type CallerProperties } from "./event"; +import type { CallerProperties } from "./event"; /** * Parent span handle. Children's `eventName` composes as `${parent.eventName}.${phaseName}`. @@ -16,6 +16,10 @@ export interface Span { properties?: CallerProperties, measurements?: Record, ): Promise; + /** Add or replace a property on the event emitted for this span. */ + setProperty(name: string, value: string): void; + /** Add or replace a measurement on the event emitted for this span. */ + setMeasurement(name: string, value: number): void; } /** No-op `Span` used when telemetry is off. Runs phase fns but emits nothing. */ @@ -31,4 +35,6 @@ export const NOOP_SPAN: Span = { ): Promise { return fn(NOOP_SPAN); }, + setProperty: () => undefined, + setMeasurement: () => undefined, }; diff --git a/src/websocket/reconnectingWebSocket.ts b/src/websocket/reconnectingWebSocket.ts index 39d603d1..071ecd64 100644 --- a/src/websocket/reconnectingWebSocket.ts +++ b/src/websocket/reconnectingWebSocket.ts @@ -1,5 +1,10 @@ import { ClientCertificateError } from "../error/clientCertificateError"; import { toError } from "../error/errorUtils"; +import { + WebSocketTelemetry, + type ConnectionDropCause, + type ConnectionStateReason, +} from "../instrumentation/websocket"; import { WebSocketCloseCode, @@ -11,7 +16,7 @@ import { import type { WebSocketEventType } from "coder/site/src/utils/OneWayWebSocket"; import type { Logger } from "../logging/logger"; -import type { TelemetryService } from "../telemetry/service"; +import type { TelemetryReporter } from "../telemetry/reporter"; import type { CloseEvent, @@ -19,6 +24,12 @@ import type { UnidirectionalStream, } from "./eventStreamConnection"; +function toCloseEventError(event: CloseEvent): Error { + return new Error( + `WebSocket closed unexpectedly with code ${event.code}: ${event.reason}`, + ); +} + /** * Connection states for the ReconnectingWebSocket state machine. */ @@ -47,17 +58,6 @@ type StateAction = | { readonly type: "DISCONNECT" } | { readonly type: "DISPOSE" }; -type ReconnectMeasurements = Record<"attempts" | "totalDurationMs", number>; - -interface ReconnectTelemetryCycle { - readonly startMs: number; - readonly measurements: ReconnectMeasurements; - readonly resolve: () => void; - readonly reject: (error: Error) => void; - attempts: number; - completed: boolean; -} - /** * Pure reducer function for state transitions. */ @@ -116,7 +116,7 @@ export interface ReconnectingWebSocketOptions { initialBackoffMs?: number; maxBackoffMs?: number; jitterFactor?: number; - telemetry?: TelemetryService; + telemetry: TelemetryReporter; /** Callback invoked when a refreshable certificate error is detected. Returns true if refresh succeeded. */ onCertificateRefreshNeeded: () => Promise; } @@ -126,7 +126,7 @@ export class ReconnectingWebSocket< > implements UnidirectionalStream { readonly #socketFactory: SocketFactory; readonly #logger: Logger; - readonly #telemetry: TelemetryService | undefined; + readonly #telemetry: WebSocketTelemetry; readonly #options: Required>; readonly #eventHandlers: { [K in WebSocketEventType]: Set>; @@ -143,15 +143,12 @@ export class ReconnectingWebSocket< #reconnectTimeoutId: NodeJS.Timeout | null = null; #state: ConnectionState = ConnectionState.IDLE; #certRefreshAttempted = false; // Tracks if cert refresh was already attempted this connection cycle - #connectionOpenedAtMs: number | undefined; - #connectionDropped = false; - #reconnectCycle: ReconnectTelemetryCycle | undefined; readonly #onDispose?: () => void; /** * Dispatch an action to transition state. Returns true if transition is allowed. */ - #dispatch(action: StateAction, reason: string): boolean { + #dispatch(action: StateAction, reason: ConnectionStateReason): boolean { const previousState = this.#state; const newState = reduceState(previousState, action); if (newState === previousState) { @@ -165,11 +162,7 @@ export class ReconnectingWebSocket< return false; } this.#state = newState; - this.#telemetry?.log("connection.state_transition", { - from: previousState, - to: newState, - reason, - }); + this.#telemetry.stateTransition(previousState, newState, reason); return true; } @@ -181,7 +174,7 @@ export class ReconnectingWebSocket< ) { this.#socketFactory = socketFactory; this.#logger = logger; - this.#telemetry = options.telemetry; + this.#telemetry = new WebSocketTelemetry(options.telemetry); this.#options = { initialBackoffMs: options.initialBackoffMs ?? 250, maxBackoffMs: options.maxBackoffMs ?? 30000, @@ -205,7 +198,6 @@ export class ReconnectingWebSocket< onDispose, ); - // connect() handles all errors internally await instance.connect("initial_connect"); return instance; } @@ -258,7 +250,7 @@ export class ReconnectingWebSocket< } if (this.#state !== ConnectionState.IDLE) { - this.startReconnectTelemetry("manual_reconnect"); + this.#telemetry.reconnectStarted("manual_reconnect"); } if (this.#state === ConnectionState.DISCONNECTED) { @@ -271,7 +263,6 @@ export class ReconnectingWebSocket< this.#reconnectTimeoutId = null; } - // connect() handles all errors internally void this.connect("manual_reconnect"); } @@ -279,23 +270,26 @@ export class ReconnectingWebSocket< * Temporarily disconnect the socket. Can be resumed via reconnect(). */ public disconnect(code?: number, reason?: string): void { - this.disconnectWithReason("disconnect", code, reason); + this.disconnectWithReason("disconnect", "manual_disconnect", { + code, + closeReason: reason, + }); } private disconnectWithReason( - telemetryReason: string, - code?: number, - reason?: string, + reason: ConnectionStateReason, + cause: ConnectionDropCause, + options: { code?: number; closeReason?: string; error?: unknown } = {}, ): void { - if (!this.#dispatch({ type: "DISCONNECT" }, telemetryReason)) { + if (!this.#dispatch({ type: "DISCONNECT" }, reason)) { return; } - this.emitConnectionDrop("manual_disconnect", code); - this.finishReconnectTelemetry( - false, - new Error(`WebSocket disconnected: ${telemetryReason}`), - ); - this.clearCurrentSocket(code, reason); + this.#telemetry.terminated(reason, { + cause, + code: options.code, + error: options.error, + }); + this.clearCurrentSocket(options.code, options.closeReason); } public close(code?: number, reason?: string): void { @@ -315,16 +309,14 @@ export class ReconnectingWebSocket< this.dispose(code, reason); } - private async connect(reason: string): Promise { - const connectStartedAtMs = performance.now(); + private async connect(reason: ConnectionStateReason): Promise { if (!this.#dispatch({ type: "CONNECT" }, reason)) { return; } - this.recordReconnectAttempt(); + this.#telemetry.connectStarted(); try { - // Close any existing socket before creating a new one if (this.#currentSocket) { - this.emitConnectionDrop("replaced", WebSocketCloseCode.NORMAL); + this.#telemetry.dropped("replaced", WebSocketCloseCode.NORMAL); this.#currentSocket.close( WebSocketCloseCode.NORMAL, "Replacing connection", @@ -351,15 +343,7 @@ export class ReconnectingWebSocket< if (!this.#dispatch({ type: "OPEN" }, "open")) { return; } - const openedAtMs = performance.now(); - this.#connectionOpenedAtMs = openedAtMs; - this.#connectionDropped = false; - this.#telemetry?.log( - "connection.open", - { url: this.#route }, - { connectDurationMs: openedAtMs - connectStartedAtMs }, - ); - this.finishReconnectTelemetry(true); + this.#telemetry.opened(this.#route); // Reset backoff on successful connection this.#backoffMs = this.#options.initialBackoffMs; this.#certRefreshAttempted = false; @@ -388,55 +372,61 @@ export class ReconnectingWebSocket< }); socket.addEventListener("close", (event) => { - if (this.#currentSocket !== socket) { - return; - } - - if ( - this.#state === ConnectionState.DISPOSED || - this.#state === ConnectionState.DISCONNECTED - ) { - return; + if (this.#currentSocket === socket) { + this.handleSocketClose(event); } + }); + } catch (error) { + await this.handleConnectionError(error); + } + } - this.executeHandlers("close", event); - - if (UNRECOVERABLE_WS_CLOSE_CODES.has(event.code)) { - const error = closeEventError(event); - this.emitConnectionDrop("unrecoverable_close", event.code, error); - this.#logger.error( - `WebSocket connection closed with unrecoverable error code ${event.code}`, - ); - this.disconnectWithReason( - "unrecoverable_close", - event.code, - event.reason, - ); - return; - } + private handleSocketClose(event: CloseEvent): void { + if ( + this.#state === ConnectionState.DISPOSED || + this.#state === ConnectionState.DISCONNECTED + ) { + return; + } - if (NORMAL_CLOSURE_CODES.has(event.code)) { - this.emitConnectionDrop("normal_close", event.code); - return; - } + this.executeHandlers("close", event); - this.emitConnectionDrop( - "unexpected_close", - event.code, - closeEventError(event), - ); - this.scheduleReconnect("unexpected_close"); + if (UNRECOVERABLE_WS_CLOSE_CODES.has(event.code)) { + this.#logger.error( + `WebSocket connection closed with unrecoverable error code ${event.code}`, + ); + this.disconnectWithReason("unrecoverable_close", "unrecoverable_close", { + code: event.code, + closeReason: event.reason, + error: toCloseEventError(event), }); - } catch (error) { - await this.handleConnectionError(error); + return; + } + + if (NORMAL_CLOSURE_CODES.has(event.code)) { + this.#telemetry.dropped("normal_close", event.code); + return; } + + this.scheduleReconnect("unexpected_close", "unexpected_close", { + code: event.code, + error: toCloseEventError(event), + }); } - private scheduleReconnect(reason: string): void { + private scheduleReconnect( + reason: ConnectionStateReason, + cause: ConnectionDropCause, + options: { code?: number; error?: unknown } = {}, + ): void { if (!this.#dispatch({ type: "SCHEDULE_RETRY" }, reason)) { return; } - this.startReconnectTelemetry(reason); + this.#telemetry.retrying(reason, { + cause, + code: options.code, + error: options.error, + }); const jitter = this.#backoffMs * this.#options.jitterFactor * (Math.random() * 2 - 1); @@ -448,33 +438,16 @@ export class ReconnectingWebSocket< this.#reconnectTimeoutId = setTimeout(() => { this.#reconnectTimeoutId = null; - // connect() handles all errors internally void this.connect("scheduled_reconnect"); }, delayMs); this.#backoffMs = Math.min(this.#backoffMs * 2, this.#options.maxBackoffMs); } - /** - * Attempt to refresh certificates and return true if refresh succeeded. - */ - private async attemptCertificateRefresh(): Promise { - try { - return await this.#options.onCertificateRefreshNeeded(); - } catch (refreshError) { - this.#logger.error("Error during certificate refresh:", refreshError); - return false; - } - } - - /** - * Handle client certificate errors by attempting refresh for refreshable errors. - * @returns true if refresh succeeded. - */ + /** Returns true if refresh succeeded and the caller should retry. */ private async handleClientCertificateError( certError: ClientCertificateError, ): Promise { - // Only attempt refresh once per connection cycle if (this.#certRefreshAttempted) { this.#logger.warn("Certificate refresh already attempted, not retrying"); void certError.showNotification(); @@ -482,17 +455,20 @@ export class ReconnectingWebSocket< } if (certError.isRefreshable) { - this.#certRefreshAttempted = true; // Mark that we're attempting + this.#certRefreshAttempted = true; this.#logger.info( `Client certificate error (alert ${certError.alertCode}), attempting refresh...`, ); - if (await this.attemptCertificateRefresh()) { - this.#logger.info("Certificate refresh succeeded, reconnecting..."); - return true; + try { + if (await this.#options.onCertificateRefreshNeeded()) { + this.#logger.info("Certificate refresh succeeded, reconnecting..."); + return true; + } + } catch (refreshError) { + this.#logger.error("Error during certificate refresh:", refreshError); } } - // Show notification for failed/non-refreshable errors void certError.showNotification(); return false; } @@ -534,8 +510,7 @@ export class ReconnectingWebSocket< `Unrecoverable HTTP error during connection for ${this.#route}`, error, ); - this.emitConnectionDrop("error", undefined, error); - this.disconnectWithReason("unrecoverable_http"); + this.disconnectWithReason("unrecoverable_http", "error", { error }); return; } @@ -545,15 +520,13 @@ export class ReconnectingWebSocket< if (await this.handleClientCertificateError(certError)) { this.reconnect(); } else { - this.emitConnectionDrop("error", undefined, error); - this.disconnectWithReason("certificate_error"); + this.disconnectWithReason("certificate_error", "error", { error }); } return; } this.#logger.warn(`WebSocket connection failed for ${this.#route}`, error); - this.emitConnectionDrop("error", undefined, error); - this.scheduleReconnect("connection_error"); + this.scheduleReconnect("connection_error", "error", { error }); } /** @@ -573,11 +546,7 @@ export class ReconnectingWebSocket< if (!this.#dispatch({ type: "DISPOSE" }, "dispose")) { return; } - this.emitConnectionDrop("disposed", code); - this.finishReconnectTelemetry( - false, - new Error("WebSocket disposed before reconnect completed"), - ); + this.#telemetry.terminated("dispose", { cause: "disposed", code }); this.clearCurrentSocket(code, reason); for (const set of Object.values(this.#eventHandlers)) { @@ -597,98 +566,6 @@ export class ReconnectingWebSocket< this.#currentSocket.close(code, reason); this.#currentSocket = null; } - this.#connectionOpenedAtMs = undefined; - this.#connectionDropped = false; - } - - private emitConnectionDrop( - cause: string, - closeCode?: number, - error?: unknown, - ): void { - if (this.#connectionOpenedAtMs === undefined || this.#connectionDropped) { - return; - } - - const properties = { - cause, - closeCode: closeCode === undefined ? "" : String(closeCode), - }; - const measurements = { - connectionDurationMs: performance.now() - this.#connectionOpenedAtMs, - }; - if (error === undefined) { - this.#telemetry?.log("connection.drop", properties, measurements); - } else { - this.#telemetry?.logError( - "connection.drop", - error, - properties, - measurements, - ); - } - this.#connectionDropped = true; + this.#telemetry.reset(); } - - private startReconnectTelemetry(reason: string): void { - if (!this.#telemetry || this.#reconnectCycle) { - return; - } - - const measurements: ReconnectMeasurements = { - attempts: 0, - totalDurationMs: 0, - }; - let resolveCycle!: () => void; - let rejectCycle!: (error: Error) => void; - const cycleDone = new Promise((resolve, reject) => { - resolveCycle = resolve; - rejectCycle = reject; - }); - - const startMs = performance.now(); - this.#reconnectCycle = { - startMs, - measurements, - resolve: resolveCycle, - reject: rejectCycle, - attempts: 0, - completed: false, - }; - void this.#telemetry - .trace("connection.reconnect", () => cycleDone, { reason }, measurements) - .catch(() => undefined); - } - - private recordReconnectAttempt(): void { - const cycle = this.#reconnectCycle; - if (!cycle) { - return; - } - cycle.attempts += 1; - cycle.measurements.attempts = cycle.attempts; - cycle.measurements.totalDurationMs = performance.now() - cycle.startMs; - } - - private finishReconnectTelemetry(success: boolean, error?: Error): void { - const cycle = this.#reconnectCycle; - if (!cycle || cycle.completed) { - return; - } - - cycle.completed = true; - cycle.measurements.totalDurationMs = performance.now() - cycle.startMs; - this.#reconnectCycle = undefined; - if (success) { - cycle.resolve(); - } else { - cycle.reject(error ?? new Error("WebSocket reconnect failed")); - } - } -} - -function closeEventError(event: CloseEvent): Error { - return new Error( - `WebSocket closed unexpectedly with code ${event.code}: ${event.reason}`, - ); } diff --git a/test/mocks/telemetry.ts b/test/mocks/telemetry.ts index 4a9c1886..a3372ff5 100644 --- a/test/mocks/telemetry.ts +++ b/test/mocks/telemetry.ts @@ -1,10 +1,14 @@ import { vi } from "vitest"; -import type { - TelemetryEvent, - TelemetryLevel, - TelemetrySink, +import { + buildSession, + type TelemetryEvent, + type TelemetryLevel, + type TelemetrySink, } from "@/telemetry/event"; +import { TelemetryService } from "@/telemetry/service"; + +import { createMockLogger, MockConfigurationProvider } from "./testHelpers"; /** * In-memory `TelemetrySink` for tests. Captures every written event and @@ -26,3 +30,21 @@ export class TestSink implements TelemetrySink { this.events.push(event); } } + +/** + * Build a real `TelemetryService` wired to a single `TestSink` for assertions. + * Sets `coder.telemetry.level` to `"local"` so locally-scoped events flow. + */ +export function createTestTelemetry(): { + telemetry: TelemetryService; + sink: TestSink; +} { + new MockConfigurationProvider().set("coder.telemetry.level", "local"); + const sink = new TestSink(); + const telemetry = new TelemetryService( + buildSession("test", "test-session"), + [sink], + createMockLogger(), + ); + return { telemetry, sink }; +} diff --git a/test/unit/instrumentation/ssh.test.ts b/test/unit/instrumentation/ssh.test.ts new file mode 100644 index 00000000..11f5fefa --- /dev/null +++ b/test/unit/instrumentation/ssh.test.ts @@ -0,0 +1,123 @@ +import { describe, it, expect } from "vitest"; + +import { SshTelemetry } from "@/instrumentation/ssh"; + +import { createTestTelemetry } from "../../mocks/telemetry"; + +import type { NetworkInfo } from "@/remote/sshProcess"; + +function makeNetwork(overrides: Partial = {}): NetworkInfo { + return { + p2p: true, + latency: 50, + preferred_derp: "NYC", + derp_latency: { NYC: 10 }, + upload_bytes_sec: 1_250_000, + download_bytes_sec: 6_250_000, + using_coder_connect: false, + ...overrides, + }; +} + +describe("SshTelemetry", () => { + describe("processReplaced", () => { + it("emits a recovery when the prior process was already marked lost", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.processStarted(); + ssh.processLost("stale_network_info"); + ssh.processReplaced(); + + const events = sink.events.map((e) => e.eventName); + expect(events).toEqual(["ssh.process.lost", "ssh.process.recovered"]); + await telemetry.dispose(); + }); + + it("emits a replacement event for an instant handover", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.processStarted(); + ssh.processReplaced(); + + const replaced = sink.events.filter( + (e) => e.eventName === "ssh.process.replaced", + ); + expect(replaced).toHaveLength(1); + expect(replaced[0].measurements).toMatchObject({ + previousUptimeMs: expect.any(Number), + }); + await telemetry.dispose(); + }); + + it("emits nothing if there was no prior process", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.processReplaced(); + + expect(sink.events).toHaveLength(0); + await telemetry.dispose(); + }); + }); + + describe("processLost", () => { + it("is a no-op when there is no started process", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.processLost("disposed"); + + expect(sink.events).toHaveLength(0); + await telemetry.dispose(); + }); + + it("does not double-emit when called twice without a recovery", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.processStarted(); + ssh.processLost("stale_network_info"); + ssh.processLost("missing_network_info"); + + const lost = sink.events.filter( + (e) => e.eventName === "ssh.process.lost", + ); + expect(lost).toHaveLength(1); + expect(lost[0].properties).toMatchObject({ cause: "stale_network_info" }); + await telemetry.dispose(); + }); + }); + + describe("networkSampled", () => { + it("emits the first sample and skips unchanged follow-ups", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.networkSampled(makeNetwork()); + ssh.networkSampled(makeNetwork()); + + const samples = sink.events.filter( + (e) => e.eventName === "ssh.network.sample", + ); + expect(samples).toHaveLength(1); + await telemetry.dispose(); + }); + + it("re-samples on a p2p flip inside the heartbeat window", async () => { + const { telemetry, sink } = createTestTelemetry(); + const ssh = new SshTelemetry(telemetry); + + ssh.networkSampled(makeNetwork({ p2p: true })); + ssh.networkSampled(makeNetwork({ p2p: false })); + + const samples = sink.events.filter( + (e) => e.eventName === "ssh.network.sample", + ); + expect(samples).toHaveLength(2); + expect(samples[1].properties.p2p).toBe("false"); + await telemetry.dispose(); + }); + }); +}); diff --git a/test/unit/remote/sshProcess.test.ts b/test/unit/remote/sshProcess.test.ts index 7528a3a4..71d2529a 100644 --- a/test/unit/remote/sshProcess.test.ts +++ b/test/unit/remote/sshProcess.test.ts @@ -10,10 +10,9 @@ import { type NetworkInfo, type SshProcessMonitorOptions, } from "@/remote/sshProcess"; -import { buildSession } from "@/telemetry/event"; -import { TelemetryService } from "@/telemetry/service"; +import { NOOP_TELEMETRY_REPORTER } from "@/telemetry/reporter"; -import { TestSink } from "../../mocks/telemetry"; +import { createTestTelemetry, type TestSink } from "../../mocks/telemetry"; import { createMockLogger, makeNetworkInfo, @@ -321,10 +320,12 @@ describe("SshProcessMonitor", () => { "-> socksPort 12345 ->", }; + function eventsNamed(sink: TestSink, eventName: string) { + return sink.events.filter((event) => event.eventName === eventName); + } + function networkTelemetryEvents(sink: TestSink) { - return sink.events.filter( - (event) => event.eventName === "ssh.network.info", - ); + return eventsNamed(sink, "ssh.network.sample"); } async function advanceUntilNetworkEvents( @@ -346,22 +347,24 @@ describe("SshProcessMonitor", () => { } it("emits a process discovered trace", async () => { - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); vol.fromJSON(sshLog); const monitor = createMonitor({ telemetry }); await waitForEvent(monitor.onPidChange); - const event = sink.events.find( - (event) => event.eventName === "ssh.process.discovered", - ); - expect(event?.properties.result).toBe("success"); - expect(event?.measurements.durationMs).toEqual(expect.any(Number)); + expect(eventsNamed(sink, "ssh.process.discovered")[0]).toMatchObject({ + properties: { result: "success" }, + measurements: { + durationMs: expect.any(Number), + attempts: expect.any(Number), + }, + }); await telemetry.dispose(); }); it("emits process lost and recovered events after stale network info", async () => { - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); vol.fromJSON({ ...sshLog, "/network/999.json": makeNetworkJson(), @@ -379,31 +382,24 @@ describe("SshProcessMonitor", () => { await waitFor( () => - sink.events.some((event) => event.eventName === "ssh.process.lost") && - sink.events.some( - (event) => event.eventName === "ssh.process.recovered", - ), + eventsNamed(sink, "ssh.process.lost").length > 0 && + eventsNamed(sink, "ssh.process.recovered").length > 0, 500, ); - const lost = sink.events.find( - (event) => event.eventName === "ssh.process.lost", - )!; - expect(lost.properties).toEqual({ cause: "stale_network_info" }); - expect(lost.measurements.uptimeMs).toEqual(expect.any(Number)); - - const recovered = sink.events.find( - (event) => event.eventName === "ssh.process.recovered", - )!; - expect(recovered.measurements.recoveryDurationMs).toEqual( - expect.any(Number), - ); + expect(eventsNamed(sink, "ssh.process.lost")[0]).toMatchObject({ + properties: { cause: "stale_network_info" }, + measurements: { uptimeMs: expect.any(Number) }, + }); + expect(eventsNamed(sink, "ssh.process.recovered")[0]).toMatchObject({ + measurements: { recoveryDurationMs: expect.any(Number) }, + }); await telemetry.dispose(); }); it("samples network info on first read and every 60 seconds, not every poll", async () => { vi.useFakeTimers(); - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); vol.fromJSON({ ...sshLog, "/network/999.json": makeNetworkJson({ latency: 25 }), @@ -435,7 +431,7 @@ describe("SshProcessMonitor", () => { it("samples network info on p2p flip inside the 60 second window", async () => { vi.useFakeTimers(); - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); vol.fromJSON({ ...sshLog, "/network/999.json": makeNetworkJson({ p2p: true }), @@ -959,22 +955,12 @@ describe("SshProcessMonitor", () => { discoveryPollIntervalMs: 10, maxDiscoveryBackoffMs: 100, networkPollInterval: 10, + telemetry: NOOP_TELEMETRY_REPORTER, ...overrides, }); activeMonitors.push(monitor); return monitor; } - - function createTelemetry(): { telemetry: TelemetryService; sink: TestSink } { - new MockConfigurationProvider().set("coder.telemetry.level", "local"); - const sink = new TestSink(); - const telemetry = new TelemetryService( - buildSession("test", "test-session"), - [sink], - createMockLogger(), - ); - return { telemetry, sink }; - } }); /** diff --git a/test/unit/websocket/reconnectingWebSocket.test.ts b/test/unit/websocket/reconnectingWebSocket.test.ts index 08a4ec58..e127bc2f 100644 --- a/test/unit/websocket/reconnectingWebSocket.test.ts +++ b/test/unit/websocket/reconnectingWebSocket.test.ts @@ -1,7 +1,6 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; -import { buildSession } from "@/telemetry/event"; -import { TelemetryService } from "@/telemetry/service"; +import { NOOP_TELEMETRY_REPORTER } from "@/telemetry/reporter"; import { WebSocketCloseCode, HttpStatusCode } from "@/websocket/codes"; import { ConnectionState, @@ -9,14 +8,13 @@ import { type SocketFactory, } from "@/websocket/reconnectingWebSocket"; -import { TestSink } from "../../mocks/telemetry"; -import { - createMockLogger, - MockConfigurationProvider, -} from "../../mocks/testHelpers"; +import { createTestTelemetry, type TestSink } from "../../mocks/telemetry"; +import { createMockLogger } from "../../mocks/testHelpers"; import type { CloseEvent, Event as WsEvent } from "ws"; +import type { TelemetryEvent } from "@/telemetry/event"; +import type { TelemetryService } from "@/telemetry/service"; import type { UnidirectionalStream } from "@/websocket/eventStreamConnection"; describe("ReconnectingWebSocket", () => { @@ -109,7 +107,10 @@ describe("ReconnectingWebSocket", () => { const ws = await ReconnectingWebSocket.create( factory, createMockLogger(), - { onCertificateRefreshNeeded: () => Promise.resolve(false) }, + { + telemetry: NOOP_TELEMETRY_REPORTER, + onCertificateRefreshNeeded: () => Promise.resolve(false), + }, ); // Should be disconnected after unrecoverable HTTP error @@ -594,8 +595,8 @@ describe("ReconnectingWebSocket", () => { }); describe("Telemetry", () => { - it("emits state transition events for reducer changes", async () => { - const { telemetry, sink } = createTelemetry(); + it("emits state transition events for connection lifecycle changes", async () => { + const { telemetry, sink } = createTestTelemetry(); const { ws, sockets } = await createReconnectingWebSocket( undefined, telemetry, @@ -610,10 +611,11 @@ describe("ReconnectingWebSocket", () => { sockets[1].fireOpen(); ws.close(); - const transitions = sink.events.filter( - (event) => event.eventName === "connection.state_transition", - ); - expect(transitions.map((event) => event.properties)).toEqual([ + expect( + eventsNamed(sink, "connection.state_transition").map( + (event) => event.properties, + ), + ).toEqual([ { from: "IDLE", to: "CONNECTING", reason: "initial_connect" }, { from: "CONNECTING", to: "CONNECTED", reason: "open" }, { @@ -633,7 +635,7 @@ describe("ReconnectingWebSocket", () => { }); it("emits open and unexpected drop events", async () => { - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); const { ws, sockets } = await createReconnectingWebSocket( undefined, telemetry, @@ -645,30 +647,26 @@ describe("ReconnectingWebSocket", () => { reason: "Network error", }); - const open = sink.events.find( - (event) => event.eventName === "connection.open", - ); - expect(open?.properties).toEqual({ url: "/api/test" }); - expect(open?.measurements.connectDurationMs).toEqual(expect.any(Number)); + expect(firstEventNamed(sink, "connection.open")).toMatchObject({ + properties: { url: "/api/test" }, + measurements: { connectDurationMs: expect.any(Number) }, + }); - const drop = sink.events.find( - (event) => event.eventName === "connection.drop", - ); - expect(drop?.properties).toEqual({ - cause: "unexpected_close", - closeCode: String(WebSocketCloseCode.ABNORMAL), + expect(firstEventNamed(sink, "connection.drop")).toMatchObject({ + properties: { + cause: "unexpected_close", + closeCode: String(WebSocketCloseCode.ABNORMAL), + }, + measurements: { connectionDurationMs: expect.any(Number) }, + error: { message: expect.stringContaining("1006") }, }); - expect(drop?.measurements.connectionDurationMs).toEqual( - expect.any(Number), - ); - expect(drop?.error?.message).toContain("1006"); ws.close(); await telemetry.dispose(); }); it("emits reconnect aggregation after recovery", async () => { - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); const { ws, sockets } = await createReconnectingWebSocket( undefined, telemetry, @@ -682,31 +680,23 @@ describe("ReconnectingWebSocket", () => { await vi.advanceTimersByTimeAsync(300); sockets[1].fireOpen(); - await vi.waitFor(() => { - expect( - sink.events.filter( - (event) => event.eventName === "connection.reconnect", - ), - ).toHaveLength(1); - }); - const reconnect = sink.events.find( - (event) => event.eventName === "connection.reconnect", - )!; - expect(reconnect.properties).toMatchObject({ - result: "success", - reason: "unexpected_close", - }); - expect(reconnect.measurements.attempts).toBe(1); - expect(reconnect.measurements.totalDurationMs).toEqual( - expect.any(Number), + await vi.waitFor(() => + expect(eventsNamed(sink, "connection.reconnect")).toHaveLength(1), ); + expect(firstEventNamed(sink, "connection.reconnect")).toMatchObject({ + properties: { result: "success", reason: "unexpected_close" }, + measurements: { + attempts: 1, + totalDurationMs: expect.any(Number), + }, + }); ws.close(); await telemetry.dispose(); }); it("emits failed reconnect aggregation when reconnect ends disconnected", async () => { - const { telemetry, sink } = createTelemetry(); + const { telemetry, sink } = createTestTelemetry(); const { ws, sockets, setFactoryError } = await createReconnectingWebSocketWithErrorControl(telemetry); sockets[0].fireOpen(); @@ -719,21 +709,16 @@ describe("ReconnectingWebSocket", () => { await vi.waitFor(() => { expect(ws.state).toBe(ConnectionState.DISCONNECTED); - expect( - sink.events.filter( - (event) => event.eventName === "connection.reconnect", - ), - ).toHaveLength(1); + expect(eventsNamed(sink, "connection.reconnect")).toHaveLength(1); }); - const reconnect = sink.events.find( - (event) => event.eventName === "connection.reconnect", - )!; - expect(reconnect.properties).toMatchObject({ - result: "error", - reason: "manual_reconnect", + expect(firstEventNamed(sink, "connection.reconnect")).toMatchObject({ + properties: { + result: "error", + reason: "manual_reconnect", + terminationReason: "unrecoverable_http", + }, + measurements: { attempts: 1 }, }); - expect(reconnect.measurements.attempts).toBe(1); - expect(reconnect.error?.message).toContain("unrecoverable_http"); ws.close(); await telemetry.dispose(); @@ -962,23 +947,23 @@ async function fromFactory( factory, createMockLogger(), { + telemetry: telemetry ?? NOOP_TELEMETRY_REPORTER, onCertificateRefreshNeeded: onCertificateRefreshNeeded ?? (() => Promise.resolve(false)), - ...(telemetry && { telemetry }), }, onDispose, ); } -function createTelemetry(): { telemetry: TelemetryService; sink: TestSink } { - new MockConfigurationProvider().set("coder.telemetry.level", "local"); - const sink = new TestSink(); - const telemetry = new TelemetryService( - buildSession("test", "test-session"), - [sink], - createMockLogger(), - ); - return { telemetry, sink }; +function eventsNamed(sink: TestSink, eventName: string): TelemetryEvent[] { + return sink.events.filter((event) => event.eventName === eventName); +} + +function firstEventNamed( + sink: TestSink, + eventName: string, +): TelemetryEvent | undefined { + return eventsNamed(sink, eventName)[0]; } async function createBlockingReconnectingWebSocket(): Promise<{ From cbffa11d20fe733888a3bf3aeeb856a9b11e9f60 Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 8 May 2026 14:51:55 +0300 Subject: [PATCH 3/7] add changelog entry --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dcc92f1..04dc7fa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,10 @@ - Every `coder.*` command now records a `command.invoked` telemetry event with its duration and outcome, so command latency and failures are captured alongside other local telemetry. +- Connection lifecycle now records local telemetry: SSH process + discovery/loss/recovery with sampled network info, and reconnecting + WebSocket open, drop, reconnect, and state transitions, so connection + stability is captured alongside other local telemetry. ### Fixed From edd48dba661fe8227166154d2790c7bb6c0a3832 Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 8 May 2026 16:37:58 +0300 Subject: [PATCH 4/7] test(telemetry): tighten connection telemetry tests --- test/mocks/telemetry.ts | 4 + test/unit/instrumentation/ssh.test.ts | 136 ++++++------ test/unit/instrumentation/websocket.test.ts | 193 ++++++++++++++++++ test/unit/remote/sshProcess.test.ts | 89 ++------ .../websocket/reconnectingWebSocket.test.ts | 177 +++------------- 5 files changed, 315 insertions(+), 284 deletions(-) create mode 100644 test/unit/instrumentation/websocket.test.ts diff --git a/test/mocks/telemetry.ts b/test/mocks/telemetry.ts index a3372ff5..7ddc13eb 100644 --- a/test/mocks/telemetry.ts +++ b/test/mocks/telemetry.ts @@ -29,6 +29,10 @@ export class TestSink implements TelemetrySink { write(event: TelemetryEvent): void { this.events.push(event); } + + eventsNamed(name: string): TelemetryEvent[] { + return this.events.filter((e) => e.eventName === name); + } } /** diff --git a/test/unit/instrumentation/ssh.test.ts b/test/unit/instrumentation/ssh.test.ts index 11f5fefa..cba77faa 100644 --- a/test/unit/instrumentation/ssh.test.ts +++ b/test/unit/instrumentation/ssh.test.ts @@ -1,123 +1,127 @@ -import { describe, it, expect } from "vitest"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { SshTelemetry } from "@/instrumentation/ssh"; import { createTestTelemetry } from "../../mocks/telemetry"; +import { makeNetworkInfo } from "../../mocks/testHelpers"; -import type { NetworkInfo } from "@/remote/sshProcess"; - -function makeNetwork(overrides: Partial = {}): NetworkInfo { - return { - p2p: true, - latency: 50, - preferred_derp: "NYC", - derp_latency: { NYC: 10 }, - upload_bytes_sec: 1_250_000, - download_bytes_sec: 6_250_000, - using_coder_connect: false, - ...overrides, - }; +function setup() { + const { telemetry, sink } = createTestTelemetry(); + return { ssh: new SshTelemetry(telemetry), sink }; } describe("SshTelemetry", () => { describe("processReplaced", () => { - it("emits a recovery when the prior process was already marked lost", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); + it("emits a recovery when the prior process was already marked lost", () => { + const { ssh, sink } = setup(); ssh.processStarted(); ssh.processLost("stale_network_info"); ssh.processReplaced(); - const events = sink.events.map((e) => e.eventName); - expect(events).toEqual(["ssh.process.lost", "ssh.process.recovered"]); - await telemetry.dispose(); + expect(sink.events.map((e) => e.eventName)).toEqual([ + "ssh.process.lost", + "ssh.process.recovered", + ]); }); - it("emits a replacement event for an instant handover", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); + it("emits a replacement event for an instant handover", () => { + const { ssh, sink } = setup(); ssh.processStarted(); ssh.processReplaced(); - const replaced = sink.events.filter( - (e) => e.eventName === "ssh.process.replaced", - ); + const replaced = sink.eventsNamed("ssh.process.replaced"); expect(replaced).toHaveLength(1); expect(replaced[0].measurements).toMatchObject({ previousUptimeMs: expect.any(Number), }); - await telemetry.dispose(); }); - it("emits nothing if there was no prior process", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); + it("emits nothing if there was no prior process", () => { + const { ssh, sink } = setup(); ssh.processReplaced(); expect(sink.events).toHaveLength(0); - await telemetry.dispose(); }); }); describe("processLost", () => { - it("is a no-op when there is no started process", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); + it("is a no-op when there is no started process", () => { + const { ssh, sink } = setup(); ssh.processLost("disposed"); expect(sink.events).toHaveLength(0); - await telemetry.dispose(); }); - it("does not double-emit when called twice without a recovery", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); + it("does not double-emit when called twice without a recovery", () => { + const { ssh, sink } = setup(); ssh.processStarted(); ssh.processLost("stale_network_info"); ssh.processLost("missing_network_info"); - const lost = sink.events.filter( - (e) => e.eventName === "ssh.process.lost", - ); + const lost = sink.eventsNamed("ssh.process.lost"); expect(lost).toHaveLength(1); - expect(lost[0].properties).toMatchObject({ cause: "stale_network_info" }); - await telemetry.dispose(); + expect(lost[0].properties.cause).toBe("stale_network_info"); }); }); describe("networkSampled", () => { - it("emits the first sample and skips unchanged follow-ups", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); - - ssh.networkSampled(makeNetwork()); - ssh.networkSampled(makeNetwork()); - - const samples = sink.events.filter( - (e) => e.eventName === "ssh.network.sample", - ); - expect(samples).toHaveLength(1); - await telemetry.dispose(); + beforeEach(() => vi.useFakeTimers()); + afterEach(() => vi.useRealTimers()); + + it.each([ + { name: "no change in window", next: {}, advanceMs: 1_000, expected: 1 }, + { + name: "small latency change (under 10%)", + next: { latency: 51 }, + advanceMs: 1_000, + expected: 1, + }, + { name: "p2p flip", next: { p2p: false }, advanceMs: 1_000, expected: 2 }, + { + name: "DERP region change", + next: { preferred_derp: "SFO" }, + advanceMs: 1_000, + expected: 2, + }, + { + name: "large latency swing (over 10%)", + next: { latency: 100 }, + advanceMs: 1_000, + expected: 2, + }, + { + name: "heartbeat after 60s without change", + next: {}, + advanceMs: 60_000, + expected: 2, + }, + ])("$name -> $expected sample(s)", ({ next, advanceMs, expected }) => { + const { ssh, sink } = setup(); + + ssh.networkSampled(makeNetworkInfo()); + vi.advanceTimersByTime(advanceMs); + ssh.networkSampled(makeNetworkInfo(next)); + + expect(sink.eventsNamed("ssh.network.sample")).toHaveLength(expected); }); - it("re-samples on a p2p flip inside the heartbeat window", async () => { - const { telemetry, sink } = createTestTelemetry(); - const ssh = new SshTelemetry(telemetry); + it("includes p2p, derp, latency, and bandwidth in the emitted sample", () => { + const { ssh, sink } = setup(); - ssh.networkSampled(makeNetwork({ p2p: true })); - ssh.networkSampled(makeNetwork({ p2p: false })); + ssh.networkSampled(makeNetworkInfo({ latency: 25 })); - const samples = sink.events.filter( - (e) => e.eventName === "ssh.network.sample", - ); - expect(samples).toHaveLength(2); - expect(samples[1].properties.p2p).toBe("false"); - await telemetry.dispose(); + const [sample] = sink.eventsNamed("ssh.network.sample"); + expect(sample.properties).toEqual({ p2p: "true", derp: "NYC" }); + expect(sample.measurements).toMatchObject({ + latencyMs: 25, + downloadMbits: expect.any(Number), + uploadMbits: expect.any(Number), + }); }); }); }); diff --git a/test/unit/instrumentation/websocket.test.ts b/test/unit/instrumentation/websocket.test.ts new file mode 100644 index 00000000..74df249d --- /dev/null +++ b/test/unit/instrumentation/websocket.test.ts @@ -0,0 +1,193 @@ +import { describe, expect, it } from "vitest"; + +import { WebSocketTelemetry } from "@/instrumentation/websocket"; +import { ConnectionState } from "@/websocket/reconnectingWebSocket"; + +import { createTestTelemetry } from "../../mocks/telemetry"; + +function setup() { + const { telemetry, sink } = createTestTelemetry(); + return { ws: new WebSocketTelemetry(telemetry), sink }; +} + +describe("WebSocketTelemetry", () => { + describe("stateTransition", () => { + it("emits a connection.state_transition event with from/to/reason", () => { + const { ws, sink } = setup(); + + ws.stateTransition( + ConnectionState.IDLE, + ConnectionState.CONNECTING, + "initial_connect", + ); + + const [event] = sink.eventsNamed("connection.state_transition"); + expect(event.properties).toEqual({ + from: "IDLE", + to: "CONNECTING", + reason: "initial_connect", + }); + }); + }); + + describe("opened", () => { + it("emits connection.open with route and connect duration", () => { + const { ws, sink } = setup(); + + ws.connectStarted(); + ws.opened("/api/test"); + + const [event] = sink.eventsNamed("connection.open"); + expect(event).toMatchObject({ + properties: { url: "/api/test" }, + measurements: { connectDurationMs: expect.any(Number) }, + }); + }); + + it("uses 0 duration when connectStarted was not called", () => { + const { ws, sink } = setup(); + + ws.opened("/api/test"); + + const [event] = sink.eventsNamed("connection.open"); + expect(event.measurements.connectDurationMs).toBe(0); + }); + }); + + describe("dropped", () => { + it("is silent when no connection has been opened", () => { + const { ws, sink } = setup(); + + ws.dropped("error"); + + expect(sink.events).toHaveLength(0); + }); + + it("emits connection.drop with cause and close code", () => { + const { ws, sink } = setup(); + + ws.opened("/api/test"); + ws.dropped("unexpected_close", 1006); + + const [event] = sink.eventsNamed("connection.drop"); + expect(event.properties).toMatchObject({ + cause: "unexpected_close", + closeCode: "1006", + }); + expect(event.measurements.connectionDurationMs).toEqual( + expect.any(Number), + ); + }); + + it("emits via logError when an error is provided", () => { + const { ws, sink } = setup(); + + ws.opened("/api/test"); + ws.dropped("error", 1006, new Error("boom")); + + const [event] = sink.eventsNamed("connection.drop"); + expect(event.error).toMatchObject({ message: "boom" }); + }); + + it("does not double-emit when called twice", () => { + const { ws, sink } = setup(); + + ws.opened("/api/test"); + ws.dropped("normal_close"); + ws.dropped("error"); + + expect(sink.eventsNamed("connection.drop")).toHaveLength(1); + }); + }); + + describe("reset", () => { + it("clears state so a subsequent dropped is silent", () => { + const { ws, sink } = setup(); + + ws.opened("/api/test"); + ws.reset(); + ws.dropped("error"); + + expect(sink.eventsNamed("connection.drop")).toHaveLength(0); + }); + }); + + describe("reconnect cycle", () => { + it("emits connection.reconnect with success when opened closes the cycle", () => { + const { ws, sink } = setup(); + + ws.reconnectStarted("manual_reconnect"); + ws.connectStarted(); + ws.opened("/api/test"); + + const [event] = sink.eventsNamed("connection.reconnect"); + expect(event).toMatchObject({ + properties: { result: "success", reason: "manual_reconnect" }, + measurements: { attempts: 1, totalDurationMs: expect.any(Number) }, + }); + }); + + it("emits connection.reconnect with error when terminated closes the cycle", () => { + const { ws, sink } = setup(); + + ws.reconnectStarted("manual_reconnect"); + ws.terminated("unrecoverable_http", { cause: "error" }); + + const [event] = sink.eventsNamed("connection.reconnect"); + expect(event.properties).toEqual({ + result: "error", + reason: "manual_reconnect", + terminationReason: "unrecoverable_http", + }); + }); + + it("does not emit when terminated is called outside a cycle", () => { + const { ws, sink } = setup(); + + ws.terminated("dispose", { cause: "disposed" }); + + expect(sink.eventsNamed("connection.reconnect")).toHaveLength(0); + }); + + it("counts each connectStarted as an attempt within the cycle", () => { + const { ws, sink } = setup(); + + ws.reconnectStarted("scheduled_reconnect"); + ws.connectStarted(); + ws.connectStarted(); + ws.connectStarted(); + ws.opened("/api/test"); + + expect( + sink.eventsNamed("connection.reconnect")[0].measurements.attempts, + ).toBe(3); + }); + + it("ignores reconnectStarted while a cycle is already open", () => { + const { ws, sink } = setup(); + + ws.reconnectStarted("manual_reconnect"); + ws.reconnectStarted("scheduled_reconnect"); + ws.opened("/api/test"); + + expect( + sink.eventsNamed("connection.reconnect")[0].properties.reason, + ).toBe("manual_reconnect"); + }); + + it("retrying drops the connection and opens a cycle", () => { + const { ws, sink } = setup(); + + ws.opened("/api/test"); + ws.retrying("unexpected_close", { + cause: "unexpected_close", + code: 1006, + }); + + expect(sink.eventsNamed("connection.drop")).toHaveLength(1); + + ws.opened("/api/test"); + expect(sink.eventsNamed("connection.reconnect")).toHaveLength(1); + }); + }); +}); diff --git a/test/unit/remote/sshProcess.test.ts b/test/unit/remote/sshProcess.test.ts index 71d2529a..83c5f9f0 100644 --- a/test/unit/remote/sshProcess.test.ts +++ b/test/unit/remote/sshProcess.test.ts @@ -12,7 +12,7 @@ import { } from "@/remote/sshProcess"; import { NOOP_TELEMETRY_REPORTER } from "@/telemetry/reporter"; -import { createTestTelemetry, type TestSink } from "../../mocks/telemetry"; +import { createTestTelemetry } from "../../mocks/telemetry"; import { createMockLogger, makeNetworkInfo, @@ -320,50 +320,28 @@ describe("SshProcessMonitor", () => { "-> socksPort 12345 ->", }; - function eventsNamed(sink: TestSink, eventName: string) { - return sink.events.filter((event) => event.eventName === eventName); - } - - function networkTelemetryEvents(sink: TestSink) { - return eventsNamed(sink, "ssh.network.sample"); - } - - async function advanceUntilNetworkEvents( - sink: TestSink, - count: number, - ): Promise { - for (let i = 0; i < 20; i++) { - await vi.advanceTimersByTimeAsync(1); - if (networkTelemetryEvents(sink).length >= count) { - return; - } - } - expect(networkTelemetryEvents(sink)).toHaveLength(count); - } - function keepNetworkFileFresh(filePath: string, ms = 90_000): void { const mtime = (Date.now() + ms) / 1000; vol.utimesSync(filePath, mtime, mtime); } - it("emits a process discovered trace", async () => { + it("emits a process discovered trace once a PID is found", async () => { const { telemetry, sink } = createTestTelemetry(); vol.fromJSON(sshLog); const monitor = createMonitor({ telemetry }); await waitForEvent(monitor.onPidChange); - expect(eventsNamed(sink, "ssh.process.discovered")[0]).toMatchObject({ + expect(sink.eventsNamed("ssh.process.discovered")[0]).toMatchObject({ properties: { result: "success" }, measurements: { durationMs: expect.any(Number), attempts: expect.any(Number), }, }); - await telemetry.dispose(); }); - it("emits process lost and recovered events after stale network info", async () => { + it("emits lost and recovered events around a stale-network reconnect", async () => { const { telemetry, sink } = createTestTelemetry(); vol.fromJSON({ ...sshLog, @@ -379,26 +357,22 @@ describe("SshProcessMonitor", () => { telemetry, }); await waitForEvent(monitor.onPidChange); - await waitFor( () => - eventsNamed(sink, "ssh.process.lost").length > 0 && - eventsNamed(sink, "ssh.process.recovered").length > 0, - 500, + sink.eventsNamed("ssh.process.lost").length > 0 && + sink.eventsNamed("ssh.process.recovered").length > 0, ); - expect(eventsNamed(sink, "ssh.process.lost")[0]).toMatchObject({ + expect(sink.eventsNamed("ssh.process.lost")[0]).toMatchObject({ properties: { cause: "stale_network_info" }, measurements: { uptimeMs: expect.any(Number) }, }); - expect(eventsNamed(sink, "ssh.process.recovered")[0]).toMatchObject({ + expect(sink.eventsNamed("ssh.process.recovered")[0]).toMatchObject({ measurements: { recoveryDurationMs: expect.any(Number) }, }); - await telemetry.dispose(); }); - it("samples network info on first read and every 60 seconds, not every poll", async () => { - vi.useFakeTimers(); + it("forwards network reads to the telemetry as samples", async () => { const { telemetry, sink } = createTestTelemetry(); vol.fromJSON({ ...sshLog, @@ -406,52 +380,17 @@ describe("SshProcessMonitor", () => { }); keepNetworkFileFresh("/network/999.json"); - createMonitor({ - networkInfoPath: "/network", - networkPollInterval: 10_000, - telemetry, - }); - await advanceUntilNetworkEvents(sink, 1); - expect(networkTelemetryEvents(sink)).toHaveLength(1); - - await vi.advanceTimersByTimeAsync(50_000); - expect(networkTelemetryEvents(sink)).toHaveLength(1); - - await vi.advanceTimersByTimeAsync(10_000); - expect(networkTelemetryEvents(sink)).toHaveLength(2); - const first = networkTelemetryEvents(sink)[0]; - expect(first.properties).toEqual({ p2p: "true", derp: "NYC" }); - expect(first.measurements).toMatchObject({ - latencyMs: 25, - downloadMbits: 50, - uploadMbits: 10, - }); - await telemetry.dispose(); - }); - - it("samples network info on p2p flip inside the 60 second window", async () => { - vi.useFakeTimers(); - const { telemetry, sink } = createTestTelemetry(); - vol.fromJSON({ - ...sshLog, - "/network/999.json": makeNetworkJson({ p2p: true }), - }); - keepNetworkFileFresh("/network/999.json"); - createMonitor({ networkInfoPath: "/network", networkPollInterval: 10, telemetry, }); - await advanceUntilNetworkEvents(sink, 1); + await waitFor(() => sink.eventsNamed("ssh.network.sample").length > 0); - vol.writeFileSync("/network/999.json", makeNetworkJson({ p2p: false })); - keepNetworkFileFresh("/network/999.json"); - await vi.advanceTimersByTimeAsync(10); - - expect(networkTelemetryEvents(sink)).toHaveLength(2); - expect(networkTelemetryEvents(sink)[1].properties.p2p).toBe("false"); - await telemetry.dispose(); + expect(sink.eventsNamed("ssh.network.sample")[0]).toMatchObject({ + properties: { p2p: "true", derp: "NYC" }, + measurements: { latencyMs: 25 }, + }); }); }); diff --git a/test/unit/websocket/reconnectingWebSocket.test.ts b/test/unit/websocket/reconnectingWebSocket.test.ts index e127bc2f..10b54732 100644 --- a/test/unit/websocket/reconnectingWebSocket.test.ts +++ b/test/unit/websocket/reconnectingWebSocket.test.ts @@ -8,12 +8,11 @@ import { type SocketFactory, } from "@/websocket/reconnectingWebSocket"; -import { createTestTelemetry, type TestSink } from "../../mocks/telemetry"; +import { createTestTelemetry } from "../../mocks/telemetry"; import { createMockLogger } from "../../mocks/testHelpers"; import type { CloseEvent, Event as WsEvent } from "ws"; -import type { TelemetryEvent } from "@/telemetry/event"; import type { TelemetryService } from "@/telemetry/service"; import type { UnidirectionalStream } from "@/websocket/eventStreamConnection"; @@ -104,14 +103,7 @@ describe("ReconnectingWebSocket", () => { }); // create() returns a disconnected instance instead of throwing - const ws = await ReconnectingWebSocket.create( - factory, - createMockLogger(), - { - telemetry: NOOP_TELEMETRY_REPORTER, - onCertificateRefreshNeeded: () => Promise.resolve(false), - }, - ); + const ws = await fromFactory(factory); // Should be disconnected after unrecoverable HTTP error expect(ws.state).toBe(ConnectionState.DISCONNECTED); @@ -370,7 +362,9 @@ describe("ReconnectingWebSocket", () => { it("calls onDispose callback once, even with multiple close() calls", async () => { let disposeCount = 0; - const { ws } = await createReconnectingWebSocket(() => ++disposeCount); + const { ws } = await createReconnectingWebSocket({ + onDispose: () => ++disposeCount, + }); ws.close(); ws.close(); @@ -381,9 +375,9 @@ describe("ReconnectingWebSocket", () => { it("suspends (not disposes) on unrecoverable WebSocket close code", async () => { let disposeCount = 0; - const { ws, sockets } = await createReconnectingWebSocket( - () => ++disposeCount, - ); + const { ws, sockets } = await createReconnectingWebSocket({ + onDispose: () => ++disposeCount, + }); sockets[0].fireOpen(); expect(ws.state).toBe(ConnectionState.CONNECTED); @@ -410,9 +404,9 @@ describe("ReconnectingWebSocket", () => { it("does not call onDispose callback during reconnection", async () => { let disposeCount = 0; - const { ws, sockets } = await createReconnectingWebSocket( - () => ++disposeCount, - ); + const { ws, sockets } = await createReconnectingWebSocket({ + onDispose: () => ++disposeCount, + }); sockets[0].fireOpen(); sockets[0].fireClose({ @@ -594,13 +588,10 @@ describe("ReconnectingWebSocket", () => { }); }); - describe("Telemetry", () => { - it("emits state transition events for connection lifecycle changes", async () => { + describe("Telemetry wiring", () => { + it("walks the state machine through a full reconnect lifecycle", async () => { const { telemetry, sink } = createTestTelemetry(); - const { ws, sockets } = await createReconnectingWebSocket( - undefined, - telemetry, - ); + const { ws, sockets } = await createReconnectingWebSocket({ telemetry }); sockets[0].fireOpen(); sockets[0].fireClose({ @@ -612,9 +603,9 @@ describe("ReconnectingWebSocket", () => { ws.close(); expect( - eventsNamed(sink, "connection.state_transition").map( - (event) => event.properties, - ), + sink + .eventsNamed("connection.state_transition") + .map((e) => e.properties), ).toEqual([ { from: "IDLE", to: "CONNECTING", reason: "initial_connect" }, { from: "CONNECTING", to: "CONNECTED", reason: "open" }, @@ -631,97 +622,6 @@ describe("ReconnectingWebSocket", () => { { from: "CONNECTING", to: "CONNECTED", reason: "open" }, { from: "CONNECTED", to: "DISPOSED", reason: "dispose" }, ]); - await telemetry.dispose(); - }); - - it("emits open and unexpected drop events", async () => { - const { telemetry, sink } = createTestTelemetry(); - const { ws, sockets } = await createReconnectingWebSocket( - undefined, - telemetry, - ); - - sockets[0].fireOpen(); - sockets[0].fireClose({ - code: WebSocketCloseCode.ABNORMAL, - reason: "Network error", - }); - - expect(firstEventNamed(sink, "connection.open")).toMatchObject({ - properties: { url: "/api/test" }, - measurements: { connectDurationMs: expect.any(Number) }, - }); - - expect(firstEventNamed(sink, "connection.drop")).toMatchObject({ - properties: { - cause: "unexpected_close", - closeCode: String(WebSocketCloseCode.ABNORMAL), - }, - measurements: { connectionDurationMs: expect.any(Number) }, - error: { message: expect.stringContaining("1006") }, - }); - - ws.close(); - await telemetry.dispose(); - }); - - it("emits reconnect aggregation after recovery", async () => { - const { telemetry, sink } = createTestTelemetry(); - const { ws, sockets } = await createReconnectingWebSocket( - undefined, - telemetry, - ); - - sockets[0].fireOpen(); - sockets[0].fireClose({ - code: WebSocketCloseCode.ABNORMAL, - reason: "Network error", - }); - await vi.advanceTimersByTimeAsync(300); - sockets[1].fireOpen(); - - await vi.waitFor(() => - expect(eventsNamed(sink, "connection.reconnect")).toHaveLength(1), - ); - expect(firstEventNamed(sink, "connection.reconnect")).toMatchObject({ - properties: { result: "success", reason: "unexpected_close" }, - measurements: { - attempts: 1, - totalDurationMs: expect.any(Number), - }, - }); - - ws.close(); - await telemetry.dispose(); - }); - - it("emits failed reconnect aggregation when reconnect ends disconnected", async () => { - const { telemetry, sink } = createTestTelemetry(); - const { ws, sockets, setFactoryError } = - await createReconnectingWebSocketWithErrorControl(telemetry); - sockets[0].fireOpen(); - - setFactoryError( - new Error(`Unexpected server response: ${HttpStatusCode.FORBIDDEN}`), - ); - ws.reconnect(); - await Promise.resolve(); - - await vi.waitFor(() => { - expect(ws.state).toBe(ConnectionState.DISCONNECTED); - expect(eventsNamed(sink, "connection.reconnect")).toHaveLength(1); - }); - expect(firstEventNamed(sink, "connection.reconnect")).toMatchObject({ - properties: { - result: "error", - reason: "manual_reconnect", - terminationReason: "unrecoverable_http", - }, - measurements: { attempts: 1 }, - }); - - ws.close(); - await telemetry.dispose(); }); }); @@ -734,7 +634,9 @@ describe("ReconnectingWebSocket", () => { sockets.push(socket); return Promise.resolve(socket); }); - const ws = await fromFactory(factory, undefined, refreshCallback); + const ws = await fromFactory(factory, { + onCertificateRefreshNeeded: refreshCallback, + }); sockets[0].fireOpen(); return { ws, sockets, refreshCallback }; }; @@ -885,9 +787,14 @@ function createMockSocket(): MockSocket { }; } +interface FactoryOptions { + onDispose?: () => void; + onCertificateRefreshNeeded?: () => Promise; + telemetry?: TelemetryService; +} + async function createReconnectingWebSocket( - onDispose?: () => void, - telemetry?: TelemetryService, + options: FactoryOptions = {}, ): Promise<{ ws: ReconnectingWebSocket; sockets: MockSocket[]; @@ -898,16 +805,13 @@ async function createReconnectingWebSocket( sockets.push(socket); return Promise.resolve(socket); }); - const ws = await fromFactory(factory, onDispose, undefined, telemetry); - - // We start with one socket + const ws = await fromFactory(factory, options); expect(sockets).toHaveLength(1); - return { ws, sockets }; } async function createReconnectingWebSocketWithErrorControl( - telemetry?: TelemetryService, + options: FactoryOptions = {}, ): Promise<{ ws: ReconnectingWebSocket; sockets: MockSocket[]; @@ -925,7 +829,7 @@ async function createReconnectingWebSocketWithErrorControl( return Promise.resolve(socket); }); - const ws = await fromFactory(factory, undefined, undefined, telemetry); + const ws = await fromFactory(factory, options); expect(sockets).toHaveLength(1); return { @@ -939,33 +843,20 @@ async function createReconnectingWebSocketWithErrorControl( async function fromFactory( factory: SocketFactory, - onDispose?: () => void, - onCertificateRefreshNeeded?: () => Promise, - telemetry?: TelemetryService, + options: FactoryOptions = {}, ): Promise> { return await ReconnectingWebSocket.create( factory, createMockLogger(), { - telemetry: telemetry ?? NOOP_TELEMETRY_REPORTER, + telemetry: options.telemetry ?? NOOP_TELEMETRY_REPORTER, onCertificateRefreshNeeded: - onCertificateRefreshNeeded ?? (() => Promise.resolve(false)), + options.onCertificateRefreshNeeded ?? (() => Promise.resolve(false)), }, - onDispose, + options.onDispose, ); } -function eventsNamed(sink: TestSink, eventName: string): TelemetryEvent[] { - return sink.events.filter((event) => event.eventName === eventName); -} - -function firstEventNamed( - sink: TestSink, - eventName: string, -): TelemetryEvent | undefined { - return eventsNamed(sink, eventName)[0]; -} - async function createBlockingReconnectingWebSocket(): Promise<{ ws: ReconnectingWebSocket; sockets: MockSocket[]; From 5ed31ab387e7ded252993709807ba10057404f9d Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 8 May 2026 19:53:53 +0300 Subject: [PATCH 5/7] refactor(telemetry): address review feedback for connection lifecycle - ssh.process.discovered: emit found=true|false so abandoned searches are distinguishable - Add ssh.process.disposed as a dedicated terminal event with wasLost flag - ssh.process.replaced: always emit, never conflate with recovered - Remove dead process_changed and disposed members from ProcessLossCause - Rename connection events to past-tense and url -> route property - Route normal_close through disconnectWithReason for state-machine consistency - Distinguish certificate_refresh from manual_reconnect on cycle reason - Clear #reconnectCycle in WebSocketTelemetry.reset() - Restore Only attempt refresh once per connection cycle comment - Unblock SshProcessMonitor.delay() on dispose so abandoned traces complete - Add unit and integration coverage for the new event shapes --- src/instrumentation/ssh.ts | 57 +++++--- src/instrumentation/websocket.ts | 21 ++- src/remote/sshProcess.ts | 8 +- src/websocket/reconnectingWebSocket.ts | 20 ++- test/unit/instrumentation/ssh.test.ts | 79 +++++++++++- test/unit/instrumentation/websocket.test.ts | 51 +++++--- test/unit/remote/sshProcess.test.ts | 122 +++++++++++++++++- .../websocket/reconnectingWebSocket.test.ts | 70 +++++++++- 8 files changed, 359 insertions(+), 69 deletions(-) diff --git a/src/instrumentation/ssh.ts b/src/instrumentation/ssh.ts index 5068de27..d4e2721c 100644 --- a/src/instrumentation/ssh.ts +++ b/src/instrumentation/ssh.ts @@ -4,16 +4,12 @@ import type { TelemetryReporter } from "../telemetry/reporter"; const NETWORK_SAMPLE_INTERVAL_MS = 60_000; const NETWORK_LATENCY_CHANGE_RATIO = 0.1; -export type ProcessLossCause = - | "stale_network_info" - | "missing_network_info" - | "process_changed" - | "disposed"; +export type ProcessLossCause = "stale_network_info" | "missing_network_info"; interface NetworkSample { readonly emittedAtMs: number; readonly p2p: boolean; - readonly derp: string; + readonly preferredDerp: string; readonly latencyMs: number; } @@ -37,6 +33,7 @@ export class SshTelemetry { ): Promise { return this.#telemetry.trace("ssh.process.discovered", async (span) => { const { pid, attempts } = await fn(); + span.setProperty("found", String(pid !== undefined)); span.setMeasurement("attempts", attempts); return pid; }); @@ -75,19 +72,23 @@ export class SshTelemetry { this.#processLostAtMs = undefined; } - /** - * Handover from one SSH process to another. Closes out the prior process - * (recovery if lost, replacement otherwise) and starts fresh tracking. - */ + /** Handover to a different SSH process. Always emits `ssh.process.replaced`, + * even when the prior process was already lost (replacement is operationally + * distinct from recovery). */ public processReplaced(): void { const now = performance.now(); - if (this.#processLostAtMs !== undefined) { - this.processRecovered(); - } else if (this.#processStartedAtMs !== undefined) { + if (this.#processStartedAtMs !== undefined) { + const wasLost = this.#processLostAtMs !== undefined; + const measurements: Record = { + previousUptimeMs: now - this.#processStartedAtMs, + }; + if (this.#processLostAtMs !== undefined) { + measurements.lostDurationMs = now - this.#processLostAtMs; + } this.#telemetry.log( "ssh.process.replaced", - {}, - { previousUptimeMs: now - this.#processStartedAtMs }, + { wasLost: String(wasLost) }, + measurements, ); } this.#processStartedAtMs = now; @@ -95,6 +96,24 @@ export class SshTelemetry { this.#lastNetworkSample = undefined; } + /** Terminal teardown signal. Emits regardless of prior lost state so + * consumers always see a session-ending event. */ + public disposed(): void { + if (this.#processStartedAtMs === undefined) { + return; + } + const now = performance.now(); + const wasLost = this.#processLostAtMs !== undefined; + this.#telemetry.log( + "ssh.process.disposed", + { wasLost: String(wasLost) }, + { uptimeMs: now - this.#processStartedAtMs }, + ); + this.#processStartedAtMs = undefined; + this.#processLostAtMs = undefined; + this.#lastNetworkSample = undefined; + } + public networkSampled(network: NetworkInfo): void { const now = performance.now(); const previous = this.#lastNetworkSample; @@ -105,14 +124,14 @@ export class SshTelemetry { this.#lastNetworkSample = { emittedAtMs: now, p2p: network.p2p, - derp: network.preferred_derp, + preferredDerp: network.preferred_derp, latencyMs: network.latency, }; this.#telemetry.log( - "ssh.network.sample", + "ssh.network.sampled", { p2p: String(network.p2p), - derp: network.preferred_derp, + preferredDerp: network.preferred_derp, }, { latencyMs: network.latency, @@ -135,7 +154,7 @@ function shouldEmitSample( if (current.p2p !== previous.p2p) { return true; } - if (current.preferred_derp !== previous.derp) { + if (current.preferred_derp !== previous.preferredDerp) { return true; } return hasMeaningfulLatencyChange(current.latency, previous.latencyMs); diff --git a/src/instrumentation/websocket.ts b/src/instrumentation/websocket.ts index ca2b9e6a..23fb39d6 100644 --- a/src/instrumentation/websocket.ts +++ b/src/instrumentation/websocket.ts @@ -5,6 +5,7 @@ import type { ConnectionState } from "../websocket/reconnectingWebSocket"; export type ConnectionStateReason = | "initial_connect" | "manual_reconnect" + | "certificate_refresh" | "scheduled_reconnect" | "open" | "disconnect" @@ -13,6 +14,7 @@ export type ConnectionStateReason = | "unrecoverable_http" | "certificate_error" | "connection_error" + | "normal_close" | "unexpected_close"; export type ConnectionDropCause = @@ -24,7 +26,7 @@ export type ConnectionDropCause = | "disposed" | "error"; -export type ReconnectOutcome = +type ReconnectOutcome = | { readonly result: "success" } | { readonly result: "error"; @@ -59,7 +61,11 @@ export class WebSocketTelemetry { to: ConnectionState, reason: ConnectionStateReason, ): void { - this.#telemetry.log("connection.state_transition", { from, to, reason }); + this.#telemetry.log("connection.state_transitioned", { + from, + to, + reason, + }); } /** Stamp the connect-start time; counts an attempt if a cycle is open. */ @@ -77,8 +83,8 @@ export class WebSocketTelemetry { this.#connectionDropEmitted = false; this.#connectStartedAtMs = undefined; this.#telemetry.log( - "connection.open", - { url: route }, + "connection.opened", + { route }, { connectDurationMs: now - start }, ); this.#finishReconnect({ result: "success" }); @@ -104,10 +110,10 @@ export class WebSocketTelemetry { connectionDurationMs: performance.now() - this.#connectionOpenedAtMs, }; if (error === undefined) { - this.#telemetry.log("connection.drop", properties, measurements); + this.#telemetry.log("connection.dropped", properties, measurements); } else { this.#telemetry.logError( - "connection.drop", + "connection.dropped", error, properties, measurements, @@ -120,6 +126,7 @@ export class WebSocketTelemetry { this.#connectStartedAtMs = undefined; this.#connectionOpenedAtMs = undefined; this.#connectionDropEmitted = false; + this.#reconnectCycle = undefined; } /** Open a reconnect cycle. No-op if one is already open. */ @@ -160,7 +167,7 @@ export class WebSocketTelemetry { if (outcome.result === "error") { properties.terminationReason = outcome.terminationReason; } - this.#telemetry.log("connection.reconnect", properties, { + this.#telemetry.log("connection.reconnected", properties, { attempts: cycle.attempts, totalDurationMs: performance.now() - cycle.startMs, }); diff --git a/src/remote/sshProcess.ts b/src/remote/sshProcess.ts index a67fa448..4c007407 100644 --- a/src/remote/sshProcess.ts +++ b/src/remote/sshProcess.ts @@ -84,6 +84,7 @@ export class SshProcessMonitor implements vscode.Disposable { private currentPid: number | undefined; private logFilePath: string | undefined; private pendingTimeout: NodeJS.Timeout | undefined; + private pendingDelayResolve: (() => void) | undefined; private lastStaleSearchTime = 0; private readonly reporter: NetworkStatusReporter; @@ -188,12 +189,15 @@ export class SshProcessMonitor implements vscode.Disposable { if (this.disposed) { return; } - this.telemetry.processLost("disposed"); + this.telemetry.disposed(); this.disposed = true; if (this.pendingTimeout) { clearTimeout(this.pendingTimeout); this.pendingTimeout = undefined; } + // Unblock any in-flight delay() so discovery loops exit promptly. + this.pendingDelayResolve?.(); + this.pendingDelayResolve = undefined; this.statusBarItem.dispose(); this._onLogFilePathChange.dispose(); this._onPidChange.dispose(); @@ -207,8 +211,10 @@ export class SshProcessMonitor implements vscode.Disposable { return; } await new Promise((resolve) => { + this.pendingDelayResolve = resolve; this.pendingTimeout = setTimeout(() => { this.pendingTimeout = undefined; + this.pendingDelayResolve = undefined; resolve(); }, ms); }); diff --git a/src/websocket/reconnectingWebSocket.ts b/src/websocket/reconnectingWebSocket.ts index 071ecd64..fb9e1307 100644 --- a/src/websocket/reconnectingWebSocket.ts +++ b/src/websocket/reconnectingWebSocket.ts @@ -25,9 +25,7 @@ import type { } from "./eventStreamConnection"; function toCloseEventError(event: CloseEvent): Error { - return new Error( - `WebSocket closed unexpectedly with code ${event.code}: ${event.reason}`, - ); + return new Error(`WebSocket closed with code ${event.code}: ${event.reason}`); } /** @@ -245,12 +243,16 @@ export class ReconnectingWebSocket< * Resumes the socket if previously disconnected via disconnect(). */ public reconnect(): void { + this.#reconnectInternal("manual_reconnect"); + } + + #reconnectInternal(reason: ConnectionStateReason): void { if (this.#state === ConnectionState.DISPOSED) { return; } if (this.#state !== ConnectionState.IDLE) { - this.#telemetry.reconnectStarted("manual_reconnect"); + this.#telemetry.reconnectStarted(reason); } if (this.#state === ConnectionState.DISCONNECTED) { @@ -263,7 +265,7 @@ export class ReconnectingWebSocket< this.#reconnectTimeoutId = null; } - void this.connect("manual_reconnect"); + void this.connect(reason); } /** @@ -404,7 +406,10 @@ export class ReconnectingWebSocket< } if (NORMAL_CLOSURE_CODES.has(event.code)) { - this.#telemetry.dropped("normal_close", event.code); + this.disconnectWithReason("normal_close", "normal_close", { + code: event.code, + closeReason: event.reason, + }); return; } @@ -448,6 +453,7 @@ export class ReconnectingWebSocket< private async handleClientCertificateError( certError: ClientCertificateError, ): Promise { + // Only attempt refresh once per connection cycle if (this.#certRefreshAttempted) { this.#logger.warn("Certificate refresh already attempted, not retrying"); void certError.showNotification(); @@ -518,7 +524,7 @@ export class ReconnectingWebSocket< const certError = ClientCertificateError.fromError(error); if (certError) { if (await this.handleClientCertificateError(certError)) { - this.reconnect(); + this.#reconnectInternal("certificate_refresh"); } else { this.disconnectWithReason("certificate_error", "error", { error }); } diff --git a/test/unit/instrumentation/ssh.test.ts b/test/unit/instrumentation/ssh.test.ts index cba77faa..10ea34c0 100644 --- a/test/unit/instrumentation/ssh.test.ts +++ b/test/unit/instrumentation/ssh.test.ts @@ -11,8 +11,33 @@ function setup() { } describe("SshTelemetry", () => { + describe("traceProcessDiscovery", () => { + interface DiscoveryCase { + pid: number | undefined; + attempts: number; + found: string; + } + it.each([ + { pid: 123, attempts: 2, found: "true" }, + { pid: undefined, attempts: 5, found: "false" }, + ])( + "emits found=$found and attempts=$attempts based on the result", + async ({ pid, attempts, found }) => { + const { ssh, sink } = setup(); + + await ssh.traceProcessDiscovery(() => + Promise.resolve({ pid, attempts }), + ); + + const [event] = sink.eventsNamed("ssh.process.discovered"); + expect(event.properties).toMatchObject({ result: "success", found }); + expect(event.measurements.attempts).toBe(attempts); + }, + ); + }); + describe("processReplaced", () => { - it("emits a recovery when the prior process was already marked lost", () => { + it("emits a replacement (not recovery) when the prior process was lost", () => { const { ssh, sink } = setup(); ssh.processStarted(); @@ -21,8 +46,14 @@ describe("SshTelemetry", () => { expect(sink.events.map((e) => e.eventName)).toEqual([ "ssh.process.lost", - "ssh.process.recovered", + "ssh.process.replaced", ]); + const [replaced] = sink.eventsNamed("ssh.process.replaced"); + expect(replaced.properties).toMatchObject({ wasLost: "true" }); + expect(replaced.measurements).toMatchObject({ + previousUptimeMs: expect.any(Number), + lostDurationMs: expect.any(Number), + }); }); it("emits a replacement event for an instant handover", () => { @@ -33,9 +64,11 @@ describe("SshTelemetry", () => { const replaced = sink.eventsNamed("ssh.process.replaced"); expect(replaced).toHaveLength(1); + expect(replaced[0].properties).toMatchObject({ wasLost: "false" }); expect(replaced[0].measurements).toMatchObject({ previousUptimeMs: expect.any(Number), }); + expect(replaced[0].measurements.lostDurationMs).toBeUndefined(); }); it("emits nothing if there was no prior process", () => { @@ -51,7 +84,7 @@ describe("SshTelemetry", () => { it("is a no-op when there is no started process", () => { const { ssh, sink } = setup(); - ssh.processLost("disposed"); + ssh.processLost("stale_network_info"); expect(sink.events).toHaveLength(0); }); @@ -69,6 +102,38 @@ describe("SshTelemetry", () => { }); }); + describe("disposed", () => { + it("is a no-op when there is no started process", () => { + const { ssh, sink } = setup(); + + ssh.disposed(); + + expect(sink.events).toHaveLength(0); + }); + + interface DisposedCase { + name: string; + lose: boolean; + wasLost: string; + } + it.each([ + { name: "from a healthy state", lose: false, wasLost: "false" }, + { name: "after the process was lost", lose: true, wasLost: "true" }, + ])("emits a terminal event $name", ({ lose, wasLost }) => { + const { ssh, sink } = setup(); + + ssh.processStarted(); + if (lose) { + ssh.processLost("stale_network_info"); + } + ssh.disposed(); + + const [event] = sink.eventsNamed("ssh.process.disposed"); + expect(event.properties).toMatchObject({ wasLost }); + expect(event.measurements.uptimeMs).toEqual(expect.any(Number)); + }); + }); + describe("networkSampled", () => { beforeEach(() => vi.useFakeTimers()); afterEach(() => vi.useRealTimers()); @@ -107,16 +172,16 @@ describe("SshTelemetry", () => { vi.advanceTimersByTime(advanceMs); ssh.networkSampled(makeNetworkInfo(next)); - expect(sink.eventsNamed("ssh.network.sample")).toHaveLength(expected); + expect(sink.eventsNamed("ssh.network.sampled")).toHaveLength(expected); }); - it("includes p2p, derp, latency, and bandwidth in the emitted sample", () => { + it("includes p2p, preferredDerp, latency, and bandwidth in the emitted sample", () => { const { ssh, sink } = setup(); ssh.networkSampled(makeNetworkInfo({ latency: 25 })); - const [sample] = sink.eventsNamed("ssh.network.sample"); - expect(sample.properties).toEqual({ p2p: "true", derp: "NYC" }); + const [sample] = sink.eventsNamed("ssh.network.sampled"); + expect(sample.properties).toEqual({ p2p: "true", preferredDerp: "NYC" }); expect(sample.measurements).toMatchObject({ latencyMs: 25, downloadMbits: expect.any(Number), diff --git a/test/unit/instrumentation/websocket.test.ts b/test/unit/instrumentation/websocket.test.ts index 74df249d..1dafa041 100644 --- a/test/unit/instrumentation/websocket.test.ts +++ b/test/unit/instrumentation/websocket.test.ts @@ -12,7 +12,7 @@ function setup() { describe("WebSocketTelemetry", () => { describe("stateTransition", () => { - it("emits a connection.state_transition event with from/to/reason", () => { + it("emits a connection.state_transitioned event with from/to/reason", () => { const { ws, sink } = setup(); ws.stateTransition( @@ -21,7 +21,7 @@ describe("WebSocketTelemetry", () => { "initial_connect", ); - const [event] = sink.eventsNamed("connection.state_transition"); + const [event] = sink.eventsNamed("connection.state_transitioned"); expect(event.properties).toEqual({ from: "IDLE", to: "CONNECTING", @@ -31,15 +31,15 @@ describe("WebSocketTelemetry", () => { }); describe("opened", () => { - it("emits connection.open with route and connect duration", () => { + it("emits connection.opened with route and connect duration", () => { const { ws, sink } = setup(); ws.connectStarted(); ws.opened("/api/test"); - const [event] = sink.eventsNamed("connection.open"); + const [event] = sink.eventsNamed("connection.opened"); expect(event).toMatchObject({ - properties: { url: "/api/test" }, + properties: { route: "/api/test" }, measurements: { connectDurationMs: expect.any(Number) }, }); }); @@ -49,7 +49,7 @@ describe("WebSocketTelemetry", () => { ws.opened("/api/test"); - const [event] = sink.eventsNamed("connection.open"); + const [event] = sink.eventsNamed("connection.opened"); expect(event.measurements.connectDurationMs).toBe(0); }); }); @@ -63,13 +63,13 @@ describe("WebSocketTelemetry", () => { expect(sink.events).toHaveLength(0); }); - it("emits connection.drop with cause and close code", () => { + it("emits connection.dropped with cause and close code", () => { const { ws, sink } = setup(); ws.opened("/api/test"); ws.dropped("unexpected_close", 1006); - const [event] = sink.eventsNamed("connection.drop"); + const [event] = sink.eventsNamed("connection.dropped"); expect(event.properties).toMatchObject({ cause: "unexpected_close", closeCode: "1006", @@ -85,7 +85,7 @@ describe("WebSocketTelemetry", () => { ws.opened("/api/test"); ws.dropped("error", 1006, new Error("boom")); - const [event] = sink.eventsNamed("connection.drop"); + const [event] = sink.eventsNamed("connection.dropped"); expect(event.error).toMatchObject({ message: "boom" }); }); @@ -96,7 +96,7 @@ describe("WebSocketTelemetry", () => { ws.dropped("normal_close"); ws.dropped("error"); - expect(sink.eventsNamed("connection.drop")).toHaveLength(1); + expect(sink.eventsNamed("connection.dropped")).toHaveLength(1); }); }); @@ -108,32 +108,43 @@ describe("WebSocketTelemetry", () => { ws.reset(); ws.dropped("error"); - expect(sink.eventsNamed("connection.drop")).toHaveLength(0); + expect(sink.eventsNamed("connection.dropped")).toHaveLength(0); + }); + + it("clears any open reconnect cycle", () => { + const { ws, sink } = setup(); + + ws.reconnectStarted("manual_reconnect"); + ws.connectStarted(); + ws.reset(); + ws.opened("/api/test"); + + expect(sink.eventsNamed("connection.reconnected")).toHaveLength(0); }); }); describe("reconnect cycle", () => { - it("emits connection.reconnect with success when opened closes the cycle", () => { + it("emits connection.reconnected with success when opened closes the cycle", () => { const { ws, sink } = setup(); ws.reconnectStarted("manual_reconnect"); ws.connectStarted(); ws.opened("/api/test"); - const [event] = sink.eventsNamed("connection.reconnect"); + const [event] = sink.eventsNamed("connection.reconnected"); expect(event).toMatchObject({ properties: { result: "success", reason: "manual_reconnect" }, measurements: { attempts: 1, totalDurationMs: expect.any(Number) }, }); }); - it("emits connection.reconnect with error when terminated closes the cycle", () => { + it("emits connection.reconnected with error when terminated closes the cycle", () => { const { ws, sink } = setup(); ws.reconnectStarted("manual_reconnect"); ws.terminated("unrecoverable_http", { cause: "error" }); - const [event] = sink.eventsNamed("connection.reconnect"); + const [event] = sink.eventsNamed("connection.reconnected"); expect(event.properties).toEqual({ result: "error", reason: "manual_reconnect", @@ -146,7 +157,7 @@ describe("WebSocketTelemetry", () => { ws.terminated("dispose", { cause: "disposed" }); - expect(sink.eventsNamed("connection.reconnect")).toHaveLength(0); + expect(sink.eventsNamed("connection.reconnected")).toHaveLength(0); }); it("counts each connectStarted as an attempt within the cycle", () => { @@ -159,7 +170,7 @@ describe("WebSocketTelemetry", () => { ws.opened("/api/test"); expect( - sink.eventsNamed("connection.reconnect")[0].measurements.attempts, + sink.eventsNamed("connection.reconnected")[0].measurements.attempts, ).toBe(3); }); @@ -171,7 +182,7 @@ describe("WebSocketTelemetry", () => { ws.opened("/api/test"); expect( - sink.eventsNamed("connection.reconnect")[0].properties.reason, + sink.eventsNamed("connection.reconnected")[0].properties.reason, ).toBe("manual_reconnect"); }); @@ -184,10 +195,10 @@ describe("WebSocketTelemetry", () => { code: 1006, }); - expect(sink.eventsNamed("connection.drop")).toHaveLength(1); + expect(sink.eventsNamed("connection.dropped")).toHaveLength(1); ws.opened("/api/test"); - expect(sink.eventsNamed("connection.reconnect")).toHaveLength(1); + expect(sink.eventsNamed("connection.reconnected")).toHaveLength(1); }); }); }); diff --git a/test/unit/remote/sshProcess.test.ts b/test/unit/remote/sshProcess.test.ts index 83c5f9f0..aadd13b6 100644 --- a/test/unit/remote/sshProcess.test.ts +++ b/test/unit/remote/sshProcess.test.ts @@ -333,7 +333,7 @@ describe("SshProcessMonitor", () => { await waitForEvent(monitor.onPidChange); expect(sink.eventsNamed("ssh.process.discovered")[0]).toMatchObject({ - properties: { result: "success" }, + properties: { result: "success", found: "true" }, measurements: { durationMs: expect.any(Number), attempts: expect.any(Number), @@ -341,6 +341,120 @@ describe("SshProcessMonitor", () => { }); }); + it("emits found=false when discovery is abandoned by dispose", async () => { + const { telemetry, sink } = createTestTelemetry(); + vol.fromJSON({ + "/logs/ms-vscode-remote.remote-ssh/1-Remote - SSH.log": + "-> socksPort 12345 ->", + }); + vi.mocked(find).mockResolvedValue([]); + + const monitor = createMonitor({ telemetry }); + // Allow at least one discovery iteration to start, then abandon it. + await new Promise((r) => setTimeout(r, 30)); + monitor.dispose(); + + await waitFor( + () => sink.eventsNamed("ssh.process.discovered").length > 0, + ); + expect(sink.eventsNamed("ssh.process.discovered")[0]).toMatchObject({ + properties: { result: "success", found: "false" }, + }); + }); + + it("emits ssh.process.disposed when dispose follows a successful discovery", async () => { + const { telemetry, sink } = createTestTelemetry(); + vol.fromJSON(sshLog); + + const monitor = createMonitor({ telemetry }); + await waitForEvent(monitor.onPidChange); + monitor.dispose(); + + expect(sink.eventsNamed("ssh.process.disposed")[0]).toMatchObject({ + properties: { wasLost: "false" }, + measurements: { uptimeMs: expect.any(Number) }, + }); + }); + + it("emits ssh.process.disposed with wasLost=true when dispose follows a loss", async () => { + const { telemetry, sink } = createTestTelemetry(); + vol.fromJSON({ + ...sshLog, + "/network/999.json": makeNetworkJson(), + }); + // Second discovery hangs so the lost->disposed sequence is observable. + vi.mocked(find) + .mockResolvedValueOnce([{ pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }]) + .mockResolvedValue([]); + + const monitor = createMonitor({ + networkInfoPath: "/network", + networkPollInterval: 10, + telemetry, + }); + await waitForEvent(monitor.onPidChange); + await waitFor(() => sink.eventsNamed("ssh.process.lost").length > 0); + monitor.dispose(); + + const disposed = sink.eventsNamed("ssh.process.disposed"); + expect(disposed).toHaveLength(1); + expect(disposed[0].properties).toMatchObject({ wasLost: "true" }); + }); + + it("emits missing_network_info as the loss cause when reads fail repeatedly", async () => { + vi.useFakeTimers(); + const { telemetry, sink } = createTestTelemetry(); + vol.fromJSON(sshLog); + // No /network/999.json — every read fails until threshold is reached. + vi.mocked(find).mockResolvedValue([ + { pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }, + ]); + + const monitor = createMonitor({ + networkInfoPath: "/network", + networkPollInterval: 10, + telemetry, + }); + + await vi.advanceTimersByTimeAsync(500); + await vi.waitFor(() => + expect(sink.eventsNamed("ssh.process.lost").length).toBeGreaterThan(0), + ); + + expect(sink.eventsNamed("ssh.process.lost")[0].properties).toMatchObject({ + cause: "missing_network_info", + }); + + monitor.dispose(); + }); + + it("emits ssh.process.replaced (not recovered) when a different PID takes over", async () => { + const { telemetry, sink } = createTestTelemetry(); + vol.fromJSON({ + ...sshLog, + "/network/999.json": makeNetworkJson(), + }); + vi.mocked(find) + .mockResolvedValueOnce([{ pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }]) + .mockResolvedValue([{ pid: 888, ppid: 1, name: "ssh", cmd: "ssh" }]); + + const monitor = createMonitor({ + networkInfoPath: "/network", + networkPollInterval: 10, + telemetry, + }); + await waitForEvent(monitor.onPidChange); + await waitFor(() => sink.eventsNamed("ssh.process.replaced").length > 0); + + const replaced = sink.eventsNamed("ssh.process.replaced"); + expect(replaced[0].properties).toMatchObject({ wasLost: "true" }); + expect(replaced[0].measurements).toMatchObject({ + previousUptimeMs: expect.any(Number), + lostDurationMs: expect.any(Number), + }); + expect(sink.eventsNamed("ssh.process.recovered")).toHaveLength(0); + }); + it("emits lost and recovered events around a stale-network reconnect", async () => { const { telemetry, sink } = createTestTelemetry(); vol.fromJSON({ @@ -385,10 +499,10 @@ describe("SshProcessMonitor", () => { networkPollInterval: 10, telemetry, }); - await waitFor(() => sink.eventsNamed("ssh.network.sample").length > 0); + await waitFor(() => sink.eventsNamed("ssh.network.sampled").length > 0); - expect(sink.eventsNamed("ssh.network.sample")[0]).toMatchObject({ - properties: { p2p: "true", derp: "NYC" }, + expect(sink.eventsNamed("ssh.network.sampled")[0]).toMatchObject({ + properties: { p2p: "true", preferredDerp: "NYC" }, measurements: { latencyMs: 25 }, }); }); diff --git a/test/unit/websocket/reconnectingWebSocket.test.ts b/test/unit/websocket/reconnectingWebSocket.test.ts index 10b54732..06e343a6 100644 --- a/test/unit/websocket/reconnectingWebSocket.test.ts +++ b/test/unit/websocket/reconnectingWebSocket.test.ts @@ -1,6 +1,9 @@ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; -import { NOOP_TELEMETRY_REPORTER } from "@/telemetry/reporter"; +import { + NOOP_TELEMETRY_REPORTER, + type TelemetryReporter, +} from "@/telemetry/reporter"; import { WebSocketCloseCode, HttpStatusCode } from "@/websocket/codes"; import { ConnectionState, @@ -13,7 +16,6 @@ import { createMockLogger } from "../../mocks/testHelpers"; import type { CloseEvent, Event as WsEvent } from "ws"; -import type { TelemetryService } from "@/telemetry/service"; import type { UnidirectionalStream } from "@/websocket/eventStreamConnection"; describe("ReconnectingWebSocket", () => { @@ -604,7 +606,7 @@ describe("ReconnectingWebSocket", () => { expect( sink - .eventsNamed("connection.state_transition") + .eventsNamed("connection.state_transitioned") .map((e) => e.properties), ).toEqual([ { from: "IDLE", to: "CONNECTING", reason: "initial_connect" }, @@ -622,6 +624,66 @@ describe("ReconnectingWebSocket", () => { { from: "CONNECTING", to: "CONNECTED", reason: "open" }, { from: "CONNECTED", to: "DISPOSED", reason: "dispose" }, ]); + + expect(sink.eventsNamed("connection.opened")).toHaveLength(2); + expect(sink.eventsNamed("connection.dropped")).toHaveLength(2); + expect(sink.eventsNamed("connection.reconnected")).toMatchObject([ + { + properties: { result: "success", reason: "unexpected_close" }, + measurements: { attempts: 1, totalDurationMs: expect.any(Number) }, + }, + ]); + }); + + it("emits a normal-close drop and disconnects on server-initiated close", async () => { + const { telemetry, sink } = createTestTelemetry(); + const { ws, sockets } = await createReconnectingWebSocket({ telemetry }); + + sockets[0].fireOpen(); + sockets[0].fireClose({ + code: WebSocketCloseCode.GOING_AWAY, + reason: "server restarting", + }); + + expect(ws.state).toBe(ConnectionState.DISCONNECTED); + const dropped = sink.eventsNamed("connection.dropped"); + expect(dropped).toHaveLength(1); + expect(dropped[0].properties).toMatchObject({ + cause: "normal_close", + closeCode: String(WebSocketCloseCode.GOING_AWAY), + }); + expect( + sink + .eventsNamed("connection.state_transitioned") + .map((e) => e.properties.reason), + ).toContain("normal_close"); + + ws.close(); + }); + + it("records certificate_refresh as the reconnect reason on successful refresh", async () => { + const { telemetry, sink } = createTestTelemetry(); + const sockets: MockSocket[] = []; + const factory = vi.fn(() => { + const socket = createMockSocket(); + sockets.push(socket); + return Promise.resolve(socket); + }); + const ws = await fromFactory(factory, { + telemetry, + onCertificateRefreshNeeded: () => Promise.resolve(true), + }); + + sockets[0].fireOpen(); + sockets[0].fireError(new Error("ssl alert certificate_expired")); + await vi.waitFor(() => expect(sockets).toHaveLength(2)); + sockets[1].fireOpen(); + + const reconnected = sink.eventsNamed("connection.reconnected"); + expect(reconnected).toHaveLength(1); + expect(reconnected[0].properties.reason).toBe("certificate_refresh"); + + ws.close(); }); }); @@ -790,7 +852,7 @@ function createMockSocket(): MockSocket { interface FactoryOptions { onDispose?: () => void; onCertificateRefreshNeeded?: () => Promise; - telemetry?: TelemetryService; + telemetry?: TelemetryReporter; } async function createReconnectingWebSocket( From b2d1c2a2a69f1f88b6200a33a957a8af73bc39dc Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 8 May 2026 20:58:39 +0300 Subject: [PATCH 6/7] fix(telemetry): track concurrent delays and tighten ssh tests - Replace single pendingDelayResolve with a Set so dispose unblocks every concurrent searchForLogFile/monitorNetwork delay, not just the most recent one. - Use a single PID throughout the stale-network reconnect test so the asserted recovered event matches the stale-network cycle the test name describes. - Dispose the monitor before the negative recovered assertion in the process-replaced test to keep the 888 monitoring loop from racing. - Add dedicated processRecovered unit tests (early return, emission shape, single-emit guard). --- src/remote/sshProcess.ts | 31 +++++++++++++------------ test/unit/instrumentation/ssh.test.ts | 33 +++++++++++++++++++++++++++ test/unit/remote/sshProcess.test.ts | 9 +++++--- 3 files changed, 56 insertions(+), 17 deletions(-) diff --git a/src/remote/sshProcess.ts b/src/remote/sshProcess.ts index 4c007407..77b1d68a 100644 --- a/src/remote/sshProcess.ts +++ b/src/remote/sshProcess.ts @@ -83,8 +83,10 @@ export class SshProcessMonitor implements vscode.Disposable { private disposed = false; private currentPid: number | undefined; private logFilePath: string | undefined; - private pendingTimeout: NodeJS.Timeout | undefined; - private pendingDelayResolve: (() => void) | undefined; + private readonly pendingDelays = new Set<{ + timer: NodeJS.Timeout; + resolve: () => void; + }>(); private lastStaleSearchTime = 0; private readonly reporter: NetworkStatusReporter; @@ -191,13 +193,12 @@ export class SshProcessMonitor implements vscode.Disposable { } this.telemetry.disposed(); this.disposed = true; - if (this.pendingTimeout) { - clearTimeout(this.pendingTimeout); - this.pendingTimeout = undefined; + // Unblock all in-flight delay() calls so concurrent loops exit promptly. + for (const handle of this.pendingDelays) { + clearTimeout(handle.timer); + handle.resolve(); } - // Unblock any in-flight delay() so discovery loops exit promptly. - this.pendingDelayResolve?.(); - this.pendingDelayResolve = undefined; + this.pendingDelays.clear(); this.statusBarItem.dispose(); this._onLogFilePathChange.dispose(); this._onPidChange.dispose(); @@ -211,12 +212,14 @@ export class SshProcessMonitor implements vscode.Disposable { return; } await new Promise((resolve) => { - this.pendingDelayResolve = resolve; - this.pendingTimeout = setTimeout(() => { - this.pendingTimeout = undefined; - this.pendingDelayResolve = undefined; - resolve(); - }, ms); + const handle = { + timer: setTimeout(() => { + this.pendingDelays.delete(handle); + resolve(); + }, ms), + resolve, + }; + this.pendingDelays.add(handle); }); } diff --git a/test/unit/instrumentation/ssh.test.ts b/test/unit/instrumentation/ssh.test.ts index 10ea34c0..fbeea890 100644 --- a/test/unit/instrumentation/ssh.test.ts +++ b/test/unit/instrumentation/ssh.test.ts @@ -80,6 +80,39 @@ describe("SshTelemetry", () => { }); }); + describe("processRecovered", () => { + it("is a no-op when the process is not currently lost", () => { + const { ssh, sink } = setup(); + + ssh.processStarted(); + ssh.processRecovered(); + + expect(sink.events).toHaveLength(0); + }); + + it("emits ssh.process.recovered with recoveryDurationMs after a loss", () => { + const { ssh, sink } = setup(); + + ssh.processStarted(); + ssh.processLost("stale_network_info"); + ssh.processRecovered(); + + const [event] = sink.eventsNamed("ssh.process.recovered"); + expect(event.measurements.recoveryDurationMs).toEqual(expect.any(Number)); + }); + + it("does not double-emit when called twice without another loss", () => { + const { ssh, sink } = setup(); + + ssh.processStarted(); + ssh.processLost("stale_network_info"); + ssh.processRecovered(); + ssh.processRecovered(); + + expect(sink.eventsNamed("ssh.process.recovered")).toHaveLength(1); + }); + }); + describe("processLost", () => { it("is a no-op when there is no started process", () => { const { ssh, sink } = setup(); diff --git a/test/unit/remote/sshProcess.test.ts b/test/unit/remote/sshProcess.test.ts index aadd13b6..01d0e6e4 100644 --- a/test/unit/remote/sshProcess.test.ts +++ b/test/unit/remote/sshProcess.test.ts @@ -445,6 +445,9 @@ describe("SshProcessMonitor", () => { }); await waitForEvent(monitor.onPidChange); await waitFor(() => sink.eventsNamed("ssh.process.replaced").length > 0); + // Halt monitoring before the negative assertion so the 888 loop + // can't race ahead and emit its own lost/recovered cycle. + monitor.dispose(); const replaced = sink.eventsNamed("ssh.process.replaced"); expect(replaced[0].properties).toMatchObject({ wasLost: "true" }); @@ -461,9 +464,9 @@ describe("SshProcessMonitor", () => { ...sshLog, "/network/999.json": makeNetworkJson(), }); - vi.mocked(find) - .mockResolvedValueOnce([{ pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }]) - .mockResolvedValue([{ pid: 888, ppid: 1, name: "ssh", cmd: "ssh" }]); + vi.mocked(find).mockResolvedValue([ + { pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }, + ]); const monitor = createMonitor({ networkInfoPath: "/network", From a8a1853219866a6a825582a797b3e031065b8863 Mon Sep 17 00:00:00 2001 From: Ehab Younes Date: Fri, 8 May 2026 21:39:45 +0300 Subject: [PATCH 7/7] refactor(telemetry): clarify reconnect cycle event name - Rename connection.reconnected to connection.reconnect_resolved so the event name reflects "cycle ended, outcome in result" instead of implying success. Failed cycles emit with result=error and are no longer miscounted by name-only consumers. - Drop unused export on ProcessDiscoveryResult. - Replace em-dash in sshProcess.test.ts comment. --- src/instrumentation/ssh.ts | 2 +- src/instrumentation/websocket.ts | 2 +- test/unit/instrumentation/websocket.test.ts | 19 ++++++++++--------- test/unit/remote/sshProcess.test.ts | 2 +- .../websocket/reconnectingWebSocket.test.ts | 4 ++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/instrumentation/ssh.ts b/src/instrumentation/ssh.ts index d4e2721c..690d2ff2 100644 --- a/src/instrumentation/ssh.ts +++ b/src/instrumentation/ssh.ts @@ -13,7 +13,7 @@ interface NetworkSample { readonly latencyMs: number; } -export interface ProcessDiscoveryResult { +interface ProcessDiscoveryResult { readonly pid: number | undefined; readonly attempts: number; } diff --git a/src/instrumentation/websocket.ts b/src/instrumentation/websocket.ts index 23fb39d6..9d52ecdd 100644 --- a/src/instrumentation/websocket.ts +++ b/src/instrumentation/websocket.ts @@ -167,7 +167,7 @@ export class WebSocketTelemetry { if (outcome.result === "error") { properties.terminationReason = outcome.terminationReason; } - this.#telemetry.log("connection.reconnected", properties, { + this.#telemetry.log("connection.reconnect_resolved", properties, { attempts: cycle.attempts, totalDurationMs: performance.now() - cycle.startMs, }); diff --git a/test/unit/instrumentation/websocket.test.ts b/test/unit/instrumentation/websocket.test.ts index 1dafa041..e4420fa8 100644 --- a/test/unit/instrumentation/websocket.test.ts +++ b/test/unit/instrumentation/websocket.test.ts @@ -119,32 +119,32 @@ describe("WebSocketTelemetry", () => { ws.reset(); ws.opened("/api/test"); - expect(sink.eventsNamed("connection.reconnected")).toHaveLength(0); + expect(sink.eventsNamed("connection.reconnect_resolved")).toHaveLength(0); }); }); describe("reconnect cycle", () => { - it("emits connection.reconnected with success when opened closes the cycle", () => { + it("emits connection.reconnect_resolved with success when opened closes the cycle", () => { const { ws, sink } = setup(); ws.reconnectStarted("manual_reconnect"); ws.connectStarted(); ws.opened("/api/test"); - const [event] = sink.eventsNamed("connection.reconnected"); + const [event] = sink.eventsNamed("connection.reconnect_resolved"); expect(event).toMatchObject({ properties: { result: "success", reason: "manual_reconnect" }, measurements: { attempts: 1, totalDurationMs: expect.any(Number) }, }); }); - it("emits connection.reconnected with error when terminated closes the cycle", () => { + it("emits connection.reconnect_resolved with error when terminated closes the cycle", () => { const { ws, sink } = setup(); ws.reconnectStarted("manual_reconnect"); ws.terminated("unrecoverable_http", { cause: "error" }); - const [event] = sink.eventsNamed("connection.reconnected"); + const [event] = sink.eventsNamed("connection.reconnect_resolved"); expect(event.properties).toEqual({ result: "error", reason: "manual_reconnect", @@ -157,7 +157,7 @@ describe("WebSocketTelemetry", () => { ws.terminated("dispose", { cause: "disposed" }); - expect(sink.eventsNamed("connection.reconnected")).toHaveLength(0); + expect(sink.eventsNamed("connection.reconnect_resolved")).toHaveLength(0); }); it("counts each connectStarted as an attempt within the cycle", () => { @@ -170,7 +170,8 @@ describe("WebSocketTelemetry", () => { ws.opened("/api/test"); expect( - sink.eventsNamed("connection.reconnected")[0].measurements.attempts, + sink.eventsNamed("connection.reconnect_resolved")[0].measurements + .attempts, ).toBe(3); }); @@ -182,7 +183,7 @@ describe("WebSocketTelemetry", () => { ws.opened("/api/test"); expect( - sink.eventsNamed("connection.reconnected")[0].properties.reason, + sink.eventsNamed("connection.reconnect_resolved")[0].properties.reason, ).toBe("manual_reconnect"); }); @@ -198,7 +199,7 @@ describe("WebSocketTelemetry", () => { expect(sink.eventsNamed("connection.dropped")).toHaveLength(1); ws.opened("/api/test"); - expect(sink.eventsNamed("connection.reconnected")).toHaveLength(1); + expect(sink.eventsNamed("connection.reconnect_resolved")).toHaveLength(1); }); }); }); diff --git a/test/unit/remote/sshProcess.test.ts b/test/unit/remote/sshProcess.test.ts index 01d0e6e4..fed7e4ec 100644 --- a/test/unit/remote/sshProcess.test.ts +++ b/test/unit/remote/sshProcess.test.ts @@ -405,7 +405,7 @@ describe("SshProcessMonitor", () => { vi.useFakeTimers(); const { telemetry, sink } = createTestTelemetry(); vol.fromJSON(sshLog); - // No /network/999.json — every read fails until threshold is reached. + // No /network/999.json. Every read fails until threshold is reached. vi.mocked(find).mockResolvedValue([ { pid: 999, ppid: 1, name: "ssh", cmd: "ssh" }, ]); diff --git a/test/unit/websocket/reconnectingWebSocket.test.ts b/test/unit/websocket/reconnectingWebSocket.test.ts index 06e343a6..3dafb38d 100644 --- a/test/unit/websocket/reconnectingWebSocket.test.ts +++ b/test/unit/websocket/reconnectingWebSocket.test.ts @@ -627,7 +627,7 @@ describe("ReconnectingWebSocket", () => { expect(sink.eventsNamed("connection.opened")).toHaveLength(2); expect(sink.eventsNamed("connection.dropped")).toHaveLength(2); - expect(sink.eventsNamed("connection.reconnected")).toMatchObject([ + expect(sink.eventsNamed("connection.reconnect_resolved")).toMatchObject([ { properties: { result: "success", reason: "unexpected_close" }, measurements: { attempts: 1, totalDurationMs: expect.any(Number) }, @@ -679,7 +679,7 @@ describe("ReconnectingWebSocket", () => { await vi.waitFor(() => expect(sockets).toHaveLength(2)); sockets[1].fireOpen(); - const reconnected = sink.eventsNamed("connection.reconnected"); + const reconnected = sink.eventsNamed("connection.reconnect_resolved"); expect(reconnected).toHaveLength(1); expect(reconnected[0].properties.reason).toBe("certificate_refresh");