From d53d02733279787c981001b1df4d35e1a450c6ce Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 23 Jun 2026 15:33:46 +0100 Subject: [PATCH 01/10] fix(auth): don't eject users to invite screen on transient access-check failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updateCodeAccessFromSession ran on every token refresh and treated any non-success response (403, 5xx, timeout, network error) as a definitive "no access", flipping hasCodeAccess to false and bouncing authenticated users to the invite screen mid-session — e.g. during a brief API outage. Now only a definitive 200 response changes the gate. Transient failures are retried with backoff and, if they persist, the last known value is preserved (a granted user stays in the app; a never-determined user stays at null, showing the "Checking access" spinner) instead of collapsing to false. Co-Authored-By: Claude Opus 4.8 Generated-By: PostHog Code Task-Id: 8d2fbac8-0fbd-493a-b267-eeb5ba2ba542 --- packages/core/src/auth/auth.test.ts | 112 ++++++++++++++++++++++++++++ packages/core/src/auth/auth.ts | 69 +++++++++++++---- 2 files changed, 166 insertions(+), 15 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 1b9448e04..8c8e3e720 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1342,4 +1342,116 @@ describe("AuthService", () => { expect(redeemCallCount).toBe(2); }); }); + + describe("code access check resilience", () => { + const stubFetchWithCheckAccess = ( + checkAccess: (callCount: number) => Response, + ): { getCheckAccessCalls: () => number } => { + let checkAccessCalls = 0; + vi.stubGlobal( + "fetch", + vi.fn(async (input: string | Request) => { + const url = typeof input === "string" ? input : input.url; + + if (url.includes("/api/users/@me/")) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + uuid: "user-1", + organization: { id: "org-1" }, + }), + } as unknown as Response; + } + + if (/\/api\/organizations\/[^/]+\/$/.test(url)) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + name: "Org 1", + teams: [{ id: 42, name: "Project 42" }], + }), + } as unknown as Response; + } + + if (url.includes("/invites/check-access/")) { + checkAccessCalls++; + return checkAccess(checkAccessCalls); + } + + return { + ok: true, + json: vi.fn().mockResolvedValue({}), + } as unknown as Response; + }) as unknown as typeof fetch, + ); + return { getCheckAccessCalls: () => checkAccessCalls }; + }; + + const restoreSession = async () => { + seedStoredSession({ selectedProjectId: 42 }); + oauthFlow.refreshToken.mockResolvedValue( + mockTokenResponse({ + accessToken: "access-token", + refreshToken: "refresh-token", + }), + ); + await service.initialize(); + }; + + it("retries a transient failure and grants access on the retry", async () => { + const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => + call === 1 + ? ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + + expect(service.getState().hasCodeAccess).toBe(true); + expect(getCheckAccessCalls()).toBe(2); + }); + + it("preserves prior state instead of ejecting the user when the check keeps failing", async () => { + const { getCheckAccessCalls } = stubFetchWithCheckAccess( + () => + ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + }) as unknown as Response, + ); + + await restoreSession(); + + const state = service.getState(); + expect(state.status).toBe("authenticated"); + // Access was never determined, so it stays null (the "Checking access" + // spinner) — never false, which would wrongly show the invite screen. + expect(state.hasCodeAccess).toBeNull(); + expect(getCheckAccessCalls()).toBe(3); + }); + + it("still gates the user when the server definitively reports no access", async () => { + stubFetchWithCheckAccess( + () => + ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: false }), + }) as unknown as Response, + ); + + await restoreSession(); + + expect(service.getState().hasCodeAccess).toBe(false); + }); + }); }); diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index b7b47b424..c812134ed 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -910,27 +910,66 @@ export class AuthService extends TypedEventEmitter { return; } - try { - const apiHost = getCloudUrlFromRegion(this.session.cloudRegion); - const response = await this.executeAuthenticatedFetch( - fetch, - `${apiHost}/api/code/invites/check-access/`, - {}, - this.session.accessToken, - ); - const data = (await response.json().catch(() => ({}))) as { - has_access?: boolean; - }; + const apiHost = getCloudUrlFromRegion(this.session.cloudRegion); + const accessToken = this.session.accessToken; - this.updateState({ hasCodeAccess: data.has_access === true }); - } catch (error) { - this.logger.warn("Failed to update code access state", { error }); - this.updateState({ hasCodeAccess: false }); + for ( + let attempt = 0; + attempt < AuthService.CODE_ACCESS_MAX_ATTEMPTS; + attempt++ + ) { + try { + const response = await this.executeAuthenticatedFetch( + fetch, + `${apiHost}/api/code/invites/check-access/`, + {}, + accessToken, + ); + + // Only a definitive 200 response is allowed to change the gate. The + // negative case (genuinely no access) comes back as 200 with + // `has_access: false`; a non-2xx status means the request itself + // failed (outage, rejected token, rate limit) and is never a reliable + // "no access" signal, so we retry rather than eject the user. + if (response.ok) { + const data = (await response.json().catch(() => ({}))) as { + has_access?: boolean; + }; + this.updateState({ hasCodeAccess: data.has_access === true }); + return; + } + + this.logger.warn("Transient failure checking code access, retrying", { + attempt, + status: response.status, + }); + } catch (error) { + this.logger.warn("Failed to reach code access check, retrying", { + attempt, + error, + }); + } + + const isLastAttempt = + attempt === AuthService.CODE_ACCESS_MAX_ATTEMPTS - 1; + if (isLastAttempt) break; + + await sleepWithBackoff(attempt, AuthService.REFRESH_BACKOFF); } + + // Every attempt failed transiently. Preserve the last known value rather + // than asserting `false`: a previously granted user stays in the app, and + // a never-determined user stays at `null` (the "Checking access" spinner) + // instead of being wrongly bounced to the invite screen during an outage. + this.logger.warn( + "Code access check failed after retries, preserving previous state", + { hasCodeAccess: this.state.hasCodeAccess }, + ); } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; + private static readonly CODE_ACCESS_MAX_ATTEMPTS = 3; private static readonly REFRESH_BACKOFF: BackoffOptions = { initialDelayMs: 1_000, maxDelayMs: 5_000, From 6c0bd8c063b2016c247f16dd888eaaf3fb9cd4a7 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 23 Jun 2026 16:04:06 +0100 Subject: [PATCH 02/10] fix(auth): retry code-access check with prime backoff, fail closed after ~90s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the access-check resilience: one awaited attempt keeps the healthy path synchronous, and on a transient failure we retry in the background with prime-number backoff (2,3,5,7,11,13,17,19,23s) up to a ~90s budget instead of ejecting the user. The current value is preserved across the retry window so a brief outage no longer bounces active users to the invite screen. Once the budget is exhausted without a definitive 200, we now fail closed (hasCodeAccess=false) rather than preserving access indefinitely — so the gate can't be bypassed by keeping the client offline. The per-attempt fetch timeout for this check is tightened to 10s, and the background loop is decoupled from the token-refresh path so it can't stall other API calls. Generated-By: PostHog Code Task-Id: 8d2fbac8-0fbd-493a-b267-eeb5ba2ba542 --- packages/core/src/auth/auth.test.ts | 28 +++--- packages/core/src/auth/auth.ts | 140 ++++++++++++++++++---------- 2 files changed, 110 insertions(+), 58 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 8c8e3e720..cafd9a987 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1398,9 +1398,11 @@ describe("AuthService", () => { await service.initialize(); }; - it("retries a transient failure and grants access on the retry", async () => { + it("recovers in the background after transient failures without ejecting the user", async () => { + // Fails twice, then succeeds — a real user weathering a blip must end up + // with access, never having been flipped to `false`. const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => - call === 1 + call <= 2 ? ({ ok: false, status: 403, @@ -1415,11 +1417,13 @@ describe("AuthService", () => { await restoreSession(); - expect(service.getState().hasCodeAccess).toBe(true); - expect(getCheckAccessCalls()).toBe(2); + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(true), + ); + expect(getCheckAccessCalls()).toBe(3); }); - it("preserves prior state instead of ejecting the user when the check keeps failing", async () => { + it("fails closed once the retry budget is exhausted", async () => { const { getCheckAccessCalls } = stubFetchWithCheckAccess( () => ({ @@ -1431,12 +1435,14 @@ describe("AuthService", () => { await restoreSession(); - const state = service.getState(); - expect(state.status).toBe("authenticated"); - // Access was never determined, so it stays null (the "Checking access" - // spinner) — never false, which would wrongly show the invite screen. - expect(state.hasCodeAccess).toBeNull(); - expect(getCheckAccessCalls()).toBe(3); + // Persistent inability to call home eventually revokes access so the + // invite gate can't be bypassed by staying offline. + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(false), + ); + expect(service.getState().status).toBe("authenticated"); + // 1 awaited attempt + one per prime-backoff step before the budget caps. + expect(getCheckAccessCalls()).toBe(9); }); it("still gates the user when the server definitively reports no access", async () => { diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index c812134ed..326908d98 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -910,71 +910,117 @@ export class AuthService extends TypedEventEmitter { return; } - const apiHost = getCloudUrlFromRegion(this.session.cloudRegion); - const accessToken = this.session.accessToken; - - for ( - let attempt = 0; - attempt < AuthService.CODE_ACCESS_MAX_ATTEMPTS; - attempt++ - ) { - try { - const response = await this.executeAuthenticatedFetch( - fetch, - `${apiHost}/api/code/invites/check-access/`, - {}, - accessToken, - ); + // The healthy path stays synchronous: one awaited attempt resolves the + // gate before the app renders. A transient failure must NOT eject an + // active user, so we keep the current value and retry in the background, + // only failing closed once the retry budget is exhausted (see + // `runCodeAccessRetry`). That stops a brief outage from bouncing real + // users to the invite screen, while still preventing the gate from being + // bypassed by simply keeping the client offline indefinitely. + const resolved = await this.checkCodeAccessOnce(); + if (!resolved) { + this.scheduleCodeAccessRetry(); + } + } + // Returns true when the server gave a definitive answer (a 200, which also + // updates the gate) and false when the request failed transiently and the + // caller should retry. Genuine "no access" is a 200 with + // `has_access: false`; a non-2xx status (rejected token, rate limit, outage) + // is never a reliable "no access" signal. + private async checkCodeAccessOnce(): Promise { + const session = this.session; + if (!session) return false; - // Only a definitive 200 response is allowed to change the gate. The - // negative case (genuinely no access) comes back as 200 with - // `has_access: false`; a non-2xx status means the request itself - // failed (outage, rejected token, rate limit) and is never a reliable - // "no access" signal, so we retry rather than eject the user. - if (response.ok) { - const data = (await response.json().catch(() => ({}))) as { - has_access?: boolean; - }; - this.updateState({ hasCodeAccess: data.has_access === true }); - return; - } + try { + const apiHost = getCloudUrlFromRegion(session.cloudRegion); + const response = await this.executeAuthenticatedFetch( + fetch, + `${apiHost}/api/code/invites/check-access/`, + { + signal: AbortSignal.timeout( + AuthService.CODE_ACCESS_FETCH_TIMEOUT_MS, + ), + }, + session.accessToken, + ); - this.logger.warn("Transient failure checking code access, retrying", { - attempt, + if (!response.ok) { + this.logger.warn("Transient failure checking code access", { status: response.status, }); - } catch (error) { - this.logger.warn("Failed to reach code access check, retrying", { - attempt, - error, - }); + return false; } - const isLastAttempt = - attempt === AuthService.CODE_ACCESS_MAX_ATTEMPTS - 1; - if (isLastAttempt) break; + const data = (await response.json().catch(() => ({}))) as { + has_access?: boolean; + }; + this.updateState({ hasCodeAccess: data.has_access === true }); + return true; + } catch (error) { + this.logger.warn("Failed to reach code access check", { error }); + return false; + } + } + private scheduleCodeAccessRetry(): void { + if (this.codeAccessRetryPromise) return; + this.codeAccessRetryPromise = this.runCodeAccessRetry().finally(() => { + this.codeAccessRetryPromise = null; + }); + } + // Background retry loop with prime-number backoff — primes keep retries from + // synchronising into bursts. We keep retrying until the cumulative backoff + // would exceed the budget (~90s), then fail closed so a sustained inability + // to call home revokes access rather than granting it forever. + private async runCodeAccessRetry(): Promise { + let backoffMs = 0; + + for (let attempt = 0; ; attempt++) { + const delayMs = + AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS[ + Math.min( + attempt, + AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS.length - 1, + ) + ]; + if (backoffMs + delayMs > AuthService.CODE_ACCESS_RETRY_BUDGET_MS) { + break; + } - await sleepWithBackoff(attempt, AuthService.REFRESH_BACKOFF); + await sleepWithBackoff(0, { initialDelayMs: delayMs }); + backoffMs += delayMs; + + // The session may have ended (e.g. logout) while we were waiting. + if (!this.session) return; + if (await this.checkCodeAccessOnce()) return; } - // Every attempt failed transiently. Preserve the last known value rather - // than asserting `false`: a previously granted user stays in the app, and - // a never-determined user stays at `null` (the "Checking access" spinner) - // instead of being wrongly bounced to the invite screen during an outage. - this.logger.warn( - "Code access check failed after retries, preserving previous state", - { hasCodeAccess: this.state.hasCodeAccess }, - ); + // Could not confirm access within the budget: fail closed so the invite + // gate can't be bypassed by keeping the client offline. + if (this.session) { + this.logger.warn( + "Code access check failed within budget, failing closed", + { backoffMs }, + ); + this.updateState({ hasCodeAccess: false }); + } } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; - private static readonly CODE_ACCESS_MAX_ATTEMPTS = 3; + private static readonly CODE_ACCESS_FETCH_TIMEOUT_MS = 10_000; + private static readonly CODE_ACCESS_RETRY_BUDGET_MS = 90_000; + // Prime-number backoff (ms). Stepping through primes (rather than a doubling + // schedule) keeps many clients' retries from lining up after an outage. We + // walk the sequence until the cumulative wait would exceed the budget above. + private static readonly CODE_ACCESS_BACKOFF_PRIMES_MS = [ + 2_000, 3_000, 5_000, 7_000, 11_000, 13_000, 17_000, 19_000, 23_000, + ]; private static readonly REFRESH_BACKOFF: BackoffOptions = { initialDelayMs: 1_000, maxDelayMs: 5_000, multiplier: 2, }; + private codeAccessRetryPromise: Promise | null = null; private recoveryPromise: Promise | null = null; private orgProjectsRefreshPromise: Promise | null = null; private connectivityUnsubscribe: (() => void) | null = null; From bd764b0342777992d85254f815be8204de563400 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 23 Jun 2026 16:36:25 +0100 Subject: [PATCH 03/10] fix(auth): close stale-retry race and simplify code-access backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses qa-swarm review on the PR: - Race (HIGH): a background retry loop from an earlier check could fail closed and clobber a fresher `hasCodeAccess: true` set by a later token-refresh check, ejecting an active user. Each authoritative check now bumps a sequence and the retry loop bails if it's been superseded (or the session ended) before mutating state. - Collapse the redundant 90s budget + prime-array into a single stop condition: the loop now just walks the prime schedule (~77s total), which removes the unreachable Math.min clamp and the ~90s-vs-~77s comment drift. - Replace the sleepWithBackoff(0, …) plain-sleep hack with a real sleep() helper added to @posthog/shared. - Tests: de-magic the call-count assertion, assert the actual prime-backoff schedule is exercised, and add a regression test for the stale-loop race. Generated-By: PostHog Code Task-Id: 8d2fbac8-0fbd-493a-b267-eeb5ba2ba542 --- packages/core/src/auth/auth.test.ts | 55 ++++++++++++++++++++-- packages/core/src/auth/auth.ts | 73 +++++++++++------------------ packages/shared/src/backoff.ts | 7 +++ packages/shared/src/index.ts | 1 + 4 files changed, 87 insertions(+), 49 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index cafd9a987..ba6b21214 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1,6 +1,6 @@ import type { RootLogger } from "@posthog/di/logger"; import type { IPowerManager } from "@posthog/platform/power-manager"; -import { OAUTH_SCOPE_VERSION } from "@posthog/shared"; +import { OAUTH_SCOPE_VERSION, sleep } from "@posthog/shared"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { AuthService } from "./auth"; import type { @@ -21,6 +21,7 @@ vi.mock("@posthog/shared", async (importOriginal) => { return { ...actual, sleepWithBackoff: vi.fn().mockResolvedValue(undefined), + sleep: vi.fn().mockResolvedValue(undefined), }; }); @@ -1441,8 +1442,56 @@ describe("AuthService", () => { expect(service.getState().hasCodeAccess).toBe(false), ); expect(service.getState().status).toBe("authenticated"); - // 1 awaited attempt + one per prime-backoff step before the budget caps. - expect(getCheckAccessCalls()).toBe(9); + // 1 awaited attempt + one retry per prime-backoff step (9 primes). + expect(getCheckAccessCalls()).toBe(10); + // The backoff schedule is actually exercised: each retry waits the next + // prime delay (the mocked sleep records the requested durations). + expect(vi.mocked(sleep).mock.calls.map((call) => call[0])).toEqual([ + 2_000, 3_000, 5_000, 7_000, 11_000, 13_000, 17_000, 19_000, 23_000, + ]); + }); + + it("does not let a stale retry loop clobber a fresh grant", async () => { + // Park the retry loop on its first backoff so we can interleave a newer + // authoritative check that succeeds before the stale loop wakes. + let releaseSleep!: () => void; + const gate = new Promise((resolve) => { + releaseSleep = resolve; + }); + vi.mocked(sleep).mockImplementationOnce(() => gate); + + let call = 0; + stubFetchWithCheckAccess(() => { + call++; + // First (restore) attempt fails and schedules the retry loop; every + // later check succeeds with access. + return call === 1 + ? ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response); + }); + + await restoreSession(); + // Restore's attempt failed transiently; gate is unresolved (null, not false). + expect(service.getState().hasCodeAccess).toBeNull(); + + // A fresh authoritative check (token refresh) resolves access true and + // supersedes the parked loop's sequence. + await service.refreshAccessToken(); + expect(service.getState().hasCodeAccess).toBe(true); + + // Wake the stale loop: it must notice it was superseded and NOT fail closed. + releaseSleep(); + await Promise.resolve(); + await Promise.resolve(); + expect(service.getState().hasCodeAccess).toBe(true); }); it("still gates the user when the server definitively reports no access", async () => { diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 326908d98..0dc010b2d 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -9,6 +9,7 @@ import { getCloudUrlFromRegion, NotAuthenticatedError, OAUTH_SCOPE_VERSION, + sleep, sleepWithBackoff, TypedEventEmitter, withTimeout, @@ -910,16 +911,21 @@ export class AuthService extends TypedEventEmitter { return; } + // Every call is a fresh authoritative check; bumping the sequence + // supersedes any retry loop still running from an earlier call so a stale + // loop can never clobber a newer result. + const seq = ++this.codeAccessCheckSeq; + // The healthy path stays synchronous: one awaited attempt resolves the // gate before the app renders. A transient failure must NOT eject an // active user, so we keep the current value and retry in the background, - // only failing closed once the retry budget is exhausted (see + // only failing closed once the retry window is exhausted (see // `runCodeAccessRetry`). That stops a brief outage from bouncing real // users to the invite screen, while still preventing the gate from being // bypassed by simply keeping the client offline indefinitely. const resolved = await this.checkCodeAccessOnce(); if (!resolved) { - this.scheduleCodeAccessRetry(); + void this.runCodeAccessRetry(seq); } } // Returns true when the server gave a definitive answer (a 200, which also @@ -961,57 +967,32 @@ export class AuthService extends TypedEventEmitter { return false; } } - private scheduleCodeAccessRetry(): void { - if (this.codeAccessRetryPromise) return; - this.codeAccessRetryPromise = this.runCodeAccessRetry().finally(() => { - this.codeAccessRetryPromise = null; - }); - } - // Background retry loop with prime-number backoff — primes keep retries from - // synchronising into bursts. We keep retrying until the cumulative backoff - // would exceed the budget (~90s), then fail closed so a sustained inability - // to call home revokes access rather than granting it forever. - private async runCodeAccessRetry(): Promise { - let backoffMs = 0; - - for (let attempt = 0; ; attempt++) { - const delayMs = - AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS[ - Math.min( - attempt, - AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS.length - 1, - ) - ]; - if (backoffMs + delayMs > AuthService.CODE_ACCESS_RETRY_BUDGET_MS) { - break; - } - - await sleepWithBackoff(0, { initialDelayMs: delayMs }); - backoffMs += delayMs; - - // The session may have ended (e.g. logout) while we were waiting. - if (!this.session) return; + // Background retry loop with prime-number backoff. Stepping through primes + // (rather than a doubling schedule) keeps many clients' retries from lining + // up into bursts after a shared outage. The prime sequence is the whole + // budget — it sums to ~77s of waiting — after which we fail closed so a + // sustained inability to call home revokes access rather than granting it + // forever. `seq` guards against a newer authoritative check (or logout) + // having superseded this loop while it slept. + private async runCodeAccessRetry(seq: number): Promise { + for (const delayMs of AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS) { + await sleep(delayMs); + if (this.codeAccessCheckSeq !== seq || !this.session) return; if (await this.checkCodeAccessOnce()) return; } - // Could not confirm access within the budget: fail closed so the invite - // gate can't be bypassed by keeping the client offline. - if (this.session) { - this.logger.warn( - "Code access check failed within budget, failing closed", - { backoffMs }, - ); - this.updateState({ hasCodeAccess: false }); - } + if (this.codeAccessCheckSeq !== seq || !this.session) return; + // Could not confirm access within the retry window: fail closed so the + // invite gate can't be bypassed by keeping the client offline. + this.logger.warn("Code access check failed within retry window, failing closed"); + this.updateState({ hasCodeAccess: false }); } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; private static readonly CODE_ACCESS_FETCH_TIMEOUT_MS = 10_000; - private static readonly CODE_ACCESS_RETRY_BUDGET_MS = 90_000; - // Prime-number backoff (ms). Stepping through primes (rather than a doubling - // schedule) keeps many clients' retries from lining up after an outage. We - // walk the sequence until the cumulative wait would exceed the budget above. + // Prime-number backoff (ms) for the code-access retry loop; the sequence + // sums to ~77s, which is the full retry window before failing closed. private static readonly CODE_ACCESS_BACKOFF_PRIMES_MS = [ 2_000, 3_000, 5_000, 7_000, 11_000, 13_000, 17_000, 19_000, 23_000, ]; @@ -1020,7 +1001,7 @@ export class AuthService extends TypedEventEmitter { maxDelayMs: 5_000, multiplier: 2, }; - private codeAccessRetryPromise: Promise | null = null; + private codeAccessCheckSeq = 0; private recoveryPromise: Promise | null = null; private orgProjectsRefreshPromise: Promise | null = null; private connectivityUnsubscribe: (() => void) | null = null; diff --git a/packages/shared/src/backoff.ts b/packages/shared/src/backoff.ts index 8552773ae..4ac881ad3 100644 --- a/packages/shared/src/backoff.ts +++ b/packages/shared/src/backoff.ts @@ -29,3 +29,10 @@ export function sleepWithBackoff( const delay = getBackoffDelay(attempt, options); return new Promise((resolve) => setTimeout(resolve, delay)); } + +/** + * Sleep for a fixed number of milliseconds. + */ +export function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts index 8ecefc4d5..b0d97d870 100644 --- a/packages/shared/src/index.ts +++ b/packages/shared/src/index.ts @@ -4,6 +4,7 @@ export { withTimeout } from "./async"; export { type BackoffOptions, getBackoffDelay, + sleep, sleepWithBackoff, } from "./backoff"; export { From 623e8a7e775c7dc9532acb66d52f8c9396566e3a Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 23 Jun 2026 16:48:34 +0100 Subject: [PATCH 04/10] test(auth): cover thrown network errors in the code-access retry loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses a greptile review note: the existing resilience tests drove the non-happy path with non-2xx responses only, leaving the checkCodeAccessOnce catch branch (thrown network errors — timeouts, DNS failures, fetch rejections) unexercised. Adds a test where the check throws, asserting it's treated as a transient failure (retried, then failed closed). Generated-By: PostHog Code Task-Id: 8d2fbac8-0fbd-493a-b267-eeb5ba2ba542 --- packages/core/src/auth/auth.test.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index ba6b21214..171fc1393 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1451,6 +1451,22 @@ describe("AuthService", () => { ]); }); + it("treats thrown network errors like transient failures and fails closed", async () => { + // The check throws (network timeout / DNS failure) rather than returning + // a non-2xx — exercises the catch branch, which must behave like a + // transient failure: retry, then fail closed. + const { getCheckAccessCalls } = stubFetchWithCheckAccess(() => { + throw new TypeError("network down"); + }); + + await restoreSession(); + + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(false), + ); + expect(getCheckAccessCalls()).toBe(10); + }); + it("does not let a stale retry loop clobber a fresh grant", async () => { // Park the retry loop on its first backoff so we can interleave a newer // authoritative check that succeeds before the stale loop wakes. From c0fa3608c4a1c32f9cbffc8f1f6d50456dd75619 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Tue, 23 Jun 2026 16:55:06 +0100 Subject: [PATCH 05/10] fix(auth): hard-stop and log out on 401 from code-access check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review follow-up: a 401 from the code-access check means the session token is rejected, so retrying with it is pointless — hard-stop and log the user out so they re-authenticate, rather than retrying for ~77s and dropping them on the fail-closed invite screen. A 403 stays in the transient/retry path: during an outage it isn't a reliable "your access is gone" signal. Generated-By: PostHog Code Task-Id: 8d2fbac8-0fbd-493a-b267-eeb5ba2ba542 --- packages/core/src/auth/auth.test.ts | 20 ++++++++++++++++++++ packages/core/src/auth/auth.ts | 21 ++++++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 171fc1393..eb87b9fa9 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1510,6 +1510,26 @@ describe("AuthService", () => { expect(service.getState().hasCodeAccess).toBe(true); }); + it("logs out on a 401 instead of retrying", async () => { + const { getCheckAccessCalls } = stubFetchWithCheckAccess( + () => + ({ + ok: false, + status: 401, + json: () => Promise.resolve({}), + }) as unknown as Response, + ); + + await restoreSession(); + + // A rejected token is a hard stop: log the user out (no retry, no + // fail-closed invite screen) so they re-authenticate. + const state = service.getState(); + expect(state.status).toBe("anonymous"); + expect(state.hasCodeAccess).toBeNull(); + expect(getCheckAccessCalls()).toBe(1); + }); + it("still gates the user when the server definitively reports no access", async () => { stubFetchWithCheckAccess( () => diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 0dc010b2d..1fa48b659 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -928,11 +928,12 @@ export class AuthService extends TypedEventEmitter { void this.runCodeAccessRetry(seq); } } - // Returns true when the server gave a definitive answer (a 200, which also - // updates the gate) and false when the request failed transiently and the - // caller should retry. Genuine "no access" is a 200 with - // `has_access: false`; a non-2xx status (rejected token, rate limit, outage) - // is never a reliable "no access" signal. + // Returns true when the check reached a definitive end and the caller must + // NOT retry: either a 200 (which updates the gate from `has_access`) or a + // 401 (the token itself is rejected, so we hard-stop and log out). Returns + // false only for transient failures worth retrying — a 403 (which during an + // outage looks transient), 429, 5xx, or a thrown network error. Genuine + // "no access" is a 200 with `has_access: false`, never a non-2xx status. private async checkCodeAccessOnce(): Promise { const session = this.session; if (!session) return false; @@ -950,6 +951,16 @@ export class AuthService extends TypedEventEmitter { session.accessToken, ); + if (response.status === 401) { + // The token is rejected — retrying can't fix it. Hard-stop and log out + // so the user re-authenticates instead of being retried into a + // fail-closed invite screen. (A 403 stays in the transient path below: + // during an outage it's not a reliable "your access is gone" signal.) + this.logger.warn("Code access check returned 401, logging out"); + await this.logout(); + return true; + } + if (!response.ok) { this.logger.warn("Transient failure checking code access", { status: response.status, From bb49f240bdf71d7c310d8ecf0f5fd6afba700a4f Mon Sep 17 00:00:00 2001 From: pauldambra Date: Thu, 25 Jun 2026 20:10:14 +0100 Subject: [PATCH 06/10] fix: address qa-swarm backoff comment accuracy and sleep delegation Correct retry-budget comment from ~77s to ~100s (2+3+5+7+11+13+17+19+23=100) in two locations in auth.ts. Make sleepWithBackoff delegate to sleep so the Promise/setTimeout body is expressed OnceAndOnlyOnce. Generated-By: PostHog Code Task-Id: 47d056fa-5014-40c5-b330-d5c34d9d49bc --- packages/core/src/auth/auth.ts | 12 ++++++------ packages/shared/src/backoff.ts | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 1fa48b659..e76bdd690 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -944,9 +944,7 @@ export class AuthService extends TypedEventEmitter { fetch, `${apiHost}/api/code/invites/check-access/`, { - signal: AbortSignal.timeout( - AuthService.CODE_ACCESS_FETCH_TIMEOUT_MS, - ), + signal: AbortSignal.timeout(AuthService.CODE_ACCESS_FETCH_TIMEOUT_MS), }, session.accessToken, ); @@ -981,7 +979,7 @@ export class AuthService extends TypedEventEmitter { // Background retry loop with prime-number backoff. Stepping through primes // (rather than a doubling schedule) keeps many clients' retries from lining // up into bursts after a shared outage. The prime sequence is the whole - // budget — it sums to ~77s of waiting — after which we fail closed so a + // budget — it sums to ~100s of waiting — after which we fail closed so a // sustained inability to call home revokes access rather than granting it // forever. `seq` guards against a newer authoritative check (or logout) // having superseded this loop while it slept. @@ -995,7 +993,9 @@ export class AuthService extends TypedEventEmitter { if (this.codeAccessCheckSeq !== seq || !this.session) return; // Could not confirm access within the retry window: fail closed so the // invite gate can't be bypassed by keeping the client offline. - this.logger.warn("Code access check failed within retry window, failing closed"); + this.logger.warn( + "Code access check failed within retry window, failing closed", + ); this.updateState({ hasCodeAccess: false }); } private static readonly REFRESH_MAX_ATTEMPTS = 3; @@ -1003,7 +1003,7 @@ export class AuthService extends TypedEventEmitter { private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; private static readonly CODE_ACCESS_FETCH_TIMEOUT_MS = 10_000; // Prime-number backoff (ms) for the code-access retry loop; the sequence - // sums to ~77s, which is the full retry window before failing closed. + // sums to ~100s, which is the full retry window before failing closed. private static readonly CODE_ACCESS_BACKOFF_PRIMES_MS = [ 2_000, 3_000, 5_000, 7_000, 11_000, 13_000, 17_000, 19_000, 23_000, ]; diff --git a/packages/shared/src/backoff.ts b/packages/shared/src/backoff.ts index 4ac881ad3..26ffa114d 100644 --- a/packages/shared/src/backoff.ts +++ b/packages/shared/src/backoff.ts @@ -26,8 +26,7 @@ export function sleepWithBackoff( attempt: number, options: BackoffOptions, ): Promise { - const delay = getBackoffDelay(attempt, options); - return new Promise((resolve) => setTimeout(resolve, delay)); + return sleep(getBackoffDelay(attempt, options)); } /** From 372b1d6180b4638044674915b96694ceb157f5ec Mon Sep 17 00:00:00 2001 From: pauldambra Date: Thu, 25 Jun 2026 20:44:23 +0100 Subject: [PATCH 07/10] fix(auth): close logout-mid-refresh race and address code-access review Resolves the deferred qa-swarm findings on the code-access check: - guard the shared refresh against a 401 logout landing mid-sync, so an awaited refresh rejects instead of handing back a torn-down session's token - model the check result as a named CodeAccessCheckOutcome instead of a bare boolean, removing the paraphrasing comment - treat an unreadable 200 body as a transient failure (retry) rather than gating the user on garbage - extract codeAccessCheckIsStale for the duplicated supersession guard Adds tests for the mid-sync-logout rejection, the unreadable-200 path, and honouring a success on the final retry. Generated-By: PostHog Code Task-Id: 47d056fa-5014-40c5-b330-d5c34d9d49bc --- packages/core/src/auth/auth.test.ts | 89 ++++++++++++++++++++++++++++- packages/core/src/auth/auth.ts | 64 +++++++++++++-------- 2 files changed, 127 insertions(+), 26 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index eb87b9fa9..81f0b4038 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1,6 +1,10 @@ import type { RootLogger } from "@posthog/di/logger"; import type { IPowerManager } from "@posthog/platform/power-manager"; -import { OAUTH_SCOPE_VERSION, sleep } from "@posthog/shared"; +import { + NotAuthenticatedError, + OAUTH_SCOPE_VERSION, + sleep, +} from "@posthog/shared"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { AuthService } from "./auth"; import type { @@ -1544,5 +1548,88 @@ describe("AuthService", () => { expect(service.getState().hasCodeAccess).toBe(false); }); + + it("treats an unreadable 200 body like a transient failure", async () => { + // A 200 whose body can't be parsed (e.g. a proxy returning truncated/HTML + // content) is not a definitive answer — it must retry, not gate the user, + // and recover once a real body arrives. + const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => + call <= 1 + ? ({ + ok: true, + status: 200, + json: () => Promise.reject(new Error("unexpected token <")), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(true), + ); + expect(getCheckAccessCalls()).toBe(2); + }); + + it("honours a success on the final retry instead of failing closed", async () => { + // 1 awaited attempt + 8 transient failures, then the 9th (final) retry + // succeeds: the loop must settle to access and never reach the + // fail-closed write, exercising the final-iteration boundary. + const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => + call < 10 + ? ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(true), + ); + expect(getCheckAccessCalls()).toBe(10); + }); + + it("rejects an in-flight refresh that logs out mid-sync on a 401", async () => { + // The race the seq guard alone doesn't cover: a 401 from the code-access + // check logs out *during* syncAuthenticatedSession, inside the shared + // refresh promise. The awaited refresh must reject rather than resolve + // with the torn-down session's (now dead) access token. + let reject401 = false; + stubFetchWithCheckAccess(() => + reject401 + ? ({ + ok: false, + status: 401, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + expect(service.getState().status).toBe("authenticated"); + + reject401 = true; + await expect(service.refreshAccessToken()).rejects.toBeInstanceOf( + NotAuthenticatedError, + ); + expect(service.getState().status).toBe("anonymous"); + expect(service.getState().hasCodeAccess).toBeNull(); + }); }); }); diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index e76bdd690..7a6ed8e1d 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -72,6 +72,8 @@ interface TokenResponseOptions { selectedProjectId: number | null; } +type CodeAccessCheckOutcome = "settled" | "retry"; + @injectable() export class AuthService extends TypedEventEmitter { private state: AuthState = { @@ -505,6 +507,12 @@ export class AuthService extends TypedEventEmitter { const refreshAndSync = async (): Promise => { const session = await this.refreshSession(sessionInput); await this.syncAuthenticatedSession(session); + // The code-access check inside the sync can hit a 401 and log us out + // mid-refresh. If it did, this session is already torn down, so reject + // rather than hand a dead token to callers awaiting the shared refresh. + if (this.session !== session) { + throw new NotAuthenticatedError(); + } return session; }; @@ -923,20 +931,20 @@ export class AuthService extends TypedEventEmitter { // `runCodeAccessRetry`). That stops a brief outage from bouncing real // users to the invite screen, while still preventing the gate from being // bypassed by simply keeping the client offline indefinitely. - const resolved = await this.checkCodeAccessOnce(); - if (!resolved) { + if ((await this.checkCodeAccessOnce()) === "retry") { void this.runCodeAccessRetry(seq); } } - // Returns true when the check reached a definitive end and the caller must - // NOT retry: either a 200 (which updates the gate from `has_access`) or a - // 401 (the token itself is rejected, so we hard-stop and log out). Returns - // false only for transient failures worth retrying — a 403 (which during an - // outage looks transient), 429, 5xx, or a thrown network error. Genuine - // "no access" is a 200 with `has_access: false`, never a non-2xx status. - private async checkCodeAccessOnce(): Promise { + // A 200 updates the gate from `has_access` and settles; a 401 means the token + // itself is rejected, so we hard-stop and log out (still "settled" — retrying + // can't fix a rejected token). Everything else is a transient failure worth + // retrying: a 403 (which during an outage isn't a reliable "your access is + // gone" signal), a 429, a 5xx, an unreadable 200 body, or a thrown network + // error. Genuine "no access" is a 200 with `has_access: false`, never a + // non-2xx status. + private async checkCodeAccessOnce(): Promise { const session = this.session; - if (!session) return false; + if (!session) return "retry"; try { const apiHost = getCloudUrlFromRegion(session.cloudRegion); @@ -950,30 +958,33 @@ export class AuthService extends TypedEventEmitter { ); if (response.status === 401) { - // The token is rejected — retrying can't fix it. Hard-stop and log out - // so the user re-authenticates instead of being retried into a - // fail-closed invite screen. (A 403 stays in the transient path below: - // during an outage it's not a reliable "your access is gone" signal.) this.logger.warn("Code access check returned 401, logging out"); await this.logout(); - return true; + return "settled"; } if (!response.ok) { this.logger.warn("Transient failure checking code access", { status: response.status, }); - return false; + return "retry"; } - const data = (await response.json().catch(() => ({}))) as { + const data = (await response.json().catch(() => null)) as { has_access?: boolean; - }; + } | null; + if (data === null) { + // An unreadable 200 (truncated/HTML from a proxy) is not a definitive + // answer; treat it like any other transient failure rather than gating + // the user on garbage. + this.logger.warn("Code access check returned an unreadable 200 body"); + return "retry"; + } this.updateState({ hasCodeAccess: data.has_access === true }); - return true; + return "settled"; } catch (error) { this.logger.warn("Failed to reach code access check", { error }); - return false; + return "retry"; } } // Background retry loop with prime-number backoff. Stepping through primes @@ -981,16 +992,16 @@ export class AuthService extends TypedEventEmitter { // up into bursts after a shared outage. The prime sequence is the whole // budget — it sums to ~100s of waiting — after which we fail closed so a // sustained inability to call home revokes access rather than granting it - // forever. `seq` guards against a newer authoritative check (or logout) - // having superseded this loop while it slept. + // forever. `codeAccessCheckIsStale` guards against a newer authoritative + // check (or logout) having superseded this loop while it slept. private async runCodeAccessRetry(seq: number): Promise { for (const delayMs of AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS) { await sleep(delayMs); - if (this.codeAccessCheckSeq !== seq || !this.session) return; - if (await this.checkCodeAccessOnce()) return; + if (this.codeAccessCheckIsStale(seq)) return; + if ((await this.checkCodeAccessOnce()) === "settled") return; } - if (this.codeAccessCheckSeq !== seq || !this.session) return; + if (this.codeAccessCheckIsStale(seq)) return; // Could not confirm access within the retry window: fail closed so the // invite gate can't be bypassed by keeping the client offline. this.logger.warn( @@ -998,6 +1009,9 @@ export class AuthService extends TypedEventEmitter { ); this.updateState({ hasCodeAccess: false }); } + private codeAccessCheckIsStale(seq: number): boolean { + return this.codeAccessCheckSeq !== seq || !this.session; + } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; From 4b2a430cfe5368611592a5f8fff3cdc5cf08010a Mon Sep 17 00:00:00 2001 From: pauldambra Date: Thu, 25 Jun 2026 22:59:06 +0100 Subject: [PATCH 08/10] refactor(auth): supersede code-access retries by session identity, not a counter Replace the monotonic codeAccessCheckSeq counter (and the codeAccessCheckIsStale helper) with a plain reference check against the session the retry loop was started for. A token refresh or logout already swaps this.session for a new object (or null), so `this.session !== session` is a complete supersession signal -- the same expression the mid-sync-logout guard already uses. No extra mutable bookkeeping state. Generated-By: PostHog Code Task-Id: 47d056fa-5014-40c5-b330-d5c34d9d49bc --- packages/core/src/auth/auth.test.ts | 4 ++-- packages/core/src/auth/auth.ts | 26 ++++++++++---------------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 81f0b4038..3b67b5b89 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1502,8 +1502,8 @@ describe("AuthService", () => { // Restore's attempt failed transiently; gate is unresolved (null, not false). expect(service.getState().hasCodeAccess).toBeNull(); - // A fresh authoritative check (token refresh) resolves access true and - // supersedes the parked loop's sequence. + // A fresh authoritative check (token refresh) installs a new session and + // resolves access true, superseding the parked loop. await service.refreshAccessToken(); expect(service.getState().hasCodeAccess).toBe(true); diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 7a6ed8e1d..a82675e61 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -914,16 +914,12 @@ export class AuthService extends TypedEventEmitter { }); } private async updateCodeAccessFromSession(): Promise { - if (!this.session) { + const session = this.session; + if (!session) { this.updateState({ hasCodeAccess: null }); return; } - // Every call is a fresh authoritative check; bumping the sequence - // supersedes any retry loop still running from an earlier call so a stale - // loop can never clobber a newer result. - const seq = ++this.codeAccessCheckSeq; - // The healthy path stays synchronous: one awaited attempt resolves the // gate before the app renders. A transient failure must NOT eject an // active user, so we keep the current value and retry in the background, @@ -932,7 +928,7 @@ export class AuthService extends TypedEventEmitter { // users to the invite screen, while still preventing the gate from being // bypassed by simply keeping the client offline indefinitely. if ((await this.checkCodeAccessOnce()) === "retry") { - void this.runCodeAccessRetry(seq); + void this.runCodeAccessRetry(session); } } // A 200 updates the gate from `has_access` and settles; a 401 means the token @@ -992,16 +988,18 @@ export class AuthService extends TypedEventEmitter { // up into bursts after a shared outage. The prime sequence is the whole // budget — it sums to ~100s of waiting — after which we fail closed so a // sustained inability to call home revokes access rather than granting it - // forever. `codeAccessCheckIsStale` guards against a newer authoritative - // check (or logout) having superseded this loop while it slept. - private async runCodeAccessRetry(seq: number): Promise { + // forever. The loop runs for one `session`; a token refresh or logout swaps + // `this.session` for a new object (or null), so `this.session !== session` + // means a newer authoritative check has superseded this loop and it must + // bow out rather than clobber the newer result. + private async runCodeAccessRetry(session: InMemorySession): Promise { for (const delayMs of AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS) { await sleep(delayMs); - if (this.codeAccessCheckIsStale(seq)) return; + if (this.session !== session) return; if ((await this.checkCodeAccessOnce()) === "settled") return; } - if (this.codeAccessCheckIsStale(seq)) return; + if (this.session !== session) return; // Could not confirm access within the retry window: fail closed so the // invite gate can't be bypassed by keeping the client offline. this.logger.warn( @@ -1009,9 +1007,6 @@ export class AuthService extends TypedEventEmitter { ); this.updateState({ hasCodeAccess: false }); } - private codeAccessCheckIsStale(seq: number): boolean { - return this.codeAccessCheckSeq !== seq || !this.session; - } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; @@ -1026,7 +1021,6 @@ export class AuthService extends TypedEventEmitter { maxDelayMs: 5_000, multiplier: 2, }; - private codeAccessCheckSeq = 0; private recoveryPromise: Promise | null = null; private orgProjectsRefreshPromise: Promise | null = null; private connectivityUnsubscribe: (() => void) | null = null; From 5ee6da135d50aa2dcabeedc0ca4cff937698836b Mon Sep 17 00:00:00 2001 From: pauldambra Date: Thu, 25 Jun 2026 23:10:12 +0100 Subject: [PATCH 09/10] refactor(auth): simplify code-access check (reuse backoff, zod, fewer branches) - Replace the bespoke 9-prime delay array with the existing exponential sleepWithBackoff helper + an attempt count, matching the REFRESH_BACKOFF pattern already in this class. The prime "anti-thundering-herd" rationale didn't hold (identical client schedules still align; only jitter de-syncs), and desktop clients already refresh on staggered wall-clocks. - Drop the now-unused fixed-delay `sleep` primitive this PR had added to @posthog/shared; sleepWithBackoff goes back to its inline timer. - Parse the check-access response with a zod schema (schemas.ts) instead of an unchecked cast, per the runtime-boundary convention. - Drop the speculative unreadable-200 retry branch (YAGNI). - Trim comments that restated the code; keep the why. Generated-By: PostHog Code Task-Id: 47d056fa-5014-40c5-b330-d5c34d9d49bc --- packages/core/src/auth/auth.test.ts | 42 +++--------------- packages/core/src/auth/auth.ts | 69 +++++++++++++---------------- packages/core/src/auth/schemas.ts | 4 ++ packages/shared/src/backoff.ts | 10 +---- packages/shared/src/index.ts | 1 - 5 files changed, 45 insertions(+), 81 deletions(-) diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 3b67b5b89..14d7d570c 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -3,7 +3,7 @@ import type { IPowerManager } from "@posthog/platform/power-manager"; import { NotAuthenticatedError, OAUTH_SCOPE_VERSION, - sleep, + sleepWithBackoff, } from "@posthog/shared"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { AuthService } from "./auth"; @@ -25,7 +25,6 @@ vi.mock("@posthog/shared", async (importOriginal) => { return { ...actual, sleepWithBackoff: vi.fn().mockResolvedValue(undefined), - sleep: vi.fn().mockResolvedValue(undefined), }; }); @@ -1446,13 +1445,12 @@ describe("AuthService", () => { expect(service.getState().hasCodeAccess).toBe(false), ); expect(service.getState().status).toBe("authenticated"); - // 1 awaited attempt + one retry per prime-backoff step (9 primes). + // 1 awaited attempt + one retry per backoff step. expect(getCheckAccessCalls()).toBe(10); - // The backoff schedule is actually exercised: each retry waits the next - // prime delay (the mocked sleep records the requested durations). - expect(vi.mocked(sleep).mock.calls.map((call) => call[0])).toEqual([ - 2_000, 3_000, 5_000, 7_000, 11_000, 13_000, 17_000, 19_000, 23_000, - ]); + // Each retry waits the next backoff step (attempt 0..8). + expect( + vi.mocked(sleepWithBackoff).mock.calls.map((call) => call[0]), + ).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8]); }); it("treats thrown network errors like transient failures and fails closed", async () => { @@ -1478,7 +1476,7 @@ describe("AuthService", () => { const gate = new Promise((resolve) => { releaseSleep = resolve; }); - vi.mocked(sleep).mockImplementationOnce(() => gate); + vi.mocked(sleepWithBackoff).mockImplementationOnce(() => gate); let call = 0; stubFetchWithCheckAccess(() => { @@ -1549,32 +1547,6 @@ describe("AuthService", () => { expect(service.getState().hasCodeAccess).toBe(false); }); - it("treats an unreadable 200 body like a transient failure", async () => { - // A 200 whose body can't be parsed (e.g. a proxy returning truncated/HTML - // content) is not a definitive answer — it must retry, not gate the user, - // and recover once a real body arrives. - const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => - call <= 1 - ? ({ - ok: true, - status: 200, - json: () => Promise.reject(new Error("unexpected token <")), - } as unknown as Response) - : ({ - ok: true, - status: 200, - json: () => Promise.resolve({ has_access: true }), - } as unknown as Response), - ); - - await restoreSession(); - - await vi.waitFor(() => - expect(service.getState().hasCodeAccess).toBe(true), - ); - expect(getCheckAccessCalls()).toBe(2); - }); - it("honours a success on the final retry instead of failing closed", async () => { // 1 awaited attempt + 8 transient failures, then the 9th (final) retry // succeeds: the loop must settle to access and never reach the diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index a82675e61..74b5eac03 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -9,7 +9,6 @@ import { getCloudUrlFromRegion, NotAuthenticatedError, OAUTH_SCOPE_VERSION, - sleep, sleepWithBackoff, TypedEventEmitter, withTimeout, @@ -33,6 +32,7 @@ import { type AuthServiceEvents, type AuthState, type AuthTokenResponse, + codeAccessResponse, findOrgForProject, flattenProjectIds, type OrgProjects, @@ -920,13 +920,10 @@ export class AuthService extends TypedEventEmitter { return; } - // The healthy path stays synchronous: one awaited attempt resolves the - // gate before the app renders. A transient failure must NOT eject an - // active user, so we keep the current value and retry in the background, - // only failing closed once the retry window is exhausted (see - // `runCodeAccessRetry`). That stops a brief outage from bouncing real - // users to the invite screen, while still preventing the gate from being - // bypassed by simply keeping the client offline indefinitely. + // The first attempt is awaited so the gate resolves before render; a + // transient failure keeps the current value and retries in the background + // (`runCodeAccessRetry`), so a brief outage can't eject an active user yet + // staying offline can't keep the gate open forever. if ((await this.checkCodeAccessOnce()) === "retry") { void this.runCodeAccessRetry(session); } @@ -935,9 +932,8 @@ export class AuthService extends TypedEventEmitter { // itself is rejected, so we hard-stop and log out (still "settled" — retrying // can't fix a rejected token). Everything else is a transient failure worth // retrying: a 403 (which during an outage isn't a reliable "your access is - // gone" signal), a 429, a 5xx, an unreadable 200 body, or a thrown network - // error. Genuine "no access" is a 200 with `has_access: false`, never a - // non-2xx status. + // gone" signal), a 429, a 5xx, or a thrown network error. Genuine "no access" + // is a 200 with `has_access: false`, never a non-2xx status. private async checkCodeAccessOnce(): Promise { const session = this.session; if (!session) return "retry"; @@ -966,35 +962,31 @@ export class AuthService extends TypedEventEmitter { return "retry"; } - const data = (await response.json().catch(() => null)) as { - has_access?: boolean; - } | null; - if (data === null) { - // An unreadable 200 (truncated/HTML from a proxy) is not a definitive - // answer; treat it like any other transient failure rather than gating - // the user on garbage. - this.logger.warn("Code access check returned an unreadable 200 body"); - return "retry"; - } - this.updateState({ hasCodeAccess: data.has_access === true }); + const body = await response.json().catch(() => null); + const parsed = codeAccessResponse.safeParse(body); + this.updateState({ + hasCodeAccess: parsed.success && parsed.data.has_access, + }); return "settled"; } catch (error) { this.logger.warn("Failed to reach code access check", { error }); return "retry"; } } - // Background retry loop with prime-number backoff. Stepping through primes - // (rather than a doubling schedule) keeps many clients' retries from lining - // up into bursts after a shared outage. The prime sequence is the whole - // budget — it sums to ~100s of waiting — after which we fail closed so a + // Background retry loop on a bounded exponential backoff. The loop runs for + // one `session`; a token refresh or logout swaps `this.session` for a new + // object (or null), so `this.session !== session` means a newer authoritative + // check has superseded this loop and it must bow out rather than clobber the + // newer result. If the whole budget is exhausted we fail closed, so a // sustained inability to call home revokes access rather than granting it - // forever. The loop runs for one `session`; a token refresh or logout swaps - // `this.session` for a new object (or null), so `this.session !== session` - // means a newer authoritative check has superseded this loop and it must - // bow out rather than clobber the newer result. + // forever. private async runCodeAccessRetry(session: InMemorySession): Promise { - for (const delayMs of AuthService.CODE_ACCESS_BACKOFF_PRIMES_MS) { - await sleep(delayMs); + for ( + let attempt = 0; + attempt < AuthService.CODE_ACCESS_RETRY_ATTEMPTS; + attempt++ + ) { + await sleepWithBackoff(attempt, AuthService.CODE_ACCESS_BACKOFF); if (this.session !== session) return; if ((await this.checkCodeAccessOnce()) === "settled") return; } @@ -1011,11 +1003,14 @@ export class AuthService extends TypedEventEmitter { private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; private static readonly CODE_ACCESS_FETCH_TIMEOUT_MS = 10_000; - // Prime-number backoff (ms) for the code-access retry loop; the sequence - // sums to ~100s, which is the full retry window before failing closed. - private static readonly CODE_ACCESS_BACKOFF_PRIMES_MS = [ - 2_000, 3_000, 5_000, 7_000, 11_000, 13_000, 17_000, 19_000, 23_000, - ]; + private static readonly CODE_ACCESS_RETRY_ATTEMPTS = 9; + // Bounded exponential backoff for the code-access retry loop; ~130s of waiting + // across the attempts above before failing closed. + private static readonly CODE_ACCESS_BACKOFF: BackoffOptions = { + initialDelayMs: 2_000, + maxDelayMs: 20_000, + multiplier: 2, + }; private static readonly REFRESH_BACKOFF: BackoffOptions = { initialDelayMs: 1_000, maxDelayMs: 5_000, diff --git a/packages/core/src/auth/schemas.ts b/packages/core/src/auth/schemas.ts index 3fa2d74ad..6e4e69d7d 100644 --- a/packages/core/src/auth/schemas.ts +++ b/packages/core/src/auth/schemas.ts @@ -112,6 +112,10 @@ export const validAccessTokenOutput = z.object({ }); export type ValidAccessTokenOutput = z.infer; +export const codeAccessResponse = z.object({ + has_access: z.boolean(), +}); + export const AuthServiceEvent = { StateChanged: "state-changed", } as const; diff --git a/packages/shared/src/backoff.ts b/packages/shared/src/backoff.ts index 26ffa114d..8552773ae 100644 --- a/packages/shared/src/backoff.ts +++ b/packages/shared/src/backoff.ts @@ -26,12 +26,6 @@ export function sleepWithBackoff( attempt: number, options: BackoffOptions, ): Promise { - return sleep(getBackoffDelay(attempt, options)); -} - -/** - * Sleep for a fixed number of milliseconds. - */ -export function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); + const delay = getBackoffDelay(attempt, options); + return new Promise((resolve) => setTimeout(resolve, delay)); } diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts index b0d97d870..8ecefc4d5 100644 --- a/packages/shared/src/index.ts +++ b/packages/shared/src/index.ts @@ -4,7 +4,6 @@ export { withTimeout } from "./async"; export { type BackoffOptions, getBackoffDelay, - sleep, sleepWithBackoff, } from "./backoff"; export { From 31e1d9333768814bf5af1fd88429536d85d0e4c2 Mon Sep 17 00:00:00 2001 From: pauldambra Date: Fri, 26 Jun 2026 10:26:07 +0100 Subject: [PATCH 10/10] style(auth): unwrap the code-access retry loop header Alias the attempt count into a local so the for-header fits one line instead of Biome wrapping it across four. Direct AuthService.* access elsewhere keeps the statics counted as used. Generated-By: PostHog Code Task-Id: 47d056fa-5014-40c5-b330-d5c34d9d49bc --- packages/core/src/auth/auth.ts | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index 74b5eac03..80891bf52 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -981,11 +981,8 @@ export class AuthService extends TypedEventEmitter { // sustained inability to call home revokes access rather than granting it // forever. private async runCodeAccessRetry(session: InMemorySession): Promise { - for ( - let attempt = 0; - attempt < AuthService.CODE_ACCESS_RETRY_ATTEMPTS; - attempt++ - ) { + const attempts = AuthService.CODE_ACCESS_RETRY_ATTEMPTS; + for (let attempt = 0; attempt < attempts; attempt++) { await sleepWithBackoff(attempt, AuthService.CODE_ACCESS_BACKOFF); if (this.session !== session) return; if ((await this.checkCodeAccessOnce()) === "settled") return;