diff --git a/packages/core/src/auth/auth.test.ts b/packages/core/src/auth/auth.test.ts index 1b9448e04..14d7d570c 100644 --- a/packages/core/src/auth/auth.test.ts +++ b/packages/core/src/auth/auth.test.ts @@ -1,6 +1,10 @@ import type { RootLogger } from "@posthog/di/logger"; import type { IPowerManager } from "@posthog/platform/power-manager"; -import { OAUTH_SCOPE_VERSION } from "@posthog/shared"; +import { + NotAuthenticatedError, + OAUTH_SCOPE_VERSION, + sleepWithBackoff, +} from "@posthog/shared"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { AuthService } from "./auth"; import type { @@ -1342,4 +1346,262 @@ describe("AuthService", () => { expect(redeemCallCount).toBe(2); }); }); + + describe("code access check resilience", () => { + const stubFetchWithCheckAccess = ( + checkAccess: (callCount: number) => Response, + ): { getCheckAccessCalls: () => number } => { + let checkAccessCalls = 0; + vi.stubGlobal( + "fetch", + vi.fn(async (input: string | Request) => { + const url = typeof input === "string" ? input : input.url; + + if (url.includes("/api/users/@me/")) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + uuid: "user-1", + organization: { id: "org-1" }, + }), + } as unknown as Response; + } + + if (/\/api\/organizations\/[^/]+\/$/.test(url)) { + return { + ok: true, + json: vi.fn().mockResolvedValue({ + name: "Org 1", + teams: [{ id: 42, name: "Project 42" }], + }), + } as unknown as Response; + } + + if (url.includes("/invites/check-access/")) { + checkAccessCalls++; + return checkAccess(checkAccessCalls); + } + + return { + ok: true, + json: vi.fn().mockResolvedValue({}), + } as unknown as Response; + }) as unknown as typeof fetch, + ); + return { getCheckAccessCalls: () => checkAccessCalls }; + }; + + const restoreSession = async () => { + seedStoredSession({ selectedProjectId: 42 }); + oauthFlow.refreshToken.mockResolvedValue( + mockTokenResponse({ + accessToken: "access-token", + refreshToken: "refresh-token", + }), + ); + await service.initialize(); + }; + + it("recovers in the background after transient failures without ejecting the user", async () => { + // Fails twice, then succeeds — a real user weathering a blip must end up + // with access, never having been flipped to `false`. + const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => + call <= 2 + ? ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(true), + ); + expect(getCheckAccessCalls()).toBe(3); + }); + + it("fails closed once the retry budget is exhausted", async () => { + const { getCheckAccessCalls } = stubFetchWithCheckAccess( + () => + ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + }) as unknown as Response, + ); + + await restoreSession(); + + // Persistent inability to call home eventually revokes access so the + // invite gate can't be bypassed by staying offline. + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(false), + ); + expect(service.getState().status).toBe("authenticated"); + // 1 awaited attempt + one retry per backoff step. + expect(getCheckAccessCalls()).toBe(10); + // Each retry waits the next backoff step (attempt 0..8). + expect( + vi.mocked(sleepWithBackoff).mock.calls.map((call) => call[0]), + ).toEqual([0, 1, 2, 3, 4, 5, 6, 7, 8]); + }); + + it("treats thrown network errors like transient failures and fails closed", async () => { + // The check throws (network timeout / DNS failure) rather than returning + // a non-2xx — exercises the catch branch, which must behave like a + // transient failure: retry, then fail closed. + const { getCheckAccessCalls } = stubFetchWithCheckAccess(() => { + throw new TypeError("network down"); + }); + + await restoreSession(); + + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(false), + ); + expect(getCheckAccessCalls()).toBe(10); + }); + + it("does not let a stale retry loop clobber a fresh grant", async () => { + // Park the retry loop on its first backoff so we can interleave a newer + // authoritative check that succeeds before the stale loop wakes. + let releaseSleep!: () => void; + const gate = new Promise((resolve) => { + releaseSleep = resolve; + }); + vi.mocked(sleepWithBackoff).mockImplementationOnce(() => gate); + + let call = 0; + stubFetchWithCheckAccess(() => { + call++; + // First (restore) attempt fails and schedules the retry loop; every + // later check succeeds with access. + return call === 1 + ? ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response); + }); + + await restoreSession(); + // Restore's attempt failed transiently; gate is unresolved (null, not false). + expect(service.getState().hasCodeAccess).toBeNull(); + + // A fresh authoritative check (token refresh) installs a new session and + // resolves access true, superseding the parked loop. + await service.refreshAccessToken(); + expect(service.getState().hasCodeAccess).toBe(true); + + // Wake the stale loop: it must notice it was superseded and NOT fail closed. + releaseSleep(); + await Promise.resolve(); + await Promise.resolve(); + expect(service.getState().hasCodeAccess).toBe(true); + }); + + it("logs out on a 401 instead of retrying", async () => { + const { getCheckAccessCalls } = stubFetchWithCheckAccess( + () => + ({ + ok: false, + status: 401, + json: () => Promise.resolve({}), + }) as unknown as Response, + ); + + await restoreSession(); + + // A rejected token is a hard stop: log the user out (no retry, no + // fail-closed invite screen) so they re-authenticate. + const state = service.getState(); + expect(state.status).toBe("anonymous"); + expect(state.hasCodeAccess).toBeNull(); + expect(getCheckAccessCalls()).toBe(1); + }); + + it("still gates the user when the server definitively reports no access", async () => { + stubFetchWithCheckAccess( + () => + ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: false }), + }) as unknown as Response, + ); + + await restoreSession(); + + expect(service.getState().hasCodeAccess).toBe(false); + }); + + it("honours a success on the final retry instead of failing closed", async () => { + // 1 awaited attempt + 8 transient failures, then the 9th (final) retry + // succeeds: the loop must settle to access and never reach the + // fail-closed write, exercising the final-iteration boundary. + const { getCheckAccessCalls } = stubFetchWithCheckAccess((call) => + call < 10 + ? ({ + ok: false, + status: 403, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + + await vi.waitFor(() => + expect(service.getState().hasCodeAccess).toBe(true), + ); + expect(getCheckAccessCalls()).toBe(10); + }); + + it("rejects an in-flight refresh that logs out mid-sync on a 401", async () => { + // The race the seq guard alone doesn't cover: a 401 from the code-access + // check logs out *during* syncAuthenticatedSession, inside the shared + // refresh promise. The awaited refresh must reject rather than resolve + // with the torn-down session's (now dead) access token. + let reject401 = false; + stubFetchWithCheckAccess(() => + reject401 + ? ({ + ok: false, + status: 401, + json: () => Promise.resolve({}), + } as unknown as Response) + : ({ + ok: true, + status: 200, + json: () => Promise.resolve({ has_access: true }), + } as unknown as Response), + ); + + await restoreSession(); + expect(service.getState().status).toBe("authenticated"); + + reject401 = true; + await expect(service.refreshAccessToken()).rejects.toBeInstanceOf( + NotAuthenticatedError, + ); + expect(service.getState().status).toBe("anonymous"); + expect(service.getState().hasCodeAccess).toBeNull(); + }); + }); }); diff --git a/packages/core/src/auth/auth.ts b/packages/core/src/auth/auth.ts index b7b47b424..80891bf52 100644 --- a/packages/core/src/auth/auth.ts +++ b/packages/core/src/auth/auth.ts @@ -32,6 +32,7 @@ import { type AuthServiceEvents, type AuthState, type AuthTokenResponse, + codeAccessResponse, findOrgForProject, flattenProjectIds, type OrgProjects, @@ -71,6 +72,8 @@ interface TokenResponseOptions { selectedProjectId: number | null; } +type CodeAccessCheckOutcome = "settled" | "retry"; + @injectable() export class AuthService extends TypedEventEmitter { private state: AuthState = { @@ -504,6 +507,12 @@ export class AuthService extends TypedEventEmitter { const refreshAndSync = async (): Promise => { const session = await this.refreshSession(sessionInput); await this.syncAuthenticatedSession(session); + // The code-access check inside the sync can hit a 401 and log us out + // mid-refresh. If it did, this session is already torn down, so reject + // rather than hand a dead token to callers awaiting the shared refresh. + if (this.session !== session) { + throw new NotAuthenticatedError(); + } return session; }; @@ -905,32 +914,100 @@ export class AuthService extends TypedEventEmitter { }); } private async updateCodeAccessFromSession(): Promise { - if (!this.session) { + const session = this.session; + if (!session) { this.updateState({ hasCodeAccess: null }); return; } + // The first attempt is awaited so the gate resolves before render; a + // transient failure keeps the current value and retries in the background + // (`runCodeAccessRetry`), so a brief outage can't eject an active user yet + // staying offline can't keep the gate open forever. + if ((await this.checkCodeAccessOnce()) === "retry") { + void this.runCodeAccessRetry(session); + } + } + // A 200 updates the gate from `has_access` and settles; a 401 means the token + // itself is rejected, so we hard-stop and log out (still "settled" — retrying + // can't fix a rejected token). Everything else is a transient failure worth + // retrying: a 403 (which during an outage isn't a reliable "your access is + // gone" signal), a 429, a 5xx, or a thrown network error. Genuine "no access" + // is a 200 with `has_access: false`, never a non-2xx status. + private async checkCodeAccessOnce(): Promise { + const session = this.session; + if (!session) return "retry"; + try { - const apiHost = getCloudUrlFromRegion(this.session.cloudRegion); + const apiHost = getCloudUrlFromRegion(session.cloudRegion); const response = await this.executeAuthenticatedFetch( fetch, `${apiHost}/api/code/invites/check-access/`, - {}, - this.session.accessToken, + { + signal: AbortSignal.timeout(AuthService.CODE_ACCESS_FETCH_TIMEOUT_MS), + }, + session.accessToken, ); - const data = (await response.json().catch(() => ({}))) as { - has_access?: boolean; - }; - this.updateState({ hasCodeAccess: data.has_access === true }); + if (response.status === 401) { + this.logger.warn("Code access check returned 401, logging out"); + await this.logout(); + return "settled"; + } + + if (!response.ok) { + this.logger.warn("Transient failure checking code access", { + status: response.status, + }); + return "retry"; + } + + const body = await response.json().catch(() => null); + const parsed = codeAccessResponse.safeParse(body); + this.updateState({ + hasCodeAccess: parsed.success && parsed.data.has_access, + }); + return "settled"; } catch (error) { - this.logger.warn("Failed to update code access state", { error }); - this.updateState({ hasCodeAccess: false }); + this.logger.warn("Failed to reach code access check", { error }); + return "retry"; + } + } + // Background retry loop on a bounded exponential backoff. The loop runs for + // one `session`; a token refresh or logout swaps `this.session` for a new + // object (or null), so `this.session !== session` means a newer authoritative + // check has superseded this loop and it must bow out rather than clobber the + // newer result. If the whole budget is exhausted we fail closed, so a + // sustained inability to call home revokes access rather than granting it + // forever. + private async runCodeAccessRetry(session: InMemorySession): Promise { + const attempts = AuthService.CODE_ACCESS_RETRY_ATTEMPTS; + for (let attempt = 0; attempt < attempts; attempt++) { + await sleepWithBackoff(attempt, AuthService.CODE_ACCESS_BACKOFF); + if (this.session !== session) return; + if ((await this.checkCodeAccessOnce()) === "settled") return; } + + if (this.session !== session) return; + // Could not confirm access within the retry window: fail closed so the + // invite gate can't be bypassed by keeping the client offline. + this.logger.warn( + "Code access check failed within retry window, failing closed", + ); + this.updateState({ hasCodeAccess: false }); } private static readonly REFRESH_MAX_ATTEMPTS = 3; private static readonly ORG_FETCH_MAX_ATTEMPTS = 3; private static readonly ORG_RECOVERY_MAX_ATTEMPTS = 5; + private static readonly CODE_ACCESS_FETCH_TIMEOUT_MS = 10_000; + private static readonly CODE_ACCESS_RETRY_ATTEMPTS = 9; + // Bounded exponential backoff for the code-access retry loop; ~130s of waiting + // across the attempts above before failing closed. + private static readonly CODE_ACCESS_BACKOFF: BackoffOptions = { + initialDelayMs: 2_000, + maxDelayMs: 20_000, + multiplier: 2, + }; private static readonly REFRESH_BACKOFF: BackoffOptions = { initialDelayMs: 1_000, maxDelayMs: 5_000, diff --git a/packages/core/src/auth/schemas.ts b/packages/core/src/auth/schemas.ts index 3fa2d74ad..6e4e69d7d 100644 --- a/packages/core/src/auth/schemas.ts +++ b/packages/core/src/auth/schemas.ts @@ -112,6 +112,10 @@ export const validAccessTokenOutput = z.object({ }); export type ValidAccessTokenOutput = z.infer; +export const codeAccessResponse = z.object({ + has_access: z.boolean(), +}); + export const AuthServiceEvent = { StateChanged: "state-changed", } as const;