From 122187a57658574f1e5c9c6d49bbfda9427c4f87 Mon Sep 17 00:00:00 2001 From: anderdc Date: Wed, 20 May 2026 12:06:20 -0500 Subject: [PATCH] fix(webhook): don't retain failed PR metadata jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A `fetch-pr-metadata` job uses a stable custom jobId per PR (`meta--`). BullMQ ignores `add()` when a job with that id already exists in any state, including the failed-retention set. With `removeOnFail: 50`, a metadata job that exhausts its 3 retries during a transient GitHub outage sat in the failed set and blocked every later `edited`/`closed`/`reopened`/`synchronize` webhook for the same PR until the global 50-slot cap evicted it — leaving `body`, `last_edited_at`, `closing_issue_numbers`, and downstream `issues.solved_by_pr` stale. Drop retention for these enqueues to `true` so failed jobs evict immediately and the next webhook gets a fresh fetch. Failure detail remains in service logs. Fixes #75. --- packages/das/src/queue/fetch.processor.ts | 4 +++- packages/das/src/webhook/handlers/pull-request.handler.ts | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/packages/das/src/queue/fetch.processor.ts b/packages/das/src/queue/fetch.processor.ts index 27860d2..e3e442e 100644 --- a/packages/das/src/queue/fetch.processor.ts +++ b/packages/das/src/queue/fetch.processor.ts @@ -147,7 +147,9 @@ export class FetchProcessor extends WorkerHost { { jobId: `meta-${repoFullName}-${prNumber}`, removeOnComplete: true, - removeOnFail: 50, + // Match the webhook handler — failed metadata jobs must not squat + // on the stable per-PR jobId (#75). + removeOnFail: true, attempts: 3, backoff: { type: "exponential", delay: 5000 }, }, diff --git a/packages/das/src/webhook/handlers/pull-request.handler.ts b/packages/das/src/webhook/handlers/pull-request.handler.ts index bd70e83..53dbee8 100644 --- a/packages/das/src/webhook/handlers/pull-request.handler.ts +++ b/packages/das/src/webhook/handlers/pull-request.handler.ts @@ -79,9 +79,11 @@ export class PullRequestHandler { { repoFullName, prNumber }, { jobId, - // Replace any pending job for the same PR (e.g. rapid pushes) + // Pending/active jobs for the same PR still dedupe by jobId. + // Don't retain failed jobs — they'd block future enqueues for this + // PR until the failed-set cap evicts them (#75). removeOnComplete: true, - removeOnFail: 50, + removeOnFail: true, attempts: 3, backoff: { type: "exponential", delay: 5000 }, },