From faf10b363c13102f98eb529a4dc137300dc67a93 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 22 Jan 2024 17:17:04 +0100
Subject: [PATCH 01/14] hash: Skip the unbusy ceremony for private objects

This reduces a great deal of spurious activity on the private_oh waiting
list and removes unncessary conditionals when dealing with cacheable (or
at least searchable) objects.
---
 bin/varnishd/cache/cache_hash.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 0c6c8d654d..5d133a88f8 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -907,24 +907,27 @@ HSH_Unbusy(struct worker *wrk, struct objcore *oc)
 
 	oh = oc->objhead;
 	CHECK_OBJ(oh, OBJHEAD_MAGIC);
-	INIT_OBJ(&rush, RUSH_MAGIC);
 
 	AN(oc->stobj->stevedore);
 	AN(oc->flags & OC_F_BUSY);
 	assert(oh->refcnt > 0);
 	assert(oc->refcnt > 0);
 
-	if (!(oc->flags & OC_F_PRIVATE)) {
-		BAN_NewObjCore(oc);
-		AN(oc->ban);
+	if (oc->flags & OC_F_PRIVATE) {
+		oc->flags &= ~OC_F_BUSY;
+		return;
 	}
 
+	INIT_OBJ(&rush, RUSH_MAGIC);
+
+	BAN_NewObjCore(oc);
+	AN(oc->ban);
+
 	/* XXX: pretouch neighbors on oh->objcs to prevent page-on under mtx */
 	Lck_Lock(&oh->mtx);
 	assert(oh->refcnt > 0);
 	assert(oc->refcnt > 0);
-	if (!(oc->flags & OC_F_PRIVATE))
-		EXP_RefNewObjcore(oc); /* Takes a ref for expiry */
+	EXP_RefNewObjcore(oc); /* Takes a ref for expiry */
 	/* XXX: strictly speaking, we should sort in Date: order. */
 	VTAILQ_REMOVE(&oh->objcs, oc, hsh_list);
 	VTAILQ_INSERT_HEAD(&oh->objcs, oc, hsh_list);
@@ -934,8 +937,7 @@ HSH_Unbusy(struct worker *wrk, struct objcore *oc)
 		hsh_rush1(wrk, oh, &rush, HSH_RUSH_POLICY);
 	}
 	Lck_Unlock(&oh->mtx);
-	EXP_Insert(wrk, oc); /* Does nothing unless EXP_RefNewObjcore was
-			      * called */
+	EXP_Insert(wrk, oc);
 	hsh_rush2(wrk, &rush);
 }
 

From 180592125b8edf0b83a5f12cec7a380c716849b9 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 4 Mar 2024 18:31:10 +0100
Subject: [PATCH 02/14] hash: Only grant OC_F_BUSY to lookup-able objects

The OC_F_BUSY flag is used to decide whether to disembark a request into
an objhead's waiting list. Therefore, it makes sense to only give this
flag to objcores for which the outcome may be a shareable object.

This excludes synthetic and private (req.body and passed req) objcores,
and objects inserted with HSH_Insert(). Inserted objects are never busy
since they come already ready to use.

Pass transactions don't need this flag either. The OC_F_BUSY flag was
somewhat conflated with struct busyobj in the fetch state machine. We
don't need OC_F_BUSY to convey that a fetch task is coordinated with a
req task, oc->boc is already doing that. The OC_F_BUSY|OC_F_PRIVATE bits
should be considered an oxymoron since OC_F_BUSY drives waiting list
activity and OC_F_PRIVATE is for objects that are not shareable by
definition.
---
 bin/varnishd/cache/cache_fetch.c |  5 +++--
 bin/varnishd/cache/cache_hash.c  | 25 +++++++++++++++++--------
 bin/varnishd/cache/cache_obj.c   |  2 --
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/bin/varnishd/cache/cache_fetch.c b/bin/varnishd/cache/cache_fetch.c
index 4ea54d9f90..c707a22b67 100644
--- a/bin/varnishd/cache/cache_fetch.c
+++ b/bin/varnishd/cache/cache_fetch.c
@@ -934,7 +934,8 @@ vbf_stp_error(struct worker *wrk, struct busyobj *bo)
 	CHECK_OBJ_NOTNULL(bo, BUSYOBJ_MAGIC);
 	oc = bo->fetch_objcore;
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
-	AN(oc->flags & OC_F_BUSY);
+	CHECK_OBJ_NOTNULL(oc->boc, BOC_MAGIC);
+	assert(oc->boc->state < BOS_STREAM);
 	assert(bo->director_state == DIR_S_NULL);
 
 	if (wrk->vpi->handling != VCL_RET_ERROR)
@@ -1168,7 +1169,6 @@ VBF_Fetch(struct worker *wrk, struct req *req, struct objcore *oc,
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
-	AN(oc->flags & OC_F_BUSY);
 	CHECK_OBJ_ORNULL(oldoc, OBJCORE_MAGIC);
 
 	bo = VBO_GetBusyObj(wrk, req);
@@ -1177,6 +1177,7 @@ VBF_Fetch(struct worker *wrk, struct req *req, struct objcore *oc,
 
 	boc = HSH_RefBoc(oc);
 	CHECK_OBJ_NOTNULL(boc, BOC_MAGIC);
+	assert(boc->state < BOS_STREAM);
 	boc->transit_buffer = cache_param->transit_buffer;
 
 	switch (mode) {
diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 5d133a88f8..cd7c85c749 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -316,7 +316,7 @@ HSH_Insert(struct worker *wrk, const void *digest, struct objcore *oc,
 	AN(digest);
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 	AN(ban);
-	AN(oc->flags & OC_F_BUSY);
+	AZ(oc->flags & OC_F_BUSY);
 	AZ(oc->flags & OC_F_PRIVATE);
 	assert(oc->refcnt == 1);
 	INIT_OBJ(&rush, RUSH_MAGIC);
@@ -344,7 +344,6 @@ HSH_Insert(struct worker *wrk, const void *digest, struct objcore *oc,
 	Lck_Lock(&oh->mtx);
 	VTAILQ_REMOVE(&oh->objcs, oc, hsh_list);
 	VTAILQ_INSERT_HEAD(&oh->objcs, oc, hsh_list);
-	oc->flags &= ~OC_F_BUSY;
 	if (!VTAILQ_EMPTY(&oh->waitinglist))
 		hsh_rush1(wrk, oh, &rush, HSH_RUSH_POLICY);
 	Lck_Unlock(&oh->mtx);
@@ -370,7 +369,8 @@ hsh_insert_busyobj(const struct worker *wrk, struct objhead *oh)
 	wrk->wpriv->nobjcore = NULL;
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 
-	AN(oc->flags & OC_F_BUSY);
+	AZ(oc->flags & OC_F_BUSY);
+	oc->flags |= OC_F_BUSY;
 	oc->refcnt = 1;		/* Owned by busyobj */
 	oc->objhead = oh;
 	VTAILQ_INSERT_TAIL(&oh->objcs, oc, hsh_list);
@@ -821,15 +821,24 @@ HSH_Fail(struct objcore *oc)
 	struct objhead *oh;
 
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
+	CHECK_OBJ_NOTNULL(oc->boc, BOC_MAGIC);
 	oh = oc->objhead;
 	CHECK_OBJ(oh, OBJHEAD_MAGIC);
 
 	/*
-	 * We have to have either a busy bit, so that HSH_Lookup
-	 * will not consider this oc, or an object hung of the oc
+	 * We either failed before the end of vcl_backend_response
+	 * and a cache miss has the busy bit, so that HSH_Lookup()
+	 * will not consider this oc, or an object hung off the oc
 	 * so that it can consider it.
+	 *
+	 * We can only fail an ongoing fetch in a backend context
+	 * so we can safely check the BOC state as it won't change
+	 * under our feet.
 	 */
-	assert((oc->flags & OC_F_BUSY) || (oc->stobj->stevedore != NULL));
+	if (oc->boc->state < BOS_STREAM)
+		assert(oc->flags & (OC_F_BUSY|OC_F_PRIVATE));
+	else
+		assert(oc->stobj->stevedore != NULL);
 
 	Lck_Lock(&oh->mtx);
 	oc->flags |= OC_F_FAILED;
@@ -909,15 +918,15 @@ HSH_Unbusy(struct worker *wrk, struct objcore *oc)
 	CHECK_OBJ(oh, OBJHEAD_MAGIC);
 
 	AN(oc->stobj->stevedore);
-	AN(oc->flags & OC_F_BUSY);
 	assert(oh->refcnt > 0);
 	assert(oc->refcnt > 0);
 
 	if (oc->flags & OC_F_PRIVATE) {
-		oc->flags &= ~OC_F_BUSY;
+		AZ(oc->flags & OC_F_BUSY);
 		return;
 	}
 
+	AN(oc->flags & OC_F_BUSY);
 	INIT_OBJ(&rush, RUSH_MAGIC);
 
 	BAN_NewObjCore(oc);
diff --git a/bin/varnishd/cache/cache_obj.c b/bin/varnishd/cache/cache_obj.c
index c85ecd6185..4c40d3b8dd 100644
--- a/bin/varnishd/cache/cache_obj.c
+++ b/bin/varnishd/cache/cache_obj.c
@@ -140,8 +140,6 @@ ObjNew(const struct worker *wrk)
 	AN(oc);
 	wrk->stats->n_objectcore++;
 	oc->last_lru = NAN;
-	oc->flags = OC_F_BUSY;
-
 	oc->boc = obj_newboc();
 
 	return (oc);

From b895a20b700eb7dace578ad6b42311273c7fb9e1 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 22 Jan 2024 15:55:55 +0100
Subject: [PATCH 03/14] hash: Make HSH_Fail() rush the waiting list

This is not a nice caching outcome. In fact, when a fetch fails we don't
know the outcome. In that case we need to rush one request that will
trigger a new fetch and hopefully complete with a proper outcome to rush
other requests accordingly.

As far as lookup is concerned there is no point in keeping the OC_F_BUSY
flag so in the case where a failure happens before a proper HSH_Unbusy()
it can be dropped before the rush. This would otherwise happen once the
last objcore reference is dropped. One is held by the current fetch task
and the other one by the client task that triggerred the fetch. It would
amount to an unnecessary delay.

The OC_F_FAILED flag will prevent it from being looked up, so there is
no reason to delay the rush until the final objcore reference is dropped.

This also means that once the client task that triggered the fetch task
resumes it can only be operating on a non-busy object.
---
 bin/varnishd/cache/cache_fetch.c   | 13 +++++--------
 bin/varnishd/cache/cache_hash.c    | 10 +++++++++-
 bin/varnishd/cache/cache_objhead.h |  2 +-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/bin/varnishd/cache/cache_fetch.c b/bin/varnishd/cache/cache_fetch.c
index c707a22b67..79798847d6 100644
--- a/bin/varnishd/cache/cache_fetch.c
+++ b/bin/varnishd/cache/cache_fetch.c
@@ -1060,9 +1060,8 @@ vbf_stp_fail(struct worker *wrk, struct busyobj *bo)
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 
 	assert(oc->boc->state < BOS_FINISHED);
-	HSH_Fail(oc);
-	if (!(oc->flags & OC_F_BUSY))
-		HSH_Kill(oc);
+	HSH_Fail(wrk, oc);
+	HSH_Kill(oc);
 	ObjSetState(wrk, oc, BOS_FAILED);
 	return (F_STP_DONE);
 }
@@ -1250,11 +1249,9 @@ VBF_Fetch(struct worker *wrk, struct req *req, struct objcore *oc,
 			(void)VRB_Ignore(req);
 		} else {
 			ObjWaitState(oc, BOS_STREAM);
-			if (oc->boc->state == BOS_FAILED) {
-				AN((oc->flags & OC_F_FAILED));
-			} else {
-				AZ(oc->flags & OC_F_BUSY);
-			}
+			AZ(oc->flags & OC_F_BUSY);
+			if (oc->boc->state == BOS_FAILED)
+				AN(oc->flags & OC_F_FAILED);
 		}
 	}
 	AZ(bo);
diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index cd7c85c749..ed8244f2ab 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -816,14 +816,17 @@ HSH_Purge(struct worker *wrk, struct objhead *oh, vtim_real ttl_now,
  */
 
 void
-HSH_Fail(struct objcore *oc)
+HSH_Fail(struct worker *wrk, struct objcore *oc)
 {
 	struct objhead *oh;
+	struct rush rush;
 
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 	CHECK_OBJ_NOTNULL(oc->boc, BOC_MAGIC);
 	oh = oc->objhead;
 	CHECK_OBJ(oh, OBJHEAD_MAGIC);
+	INIT_OBJ(&rush, RUSH_MAGIC);
 
 	/*
 	 * We either failed before the end of vcl_backend_response
@@ -842,7 +845,12 @@ HSH_Fail(struct objcore *oc)
 
 	Lck_Lock(&oh->mtx);
 	oc->flags |= OC_F_FAILED;
+	if (oc->flags & OC_F_BUSY) {
+		oc->flags &= ~OC_F_BUSY;
+		hsh_rush1(wrk, oh, &rush, 1);
+	}
 	Lck_Unlock(&oh->mtx);
+	hsh_rush2(wrk, &rush);
 }
 
 /*---------------------------------------------------------------------
diff --git a/bin/varnishd/cache/cache_objhead.h b/bin/varnishd/cache/cache_objhead.h
index 2edfb5a883..866dbc4a22 100644
--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -65,11 +65,11 @@ enum lookup_e {
 	HSH_BUSY,
 };
 
-void HSH_Fail(struct objcore *);
 void HSH_Kill(struct objcore *);
 void HSH_Replace(struct objcore *, const struct objcore *);
 void HSH_Insert(struct worker *, const void *hash, struct objcore *,
     struct ban *);
+void HSH_Fail(struct worker *, struct objcore *);
 void HSH_Unbusy(struct worker *, struct objcore *);
 int HSH_Snipe(const struct worker *, struct objcore *);
 struct boc *HSH_RefBoc(const struct objcore *);

From a6a715b4064ea69e32823b48053ed49840b1dc97 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 22 Jan 2024 14:58:27 +0100
Subject: [PATCH 04/14] obj: Rush objcores based on BOC state changes

The BOS_PREP_STREAM state is retired. It was used as a stop-gap to
"unbusy" objects before waking up the client task for good. Instead,
the HSH_Unbusy() and HSH_Fail() functions are called once the gap is
closed, depending on the outcome.

This removes one unnecessary signal and consolidates multiple call
sites, ensuring that objects always drop the OC_F_BUSY flag from a
central location, for fetch tasks.
---
 bin/varnishd/cache/cache_fetch.c    | 20 ++++----------------
 bin/varnishd/cache/cache_obj.c      | 10 +++++++---
 bin/varnishd/cache/cache_varnishd.h |  3 +--
 include/tbl/boc_state.h             |  1 -
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/bin/varnishd/cache/cache_fetch.c b/bin/varnishd/cache/cache_fetch.c
index 79798847d6..f005168125 100644
--- a/bin/varnishd/cache/cache_fetch.c
+++ b/bin/varnishd/cache/cache_fetch.c
@@ -712,11 +712,8 @@ vbf_stp_fetch(struct worker *wrk, struct busyobj *bo)
 
 	assert(oc->boc->state == BOS_REQ_DONE);
 
-	if (bo->do_stream) {
-		ObjSetState(wrk, oc, BOS_PREP_STREAM);
-		HSH_Unbusy(wrk, oc);
+	if (bo->do_stream)
 		ObjSetState(wrk, oc, BOS_STREAM);
-	}
 
 	VSLb(bo->vsl, SLT_Fetch_Body, "%u %s %s",
 	    bo->htc->body_status->nbr, bo->htc->body_status->name,
@@ -751,11 +748,8 @@ vbf_stp_fetchend(struct worker *wrk, struct busyobj *bo)
 
 	if (bo->do_stream)
 		assert(oc->boc->state == BOS_STREAM);
-	else {
+	else
 		assert(oc->boc->state == BOS_REQ_DONE);
-		ObjSetState(wrk, oc, BOS_PREP_STREAM);
-		HSH_Unbusy(wrk, oc);
-	}
 
 	ObjSetState(wrk, oc, BOS_FINISHED);
 	VSLb_ts_busyobj(bo, "BerespBody", W_TIM_real(wrk));
@@ -884,11 +878,8 @@ vbf_stp_condfetch(struct worker *wrk, struct busyobj *bo)
 		ObjSetFlag(bo->wrk, oc, OF_IMSCAND, 0);
 	AZ(ObjCopyAttr(bo->wrk, oc, stale_oc, OA_GZIPBITS));
 
-	if (bo->do_stream) {
-		ObjSetState(wrk, oc, BOS_PREP_STREAM);
-		HSH_Unbusy(wrk, oc);
+	if (bo->do_stream)
 		ObjSetState(wrk, oc, BOS_STREAM);
-	}
 
 	INIT_OBJ(vop, VBF_OBITER_PRIV_MAGIC);
 	vop->bo = bo;
@@ -1038,8 +1029,6 @@ vbf_stp_error(struct worker *wrk, struct busyobj *bo)
 	assert(o == VSB_len(synth_body));
 	AZ(ObjSetU64(wrk, oc, OA_LEN, o));
 	VSB_destroy(&synth_body);
-	ObjSetState(wrk, oc, BOS_PREP_STREAM);
-	HSH_Unbusy(wrk, oc);
 	if (stale != NULL && oc->ttl > 0)
 		HSH_Kill(stale);
 	ObjSetState(wrk, oc, BOS_FINISHED);
@@ -1060,9 +1049,8 @@ vbf_stp_fail(struct worker *wrk, struct busyobj *bo)
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 
 	assert(oc->boc->state < BOS_FINISHED);
-	HSH_Fail(wrk, oc);
-	HSH_Kill(oc);
 	ObjSetState(wrk, oc, BOS_FAILED);
+	HSH_Kill(oc);
 	return (F_STP_DONE);
 }
 
diff --git a/bin/varnishd/cache/cache_obj.c b/bin/varnishd/cache/cache_obj.c
index 4c40d3b8dd..172e29a8bd 100644
--- a/bin/varnishd/cache/cache_obj.c
+++ b/bin/varnishd/cache/cache_obj.c
@@ -86,6 +86,7 @@
 
 #include "cache_varnishd.h"
 #include "cache_obj.h"
+#include "cache_objhead.h"
 #include "vend.h"
 #include "storage/storage.h"
 
@@ -490,8 +491,7 @@ ObjVAICancel(struct worker *wrk, struct boc *boc, struct vai_qe *qe)
  */
 
 void
-ObjSetState(struct worker *wrk, const struct objcore *oc,
-    enum boc_state_e next)
+ObjSetState(struct worker *wrk, struct objcore *oc, enum boc_state_e next)
 {
 	const struct obj_methods *om;
 
@@ -500,7 +500,6 @@ ObjSetState(struct worker *wrk, const struct objcore *oc,
 	assert(next > oc->boc->state);
 
 	CHECK_OBJ_ORNULL(oc->stobj->stevedore, STEVEDORE_MAGIC);
-	assert(next != BOS_STREAM || oc->boc->state == BOS_PREP_STREAM);
 	assert(next != BOS_FINISHED || (oc->oa_present & (1 << OA_LEN)));
 
 	if (oc->stobj->stevedore != NULL) {
@@ -509,6 +508,11 @@ ObjSetState(struct worker *wrk, const struct objcore *oc,
 			om->objsetstate(wrk, oc, next);
 	}
 
+	if (next == BOS_FAILED)
+		HSH_Fail(wrk, oc);
+	else if (oc->boc->state < BOS_STREAM && next >= BOS_STREAM)
+		HSH_Unbusy(wrk, oc);
+
 	Lck_Lock(&oc->boc->mtx);
 	oc->boc->state = next;
 	obj_boc_notify(oc->boc);
diff --git a/bin/varnishd/cache/cache_varnishd.h b/bin/varnishd/cache/cache_varnishd.h
index 2892ef188d..2cd5cd3dc9 100644
--- a/bin/varnishd/cache/cache_varnishd.h
+++ b/bin/varnishd/cache/cache_varnishd.h
@@ -342,8 +342,7 @@ int ObjGetSpace(struct worker *, struct objcore *, ssize_t *sz, uint8_t **ptr);
 void ObjExtend(struct worker *, struct objcore *, ssize_t l, int final);
 uint64_t ObjWaitExtend(const struct worker *, const struct objcore *,
     uint64_t l, enum boc_state_e *statep);
-void ObjSetState(struct worker *, const struct objcore *,
-    enum boc_state_e next);
+void ObjSetState(struct worker *, struct objcore *, enum boc_state_e next);
 void ObjWaitState(const struct objcore *, enum boc_state_e want);
 void ObjTouch(struct worker *, struct objcore *, vtim_real now);
 void ObjFreeObj(struct worker *, struct objcore *);
diff --git a/include/tbl/boc_state.h b/include/tbl/boc_state.h
index 44984de81b..2e04136606 100644
--- a/include/tbl/boc_state.h
+++ b/include/tbl/boc_state.h
@@ -32,7 +32,6 @@
 
 BOC_STATE(INVALID,	invalid)	/* don't touch (yet) */
 BOC_STATE(REQ_DONE,	req_done)	/* bereq.* can be examined */
-BOC_STATE(PREP_STREAM,	prep_stream)	/* Prepare for streaming */
 BOC_STATE(STREAM,	stream)		/* beresp.* can be examined */
 BOC_STATE(FINISHED,	finished)	/* object is complete */
 BOC_STATE(FAILED,	failed)		/* something went wrong */

From 4a16b529de9a8fe74335dd3ee0e11246416534cd Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Tue, 19 Dec 2023 15:18:02 +0100
Subject: [PATCH 05/14] hash: Trigger a rush when an objcore is withdrawn

This is the counterpart of HSH_Unbusy() for the cases where the req task
will not schedule a fetch task, at which point we know we can already
wake up a request, if there is one on the waiting list.

This encapsulates the "wake up only one request" semantic using a new
OC_F_WITHDRAWN flag. Although this flag is not being used yet, it will
when the state of the objcore determines the rush policy instead of the
call site.

When a bgfetch is withdrawn, the extra rush becomes redundant with the
policy rush on the hit objcore. There will eventually be no more rush
for the hit objcore, which will remove the redundancy just introduced.

In order to withdraw objcores in a single critical section, a function
HSH_DerefObjCoreUnlock() is added.
---
 bin/varnishd/cache/cache_hash.c    | 50 +++++++++++++++++++++++++++++-
 bin/varnishd/cache/cache_objhead.h |  2 ++
 bin/varnishd/cache/cache_req_fsm.c |  6 ++--
 include/tbl/oc_flags.h             |  3 +-
 4 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index ed8244f2ab..69c7da8fde 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -908,6 +908,37 @@ HSH_Cancel(struct worker *wrk, struct objcore *oc, struct boc *boc)
 	ObjSlim(wrk, oc);
 }
 
+/*---------------------------------------------------------------------
+ * Withdraw an objcore that will not proceed with a fetch.
+ */
+
+void
+HSH_Withdraw(struct worker *wrk, struct objcore **ocp)
+{
+	struct objhead *oh;
+	struct objcore *oc;
+	struct rush rush;
+
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
+	TAKE_OBJ_NOTNULL(oc, ocp, OBJCORE_MAGIC);
+	INIT_OBJ(&rush, RUSH_MAGIC);
+
+	oh = oc->objhead;
+	CHECK_OBJ(oh, OBJHEAD_MAGIC);
+
+	Lck_Lock(&oh->mtx);
+	AZ(oc->stobj->stevedore);
+	AN(oc->flags & OC_F_BUSY);
+	assert(oc->refcnt == 1);
+	assert(oh->refcnt > 0);
+	oc->flags &= ~OC_F_BUSY;
+	oc->flags |= OC_F_WITHDRAWN;
+	hsh_rush1(wrk, oh, &rush, 1);
+	AZ(HSH_DerefObjCoreUnlock(wrk, &oc, 0));
+
+	hsh_rush2(wrk, &rush);
+}
+
 /*---------------------------------------------------------------------
  * Unbusy an objcore when the object is completely fetched.
  */
@@ -1094,6 +1125,23 @@ HSH_DerefBoc(struct worker *wrk, struct objcore *oc)
 
 int
 HSH_DerefObjCore(struct worker *wrk, struct objcore **ocp, int rushmax)
+{
+	struct objcore *oc;
+	struct objhead *oh;
+
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
+	TAKE_OBJ_NOTNULL(oc, ocp, OBJCORE_MAGIC);
+	assert(oc->refcnt > 0);
+
+	oh = oc->objhead;
+	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
+
+	Lck_Lock(&oh->mtx);
+	return (HSH_DerefObjCoreUnlock(wrk, &oc, rushmax));
+}
+
+int
+HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp, int rushmax)
 {
 	struct objcore *oc;
 	struct objhead *oh;
@@ -1108,7 +1156,7 @@ HSH_DerefObjCore(struct worker *wrk, struct objcore **ocp, int rushmax)
 	oh = oc->objhead;
 	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
 
-	Lck_Lock(&oh->mtx);
+	Lck_AssertHeld(&oh->mtx);
 	assert(oh->refcnt > 0);
 	r = --oc->refcnt;
 	if (!r)
diff --git a/bin/varnishd/cache/cache_objhead.h b/bin/varnishd/cache/cache_objhead.h
index 866dbc4a22..8d598d7b14 100644
--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -69,6 +69,7 @@ void HSH_Kill(struct objcore *);
 void HSH_Replace(struct objcore *, const struct objcore *);
 void HSH_Insert(struct worker *, const void *hash, struct objcore *,
     struct ban *);
+void HSH_Withdraw(struct worker *, struct objcore **);
 void HSH_Fail(struct worker *, struct objcore *);
 void HSH_Unbusy(struct worker *, struct objcore *);
 int HSH_Snipe(const struct worker *, struct objcore *);
@@ -79,6 +80,7 @@ void HSH_DeleteObjHead(const struct worker *, struct objhead *);
 int HSH_DerefObjCore(struct worker *, struct objcore **, int rushmax);
 #define HSH_RUSH_POLICY -1
 
+int HSH_DerefObjCoreUnlock(struct worker *, struct objcore **, int rushmax);
 enum lookup_e HSH_Lookup(struct req *, struct objcore **, struct objcore **);
 void HSH_Ref(struct objcore *o);
 void HSH_AddString(struct req *, void *ctx, const char *str);
diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c
index a8fa68be92..fa00acfdb0 100644
--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -688,7 +688,7 @@ cnt_lookup(struct worker *wrk, struct req *req)
 	(void)HSH_DerefObjCore(wrk, &req->objcore, HSH_RUSH_POLICY);
 
 	if (busy != NULL) {
-		(void)HSH_DerefObjCore(wrk, &busy, 0);
+		HSH_Withdraw(wrk, &busy);
 		VRY_Clear(req);
 	}
 
@@ -736,7 +736,7 @@ cnt_miss(struct worker *wrk, struct req *req)
 	VRY_Clear(req);
 	if (req->stale_oc != NULL)
 		(void)HSH_DerefObjCore(wrk, &req->stale_oc, 0);
-	AZ(HSH_DerefObjCore(wrk, &req->objcore, 1));
+	HSH_Withdraw(wrk, &req->objcore);
 	return (REQ_FSM_MORE);
 }
 
@@ -1111,7 +1111,7 @@ cnt_purge(struct worker *wrk, struct req *req)
 
 	(void)HSH_Purge(wrk, boc->objhead, req->t_req, 0, 0, 0);
 
-	AZ(HSH_DerefObjCore(wrk, &boc, 1));
+	HSH_Withdraw(wrk, &boc);
 
 	VCL_purge_method(req->vcl, wrk, req, NULL, NULL);
 	switch (wrk->vpi->handling) {
diff --git a/include/tbl/oc_flags.h b/include/tbl/oc_flags.h
index 768962fdae..0866cc38f0 100644
--- a/include/tbl/oc_flags.h
+++ b/include/tbl/oc_flags.h
@@ -30,7 +30,8 @@
 
 /*lint -save -e525 -e539 */
 
-OC_FLAG(BUSY,		busy,		(1<<1))		//lint !e835
+OC_FLAG(WITHDRAWN,	withdrawn,	(1<<0))		//lint !e835
+OC_FLAG(BUSY,		busy,		(1<<1))
 OC_FLAG(HFM,		hfm,		(1<<2))
 OC_FLAG(HFP,		hfp,		(1<<3))
 OC_FLAG(CANCEL,		cancel,		(1<<4))

From 4828b95e9a7d84ede19c61f6141206e9ec28f1da Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Tue, 19 Dec 2023 15:24:14 +0100
Subject: [PATCH 06/14] hash: Dropping an objcore does not rush the waiting
 list

Rushing the waiting list because an objcore was dropped was a source
of spurious wakeups, including the case where the dropped objcore is not
even relevant to the objhead waiting list. It is now guaranteed that an
object either dropped its OC_F_BUSY flag once ready (otherwise failed or
withdrawn) or never had this flag to begin with for objcores that can't
be looked up.
---
 bin/varnishd/cache/cache_hash.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 69c7da8fde..ad70639763 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -1145,13 +1145,13 @@ HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp, int rushmax)
 {
 	struct objcore *oc;
 	struct objhead *oh;
-	struct rush rush;
 	int r;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	TAKE_OBJ_NOTNULL(oc, ocp, OBJCORE_MAGIC);
 	assert(oc->refcnt > 0);
-	INIT_OBJ(&rush, RUSH_MAGIC);
+
+	(void)rushmax;
 
 	oh = oc->objhead;
 	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
@@ -1161,15 +1161,11 @@ HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp, int rushmax)
 	r = --oc->refcnt;
 	if (!r)
 		VTAILQ_REMOVE(&oh->objcs, oc, hsh_list);
-	if (!VTAILQ_EMPTY(&oh->waitinglist)) {
-		assert(oh->refcnt > 1);
-		hsh_rush1(wrk, oh, &rush, rushmax);
-	}
 	Lck_Unlock(&oh->mtx);
-	hsh_rush2(wrk, &rush);
 	if (r != 0)
 		return (r);
 
+	AZ(oc->flags & OC_F_BUSY);
 	AZ(oc->exp_flags);
 
 	BAN_DestroyObj(oc);

From 7305c9eec527d56c0839dd4a61da1430796afb12 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 22 Jan 2024 17:01:56 +0100
Subject: [PATCH 07/14] hash: Operate waiting list rush on objcores

Operating on an objcore allows to preserve the different rush strategies
that existed so far:

- rush one request for failed fetch tasks
- rush one request for withdrawn objects
- rush rush_exponent requests otherwise

For the cases where no rush is expected, a null objcore is passed.

For cacheable objects, all waiters are temporarily woken up at once,
until waiting list matches are handled separately.
---
 bin/varnishd/cache/cache_hash.c | 57 ++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index ad70639763..5f8d99fad2 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -76,12 +76,12 @@ static const struct hash_slinger *hash;
 #define PRIVATE_OH_EXP 7
 static struct objhead private_ohs[1 << PRIVATE_OH_EXP];
 
-static void hsh_rush1(const struct worker *, struct objhead *,
-    struct rush *, int);
+static void hsh_rush1(const struct worker *, struct objcore *,
+    struct rush *);
 static void hsh_rush2(struct worker *, struct rush *);
 static int hsh_deref_objhead(struct worker *wrk, struct objhead **poh);
 static int hsh_deref_objhead_unlock(struct worker *wrk, struct objhead **poh,
-    int);
+    struct objcore *oc);
 
 /*---------------------------------------------------------------------*/
 
@@ -345,7 +345,7 @@ HSH_Insert(struct worker *wrk, const void *digest, struct objcore *oc,
 	VTAILQ_REMOVE(&oh->objcs, oc, hsh_list);
 	VTAILQ_INSERT_HEAD(&oh->objcs, oc, hsh_list);
 	if (!VTAILQ_EMPTY(&oh->waitinglist))
-		hsh_rush1(wrk, oh, &rush, HSH_RUSH_POLICY);
+		hsh_rush1(wrk, oc, &rush);
 	Lck_Unlock(&oh->mtx);
 	hsh_rush2(wrk, &rush);
 
@@ -535,7 +535,7 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	if (oc != NULL && oc->flags & OC_F_HFP) {
 		xid = VXID(ObjGetXID(wrk, oc));
 		dttl = EXP_Dttl(req, oc);
-		AN(hsh_deref_objhead_unlock(wrk, &oh, HSH_RUSH_POLICY));
+		AN(hsh_deref_objhead_unlock(wrk, &oh, oc));
 		wrk->stats->cache_hitpass++;
 		VSLb(req->vsl, SLT_HitPass, "%u %.6f", xid, dttl);
 		return (HSH_HITPASS);
@@ -555,7 +555,7 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 		}
 		oc->hits++;
 		boc_progress = oc->boc == NULL ? -1 : oc->boc->fetched_so_far;
-		AN(hsh_deref_objhead_unlock(wrk, &oh, HSH_RUSH_POLICY));
+		AN(hsh_deref_objhead_unlock(wrk, &oh, oc));
 		Req_LogHit(wrk, req, oc, boc_progress);
 		return (HSH_HIT);
 	}
@@ -604,7 +604,7 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 		exp_oc->refcnt++;
 		*ocp = exp_oc;
 		exp_oc->hits++;
-		AN(hsh_deref_objhead_unlock(wrk, &oh, 0));
+		AN(hsh_deref_objhead_unlock(wrk, &oh, NULL));
 		Req_LogHit(wrk, req, exp_oc, boc_progress);
 		return (HSH_GRACE);
 	}
@@ -637,22 +637,34 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
  */
 
 static void
-hsh_rush1(const struct worker *wrk, struct objhead *oh, struct rush *r, int max)
+hsh_rush1(const struct worker *wrk, struct objcore *oc, struct rush *r)
 {
-	int i;
+	struct objhead *oh;
 	struct req *req;
-
-	if (max == 0)
-		return;
-	if (max == HSH_RUSH_POLICY)
-		max = cache_param->rush_exponent;
-	assert(max > 0);
+	int i, max;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
+	CHECK_OBJ_ORNULL(oc, OBJCORE_MAGIC);
 	CHECK_OBJ_NOTNULL(r, RUSH_MAGIC);
 	VTAILQ_INIT(&r->reqs);
+
+	if (oc == NULL)
+		return;
+
+	oh = oc->objhead;
+	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
 	Lck_AssertHeld(&oh->mtx);
+
+	AZ(oc->flags & OC_F_BUSY);
+	AZ(oc->flags & OC_F_PRIVATE);
+	if (oc->flags & (OC_F_WITHDRAWN|OC_F_FAILED))
+		max = 1;
+	else if (oc->flags & (OC_F_HFM|OC_F_HFP|OC_F_CANCEL|OC_F_DYING))
+		max = cache_param->rush_exponent;
+	else
+		max = INT_MAX; /* XXX: temp */
+	assert(max > 0);
+
 	for (i = 0; i < max; i++) {
 		req = VTAILQ_FIRST(&oh->waitinglist);
 		if (req == NULL)
@@ -847,7 +859,7 @@ HSH_Fail(struct worker *wrk, struct objcore *oc)
 	oc->flags |= OC_F_FAILED;
 	if (oc->flags & OC_F_BUSY) {
 		oc->flags &= ~OC_F_BUSY;
-		hsh_rush1(wrk, oh, &rush, 1);
+		hsh_rush1(wrk, oc, &rush);
 	}
 	Lck_Unlock(&oh->mtx);
 	hsh_rush2(wrk, &rush);
@@ -933,7 +945,7 @@ HSH_Withdraw(struct worker *wrk, struct objcore **ocp)
 	assert(oh->refcnt > 0);
 	oc->flags &= ~OC_F_BUSY;
 	oc->flags |= OC_F_WITHDRAWN;
-	hsh_rush1(wrk, oh, &rush, 1);
+	hsh_rush1(wrk, oc, &rush);
 	AZ(HSH_DerefObjCoreUnlock(wrk, &oc, 0));
 
 	hsh_rush2(wrk, &rush);
@@ -982,7 +994,7 @@ HSH_Unbusy(struct worker *wrk, struct objcore *oc)
 	oc->flags &= ~OC_F_BUSY;
 	if (!VTAILQ_EMPTY(&oh->waitinglist)) {
 		assert(oh->refcnt > 1);
-		hsh_rush1(wrk, oh, &rush, HSH_RUSH_POLICY);
+		hsh_rush1(wrk, oc, &rush);
 	}
 	Lck_Unlock(&oh->mtx);
 	EXP_Insert(wrk, oc);
@@ -1182,7 +1194,8 @@ HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp, int rushmax)
 }
 
 static int
-hsh_deref_objhead_unlock(struct worker *wrk, struct objhead **poh, int max)
+hsh_deref_objhead_unlock(struct worker *wrk, struct objhead **poh,
+    struct objcore *oc)
 {
 	struct objhead *oh;
 	struct rush rush;
@@ -1211,7 +1224,7 @@ hsh_deref_objhead_unlock(struct worker *wrk, struct objhead **poh, int max)
 	INIT_OBJ(&rush, RUSH_MAGIC);
 	if (!VTAILQ_EMPTY(&oh->waitinglist)) {
 		assert(oh->refcnt > 1);
-		hsh_rush1(wrk, oh, &rush, max);
+		hsh_rush1(wrk, oc, &rush);
 	}
 
 	if (oh->refcnt == 1)
@@ -1232,7 +1245,7 @@ hsh_deref_objhead(struct worker *wrk, struct objhead **poh)
 	TAKE_OBJ_NOTNULL(oh, poh, OBJHEAD_MAGIC);
 
 	Lck_Lock(&oh->mtx);
-	return (hsh_deref_objhead_unlock(wrk, &oh, 0));
+	return (hsh_deref_objhead_unlock(wrk, &oh, NULL));
 }
 
 void

From 5c3aa38c3aceca974ad915d517ea7d512398939e Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Tue, 19 Dec 2023 10:35:23 +0100
Subject: [PATCH 08/14] hash: Retire HSH_RUSH_POLICY

From now on, the policy will be derived from the objcore.
---
 bin/varnishd/cache/cache_ban_lurker.c          |  2 +-
 bin/varnishd/cache/cache_busyobj.c             |  3 +--
 bin/varnishd/cache/cache_expire.c              |  6 +++---
 bin/varnishd/cache/cache_fetch.c               |  8 ++++----
 bin/varnishd/cache/cache_hash.c                | 12 +++++-------
 bin/varnishd/cache/cache_objhead.h             |  7 ++-----
 bin/varnishd/cache/cache_req_body.c            | 12 ++++++------
 bin/varnishd/cache/cache_req_fsm.c             | 16 ++++++++--------
 bin/varnishd/cache/cache_vrt_var.c             |  2 +-
 bin/varnishd/storage/storage_lru.c             |  2 +-
 bin/varnishd/storage/storage_persistent_silo.c |  2 +-
 11 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/bin/varnishd/cache/cache_ban_lurker.c b/bin/varnishd/cache/cache_ban_lurker.c
index 3057312c08..a36bc32b2c 100644
--- a/bin/varnishd/cache/cache_ban_lurker.c
+++ b/bin/varnishd/cache/cache_ban_lurker.c
@@ -333,7 +333,7 @@ ban_lurker_test_ban(struct worker *wrk, struct ban *bt,
 			if (i)
 				ObjSendEvent(wrk, oc, OEV_BANCHG);
 		}
-		(void)HSH_DerefObjCore(wrk, &oc, 0);
+		(void)HSH_DerefObjCore(wrk, &oc);
 	}
 }
 
diff --git a/bin/varnishd/cache/cache_busyobj.c b/bin/varnishd/cache/cache_busyobj.c
index 42332ecd77..c8814a7bb3 100644
--- a/bin/varnishd/cache/cache_busyobj.c
+++ b/bin/varnishd/cache/cache_busyobj.c
@@ -157,8 +157,7 @@ VBO_ReleaseBusyObj(struct worker *wrk, struct busyobj **pbo)
 		wrk->stats->ws_backend_overflow++;
 
 	if (bo->fetch_objcore != NULL) {
-		(void)HSH_DerefObjCore(wrk, &bo->fetch_objcore,
-		    HSH_RUSH_POLICY);
+		(void)HSH_DerefObjCore(wrk, &bo->fetch_objcore);
 	}
 
 	VRT_Assign_Backend(&bo->director_req, NULL);
diff --git a/bin/varnishd/cache/cache_expire.c b/bin/varnishd/cache/cache_expire.c
index 8e459140b9..75a45bc5a2 100644
--- a/bin/varnishd/cache/cache_expire.c
+++ b/bin/varnishd/cache/cache_expire.c
@@ -213,7 +213,7 @@ EXP_Insert(struct worker *wrk, struct objcore *oc)
 		ObjSendEvent(wrk, oc, OEV_EXPIRE);
 		tmpoc = oc;
 		assert(oc->refcnt >= 2); /* Silence coverity */
-		(void)HSH_DerefObjCore(wrk, &oc, 0);
+		(void)HSH_DerefObjCore(wrk, &oc);
 		AZ(oc);
 		assert(tmpoc->refcnt >= 1); /* Silence coverity */
 	}
@@ -309,7 +309,7 @@ exp_inbox(struct exp_priv *ep, struct objcore *oc, unsigned flags, vtim_real now
 		    VXID(ObjGetXID(ep->wrk, oc)), EXP_Ttl(NULL, oc) - now,
 		    (intmax_t)oc->hits);
 		ObjSendEvent(ep->wrk, oc, OEV_EXPIRE);
-		(void)HSH_DerefObjCore(ep->wrk, &oc, 0);
+		(void)HSH_DerefObjCore(ep->wrk, &oc);
 		return;
 	}
 
@@ -387,7 +387,7 @@ exp_expire(struct exp_priv *ep, vtim_real now)
 		    VXID(ObjGetXID(ep->wrk, oc)), EXP_Ttl(NULL, oc) - now,
 		    (intmax_t)oc->hits);
 		ObjSendEvent(ep->wrk, oc, OEV_EXPIRE);
-		(void)HSH_DerefObjCore(ep->wrk, &oc, 0);
+		(void)HSH_DerefObjCore(ep->wrk, &oc);
 	}
 	return (0);
 }
diff --git a/bin/varnishd/cache/cache_fetch.c b/bin/varnishd/cache/cache_fetch.c
index f005168125..8af011fb72 100644
--- a/bin/varnishd/cache/cache_fetch.c
+++ b/bin/varnishd/cache/cache_fetch.c
@@ -1122,7 +1122,7 @@ vbf_fetch_thread(struct worker *wrk, void *priv)
 	http_Teardown(bo->beresp);
 	// cannot make assumptions about the number of references here #3434
 	if (bo->bereq_body != NULL)
-		(void) HSH_DerefObjCore(bo->wrk, &bo->bereq_body, 0);
+		(void)HSH_DerefObjCore(bo->wrk, &bo->bereq_body);
 
 	if (oc->boc->state == BOS_FINISHED) {
 		AZ(oc->flags & OC_F_FAILED);
@@ -1132,7 +1132,7 @@ vbf_fetch_thread(struct worker *wrk, void *priv)
 	// AZ(oc->boc);	// XXX
 
 	if (bo->stale_oc != NULL)
-		(void)HSH_DerefObjCore(wrk, &bo->stale_oc, 0);
+		(void)HSH_DerefObjCore(wrk, &bo->stale_oc);
 
 	wrk->vsl = NULL;
 	HSH_DerefBoc(wrk, oc);
@@ -1224,7 +1224,7 @@ VBF_Fetch(struct worker *wrk, struct req *req, struct objcore *oc,
 		    "No thread available for bgfetch");
 		(void)vbf_stp_fail(req->wrk, bo);
 		if (bo->stale_oc != NULL)
-			(void)HSH_DerefObjCore(wrk, &bo->stale_oc, 0);
+			(void)HSH_DerefObjCore(wrk, &bo->stale_oc);
 		HSH_DerefBoc(wrk, oc);
 		SES_Rel(bo->sp);
 		THR_SetBusyobj(NULL);
@@ -1247,5 +1247,5 @@ VBF_Fetch(struct worker *wrk, struct req *req, struct objcore *oc,
 	assert(oc->boc == boc);
 	HSH_DerefBoc(wrk, oc);
 	if (mode == VBF_BACKGROUND)
-		(void)HSH_DerefObjCore(wrk, &oc, HSH_RUSH_POLICY);
+		(void)HSH_DerefObjCore(wrk, &oc);
 }
diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 5f8d99fad2..14259fa5de 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -794,7 +794,7 @@ HSH_Purge(struct worker *wrk, struct objhead *oh, vtim_real ttl_now,
 				EXP_Remove(ocp[i], NULL);
 			else
 				EXP_Reduce(ocp[i], ttl_now, ttl, grace, keep);
-			(void)HSH_DerefObjCore(wrk, &ocp[i], 0);
+			(void)HSH_DerefObjCore(wrk, &ocp[i]);
 			AZ(ocp[i]);
 			total++;
 		}
@@ -946,7 +946,7 @@ HSH_Withdraw(struct worker *wrk, struct objcore **ocp)
 	oc->flags &= ~OC_F_BUSY;
 	oc->flags |= OC_F_WITHDRAWN;
 	hsh_rush1(wrk, oc, &rush);
-	AZ(HSH_DerefObjCoreUnlock(wrk, &oc, 0));
+	AZ(HSH_DerefObjCoreUnlock(wrk, &oc));
 
 	hsh_rush2(wrk, &rush);
 }
@@ -1136,7 +1136,7 @@ HSH_DerefBoc(struct worker *wrk, struct objcore *oc)
  */
 
 int
-HSH_DerefObjCore(struct worker *wrk, struct objcore **ocp, int rushmax)
+HSH_DerefObjCore(struct worker *wrk, struct objcore **ocp)
 {
 	struct objcore *oc;
 	struct objhead *oh;
@@ -1149,11 +1149,11 @@ HSH_DerefObjCore(struct worker *wrk, struct objcore **ocp, int rushmax)
 	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
 
 	Lck_Lock(&oh->mtx);
-	return (HSH_DerefObjCoreUnlock(wrk, &oc, rushmax));
+	return (HSH_DerefObjCoreUnlock(wrk, &oc));
 }
 
 int
-HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp, int rushmax)
+HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp)
 {
 	struct objcore *oc;
 	struct objhead *oh;
@@ -1163,8 +1163,6 @@ HSH_DerefObjCoreUnlock(struct worker *wrk, struct objcore **ocp, int rushmax)
 	TAKE_OBJ_NOTNULL(oc, ocp, OBJCORE_MAGIC);
 	assert(oc->refcnt > 0);
 
-	(void)rushmax;
-
 	oh = oc->objhead;
 	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
 
diff --git a/bin/varnishd/cache/cache_objhead.h b/bin/varnishd/cache/cache_objhead.h
index 8d598d7b14..94ef81db52 100644
--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -76,11 +76,8 @@ int HSH_Snipe(const struct worker *, struct objcore *);
 struct boc *HSH_RefBoc(const struct objcore *);
 void HSH_DerefBoc(struct worker *wrk, struct objcore *);
 void HSH_DeleteObjHead(const struct worker *, struct objhead *);
-
-int HSH_DerefObjCore(struct worker *, struct objcore **, int rushmax);
-#define HSH_RUSH_POLICY -1
-
-int HSH_DerefObjCoreUnlock(struct worker *, struct objcore **, int rushmax);
+int HSH_DerefObjCore(struct worker *, struct objcore **);
+int HSH_DerefObjCoreUnlock(struct worker *, struct objcore **);
 enum lookup_e HSH_Lookup(struct req *, struct objcore **, struct objcore **);
 void HSH_Ref(struct objcore *o);
 void HSH_AddString(struct req *, void *ctx, const char *str);
diff --git a/bin/varnishd/cache/cache_req_body.c b/bin/varnishd/cache/cache_req_body.c
index 9a4cd54d7e..cd19421636 100644
--- a/bin/varnishd/cache/cache_req_body.c
+++ b/bin/varnishd/cache/cache_req_body.c
@@ -80,7 +80,7 @@ vrb_pull(struct req *req, ssize_t maxsize, objiterate_f *func, void *priv)
 	if (STV_NewObject(req->wrk, req->body_oc, stv, 0) == 0) {
 		req->req_body_status = BS_ERROR;
 		HSH_DerefBoc(req->wrk, req->body_oc);
-		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc, 0));
+		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc));
 		(void)VFP_Error(vfc, "Object allocation failed:"
 		    " Ran out of space in %s", stv->vclname);
 		return (-1);
@@ -96,7 +96,7 @@ vrb_pull(struct req *req, ssize_t maxsize, objiterate_f *func, void *priv)
 		(void)VFP_Error(vfc, "req.body filters failed");
 		req->req_body_status = BS_ERROR;
 		HSH_DerefBoc(req->wrk, req->body_oc);
-		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc, 0));
+		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc));
 		return (-1);
 	}
 
@@ -104,7 +104,7 @@ vrb_pull(struct req *req, ssize_t maxsize, objiterate_f *func, void *priv)
 	if (VFP_Open(ctx, vfc) < 0) {
 		req->req_body_status = BS_ERROR;
 		HSH_DerefBoc(req->wrk, req->body_oc);
-		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc, 0));
+		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc));
 		return (-1);
 	}
 
@@ -152,7 +152,7 @@ vrb_pull(struct req *req, ssize_t maxsize, objiterate_f *func, void *priv)
 	VSLb_ts_req(req, "ReqBody", VTIM_real());
 	if (func != NULL) {
 		HSH_DerefBoc(req->wrk, req->body_oc);
-		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc, 0));
+		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc));
 		if (vfps == VFP_END && r == 0 && (flush & OBJ_ITER_END) == 0)
 			r = func(priv, flush | OBJ_ITER_END, NULL, 0);
 		if (vfps != VFP_END) {
@@ -168,7 +168,7 @@ vrb_pull(struct req *req, ssize_t maxsize, objiterate_f *func, void *priv)
 
 	if (vfps != VFP_END) {
 		req->req_body_status = BS_ERROR;
-		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc, 0));
+		AZ(HSH_DerefObjCore(req->wrk, &req->body_oc));
 		return (-1);
 	}
 
@@ -289,7 +289,7 @@ VRB_Free(struct req *req)
 	if (req->body_oc == NULL)
 		return;
 
-	r = HSH_DerefObjCore(req->wrk, &req->body_oc, 0);
+	r = HSH_DerefObjCore(req->wrk, &req->body_oc);
 
 	// each busyobj may have gained a reference
 	assert (r >= 0);
diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c
index fa00acfdb0..77481a41da 100644
--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -222,7 +222,7 @@ cnt_deliver(struct worker *wrk, struct req *req)
 	ObjTouch(req->wrk, req->objcore, req->t_prev);
 
 	if (Resp_Setup_Deliver(req)) {
-		(void)HSH_DerefObjCore(wrk, &req->objcore, HSH_RUSH_POLICY);
+		(void)HSH_DerefObjCore(wrk, &req->objcore);
 		req->err_code = 500;
 		req->req_step = R_STP_SYNTH;
 		return (REQ_FSM_MORE);
@@ -240,7 +240,7 @@ cnt_deliver(struct worker *wrk, struct req *req)
 
 	if (wrk->vpi->handling != VCL_RET_DELIVER) {
 		HSH_Cancel(wrk, req->objcore, NULL);
-		(void)HSH_DerefObjCore(wrk, &req->objcore, HSH_RUSH_POLICY);
+		(void)HSH_DerefObjCore(wrk, &req->objcore);
 		http_Teardown(req->resp);
 
 		switch (wrk->vpi->handling) {
@@ -413,7 +413,7 @@ cnt_synth(struct worker *wrk, struct req *req)
 		VSLb(req->vsl, SLT_Error, "Could not get storage");
 		req->doclose = SC_OVERLOAD;
 		VSLb_ts_req(req, "Resp", W_TIM_real(wrk));
-		(void)HSH_DerefObjCore(wrk, &req->objcore, 1);
+		(void)HSH_DerefObjCore(wrk, &req->objcore);
 		http_Teardown(req->resp);
 		return (REQ_FSM_DONE);
 	}
@@ -529,7 +529,7 @@ cnt_finish(struct worker *wrk, struct req *req)
 		req->boc = NULL;
 	}
 
-	(void)HSH_DerefObjCore(wrk, &req->objcore, HSH_RUSH_POLICY);
+	(void)HSH_DerefObjCore(wrk, &req->objcore);
 	http_Teardown(req->resp);
 
 	req->vdp_filter_list = NULL;
@@ -557,7 +557,7 @@ cnt_fetch(struct worker *wrk, struct req *req)
 	if (req->objcore->flags & OC_F_FAILED) {
 		req->err_code = 503;
 		req->req_step = R_STP_SYNTH;
-		(void)HSH_DerefObjCore(wrk, &req->objcore, 1);
+		(void)HSH_DerefObjCore(wrk, &req->objcore);
 		AZ(req->objcore);
 		return (REQ_FSM_MORE);
 	}
@@ -685,7 +685,7 @@ cnt_lookup(struct worker *wrk, struct req *req)
 	}
 
 	/* Drop our object, we won't need it */
-	(void)HSH_DerefObjCore(wrk, &req->objcore, HSH_RUSH_POLICY);
+	(void)HSH_DerefObjCore(wrk, &req->objcore);
 
 	if (busy != NULL) {
 		HSH_Withdraw(wrk, &busy);
@@ -715,7 +715,7 @@ cnt_miss(struct worker *wrk, struct req *req)
 		wrk->stats->cache_miss++;
 		VBF_Fetch(wrk, req, req->objcore, req->stale_oc, VBF_NORMAL);
 		if (req->stale_oc != NULL)
-			(void)HSH_DerefObjCore(wrk, &req->stale_oc, 0);
+			(void)HSH_DerefObjCore(wrk, &req->stale_oc);
 		req->req_step = R_STP_FETCH;
 		return (REQ_FSM_MORE);
 	case VCL_RET_FAIL:
@@ -735,7 +735,7 @@ cnt_miss(struct worker *wrk, struct req *req)
 	}
 	VRY_Clear(req);
 	if (req->stale_oc != NULL)
-		(void)HSH_DerefObjCore(wrk, &req->stale_oc, 0);
+		(void)HSH_DerefObjCore(wrk, &req->stale_oc);
 	HSH_Withdraw(wrk, &req->objcore);
 	return (REQ_FSM_MORE);
 }
diff --git a/bin/varnishd/cache/cache_vrt_var.c b/bin/varnishd/cache/cache_vrt_var.c
index b01b974602..4fcdf52c6f 100644
--- a/bin/varnishd/cache/cache_vrt_var.c
+++ b/bin/varnishd/cache/cache_vrt_var.c
@@ -644,7 +644,7 @@ VRT_u_bereq_body(VRT_CTX)
 	http_Unset(ctx->bo->bereq, H_Content_Length);
 
 	if (ctx->bo->bereq_body != NULL)
-		(void)HSH_DerefObjCore(ctx->bo->wrk, &ctx->bo->bereq_body, 0);
+		(void)HSH_DerefObjCore(ctx->bo->wrk, &ctx->bo->bereq_body);
 
 	if (ctx->bo->req != NULL) {
 		CHECK_OBJ(ctx->bo->req, REQ_MAGIC);
diff --git a/bin/varnishd/storage/storage_lru.c b/bin/varnishd/storage/storage_lru.c
index 5e046e7946..ec0369c817 100644
--- a/bin/varnishd/storage/storage_lru.c
+++ b/bin/varnishd/storage/storage_lru.c
@@ -205,6 +205,6 @@ LRU_NukeOne(struct worker *wrk, struct lru *lru)
 	ObjSlim(wrk, oc);
 
 	VSLb(wrk->vsl, SLT_ExpKill, "LRU xid=%ju", VXID(ObjGetXID(wrk, oc)));
-	(void)HSH_DerefObjCore(wrk, &oc, 0);	// Ref from HSH_Snipe
+	(void)HSH_DerefObjCore(wrk, &oc);	// Ref from HSH_Snipe
 	return (1);
 }
diff --git a/bin/varnishd/storage/storage_persistent_silo.c b/bin/varnishd/storage/storage_persistent_silo.c
index 81c24b4367..eb96acd3ad 100644
--- a/bin/varnishd/storage/storage_persistent_silo.c
+++ b/bin/varnishd/storage/storage_persistent_silo.c
@@ -178,7 +178,7 @@ smp_load_seg(struct worker *wrk, const struct smp_sc *sc,
 		HSH_Insert(wrk, so->hash, oc, ban);
 		AN(oc->ban);
 		HSH_DerefBoc(wrk, oc);	// XXX Keep it an stream resurrection?
-		(void)HSH_DerefObjCore(wrk, &oc, HSH_RUSH_POLICY);
+		(void)HSH_DerefObjCore(wrk, &oc);
 		wrk->stats->n_vampireobject++;
 	}
 	Pool_Sumstat(wrk);

From bb39efc2331876724e4b662313b54cabdc5eea20 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 22 Jan 2024 23:17:44 +0100
Subject: [PATCH 09/14] hash: Treat a waiting list match as a revalidation

A rush used to be indiscriminate, and it is now operated based on the
state of an objcore, by inspecting its flags. That change of behavior
freed call sites from having to know what kind of rush to operate, and
more importantly removed the need to attempt a rush whenever an objcore
reference was released. It ensured that an objcore would only get the
OC_F_BUSY flag if it was created for the purpose of a fetch, with the
guarantee that a rush would be triggered as soon as the fetch got an
outcome:

- withdrawn (fetch not even attempted)
- failed
- ready (either for streaming, or complete)

All these prior changes built up to this change, allowing the effective
implementation of the unqualified form for the cache-control no-cache
directive.

RFC9112 describe the server-side no-cache directive like this:

> The no-cache response directive, in its unqualified form (without an
> argument), indicates that the response MUST NOT be used to satisfy any
> other request without forwarding it for validation and receiving a
> successful response; see Section 4.3.

This can translate in VCL as beresp.ttl == 0 and beresp.grace == 0, but
still allowing to serve from cache.

Doing this naively leads to an infamous "waiting list serialization"
phenomenon that tends to take time to diagnose and depending on the
setup may be easy to remediate.

One way to solve this from a Varnish perspective is to assimilate the
waiting list as a collection of requests waiting for a response to be
validated or revalidated. In the event of a no-cache directive (or VCL
override to zero beresp.ttl and beresp.grace) and in the absence of
any other criterion preventing an object to be cached, all the requests
can consume the just-validated response.

To make this possible, a few things needed to change. A happens-before
relationship between entering the waiting list and taking part in the
rush is needed, and the time reference for this relationship is the
insertion of a new objcore in the cache. Let's call this time t_insert.

When a rush begins there are two scenarios for requests.

Requests that entered the waiting list before t_insert should evaluate
the objcore, and when applicable, propagate the rush exponentially.

Requests that entered the waiting list after t_insert already evaluated
the objcore because by the time a rush begins, the OC_F_BUSY flag got
cleared already. They should not evaluate the objcore again, and another
incoming fetch outcome will eventually wake them up.

What guarantees the happens-before relationship is that t_insert happens
under the objhead lock.

The relationship is effectively implemented with a generation counter
for the objhead. Requests inherit the generation when they enter the
waiting list. Objcores inherit the generation when they begin the rush.

This allows the exponential rush propagation to know when to stop, and
the objcore can be evaluated outside of the lookup critical section.
Only incompatible variants should force requests to reenter the waiting
list, reducing the scope for the serialization problem to a profusion
of variants (like the classic 'Vary: User-Agent' pitfall).

Since most spurious rushes at every corner of objhead activity are gone,
this change puts all the spurious activity on the incompatible variants
alone instead of all objects on more occasions.

If a cacheable object was inserted in the cache, but already expired,
this behavior enables cache hits. This can be common with multi-tier
Varnish setups where one Varnish server may serve a graced object to
another, but true of any origin server that may serve stale-yet-valid
responses.

The waiting list enables a proper response-wide no-cache behavior from
now on, but the built-in VCL prevents it by default. This is also the
first step towards implementing no-cache and private support at the
header field granularity.
---
 bin/varnishd/cache/cache.h         |   5 +-
 bin/varnishd/cache/cache_hash.c    | 119 ++++++++++++++++++----
 bin/varnishd/cache/cache_objhead.h |   1 +
 bin/varnishd/cache/cache_req_fsm.c |  15 +--
 bin/varnishd/cache/cache_vary.c    |   3 +-
 bin/varnishtest/tests/c00125.vtc   | 156 +++++++++++++++++++++++++++++
 include/tbl/req_flags.h            |   1 -
 7 files changed, 267 insertions(+), 33 deletions(-)
 create mode 100644 bin/varnishtest/tests/c00125.vtc

diff --git a/bin/varnishd/cache/cache.h b/bin/varnishd/cache/cache.h
index fd6cdfab82..fc5b9cf933 100644
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -358,6 +358,7 @@ struct objcore {
 	uint16_t		oa_present;
 
 	unsigned		timer_idx;	// XXX 4Gobj limit
+	unsigned		waitinglist_gen;
 	vtim_real		last_lru;
 	VTAILQ_ENTRY(objcore)	hsh_list;
 	VTAILQ_ENTRY(objcore)	lru_list;
@@ -474,6 +475,7 @@ struct req {
 	stream_close_t		doclose;
 	unsigned		restarts;
 	unsigned		max_restarts;
+	unsigned		waitinglist_gen;
 
 	const struct req_step	*req_step;
 	struct reqtop		*top;	/* esi_level == 0 request */
@@ -495,9 +497,6 @@ struct req {
 
 	struct objcore		*body_oc;
 
-	/* The busy objhead we sleep on */
-	struct objhead		*hash_objhead;
-
 	/* Built Vary string == workspace reservation */
 	uint8_t			*vary_b;
 	uint8_t			*vary_e;
diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 14259fa5de..4d3b5e290f 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -101,6 +101,7 @@ hsh_initobjhead(struct objhead *oh)
 	XXXAN(oh);
 	INIT_OBJ(oh, OBJHEAD_MAGIC);
 	oh->refcnt = 1;
+	oh->waitinglist_gen = 1;
 	VTAILQ_INIT(&oh->objcs);
 	VTAILQ_INIT(&oh->waitinglist);
 	Lck_New(&oh->mtx, lck_objhdr);
@@ -377,6 +378,42 @@ hsh_insert_busyobj(const struct worker *wrk, struct objhead *oh)
 	return (oc);
 }
 
+/*---------------------------------------------------------------------
+ */
+
+static unsigned
+hsh_rush_match(struct req *req)
+{
+	struct objhead *oh;
+	struct objcore *oc;
+	const uint8_t *vary;
+
+	oc = req->objcore;
+	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
+	assert(oc->refcnt > 0);
+
+	AZ(oc->flags & OC_F_BUSY);
+	AZ(oc->flags & OC_F_PRIVATE);
+	if (oc->flags & (OC_F_WITHDRAWN|OC_F_HFM|OC_F_HFP|OC_F_CANCEL|
+	    OC_F_FAILED))
+		return (0);
+
+	if (req->vcf != NULL) /* NB: must operate under oh lock. */
+		return (0);
+
+	oh = oc->objhead;
+	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
+
+	if (req->hash_ignore_vary)
+		return (1);
+	if (!ObjHasAttr(req->wrk, oc, OA_VARY))
+		return (1);
+
+	vary = ObjGetAttr(req->wrk, oc, OA_VARY, NULL);
+	AN(vary);
+	return (VRY_Match(req, vary));
+}
+
 /*---------------------------------------------------------------------
  */
 
@@ -407,6 +444,7 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	CHECK_OBJ_NOTNULL(wrk->wpriv, WORKER_PRIV_MAGIC);
 	CHECK_OBJ_NOTNULL(req->http, HTTP_MAGIC);
+	CHECK_OBJ_ORNULL(req->objcore, OBJCORE_MAGIC);
 	CHECK_OBJ_ORNULL(req->vcf, VCF_MAGIC);
 	AN(hash);
 
@@ -414,15 +452,33 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	if (DO_DEBUG(DBG_HASHEDGE))
 		hsh_testmagic(req->digest);
 
-	if (req->hash_objhead != NULL) {
-		/*
-		 * This req came off the waiting list, and brings an
-		 * oh refcnt with it.
+	/*
+	 * When a req rushes off the waiting list, it brings an implicit
+	 * oh refcnt acquired at disembark time and an oc ref (with its
+	 * own distinct oh ref) acquired during rush hour.
+	 */
+
+	if (req->objcore != NULL && hsh_rush_match(req)) {
+		TAKE_OBJ_NOTNULL(oc, &req->objcore, OBJCORE_MAGIC);
+		*ocp = oc;
+		oh = oc->objhead;
+		Lck_Lock(&oh->mtx);
+		oc->hits++;
+		boc_progress = oc->boc == NULL ? -1 : oc->boc->fetched_so_far;
+		AN(hsh_deref_objhead_unlock(wrk, &oh, oc));
+		Req_LogHit(wrk, req, oc, boc_progress);
+		/* NB: since this hit comes from the waiting list instead of
+		 * a regular lookup, grace is not considered. The object is
+		 * fresh in the context of the waiting list, even expired: it
+		 * was successfully just [re]validated by a fetch task.
 		 */
-		CHECK_OBJ_NOTNULL(req->hash_objhead, OBJHEAD_MAGIC);
-		oh = req->hash_objhead;
+		return (HSH_HIT);
+	}
+
+	if (req->objcore != NULL) {
+		oh = req->objcore->objhead;
+		(void)HSH_DerefObjCore(wrk, &req->objcore);
 		Lck_Lock(&oh->mtx);
-		req->hash_objhead = NULL;
 	} else {
 		AN(wrk->wpriv->nobjhead);
 		oh = hash->lookup(wrk, req->digest, &wrk->wpriv->nobjhead);
@@ -615,13 +671,14 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	AZ(req->hash_ignore_busy);
 
 	/*
-	 * The objhead reference transfers to the sess, we get it
-	 * back when the sess comes off the waiting list and
-	 * calls us again
+	 * The objhead reference is held by req while it is parked on the
+	 * waiting list. The oh pointer is taken back from the objcore that
+	 * triggers a rush of req off the waiting list.
 	 */
-	req->hash_objhead = oh;
+	assert(oh->refcnt > 1);
+
 	req->wrk = NULL;
-	req->waitinglist = 1;
+	req->waitinglist_gen = oh->waitinglist_gen;
 
 	if (DO_DEBUG(DBG_WAITINGLIST))
 		VSLb(req->vsl, SLT_Debug, "on waiting list <%p>", oh);
@@ -657,24 +714,44 @@ hsh_rush1(const struct worker *wrk, struct objcore *oc, struct rush *r)
 
 	AZ(oc->flags & OC_F_BUSY);
 	AZ(oc->flags & OC_F_PRIVATE);
+	max = cache_param->rush_exponent;
 	if (oc->flags & (OC_F_WITHDRAWN|OC_F_FAILED))
 		max = 1;
-	else if (oc->flags & (OC_F_HFM|OC_F_HFP|OC_F_CANCEL|OC_F_DYING))
-		max = cache_param->rush_exponent;
-	else
-		max = INT_MAX; /* XXX: temp */
 	assert(max > 0);
 
+	if (oc->waitinglist_gen == 0) {
+		oc->waitinglist_gen = oh->waitinglist_gen;
+		oh->waitinglist_gen++;
+	}
+
 	for (i = 0; i < max; i++) {
 		req = VTAILQ_FIRST(&oh->waitinglist);
 		if (req == NULL)
 			break;
-		CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-		wrk->stats->busy_wakeup++;
+
+		CHECK_OBJ(req, REQ_MAGIC);
+
+		/* NB: The waiting list is naturally sorted by generation.
+		 *
+		 * Because of the exponential nature of the rush, it is
+		 * possible that new requests enter the waiting list before
+		 * the rush for this oc completes. Because the OC_F_BUSY flag
+		 * was cleared before the beginning of the rush, requests
+		 * from a newer generation already got a chance to evaluate
+		 * oc during a lookup and it didn't match their criteria.
+		 *
+		 * Therefore there's no point propagating the exponential
+		 * rush of this oc when we see a newer generation.
+		 */
+		if (req->waitinglist_gen > oc->waitinglist_gen)
+			break;
+
 		AZ(req->wrk);
 		VTAILQ_REMOVE(&oh->waitinglist, req, w_list);
 		VTAILQ_INSERT_TAIL(&r->reqs, req, w_list);
-		req->waitinglist = 0;
+		req->objcore = oc;
+		oc->refcnt++;
+		wrk->stats->busy_wakeup++;
 	}
 }
 
@@ -945,8 +1022,8 @@ HSH_Withdraw(struct worker *wrk, struct objcore **ocp)
 	assert(oh->refcnt > 0);
 	oc->flags &= ~OC_F_BUSY;
 	oc->flags |= OC_F_WITHDRAWN;
-	hsh_rush1(wrk, oc, &rush);
-	AZ(HSH_DerefObjCoreUnlock(wrk, &oc));
+	hsh_rush1(wrk, oc, &rush); /* grabs up to 1 oc ref */
+	assert(HSH_DerefObjCoreUnlock(wrk, &oc) <= 1);
 
 	hsh_rush2(wrk, &rush);
 }
diff --git a/bin/varnishd/cache/cache_objhead.h b/bin/varnishd/cache/cache_objhead.h
index 94ef81db52..162aaeb7fa 100644
--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -40,6 +40,7 @@ struct objhead {
 	struct lock		mtx;
 	VTAILQ_HEAD(,objcore)	objcs;
 	uint8_t			digest[DIGEST_LEN];
+	unsigned		waitinglist_gen;
 	VTAILQ_HEAD(, req)	waitinglist;
 
 	/*----------------------------------------------------
diff --git a/bin/varnishd/cache/cache_req_fsm.c b/bin/varnishd/cache/cache_req_fsm.c
index 77481a41da..43cb180062 100644
--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -576,20 +576,23 @@ cnt_lookup(struct worker *wrk, struct req *req)
 {
 	struct objcore *oc, *busy;
 	enum lookup_e lr;
-	int had_objhead = 0;
+	int waitinglist_gen;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
 	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-	AZ(req->objcore);
 	AZ(req->stale_oc);
 
 	AN(req->vcl);
 
 	VRY_Prep(req);
+	waitinglist_gen = req->waitinglist_gen;
+
+	if (req->waitinglist_gen) {
+		CHECK_OBJ_NOTNULL(req->objcore, OBJCORE_MAGIC);
+		req->waitinglist_gen = 0;
+	} else
+		AZ(req->objcore);
 
-	AZ(req->objcore);
-	if (req->hash_objhead)
-		had_objhead = 1;
 	wrk->strangelove = 0;
 	lr = HSH_Lookup(req, &oc, &busy);
 	if (lr == HSH_BUSY) {
@@ -605,7 +608,7 @@ cnt_lookup(struct worker *wrk, struct req *req)
 	if ((unsigned)wrk->strangelove >= cache_param->vary_notice)
 		VSLb(req->vsl, SLT_Notice, "vsl: High number of variants (%d)",
 		    wrk->strangelove);
-	if (had_objhead)
+	if (waitinglist_gen)
 		VSLb_ts_req(req, "Waitinglist", W_TIM_real(wrk));
 
 	if (req->vcf != NULL) {
diff --git a/bin/varnishd/cache/cache_vary.c b/bin/varnishd/cache/cache_vary.c
index c9c46231b9..672a5b598a 100644
--- a/bin/varnishd/cache/cache_vary.c
+++ b/bin/varnishd/cache/cache_vary.c
@@ -226,8 +226,7 @@ vry_cmp(const uint8_t *v1, const uint8_t *v2)
 void
 VRY_Prep(struct req *req)
 {
-	if (req->hash_objhead == NULL) {
-		/* Not a waiting list return */
+	if (req->waitinglist_gen == 0) {
 		AZ(req->vary_b);
 		AZ(req->vary_e);
 		(void)WS_ReserveAll(req->ws);
diff --git a/bin/varnishtest/tests/c00125.vtc b/bin/varnishtest/tests/c00125.vtc
new file mode 100644
index 0000000000..8fbca4f46b
--- /dev/null
+++ b/bin/varnishtest/tests/c00125.vtc
@@ -0,0 +1,156 @@
+varnishtest "successful expired waiting list hit"
+
+barrier b1 cond 2
+barrier b2 cond 2
+barrier b3 cond 2
+barrier b4 cond 2
+
+
+server s1 {
+	rxreq
+	expect req.http.user-agent == c1
+	expect req.http.bgfetch == false
+	barrier b1 sync
+	barrier b2 sync
+	txresp -hdr "Cache-Control: max-age=60" -hdr "Age: 120"
+
+	rxreq
+	expect req.http.user-agent == c3
+	expect req.http.bgfetch == true
+	txresp
+
+	# The no-cache case only works with a complicit VCL, for now.
+	rxreq
+	expect req.http.user-agent == c4
+	expect req.http.bgfetch == false
+	barrier b3 sync
+	barrier b4 sync
+	txresp -hdr "Cache-Control: no-cache"
+
+	rxreq
+	expect req.http.user-agent == c6
+	expect req.http.bgfetch == false
+	txresp -hdr "Cache-Control: no-cache"
+} -start
+
+varnish v1 -cliok "param.set default_grace 1h"
+varnish v1 -cliok "param.set thread_pools 1"
+varnish v1 -cliok "param.set debug +syncvsl,+waitinglist"
+varnish v1 -vcl+backend {
+	sub vcl_backend_fetch {
+		set bereq.http.bgfetch = bereq.is_bgfetch;
+	}
+	sub vcl_beresp_stale {
+		# We just validated a stale object, do not mark it as
+		# uncacheable. The object remains available for grace
+		# hits and background fetches.
+		return;
+	}
+	sub vcl_beresp_control {
+		if (beresp.http.cache-control == "no-cache") {
+			# Keep beresp.uncacheable clear.
+			return;
+		}
+	}
+	sub vcl_deliver {
+		set resp.http.obj-hits = obj.hits;
+		set resp.http.obj-ttl = obj.ttl;
+	}
+} -start
+
+client c1 {
+	txreq -url "/stale-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == 1001
+	expect resp.http.obj-hits == 0
+	expect resp.http.obj-ttl < 0
+} -start
+
+barrier b1 sync
+
+client c2 {
+	txreq -url "/stale-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == "1004 1002"
+	expect resp.http.obj-hits == 1
+	expect resp.http.obj-ttl < 0
+} -start
+
+varnish v1 -expect busy_sleep == 1
+barrier b2 sync
+
+client c1 -wait
+client c2 -wait
+
+varnish v1 -vsl_catchup
+
+varnish v1 -expect cache_miss == 1
+varnish v1 -expect cache_hit == 1
+varnish v1 -expect cache_hit_grace == 0
+varnish v1 -expect s_bgfetch == 0
+
+client c3 {
+	txreq -url "/stale-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == "1006 1002"
+	expect resp.http.obj-hits == 2
+	expect resp.http.obj-ttl < 0
+} -run
+
+varnish v1 -vsl_catchup
+
+varnish v1 -expect cache_miss == 1
+varnish v1 -expect cache_hit == 2
+varnish v1 -expect cache_hit_grace == 1
+varnish v1 -expect s_bgfetch == 1
+
+# The only way for a plain no-cache to be hit is to have a non-zero keep.
+varnish v1 -cliok "param.set default_ttl 0"
+varnish v1 -cliok "param.set default_grace 0"
+varnish v1 -cliok "param.set default_keep 1h"
+
+client c4 {
+	txreq -url "/no-cache-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == 1009
+	expect resp.http.obj-hits == 0
+	expect resp.http.obj-ttl <= 0
+} -start
+
+barrier b3 sync
+
+client c5 {
+	txreq -url "/no-cache-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == "1012 1010"
+	expect resp.http.obj-hits == 1
+	expect resp.http.obj-ttl <= 0
+} -start
+
+varnish v1 -expect busy_sleep == 2
+barrier b4 sync
+
+client c4 -wait
+client c5 -wait
+
+varnish v1 -vsl_catchup
+
+varnish v1 -expect cache_miss == 2
+varnish v1 -expect cache_hit == 3
+varnish v1 -expect cache_hit_grace == 1
+varnish v1 -expect s_bgfetch == 1
+
+# No hit when not on the waiting list
+client c6 {
+	txreq -url "/no-cache-hit"
+	rxresp
+	expect resp.status == 200
+	expect resp.http.x-varnish == 1014
+	expect resp.http.obj-hits == 0
+	expect resp.http.obj-ttl <= 0
+} -run
diff --git a/include/tbl/req_flags.h b/include/tbl/req_flags.h
index dde208d160..8d73e620e4 100644
--- a/include/tbl/req_flags.h
+++ b/include/tbl/req_flags.h
@@ -37,7 +37,6 @@ REQ_FLAG(hash_ignore_busy,	1, 1, "")
 REQ_FLAG(hash_ignore_vary,	1, 1, "")
 REQ_FLAG(hash_always_miss,	1, 1, "")
 REQ_FLAG(is_hit,		0, 0, "")
-REQ_FLAG(waitinglist,		0, 0, "")
 REQ_FLAG(want100cont,		0, 0, "")
 REQ_FLAG(late100cont,		0, 0, "")
 REQ_FLAG(req_reset,		0, 0, "")

From 30ef04fc74a8f195111bca6930a3fc6ac7931d7b Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Tue, 19 Mar 2024 11:43:17 +0100
Subject: [PATCH 10/14] fetch: Retire waiting list safety net

If a synthetic beresp is inserted into the cache with no TTL, grace or
keep at all, it is now considered validated (although not in the best
circumstances) and will be passed to all requests in the waiting list.
---
 bin/varnishd/cache/cache_fetch.c  | 24 ++---------
 bin/varnishtest/tests/c00131.vtc  | 71 +++++++++++++++++++++++++++++++
 doc/sphinx/reference/vcl_step.rst | 21 ++++-----
 3 files changed, 83 insertions(+), 33 deletions(-)
 create mode 100644 bin/varnishtest/tests/c00131.vtc

diff --git a/bin/varnishd/cache/cache_fetch.c b/bin/varnishd/cache/cache_fetch.c
index 8af011fb72..f4f9ee4a16 100644
--- a/bin/varnishd/cache/cache_fetch.c
+++ b/bin/varnishd/cache/cache_fetch.c
@@ -900,9 +900,7 @@ vbf_stp_condfetch(struct worker *wrk, struct busyobj *bo)
  *
  * replaces a stale object unless
  * - abandoning the bereq or
- * - leaving vcl_backend_error with return (deliver) and beresp.ttl == 0s or
- * - there is a waitinglist on this object because in this case the default ttl
- *   would be 1s, so we might be looking at the same case as the previous
+ * - leaving vcl_backend_error with return (deliver)
  *
  * We do want the stale replacement to avoid an object pileup with short ttl and
  * long grace/keep, yet there could exist cases where a cache object is
@@ -960,23 +958,9 @@ vbf_stp_error(struct worker *wrk, struct busyobj *bo)
 
 	stale = bo->stale_oc;
 	oc->t_origin = now;
-	if (!VTAILQ_EMPTY(&oc->objhead->waitinglist)) {
-		/*
-		 * If there is a waitinglist, it means that there is no
-		 * grace-able object, so cache the error return for a
-		 * short time, so the waiting list can drain, rather than
-		 * each objcore on the waiting list sequentially attempt
-		 * to fetch from the backend.
-		 */
-		oc->ttl = 1;
-		oc->grace = 5;
-		oc->keep = 5;
-		stale = NULL;
-	} else {
-		oc->ttl = 0;
-		oc->grace = 0;
-		oc->keep = 0;
-	}
+	oc->ttl = 0;
+	oc->grace = 0;
+	oc->keep = 0;
 
 	synth_body = VSB_new_auto();
 	AN(synth_body);
diff --git a/bin/varnishtest/tests/c00131.vtc b/bin/varnishtest/tests/c00131.vtc
new file mode 100644
index 0000000000..c9ce19bd58
--- /dev/null
+++ b/bin/varnishtest/tests/c00131.vtc
@@ -0,0 +1,71 @@
+varnishtest "waiting list rush from vcl_backend_error"
+
+barrier b1 sock 2
+barrier b2 sock 2
+
+server s1 {
+	rxreq
+	send garbage
+} -start
+
+varnish v1 -cliok "param.set debug +syncvsl,+waitinglist"
+varnish v1 -vcl+backend {
+	import vtc;
+	sub vcl_backend_fetch {
+		vtc.barrier_sync("${b1_sock}");
+		vtc.barrier_sync("${b2_sock}");
+	}
+	sub vcl_backend_error {
+		set beresp.http.error-vxid = bereq.xid;
+	}
+} -start
+
+client c1 {
+	txreq
+	rxresp
+	expect resp.http.error-vxid == 1002
+} -start
+
+barrier b1 sync
+
+logexpect l1 -v v1 -q Debug -g raw {
+	loop 4 {
+		expect * * Debug "on waiting list"
+	}
+} -start
+
+client c2 {
+	txreq
+	rxresp
+	expect resp.http.error-vxid == 1002
+} -start
+
+client c3 {
+	txreq
+	rxresp
+	expect resp.http.error-vxid == 1002
+} -start
+
+client c4 {
+	txreq
+	rxresp
+	expect resp.http.error-vxid == 1002
+} -start
+
+client c5 {
+	txreq
+	rxresp
+	expect resp.http.error-vxid == 1002
+} -start
+
+logexpect l1 -wait
+
+barrier b2 sync
+
+client c1 -wait
+client c2 -wait
+client c3 -wait
+client c4 -wait
+client c5 -wait
+
+varnish v1 -expect n_expired == 1
diff --git a/doc/sphinx/reference/vcl_step.rst b/doc/sphinx/reference/vcl_step.rst
index 67ddf0818b..81bb7b4e05 100644
--- a/doc/sphinx/reference/vcl_step.rst
+++ b/doc/sphinx/reference/vcl_step.rst
@@ -380,19 +380,14 @@ This subroutine is called if we fail the backend fetch or if
 
 Returning with :ref:`abandon` does not leave a cache object.
 
-If returning with ``deliver`` and a ``beresp.ttl > 0s``, a synthetic
-cache object is generated in VCL, whose body may be constructed using
-the ``synthetic()`` function.
-
-When there is a waiting list on the object, the default ``ttl`` will
-be positive (currently one second), set before entering
-``vcl_backend_error``. This is to avoid request serialization and
-hammering on a potentially failing backend.
-
-Since these synthetic objects are cached in these special
-circumstances, be cautious with putting private information there. If
-you really must, then you need to explicitly set ``beresp.ttl`` to
-zero in ``vcl_backend_error``.
+If returning with ``deliver`` and ``beresp.uncacheable == false``, a
+synthetic cache object is generated in VCL, whose body may be constructed
+using the ``synthetic()`` function.
+
+Since these synthetic objects are cached in these special circumstances,
+be cautious with putting private information there. If you really must,
+then you need to explicitly set ``beresp.uncacheable`` to ``true`` in
+``vcl_backend_error``.
 
 The `vcl_backend_error` subroutine may terminate with calling ``return()``
 with one of the following keywords:

From 253f052995038d843d2f3945809d66c6d5cf883b Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Tue, 22 Oct 2024 15:33:23 +0200
Subject: [PATCH 11/14] fetch: Operate BOC state changes on busyobj

This centralizes the rules regarding BOC state changes and whether to
inform clients of those changes. For example, BOS_REQ_DONE is only
needed by grace hits, saving one broadcast on the BOC condvar on misses
and straight passes.

It also formalizes when a busyobj will release its client request.
---
 bin/varnishd/cache/cache_busyobj.c  | 39 +++++++++++++++++++++++++++++
 bin/varnishd/cache/cache_fetch.c    | 22 +++++++---------
 bin/varnishd/cache/cache_obj.c      |  6 +++--
 bin/varnishd/cache/cache_varnishd.h |  5 +++-
 bin/varnishd/cache/cache_vrt_var.c  |  5 ++--
 5 files changed, 58 insertions(+), 19 deletions(-)

diff --git a/bin/varnishd/cache/cache_busyobj.c b/bin/varnishd/cache/cache_busyobj.c
index c8814a7bb3..9bda40bd54 100644
--- a/bin/varnishd/cache/cache_busyobj.c
+++ b/bin/varnishd/cache/cache_busyobj.c
@@ -169,3 +169,42 @@ VBO_ReleaseBusyObj(struct worker *wrk, struct busyobj **pbo)
 
 	MPL_Free(vbopool, bo);
 }
+
+void
+VBO_SetState(struct worker *wrk, struct busyobj *bo, enum boc_state_e next)
+{
+	unsigned broadcast;
+
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
+	CHECK_OBJ_NOTNULL(bo, BUSYOBJ_MAGIC);
+
+	switch (next) {
+	case BOS_REQ_DONE:
+		AN(bo->req);
+		bo->req = NULL;
+		broadcast = bo->is_bgfetch;
+		break;
+	case BOS_STREAM:
+		AN(bo->do_stream);
+		AZ(bo->req);
+		broadcast = 1;
+		break;
+	case BOS_FINISHED:
+	case BOS_FAILED:
+		/* We can't assert that either state already released its
+		 * request because a fetch may fail before reaching the
+		 * BOS_REQ_DONE state. Failing can also mean executing
+		 * vcl_backend_error and reaching BOS_FINISHED from there.
+		 * One can legitemately return(retry) from there and proceed
+		 * again with a usable req if a return(error) transition led
+		 * to vcl_backend_error instead of a failed fetch attempt.
+		 */
+		bo->req = NULL;
+		broadcast = 1;
+		break;
+	default:
+		WRONG("unexpected BOC state");
+	}
+
+	ObjSetState(wrk, bo->fetch_objcore, next, broadcast);
+}
diff --git a/bin/varnishd/cache/cache_fetch.c b/bin/varnishd/cache/cache_fetch.c
index f4f9ee4a16..fe957f64b5 100644
--- a/bin/varnishd/cache/cache_fetch.c
+++ b/bin/varnishd/cache/cache_fetch.c
@@ -292,14 +292,12 @@ vbf_stp_mkbereq(struct worker *wrk, struct busyobj *bo)
 	HTTP_Clone(bo->bereq, bo->bereq0);
 
 	if (bo->req->req_body_status->avail == 0) {
-		bo->req = NULL;
-		ObjSetState(bo->wrk, oc, BOS_REQ_DONE);
+		VBO_SetState(bo->wrk, bo, BOS_REQ_DONE);
 	} else if (bo->req->req_body_status == BS_CACHED) {
 		AN(bo->req->body_oc);
 		bo->bereq_body = bo->req->body_oc;
 		HSH_Ref(bo->bereq_body);
-		bo->req = NULL;
-		ObjSetState(bo->wrk, oc, BOS_REQ_DONE);
+		VBO_SetState(bo->wrk, bo, BOS_REQ_DONE);
 	}
 	return (F_STP_STARTFETCH);
 }
@@ -544,10 +542,8 @@ vbf_stp_startfetch(struct worker *wrk, struct busyobj *bo)
 
 	VSLb_ts_busyobj(bo, "Process", W_TIM_real(wrk));
 	assert(oc->boc->state <= BOS_REQ_DONE);
-	if (oc->boc->state != BOS_REQ_DONE) {
-		bo->req = NULL;
-		ObjSetState(wrk, oc, BOS_REQ_DONE);
-	}
+	if (oc->boc->state != BOS_REQ_DONE)
+		VBO_SetState(wrk, bo, BOS_REQ_DONE);
 
 	if (bo->do_esi)
 		bo->do_stream = 0;
@@ -713,7 +709,7 @@ vbf_stp_fetch(struct worker *wrk, struct busyobj *bo)
 	assert(oc->boc->state == BOS_REQ_DONE);
 
 	if (bo->do_stream)
-		ObjSetState(wrk, oc, BOS_STREAM);
+		VBO_SetState(wrk, bo, BOS_STREAM);
 
 	VSLb(bo->vsl, SLT_Fetch_Body, "%u %s %s",
 	    bo->htc->body_status->nbr, bo->htc->body_status->name,
@@ -751,7 +747,7 @@ vbf_stp_fetchend(struct worker *wrk, struct busyobj *bo)
 	else
 		assert(oc->boc->state == BOS_REQ_DONE);
 
-	ObjSetState(wrk, oc, BOS_FINISHED);
+	VBO_SetState(wrk, bo, BOS_FINISHED);
 	VSLb_ts_busyobj(bo, "BerespBody", W_TIM_real(wrk));
 	if (bo->stale_oc != NULL) {
 		VSL(SLT_ExpKill, NO_VXID, "VBF_Superseded x=%ju n=%ju",
@@ -879,7 +875,7 @@ vbf_stp_condfetch(struct worker *wrk, struct busyobj *bo)
 	AZ(ObjCopyAttr(bo->wrk, oc, stale_oc, OA_GZIPBITS));
 
 	if (bo->do_stream)
-		ObjSetState(wrk, oc, BOS_STREAM);
+		VBO_SetState(wrk, bo, BOS_STREAM);
 
 	INIT_OBJ(vop, VBF_OBITER_PRIV_MAGIC);
 	vop->bo = bo;
@@ -1015,7 +1011,7 @@ vbf_stp_error(struct worker *wrk, struct busyobj *bo)
 	VSB_destroy(&synth_body);
 	if (stale != NULL && oc->ttl > 0)
 		HSH_Kill(stale);
-	ObjSetState(wrk, oc, BOS_FINISHED);
+	VBO_SetState(wrk, bo, BOS_FINISHED);
 	return (F_STP_DONE);
 }
 
@@ -1033,7 +1029,7 @@ vbf_stp_fail(struct worker *wrk, struct busyobj *bo)
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 
 	assert(oc->boc->state < BOS_FINISHED);
-	ObjSetState(wrk, oc, BOS_FAILED);
+	VBO_SetState(wrk, bo, BOS_FAILED);
 	HSH_Kill(oc);
 	return (F_STP_DONE);
 }
diff --git a/bin/varnishd/cache/cache_obj.c b/bin/varnishd/cache/cache_obj.c
index 172e29a8bd..804c226aa6 100644
--- a/bin/varnishd/cache/cache_obj.c
+++ b/bin/varnishd/cache/cache_obj.c
@@ -491,7 +491,8 @@ ObjVAICancel(struct worker *wrk, struct boc *boc, struct vai_qe *qe)
  */
 
 void
-ObjSetState(struct worker *wrk, struct objcore *oc, enum boc_state_e next)
+ObjSetState(struct worker *wrk, struct objcore *oc, enum boc_state_e next,
+    unsigned broadcast)
 {
 	const struct obj_methods *om;
 
@@ -515,7 +516,8 @@ ObjSetState(struct worker *wrk, struct objcore *oc, enum boc_state_e next)
 
 	Lck_Lock(&oc->boc->mtx);
 	oc->boc->state = next;
-	obj_boc_notify(oc->boc);
+	if (broadcast)
+		obj_boc_notify(oc->boc);
 	Lck_Unlock(&oc->boc->mtx);
 }
 
diff --git a/bin/varnishd/cache/cache_varnishd.h b/bin/varnishd/cache/cache_varnishd.h
index 2cd5cd3dc9..e0f41146ac 100644
--- a/bin/varnishd/cache/cache_varnishd.h
+++ b/bin/varnishd/cache/cache_varnishd.h
@@ -175,6 +175,8 @@ vtim_real BAN_Time(const struct ban *ban);
 /* cache_busyobj.c */
 struct busyobj *VBO_GetBusyObj(const struct worker *, const struct req *);
 void VBO_ReleaseBusyObj(struct worker *wrk, struct busyobj **busyobj);
+void VBO_SetState(struct worker *wrk, struct busyobj *bo,
+    enum boc_state_e next);
 
 /* cache_director.c */
 int VDI_GetHdr(struct busyobj *);
@@ -342,7 +344,8 @@ int ObjGetSpace(struct worker *, struct objcore *, ssize_t *sz, uint8_t **ptr);
 void ObjExtend(struct worker *, struct objcore *, ssize_t l, int final);
 uint64_t ObjWaitExtend(const struct worker *, const struct objcore *,
     uint64_t l, enum boc_state_e *statep);
-void ObjSetState(struct worker *, struct objcore *, enum boc_state_e next);
+void ObjSetState(struct worker *, struct objcore *, enum boc_state_e next,
+    unsigned broadcast);
 void ObjWaitState(const struct objcore *, enum boc_state_e want);
 void ObjTouch(struct worker *, struct objcore *, vtim_real now);
 void ObjFreeObj(struct worker *, struct objcore *);
diff --git a/bin/varnishd/cache/cache_vrt_var.c b/bin/varnishd/cache/cache_vrt_var.c
index 4fcdf52c6f..3415ff83c1 100644
--- a/bin/varnishd/cache/cache_vrt_var.c
+++ b/bin/varnishd/cache/cache_vrt_var.c
@@ -648,9 +648,8 @@ VRT_u_bereq_body(VRT_CTX)
 
 	if (ctx->bo->req != NULL) {
 		CHECK_OBJ(ctx->bo->req, REQ_MAGIC);
-		ctx->bo->req = NULL;
-		ObjSetState(ctx->bo->wrk,
-		    ctx->bo->fetch_objcore, BOS_REQ_DONE);
+		VBO_SetState(ctx->bo->wrk, ctx->bo, BOS_REQ_DONE);
+		AZ(ctx->bo->req);
 	}
 }
 

From fef09b63197c3b524aa6268311fde4bff3cc54fb Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 9 Dec 2024 10:11:21 +0100
Subject: [PATCH 12/14] param: Lower rush_exponent minimum value

To ease test coverage for waitling list operations, requiring less
disembarked requests for specific cases. Added is a test case showing
that a rush miss propagates further wake ups.
---
 bin/varnishtest/tests/c00138.vtc | 64 ++++++++++++++++++++++++++++++++
 include/tbl/params.h             |  6 ++-
 2 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 bin/varnishtest/tests/c00138.vtc

diff --git a/bin/varnishtest/tests/c00138.vtc b/bin/varnishtest/tests/c00138.vtc
new file mode 100644
index 0000000000..cf0cb8c198
--- /dev/null
+++ b/bin/varnishtest/tests/c00138.vtc
@@ -0,0 +1,64 @@
+varnishtest "subsequent rush on rush miss"
+
+barrier b1 cond 2
+barrier b2 cond 2
+barrier b3 cond 2
+barrier b4 cond 2
+
+server s1 {
+	rxreq
+	expect req.http.user-agent == c1
+	barrier b1 sync
+	barrier b2 sync
+	txresp -hdr "Vary: accept"
+
+	rxreq
+	expect req.http.user-agent == c2
+	txresp -hdr "Vary: accept"
+} -start
+
+varnish v1 -cliok "param.set thread_pools 1"
+varnish v1 -cliok "param.set rush_exponent 1"
+varnish v1 -cliok "param.set debug +syncvsl,+waitinglist"
+varnish v1 -vcl+backend "" -start
+
+client c1 {
+	txreq
+	rxresp
+	expect resp.http.x-varnish == 1001
+} -start
+
+barrier b1 sync
+
+logexpect l1 -v v1 -q Debug -g raw {
+	expect * * Debug "on waiting list"
+} -start
+
+client c2 {
+	txreq -hdr "accept: nothing"
+	rxresp
+	expect resp.http.x-varnish == 1004
+} -start
+
+logexpect l1 -wait -start
+
+client c3 {
+	txreq
+	rxresp
+	expect resp.http.x-varnish == "1006 1002"
+} -start
+
+logexpect l1 -wait
+
+barrier b2 sync
+
+client c1 -wait
+client c2 -wait
+client c3 -wait
+
+varnish v1 -expect cache_miss == 2
+varnish v1 -expect cache_hit == 1
+varnish v1 -expect cache_hit_grace == 0
+varnish v1 -expect s_bgfetch == 0
+varnish v1 -expect busy_sleep == 2
+varnish v1 -expect busy_wakeup == 2
diff --git a/include/tbl/params.h b/include/tbl/params.h
index df7415536c..e66a653cb6 100644
--- a/include/tbl/params.h
+++ b/include/tbl/params.h
@@ -910,7 +910,7 @@ PARAM_SIMPLE(
 PARAM_SIMPLE(
 	/* name */	rush_exponent,
 	/* type */	uint,
-	/* min */	"2",
+	/* min */	"1",
 	/* max */	NULL,
 	/* def */	"3",
 	/* units */	"requests per request",
@@ -918,7 +918,9 @@ PARAM_SIMPLE(
 	"How many parked request we start for each completed request on "
 	"the object.\n"
 	"NB: Even with the implicit delay of delivery, this parameter "
-	"controls an exponential increase in number of worker threads.",
+	"controls an exponential increase in number of worker threads. "
+	"A value of 1 will instead serialize requests resumption and is "
+	"only useful for testing purposes.",
 	/* flags */	EXPERIMENTAL
 )
 

From 6fcef42cd27b6ad406a6d2788562a8d492e8f854 Mon Sep 17 00:00:00 2001
From: Nils Goroll <nils.goroll@uplex.de>
Date: Tue, 20 May 2025 15:14:50 +0200
Subject: [PATCH 13/14] hash: Centralize vary match

Better diff with the --ignore-all-space option.
---
 bin/varnishd/cache/cache_hash.c | 39 +++++++++++++++++----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index 4d3b5e290f..a49a1f76a0 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -381,12 +381,26 @@ hsh_insert_busyobj(const struct worker *wrk, struct objhead *oh)
 /*---------------------------------------------------------------------
  */
 
+static int
+hsh_vry_match(struct req *req, struct objcore *oc, const uint8_t *vary)
+{
+
+	if (req->hash_ignore_vary)
+		return (1);
+	if (vary == NULL) {
+		if (! ObjHasAttr(req->wrk, oc, OA_VARY))
+			return (1);
+		vary = ObjGetAttr(req->wrk, oc, OA_VARY, NULL);
+		AN(vary);
+	}
+	return (VRY_Match(req, vary));
+}
+
 static unsigned
 hsh_rush_match(struct req *req)
 {
 	struct objhead *oh;
 	struct objcore *oc;
-	const uint8_t *vary;
 
 	oc = req->objcore;
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
@@ -404,14 +418,7 @@ hsh_rush_match(struct req *req)
 	oh = oc->objhead;
 	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
 
-	if (req->hash_ignore_vary)
-		return (1);
-	if (!ObjHasAttr(req->wrk, oc, OA_VARY))
-		return (1);
-
-	vary = ObjGetAttr(req->wrk, oc, OA_VARY, NULL);
-	AN(vary);
-	return (VRY_Match(req, vary));
+	return (hsh_vry_match(req, oc, NULL));
 }
 
 /*---------------------------------------------------------------------
@@ -427,7 +434,6 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 	const struct vcf_return *vr;
 	vtim_real exp_t_origin;
 	int busy_found;
-	const uint8_t *vary;
 	intmax_t boc_progress;
 	unsigned xid = 0;
 	unsigned ban_checks;
@@ -520,8 +526,7 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 				continue;
 
 			if (oc->boc && oc->boc->vary != NULL &&
-			    !req->hash_ignore_vary &&
-			    !VRY_Match(req, oc->boc->vary)) {
+			    !hsh_vry_match(req, oc, oc->boc->vary)) {
 				wrk->strangelove++;
 				continue;
 			}
@@ -540,13 +545,9 @@ HSH_Lookup(struct req *req, struct objcore **ocp, struct objcore **bocp)
 			continue;
 		}
 
-		if (!req->hash_ignore_vary && ObjHasAttr(wrk, oc, OA_VARY)) {
-			vary = ObjGetAttr(wrk, oc, OA_VARY, NULL);
-			AN(vary);
-			if (!VRY_Match(req, vary)) {
-				wrk->strangelove++;
-				continue;
-			}
+		if (!hsh_vry_match(req, oc, NULL)) {
+			wrk->strangelove++;
+			continue;
 		}
 
 		if (ban_checks >= ban_any_variant

From 5d97332f04282970aebd3b0e024f5f0b2e63bd71 Mon Sep 17 00:00:00 2001
From: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
Date: Mon, 25 Aug 2025 15:25:59 +0200
Subject: [PATCH 14/14] hash: Missing oh null check in HSH_Fail()

---
 bin/varnishd/cache/cache_hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/varnishd/cache/cache_hash.c b/bin/varnishd/cache/cache_hash.c
index a49a1f76a0..551e4ab468 100644
--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -915,7 +915,7 @@ HSH_Fail(struct worker *wrk, struct objcore *oc)
 	CHECK_OBJ_NOTNULL(oc, OBJCORE_MAGIC);
 	CHECK_OBJ_NOTNULL(oc->boc, BOC_MAGIC);
 	oh = oc->objhead;
-	CHECK_OBJ(oh, OBJHEAD_MAGIC);
+	CHECK_OBJ_NOTNULL(oh, OBJHEAD_MAGIC);
 	INIT_OBJ(&rush, RUSH_MAGIC);
 
 	/*