diff --git a/bin/varnishd/Makefile.am b/bin/varnishd/Makefile.am
index 0d60f1b188..8ff33394bd 100644
--- a/bin/varnishd/Makefile.am
+++ b/bin/varnishd/Makefile.am
@@ -81,6 +81,7 @@ varnishd_SOURCES = \
 	http2/cache_http2_hpack.c \
 	http2/cache_http2_panic.c \
 	http2/cache_http2_proto.c \
+	http2/cache_http2_reqbody.c \
 	http2/cache_http2_send.c \
 	http2/cache_http2_session.c \
 	mgt/mgt_child.c \
diff --git a/bin/varnishd/cache/cache_session.c b/bin/varnishd/cache/cache_session.c
index c031cb665f..c0856b400b 100644
--- a/bin/varnishd/cache/cache_session.c
+++ b/bin/varnishd/cache/cache_session.c
@@ -258,18 +258,13 @@ HTC_Status(enum htc_status_e e, const char **name, const char **desc)
 void
 HTC_RxInit(struct http_conn *htc, struct ws *ws)
 {
-	unsigned rollback;
 	int l;
 
 	CHECK_OBJ_NOTNULL(htc, HTTP_CONN_MAGIC);
 	htc->ws = ws;
 
-	/* NB: HTTP/1 keep-alive triggers a rollback, so does the first
-	 * request of a session or an h2 request where the rollback is a
-	 * no-op in terms of workspace usage.
-	 */
-	rollback = !strcasecmp(ws->id, "req") && htc->body_status == NULL;
-	l = WS_Pipeline(htc->ws, htc->pipeline_b, htc->pipeline_e, rollback);
+	l = WS_Pipeline(htc->ws, htc->pipeline_b, htc->pipeline_e,
+	    htc->pipeline_snap);
 	xxxassert(l >= 0);
 
 	htc->rxbuf_b = WS_Reservation(ws);
@@ -410,6 +405,38 @@ HTC_RxStuff(struct http_conn *htc, htc_complete_f *func,
 	}
 }
 
+/*--------------------------------------------------------------------
+ * Prune a vector of struct iovec
+ */
+
+void
+VIOV_prune(struct iovec *iov, unsigned *n, size_t l)
+{
+	unsigned u;
+
+	if (l == 0)
+		return;
+
+	AN(iov);
+	AN(n);
+
+	u = 0;
+	while (l > 0) {
+		assert(u < *n);
+		if (iov[u].iov_len <= l) {
+			l -= iov[u].iov_len;
+			u++;
+		} else {
+			iov[u].iov_base = (char *)iov[u].iov_base + l;
+			iov[u].iov_len -= l;
+			break;
+		}
+	}
+
+	memmove(iov, &iov[u], (*n - u) * sizeof *iov);
+	*n -= u;
+}
+
 /*--------------------------------------------------------------------
  * Get a new session, preferably by recycling an already ready one
  *
diff --git a/bin/varnishd/cache/cache_varnishd.h b/bin/varnishd/cache/cache_varnishd.h
index 2892ef188d..9eee2580af 100644
--- a/bin/varnishd/cache/cache_varnishd.h
+++ b/bin/varnishd/cache/cache_varnishd.h
@@ -104,6 +104,7 @@ struct http_conn {
 	char			*rxbuf_e;
 	char			*pipeline_b;
 	char			*pipeline_e;
+	uintptr_t		pipeline_snap;
 	ssize_t			content_length;
 	void			*priv;
 
@@ -462,6 +463,8 @@ enum htc_status_e HTC_RxStuff(struct http_conn *, htc_complete_f *,
     vtim_real *t1, vtim_real *t2, vtim_real ti, vtim_real tn, vtim_dur td,
     int maxbytes);
 
+void VIOV_prune(struct iovec *iov, unsigned *n, size_t l);
+
 #define SESS_ATTR(UP, low, typ, len)					\
 	int SES_Set_##low(const struct sess *sp, const typ *src);	\
 	int SES_Reserve_##low(struct sess *sp, typ **dst, ssize_t *sz);
@@ -567,7 +570,9 @@ WS_IsReserved(const struct ws *ws)
 
 void *WS_AtOffset(const struct ws *ws, unsigned off, unsigned len);
 unsigned WS_ReservationOffset(const struct ws *ws);
-int WS_Pipeline(struct ws *, const void *b, const void *e, unsigned rollback);
+
+extern uintptr_t const ws_pipeline_rollback;
+int WS_Pipeline(struct ws *, const void *b, const void *e, uintptr_t snap);
 
 /* cache_ws_common.c */
 void WS_Id(const struct ws *ws, char *id);
diff --git a/bin/varnishd/cache/cache_ws.c b/bin/varnishd/cache/cache_ws.c
index 3f2cc5309c..7895e6161d 100644
--- a/bin/varnishd/cache/cache_ws.c
+++ b/bin/varnishd/cache/cache_ws.c
@@ -136,14 +136,16 @@ WS_Reset(struct ws *ws, uintptr_t pp)
  */
 
 int
-WS_Pipeline(struct ws *ws, const void *b, const void *e, unsigned rollback)
+WS_Pipeline(struct ws *ws, const void *b, const void *e, uintptr_t snap)
 {
 	unsigned r, l;
 
 	WS_Assert(ws);
 
-	if (rollback)
+	if (snap == ws_pipeline_rollback)
 		WS_Rollback(ws, 0);
+	else if (snap != 0)
+		WS_Rollback(ws, snap);
 
 	r = WS_ReserveAll(ws);
 
diff --git a/bin/varnishd/cache/cache_ws_common.c b/bin/varnishd/cache/cache_ws_common.c
index a23cd06af6..bcd28b1e7a 100644
--- a/bin/varnishd/cache/cache_ws_common.c
+++ b/bin/varnishd/cache/cache_ws_common.c
@@ -37,6 +37,8 @@
 
 #include "cache_varnishd.h"
 
+uintptr_t const ws_pipeline_rollback = (uintptr_t)&ws_pipeline_rollback;
+
 void
 WS_Id(const struct ws *ws, char *id)
 {
diff --git a/bin/varnishd/cache/cache_ws_emu.c b/bin/varnishd/cache/cache_ws_emu.c
index 767839d1e2..c6a1393197 100644
--- a/bin/varnishd/cache/cache_ws_emu.c
+++ b/bin/varnishd/cache/cache_ws_emu.c
@@ -222,7 +222,7 @@ WS_Reset(struct ws *ws, uintptr_t pp)
 }
 
 int
-WS_Pipeline(struct ws *ws, const void *b, const void *e, unsigned rollback)
+WS_Pipeline(struct ws *ws, const void *b, const void *e, uintptr_t snap)
 {
 	void *tmp;
 	unsigned r, l;
@@ -248,8 +248,10 @@ WS_Pipeline(struct ws *ws, const void *b, const void *e, unsigned rollback)
 		tmp = NULL;
 	}
 
-	if (rollback)
+	if (snap == ws_pipeline_rollback)
 		WS_Rollback(ws, 0);
+	else if (snap != 0)
+		WS_Rollback(ws, snap);
 
 	r = WS_ReserveAll(ws);
 
diff --git a/bin/varnishd/http1/cache_http1_fsm.c b/bin/varnishd/http1/cache_http1_fsm.c
index ac1dc012cf..f755f5fc77 100644
--- a/bin/varnishd/http1/cache_http1_fsm.c
+++ b/bin/varnishd/http1/cache_http1_fsm.c
@@ -111,6 +111,7 @@ http1_new_session(struct worker *wrk, void *arg)
 	sp = req->sp;
 	CHECK_OBJ_NOTNULL(sp, SESS_MAGIC);
 
+	req->htc->pipeline_snap = ws_pipeline_rollback;
 	HTC_RxInit(req->htc, req->ws);
 
 	sz = sizeof u;
diff --git a/bin/varnishd/http2/cache_http2.h b/bin/varnishd/http2/cache_http2.h
index ba036b84d6..19ad5e3dbd 100644
--- a/bin/varnishd/http2/cache_http2.h
+++ b/bin/varnishd/http2/cache_http2.h
@@ -35,6 +35,9 @@ struct h2h_decode;
 struct h2_frame_s;
 
 #include "hpack/vhp.h"
+#include "vefd.h"
+
+#define H2_TX_BUFSIZE                  1024
 
 /**********************************************************************/
 
@@ -140,42 +143,44 @@ struct h2_req {
 	int				counted;
 	struct h2_sess			*h2sess;
 	struct req			*req;
-	double				t_send;
-	double				t_winupd;
-	pthread_cond_t			*cond;
+	vtim_real			t_send;
+	vtim_real			t_win_low;
 	VTAILQ_ENTRY(h2_req)		list;
-	int64_t				t_window;
-	int64_t				r_window;
 
-	/* Where to wake this stream up */
-	struct worker			*wrk;
+	int64_t				tx_window;
+	int64_t				rx_window;
 
 	struct h2_rxbuf			*rxbuf;
+	struct h2_reqbody_waiter        *reqbody_waiter;
+	h2_error                        async_error;
 
-	VTAILQ_ENTRY(h2_req)		tx_list;
 	h2_error			error;
 };
 
 VTAILQ_HEAD(h2_req_s, h2_req);
 
+struct h2_send_large;
+VTAILQ_HEAD(h2_send_large_s, h2_send_large);
+
 struct h2_sess {
 	unsigned			magic;
 #define H2_SESS_MAGIC			0xa16f7e4b
 
+	unsigned			expect_settings_next;
+
 	pthread_t			rxthr;
-	pthread_cond_t			*cond;
-	pthread_cond_t			winupd_cond[1];
 
 	struct sess			*sess;
 	int				refcnt;
 	int				open_streams;
-	int				winup_streams;
+	int				win_low_streams;
 	uint32_t			highest_stream;
-	int				goaway;
 	int				bogosity;
-	int				do_sweep;
 
-	struct h2_req			*req0;
+	struct vefd                     efd[1];
+
+	int64_t				tx_window;
+	int64_t				rx_window;
 
 	struct h2_req_s			streams;
 
@@ -186,6 +191,23 @@ struct h2_sess {
 	struct h2h_decode		*decode;
 	struct vht_table		dectbl[1];
 
+	vtim_real			deadline;
+
+	struct iovec			tx_vec[2]; /* Must be 2 wide */
+	unsigned			tx_nvec;
+
+	unsigned			tx_stopped;
+
+	uint8_t				*tx_s_start;
+	uint8_t				*tx_s_end;
+	uint8_t				*tx_s_head;
+	uint8_t				*tx_s_mark;
+
+	struct h2_send_large_s		tx_l_queue;
+	struct h2_send_large		*tx_l_current;
+	uint8_t				tx_l_hdrbuf[9];
+	char				tx_l_stuck;
+
 	unsigned			rxf_len;
 	unsigned			rxf_type;
 	unsigned			rxf_flags;
@@ -195,11 +217,8 @@ struct h2_sess {
 	struct h2_settings		remote_settings;
 	struct h2_settings		local_settings;
 
-	struct req			*new_req;
+	struct h2_req			*hpack_lock;
 	vtim_real			t1;	// t_first for new_req
-	uint32_t			goaway_last_stream;
-
-	VTAILQ_HEAD(,h2_req)		txqueue;
 
 	h2_error			error;
 
@@ -213,7 +232,17 @@ struct h2_sess {
 	vtim_real			last_rst;
 };
 
-#define ASSERT_RXTHR(h2) do {assert(h2->rxthr == pthread_self());} while(0)
+#define ASSERT_H2_SESS(h2)						\
+	do {								\
+		CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);			\
+		assert(pthread_equal(h2->rxthr, pthread_self()));	\
+	} while (0)
+
+#define ASSERT_H2_REQ(h2) \
+	do {								\
+		CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);			\
+		assert(!pthread_equal(h2->rxthr, pthread_self()));	\
+	} while (0)
 
 /* http2/cache_http2_panic.c */
 #ifdef TRANSPORT_MAGIC
@@ -235,7 +264,6 @@ struct h2h_decode {
 	unsigned			has_scheme:1;
 	h2_error			error;
 	enum vhd_ret_e			vhd_ret;
-	struct ws			*ws;
 	char				*out;
 	int64_t				limit;
 	size_t				out_l;
@@ -244,38 +272,42 @@ struct h2h_decode {
 	struct vhd_decode		vhd[1];
 };
 
-void h2h_decode_hdr_init(const struct h2_sess *h2);
-h2_error h2h_decode_hdr_fini(const struct h2_sess *h2);
+void h2h_decode_hdr_init(struct h2_sess *h2, struct h2_req *);
+h2_error h2h_decode_hdr_fini(struct h2_sess *h2);
 h2_error h2h_decode_bytes(struct h2_sess *h2, const uint8_t *ptr,
     size_t len);
 
 /* cache_http2_send.c */
-void H2_Send_Get(struct worker *, struct h2_sess *, struct h2_req *);
-void H2_Send_Rel(struct h2_sess *, const struct h2_req *);
-
-void H2_Send_Frame(struct worker *, struct h2_sess *,
-    h2_frame type, uint8_t flags, uint32_t len, uint32_t stream,
-    const void *);
-
-void H2_Send_RST(struct worker *wrk, struct h2_sess *h2,
-    const struct h2_req *r2, uint32_t stream, h2_error h2e);
-
-void H2_Send(struct worker *, struct h2_req *, h2_frame type, uint8_t flags,
-    uint32_t len, const void *, uint64_t *acct);
+int H2_Send_RST(struct h2_sess *h2, uint32_t stream, h2_error h2e);
+int H2_Send_SETTINGS(struct h2_sess *h2, uint8_t flags, ssize_t len,
+    const uint8_t *buf);
+int H2_Send_PING(struct h2_sess *h2, uint8_t flags, uint64_t data);
+int H2_Send_GOAWAY(struct h2_sess *h2, uint32_t last_stream_id, h2_error h2e);
+int H2_Send_WINDOW_UPDATE(struct h2_sess *h2, uint32_t stream, uint32_t incr);
+int H2_Send(struct vsl_log *vsl, struct h2_req *r2, h2_frame ftyp,
+    uint8_t flags, uint32_t len, const void *ptr);
+ssize_t H2_Send_TxStuff(struct h2_sess *h2);
+int H2_Send_Something(struct h2_sess *h2);
+int H2_Send_Pending(struct h2_sess *h2);
+void H2_Send_Shutdown(struct h2_sess *h2);
+void H2_Send_Stop(struct h2_sess *h2);
 
 /* cache_http2_proto.c */
-struct h2_req * h2_new_req(struct h2_sess *, unsigned stream, struct req *);
-h2_error h2_stream_tmo(struct h2_sess *, const struct h2_req *, vtim_real);
-void h2_del_req(struct worker *, struct h2_req *);
-void h2_kill_req(struct worker *, struct h2_sess *, struct h2_req *, h2_error);
-int h2_rxframe(struct worker *, struct h2_sess *);
+const char *h2_framename(int frame);
+h2_error h2_errcheck(const struct h2_req *r2);
+void h2_async_error(struct h2_req *r2, h2_error h2e);
+void h2_attention(struct h2_sess *h2);
+void h2_stream_setstate(struct h2_req *r2, enum h2_stream_e state);
+void h2_run(struct worker *wrk, struct h2_sess *h2);
+struct h2_req * h2_new_req(struct h2_sess *, unsigned stream, struct req **);
+void h2_kill_req(struct worker *, struct h2_sess *, struct h2_req **, h2_error);
 h2_error h2_set_setting(struct h2_sess *, const uint8_t *);
-void h2_req_body(struct req*);
 task_func_t h2_do_req;
 #ifdef TRANSPORT_MAGIC
 vtr_req_fail_f h2_req_fail;
 #endif
 
-/* cache_http2_session.c */
-void
-H2S_Lock_VSLb(const struct h2_sess *, enum VSL_tag_e, const char *, ...);
+/* cache_http2_reqbody.c */
+h2_error h2_reqbody_data(struct worker *, struct h2_sess *, struct h2_req *);
+void h2_reqbody(struct req *);
+void h2_reqbody_kick(struct h2_req *r2);
diff --git a/bin/varnishd/http2/cache_http2_deliver.c b/bin/varnishd/http2/cache_http2_deliver.c
index 4013012436..a3bc571a5e 100644
--- a/bin/varnishd/http2/cache_http2_deliver.c
+++ b/bin/varnishd/http2/cache_http2_deliver.c
@@ -73,7 +73,7 @@ V2D_Init(void)
 /**********************************************************************/
 
 static int v_matchproto_(vdp_init_f)
-h2_init(VRT_CTX, struct vdp_ctx *vdc, void **priv)
+h2_vdp_init(VRT_CTX, struct vdp_ctx *vdc, void **priv)
 {
 	struct h2_req *r2;
 
@@ -86,57 +86,67 @@ h2_init(VRT_CTX, struct vdp_ctx *vdc, void **priv)
 }
 
 static int v_matchproto_(vdp_fini_f)
-h2_fini(struct vdp_ctx *vdc, void **priv)
+h2_vdp_fini(struct vdp_ctx *vdc, void **priv)
 {
 	struct h2_req *r2;
+	h2_error h2e = NULL;
 
 	CHECK_OBJ_NOTNULL(vdc, VDP_CTX_MAGIC);
 	CHECK_OBJ_NOTNULL(vdc->wrk, WORKER_MAGIC);
 	TAKE_OBJ_NOTNULL(r2, priv, H2_REQ_MAGIC);
 
-	if (r2->error)
-		return (0);
-
 	if (vdc->retval < 0) {
-		r2->error = H2SE_INTERNAL_ERROR; /* XXX: proper error? */
-		H2_Send_Get(vdc->wrk, r2->h2sess, r2);
-		H2_Send_RST(vdc->wrk, r2->h2sess, r2, r2->stream, r2->error);
-		H2_Send_Rel(r2->h2sess, r2);
-		return (0);
+		h2e = H2SE_INTERNAL_ERROR;
+		h2_async_error(r2, h2e);
+	} else
+		h2e = h2_errcheck(r2);
+
+	if (h2e != NULL)
+		VSLb(vdc->vsl, SLT_Error, "H2: delivery error (%s)", h2e->name);
+
+	if (h2e == NULL && r2->state < H2_S_CLOSED) {
+		/* Not all VDPs will always send VDP_END (e.g. ESI). End
+		 * the stream here if necessary. */
+		H2_Send(vdc->vsl, r2, H2_F_DATA, H2FF_END_STREAM, 0, NULL);
 	}
 
-	H2_Send_Get(vdc->wrk, r2->h2sess, r2);
-	H2_Send(vdc->wrk, r2, H2_F_DATA, H2FF_DATA_END_STREAM, 0, "", NULL);
-	H2_Send_Rel(r2->h2sess, r2);
 	return (0);
 }
 
 static int v_matchproto_(vdp_bytes_f)
-h2_bytes(struct vdp_ctx *vdc, enum vdp_action act, void **priv,
+h2_vdp_bytes(struct vdp_ctx *vdc, enum vdp_action act, void **priv,
     const void *ptr, ssize_t len)
 {
 	struct h2_req *r2;
+	uint8_t flags = H2FF_NONE;
 
 	CHECK_OBJ_NOTNULL(vdc, VDP_CTX_MAGIC);
 	CAST_OBJ_NOTNULL(r2, *priv, H2_REQ_MAGIC);
-	(void)act;
+	assert(len >= 0);
 
-	if ((r2->h2sess->error || r2->error))
+	if (h2_errcheck(r2) != NULL)
 		return (-1);
-	if (len == 0)
+	vdc->bytes_done = len;
+	if (len == 0) {
+		/* No reason to send an empty frame. There is code
+		 * (notably ESI) that will pass len==0 without
+		 * VDP_END. An incomplete delivery will result in
+		 * the len==0 && VDP_END combo, deferring the final
+		 * DATA frame to the h2_vdp_fini() call. */
 		return (0);
-	H2_Send_Get(vdc->wrk, r2->h2sess, r2);
-	vdc->bytes_done = 0;
-	H2_Send(vdc->wrk, r2, H2_F_DATA, H2FF_NONE, len, ptr, &vdc->bytes_done);
-	H2_Send_Rel(r2->h2sess, r2);
+	}
+	if (act == VDP_END)
+		flags |= H2FF_END_STREAM;
+	// XXX? return (H2_Send(...));
+	H2_Send(vdc->vsl, r2, H2_F_DATA, flags, len, ptr);
 	return (0);
 }
 
 static const struct vdp h2_vdp = {
 	.name =		"H2B",
-	.init =		h2_init,
-	.bytes =	h2_bytes,
-	.fini =		h2_fini,
+	.init =		h2_vdp_init,
+	.bytes =	h2_vdp_bytes,
+	.fini =		h2_vdp_fini,
 };
 
 static inline size_t
@@ -170,6 +180,7 @@ h2_minimal_response(struct req *req, uint16_t status)
 	struct h2_req *r2;
 	size_t l;
 	uint8_t buf[6];
+	uint8_t flags;
 
 	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
 	CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC);
@@ -189,14 +200,10 @@ h2_minimal_response(struct req *req, uint16_t status)
 		req->err_code = status;
 
 	/* XXX return code checking once H2_Send returns anything but 0 */
-	H2_Send_Get(req->wrk, r2->h2sess, r2);
-	H2_Send(req->wrk, r2,
-	    H2_F_HEADERS,
-	    H2FF_HEADERS_END_HEADERS |
-		(status < 200 ? 0 : H2FF_HEADERS_END_STREAM),
-	    l, buf, NULL);
-	H2_Send_Rel(r2->h2sess, r2);
-	return (0);
+	flags = H2FF_END_HEADERS;
+	if (status >= 200)
+		flags |= H2FF_END_STREAM;
+	return (H2_Send(req->vsl, r2, H2_F_HEADERS, flags, l, buf));
 }
 
 static void
@@ -302,6 +309,7 @@ h2_deliver(struct req *req, int sendbody)
 	struct vsb resp[1];
 	struct vrt_ctx ctx[1];
 	uintptr_t ss;
+	uint8_t flags;
 
 	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
 	CHECK_OBJ_NOTNULL(req->objcore, OBJCORE_MAGIC);
@@ -332,11 +340,10 @@ h2_deliver(struct req *req, int sendbody)
 
 	r2->t_send = req->t_prev;
 
-	H2_Send_Get(req->wrk, r2->h2sess, r2);
-	H2_Send(req->wrk, r2, H2_F_HEADERS,
-	    (sendbody ? 0 : H2FF_HEADERS_END_STREAM) | H2FF_HEADERS_END_HEADERS,
-	    sz, r, &req->acct.resp_hdrbytes);
-	H2_Send_Rel(r2->h2sess, r2);
+	flags = H2FF_END_HEADERS;
+	if (!sendbody)
+		flags |= H2FF_END_STREAM;
+	H2_Send(req->vsl, r2, H2_F_HEADERS, flags, sz, r);
 
 	WS_Reset(req->ws, ss);
 
diff --git a/bin/varnishd/http2/cache_http2_hpack.c b/bin/varnishd/http2/cache_http2_hpack.c
index a90e6fde23..ae3709985e 100644
--- a/bin/varnishd/http2/cache_http2_hpack.c
+++ b/bin/varnishd/http2/cache_http2_hpack.c
@@ -260,25 +260,31 @@ h2h_addhdr(struct http *hp, struct h2h_decode *d)
 	return (0);
 }
 
-static void
-h2h_decode_init(const struct h2_sess *h2, struct ws *ws)
+void
+h2h_decode_hdr_init(struct h2_sess *h2, struct h2_req *r2)
 {
 	struct h2h_decode *d;
 
 	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(ws, WS_MAGIC);
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->req, REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->req->http, HTTP_MAGIC);
+
+	AZ(h2->hpack_lock);
+	h2->hpack_lock = r2;
 
 	AN(h2->decode);
 	d = h2->decode;
 	INIT_OBJ(d, H2H_DECODE_MAGIC);
 	VHD_Init(d->vhd);
-	d->out_l = WS_ReserveSize(ws, cache_param->http_req_size);
+	d->out_l = WS_ReserveSize(h2->hpack_lock->req->http->ws,
+	    cache_param->http_req_size);
 	/*
 	 * Can't do any work without any buffer
 	 * space. Require non-zero size.
 	 */
 	XXXAN(d->out_l);
-	d->out = WS_Reservation(ws);
+	d->out = WS_Reservation(h2->hpack_lock->req->http->ws);
 
 	if (cache_param->h2_max_header_list_size == 0)
 		d->limit =
@@ -288,18 +294,6 @@ h2h_decode_init(const struct h2_sess *h2, struct ws *ws)
 
 	if (d->limit < h2->local_settings.max_header_list_size)
 		d->limit = INT64_MAX;
-
-	d->ws = ws;
-}
-
-void
-h2h_decode_hdr_init(const struct h2_sess *h2)
-{
-
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(h2->new_req, REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(h2->new_req->http, HTTP_MAGIC);
-	h2h_decode_init(h2, h2->new_req->ws);
 }
 
 /* Possible error returns:
@@ -311,32 +305,34 @@ h2h_decode_hdr_init(const struct h2_sess *h2)
  * is a stream level error.
  */
 h2_error
-h2h_decode_hdr_fini(const struct h2_sess *h2)
+h2h_decode_hdr_fini(struct h2_sess *h2)
 {
 	h2_error ret;
 	struct h2h_decode *d;
 
 	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
 	d = h2->decode;
-	CHECK_OBJ_NOTNULL(h2->new_req, REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(h2->hpack_lock, H2_REQ_MAGIC);
 	CHECK_OBJ_NOTNULL(d, H2H_DECODE_MAGIC);
-	WS_ReleaseP(d->ws, d->out);
+	WS_ReleaseP(h2->hpack_lock->req->http->ws, d->out);
 	if (d->vhd_ret != VHD_OK) {
 		/* HPACK header block didn't finish at an instruction
 		   boundary */
-		VSLb(h2->new_req->http->vsl, SLT_BogoHeader,
+		VSLb(h2->hpack_lock->req->http->vsl, SLT_BogoHeader,
 		    "HPACK compression error/fini (%s)", VHD_Error(d->vhd_ret));
 		ret = H2CE_COMPRESSION_ERROR;
 	} else if (d->error == NULL && !d->has_scheme) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "Missing :scheme");
+		VSLb(h2->vsl, SLT_Debug, "Missing :scheme");
 		ret = H2SE_MISSING_SCHEME; //rfc7540,l,3087,3090
 	} else
 		ret = d->error;
 	FINI_OBJ(d);
 	if (ret == H2SE_REQ_SIZE) {
-		VSLb(h2->new_req->http->vsl, SLT_LostHeader,
+		VSLb(h2->hpack_lock->req->http->vsl, SLT_LostHeader,
 		    "Header list too large");
 	}
+	h2->hpack_lock = NULL;
+
 	return (ret);
 }
 
@@ -357,15 +353,15 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l)
 	const char *r, *e;
 
 	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(h2->new_req, REQ_MAGIC);
-	hp = h2->new_req->http;
+	CHECK_OBJ_NOTNULL(h2->hpack_lock, H2_REQ_MAGIC);
+	hp = h2->hpack_lock->req->http;
 	CHECK_OBJ_NOTNULL(hp, HTTP_MAGIC);
 	d = h2->decode;
 	CHECK_OBJ_NOTNULL(d, H2H_DECODE_MAGIC);
-	CHECK_OBJ_NOTNULL(d->ws, WS_MAGIC);
-	r = WS_Reservation(d->ws);
+	CHECK_OBJ_NOTNULL(h2->hpack_lock->req->http->ws, WS_MAGIC);
+	r = WS_Reservation(h2->hpack_lock->req->http->ws);
 	AN(r);
-	e = r + WS_ReservationSize(d->ws);
+	e = r + WS_ReservationSize(h2->hpack_lock->req->http->ws);
 
 	/* Only H2E_ENHANCE_YOUR_CALM indicates that we should continue
 	   processing. Other errors should have been returned and handled
@@ -380,7 +376,7 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l)
 		    d->out, d->out_l, &d->out_u);
 
 		if (d->vhd_ret < 0) {
-			H2S_Lock_VSLb(h2, SLT_BogoHeader,
+			VSLb(h2->vsl, SLT_BogoHeader,
 			    "HPACK compression error (%s)",
 			    VHD_Error(d->vhd_ret));
 			d->error = H2CE_COMPRESSION_ERROR;
@@ -440,7 +436,7 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l)
 		}
 
 		if (H2_ERROR_MATCH(d->error, H2SE_ENHANCE_YOUR_CALM)) {
-			d->out = WS_Reservation(d->ws);
+			d->out = WS_Reservation(h2->hpack_lock->req->http->ws);
 			d->out_l = e - d->out;
 			d->limit -= d->out_u;
 			d->out_u = 0;
@@ -452,7 +448,7 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l)
 	if (d->limit < 0) {
 		/* Fatal error, the client exceeded both http_req_size
 		 * and h2_max_header_list_size. */
-		H2S_Lock_VSLb(h2, SLT_SessError, "Header list too large");
+		VSLb(h2->vsl, SLT_SessError, "Header list too large");
 		return (H2CE_ENHANCE_YOUR_CALM);
 	}
 
diff --git a/bin/varnishd/http2/cache_http2_panic.c b/bin/varnishd/http2/cache_http2_panic.c
index 227097e6b4..6585485037 100644
--- a/bin/varnishd/http2/cache_http2_panic.c
+++ b/bin/varnishd/http2/cache_http2_panic.c
@@ -78,10 +78,10 @@ h2_sess_panic(struct vsb *vsb, const struct sess *sp)
 		return;
 	VSB_printf(vsb, "refcnt = %d, bogosity = %d, error = %s\n",
 	    h2->refcnt, h2->bogosity, h2_panic_error(h2->error));
-	VSB_printf(vsb,
-	    "open_streams = %d, highest_stream = %u,"
-	    " goaway_last_stream = %u,\n",
-	    h2->open_streams, h2->highest_stream, h2->goaway_last_stream);
+	VSB_printf(vsb, "open_streams = %d, highest_stream = %u,\n",
+	    h2->open_streams, h2->highest_stream);
+	VSB_printf(vsb, "tx_window = %jd, rx_window = %jd,\n",
+	    h2->tx_window, h2->rx_window);
 	VSB_cat(vsb, "local_settings = {");
 	h2_panic_settings(vsb, &h2->local_settings);
 	VSB_cat(vsb, "},\n");
@@ -107,10 +107,10 @@ h2_sess_panic(struct vsb *vsb, const struct sess *sp)
 
 		VSB_printf(vsb, "h2_sess = %p, scheduled = %d, error = %s,\n",
 		    r2->h2sess, r2->scheduled, h2_panic_error(r2->error));
-		VSB_printf(vsb, "t_send = %f, t_winupd = %f,\n",
-		    r2->t_send, r2->t_winupd);
-		VSB_printf(vsb, "t_window = %jd, r_window = %jd,\n",
-		    (intmax_t)r2->t_window, (intmax_t)r2->r_window);
+		VSB_printf(vsb, "t_send = %f, t_win_low = %f,\n",
+		    r2->t_send, r2->t_win_low);
+		VSB_printf(vsb, "tx_window = %jd, rx_window = %jd,\n",
+		    (intmax_t)r2->tx_window, (intmax_t)r2->rx_window);
 
 		if (!PAN_dump_struct(vsb, r2->rxbuf, H2_RXBUF_MAGIC, "rxbuf")) {
 			VSB_printf(vsb, "stvbuf = %p,\n", r2->rxbuf->stvbuf);
diff --git a/bin/varnishd/http2/cache_http2_proto.c b/bin/varnishd/http2/cache_http2_proto.c
index 254275ebd3..78bcc2196a 100644
--- a/bin/varnishd/http2/cache_http2_proto.c
+++ b/bin/varnishd/http2/cache_http2_proto.c
@@ -31,11 +31,11 @@
 
 #include "config.h"
 
-#include "cache/cache_varnishd.h"
-
+#include <poll.h>
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "cache/cache_varnishd.h"
 #include "cache/cache_transport.h"
 #include "cache/cache_filter.h"
 #include "http2/cache_http2.h"
@@ -74,11 +74,11 @@ enum h2frame {
 #include "tbl/h2_frames.h"
 };
 
-static const char *
-h2_framename(enum h2frame h2f)
+const char *
+h2_framename(int frame)
 {
 
-	switch (h2f) {
+	switch (frame) {
 #define H2_FRAME(l,u,t,f,...)	case H2F_##u: return (#u);
 #include "tbl/h2_frames.h"
 	default:
@@ -141,17 +141,32 @@ h2_connectionerror(uint32_t u)
 		return (H2NN_ERROR);
 }
 
+h2_error
+h2_errcheck(const struct h2_req *r2)
+{
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC);
+
+	if (r2->error != NULL)
+		return (r2->error);
+	return (r2->h2sess->error);
+}
+
 /**********************************************************************/
 
 struct h2_req *
-h2_new_req(struct h2_sess *h2, unsigned stream, struct req *req)
+h2_new_req(struct h2_sess *h2, unsigned stream, struct req **preq)
 {
+	struct req *req;
 	struct h2_req *r2;
 
-	ASSERT_RXTHR(h2);
-	if (req == NULL)
+	ASSERT_H2_SESS(h2);
+	if (preq != NULL)
+		TAKE_OBJ_NOTNULL(req, preq, REQ_MAGIC);
+	else {
 		req = Req_New(h2->sess, NULL);
-	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
+		CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
+	}
 
 	r2 = WS_Alloc(req->ws, sizeof *r2);
 	AN(r2);
@@ -160,42 +175,37 @@ h2_new_req(struct h2_sess *h2, unsigned stream, struct req *req)
 	r2->h2sess = h2;
 	r2->stream = stream;
 	r2->req = req;
-	if (stream)
-		r2->counted = 1;
-	r2->r_window = h2->local_settings.initial_window_size;
-	r2->t_window = h2->remote_settings.initial_window_size;
+	r2->rx_window = h2->local_settings.initial_window_size;
+	r2->tx_window = h2->remote_settings.initial_window_size;
 	req->transport_priv = r2;
-	Lck_Lock(&h2->sess->mtx);
-	if (stream)
+	if (stream > 0)
 		h2->open_streams++;
 	VTAILQ_INSERT_TAIL(&h2->streams, r2, list);
-	Lck_Unlock(&h2->sess->mtx);
 	h2->refcnt++;
 	return (r2);
 }
 
-void
-h2_del_req(struct worker *wrk, struct h2_req *r2)
+static void
+h2_del_req(struct worker *wrk, struct h2_req **pr2)
 {
+	struct h2_req *r2;
 	struct h2_sess *h2;
 	struct sess *sp;
 	struct stv_buffer *stvbuf;
+	struct req *req;
 
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	TAKE_OBJ_NOTNULL(r2, pr2, H2_REQ_MAGIC);
 	AZ(r2->scheduled);
 	h2 = r2->h2sess;
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	sp = h2->sess;
-	Lck_Lock(&sp->mtx);
 	assert(h2->refcnt > 0);
 	--h2->refcnt;
 	/* XXX: PRIORITY reshuffle */
 	VTAILQ_REMOVE(&h2->streams, r2, list);
-	if (r2->req == h2->new_req)
-		h2->new_req = NULL;
-	Lck_Unlock(&sp->mtx);
+	assert(r2->t_win_low == 0.);
 
+	AZ(r2->reqbody_waiter);
 	assert(!WS_IsReserved(r2->req->ws));
 	AZ(r2->req->ws->r);
 
@@ -207,47 +217,88 @@ h2_del_req(struct worker *wrk, struct h2_req *r2)
 		AZ(stvbuf);
 	}
 
-	Req_Cleanup(sp, wrk, r2->req);
+	req = r2->req;
+	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
+	r2->magic = 0;
+	req->transport_priv = NULL;
+
+	AZ(req->ws->r);
+	Req_Cleanup(sp, wrk, req);
 	if (FEATURE(FEATURE_BUSY_STATS_RATE))
 		WRK_AddStat(wrk);
-	Req_Release(r2->req);
+	Req_Release(req);
 }
 
 void
-h2_kill_req(struct worker *wrk, struct h2_sess *h2,
-    struct h2_req *r2, h2_error h2e)
+h2_kill_req(struct worker *wrk, struct h2_sess *h2, struct h2_req **pr2,
+    h2_error h2e)
 {
+	struct h2_req *r2;
 
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
+	TAKE_OBJ_NOTNULL(r2, pr2, H2_REQ_MAGIC);
 	AN(h2e);
-	Lck_Lock(&h2->sess->mtx);
-	VSLb(h2->vsl, SLT_Debug, "KILL st=%u state=%d sched=%d",
-	    r2->stream, r2->state, r2->scheduled);
-	if (r2->counted) {
-		assert(h2->open_streams > 0);
-		h2->open_streams--;
-		r2->counted = 0;
+
+	VSLb(h2->vsl, SLT_Debug, "KILL st=%u state=%d sched=%d error=%d",
+	    r2->stream, r2->state, r2->scheduled, h2e->val);
+
+	if (h2->error != NULL) {
+		/* The connection is in an error state. Don't send RST. */
+	} else if (r2->error == NULL && r2->state < H2_S_CLOSED) {
+		/* Notify the peer only first time it is killed. */
+		H2_Send_RST(h2, r2->stream, h2e);
 	}
-	if (r2->error == NULL)
+
+	if (r2->error == NULL || H2_ERROR_MATCH(r2->error, H2SE_NO_ERROR)) {
+		/* We latch the first error set, except if it was a "no
+		 * error". */
 		r2->error = h2e;
+	}
+
+	if (r2 == h2->hpack_lock) {
+		/* We are killing the request that holds the hpack
+		 * context. This is a hard error. */
+		(void)h2h_decode_hdr_fini(h2);
+		AZ(h2->hpack_lock);
+		if (h2->error == NULL)
+			h2->error = H2CE_COMPRESSION_ERROR;
+	}
+
+	if (r2->t_win_low != 0.) {
+		assert(h2->win_low_streams > 0);
+		h2->win_low_streams--;
+		r2->t_win_low = 0.;
+	}
+
+	h2_stream_setstate(r2, H2_S_CLOSED);
+
+	Lck_Lock(&h2->sess->mtx);
 	if (r2->scheduled) {
-		if (r2->cond != NULL)
-			PTOK(pthread_cond_signal(r2->cond));
-		r2 = NULL;
+		h2_reqbody_kick(r2);
 		Lck_Unlock(&h2->sess->mtx);
 	} else {
 		Lck_Unlock(&h2->sess->mtx);
-		if (r2->state == H2_S_OPEN && h2->new_req == r2->req)
-			(void)h2h_decode_hdr_fini(h2);
+		h2_del_req(wrk, &r2);
+	}
+}
+
+static void
+h2_kill_all(struct worker *wrk, struct h2_sess *h2, h2_error h2e)
+{
+	struct h2_req *r2, *r22;
+
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	AN(h2e);
+	VTAILQ_FOREACH_SAFE(r2, &h2->streams, list, r22) {
+		CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+		h2_kill_req(wrk, h2, &r2, h2e);
 	}
-	if (r2 != NULL)
-		h2_del_req(wrk, r2);
 }
 
 /**********************************************************************/
 
 static void
-h2_vsl_frame(const struct h2_sess *h2, const void *ptr, size_t len)
+h2_rxframe_vsl(const struct h2_sess *h2, const void *ptr, size_t len)
 {
 	const uint8_t *b;
 	struct vsb *vsb;
@@ -275,18 +326,12 @@ h2_vsl_frame(const struct h2_sess *h2, const void *ptr, size_t len)
 	VSB_quote(vsb, b + 4, 1, VSB_QUOTE_HEX);
 	VSB_putc(vsb, ' ');
 	VSB_quote(vsb, b + 5, 4, VSB_QUOTE_HEX);
-	if (u > 0) {
-		VSB_putc(vsb, ' ');
-		VSB_quote(vsb, b + 9, len - 9, VSB_QUOTE_HEX);
-	}
 	AZ(VSB_finish(vsb));
-	Lck_Lock(&h2->sess->mtx);
 	VSLb_bin(h2->vsl, SLT_H2RxHdr, 9, b);
 	if (len > 9)
 		VSLb_bin(h2->vsl, SLT_H2RxBody, len - 9, b + 9);
 
 	VSLb(h2->vsl, SLT_Debug, "H2RXF %s", VSB_data(vsb));
-	Lck_Unlock(&h2->sess->mtx);
 	VSB_destroy(&vsb);
 }
 
@@ -297,25 +342,24 @@ h2_vsl_frame(const struct h2_sess *h2, const void *ptr, size_t len)
 static h2_error v_matchproto_(h2_rxframe_f)
 h2_rx_ping(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
+	uint64_t val;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	assert(r2 == h2->req0);
+	ASSERT_H2_SESS(h2);
+	AZ(r2);
 
-	if (h2->rxf_len != 8) { 			// rfc7540,l,2364,2366
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx ping with (len != 8)");
+	if (h2->rxf_len != 8) {				// rfc7540,l,2364,2366
+		VSLb(h2->vsl, SLT_SessError, "H2: rx ping with (len != 8)");
 		return (H2CE_FRAME_SIZE_ERROR);
 	}
 	AZ(h2->rxf_stream);				// rfc7540,l,2359,2362
 	if (h2->rxf_flags != 0)	{			// We never send pings
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx ping ack");
+		VSLb(h2->vsl, SLT_SessError, "H2: rx ping ack");
 		return (H2SE_PROTOCOL_ERROR);
 	}
-	H2_Send_Get(wrk, h2, r2);
-	H2_Send_Frame(wrk, h2,
-	    H2_F_PING, H2FF_PING_ACK, 8, 0, h2->rxf_data);
-	H2_Send_Rel(h2, r2);
+	_Static_assert(sizeof (val) == 8, "");
+	memcpy(&val, h2->rxf_data, sizeof val);
+	H2_Send_PING(h2, H2FF_ACK, val);
 	return (0);
 }
 
@@ -327,25 +371,25 @@ h2_rx_push_promise(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC);
 
 	// rfc7540,l,2262,2267
-	H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx push promise");
+	VSLb(h2->vsl, SLT_SessError, "H2: rx push promise");
 	return (H2CE_PROTOCOL_ERROR);
 }
 
 /**********************************************************************
  */
 
-static h2_error
+static int
 h2_rapid_reset(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
 	vtim_real now;
 	vtim_dur d;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
 
 	if (h2->rapid_reset_limit == 0)
@@ -364,12 +408,10 @@ h2_rapid_reset(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	    h2->rapid_reset_limit);
 	h2->last_rst = now;
 
-	if (h2->rst_budget < 1.0) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: Hit RST limit. Closing session.");
-		return (H2CE_RAPID_RESET);
-	}
 	h2->rst_budget -= 1.0;
-	return (0);
+	if (h2->rst_budget > 0)
+		return (0);
+	return (1);
 }
 
 static h2_error v_matchproto_(h2_rxframe_f)
@@ -378,17 +420,38 @@ h2_rx_rst_stream(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	h2_error h2e;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC);
 
 	if (h2->rxf_len != 4) {			// rfc7540,l,2003,2004
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx rst with (len != 4)");
+		VSLb(h2->vsl, SLT_SessError, "H2: rx rst with (len != 4)");
 		return (H2CE_FRAME_SIZE_ERROR);
 	}
 	if (r2 == NULL)
 		return (0);
-	h2e = h2_rapid_reset(wrk, h2, r2);
-	h2_kill_req(wrk, h2, r2, h2_streamerror(vbe32dec(h2->rxf_data)));
+
+	h2e = h2_streamerror(vbe32dec(h2->rxf_data));
+	AN(h2e);
+	if (h2e == H2NN_ERROR) {
+		/* The error is unknown. We don't want to return
+		 * H2NN_ERROR from this function because that will cause
+		 * us to close the connection. Map the unknown error to
+		 * H2SE_INTERNAL_ERROR as suggested by the RFC. */
+		/* rfc7540,l,2839,2841 */
+		h2e = H2SE_INTERNAL_ERROR;
+	}
+
+	/* We set `r2->error` prior to returnnig to prevent sending a RST in
+	 * return. */
+	if (r2->error == NULL)
+		r2->error = h2e;
+
+	if (h2_rapid_reset(wrk, h2, r2)) {
+		/* Upgrading to a connection level error. */
+		VSLb(h2->vsl, SLT_Error, "H2: Hit RST limit. Closing session.");
+		h2e = H2CE_RAPID_RESET;
+	}
+
 	return (h2e);
 }
 
@@ -398,36 +461,22 @@ h2_rx_rst_stream(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 static h2_error v_matchproto_(h2_rxframe_f)
 h2_rx_goaway(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
+	h2_error h2e;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	assert(r2 == h2->req0);
-
-	h2->goaway = 1;
-	h2->goaway_last_stream = vbe32dec(h2->rxf_data);
-	h2->error = h2_connectionerror(vbe32dec(h2->rxf_data + 4));
-	H2S_Lock_VSLb(h2, SLT_Debug, "GOAWAY %s", h2->error->name);
-	return (h2->error);
-}
-
-static void
-h2_tx_goaway(struct worker *wrk, struct h2_sess *h2, h2_error h2e)
-{
-	char b[8];
+	ASSERT_H2_SESS(h2);
+	AZ(r2);
 
-	ASSERT_RXTHR(h2);
+	h2e = h2_connectionerror(vbe32dec(h2->rxf_data + 4));
 	AN(h2e);
 
-	if (h2->goaway || !h2e->send_goaway)
-		return;
-
-	h2->goaway = 1;
-	vbe32enc(b, h2->highest_stream);
-	vbe32enc(b + 4, h2e->val);
-	H2_Send_Get(wrk, h2, h2->req0);
-	H2_Send_Frame(wrk, h2, H2_F_GOAWAY, 0, 8, 0, b);
-	H2_Send_Rel(h2, h2->req0);
+	VSLb(h2->vsl, SLT_Debug, "GOAWAY %s", h2e->name); /* XXX: Remove? */
+	if (!H2_ERROR_MATCH(h2e, H2CE_NO_ERROR)) {
+		/* XXX: Should we log something (not SLT_Error) on a
+		 * graceful shutdown? */
+		VSLb(h2->vsl, SLT_Error, "H2: rx goaway %s", h2e->name);
+	}
+	return (H2CE_NO_ERROR);
 }
 
 /**********************************************************************
@@ -439,27 +488,41 @@ h2_rx_window_update(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	uint32_t wu;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC);
 
 	if (h2->rxf_len != 4) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx winup with (len != 4)");
+		VSLb(h2->vsl, SLT_SessError, "H2: rx winup with (len != 4)");
 		return (H2CE_FRAME_SIZE_ERROR);
 	}
 	wu = vbe32dec(h2->rxf_data) & ~(1LU<<31);
-	if (wu == 0)
-		return (H2SE_PROTOCOL_ERROR);
-	if (r2 == NULL)
-		return (0);
-	Lck_Lock(&h2->sess->mtx);
-	r2->t_window += wu;
-	if (r2 == h2->req0)
-		PTOK(pthread_cond_broadcast(h2->winupd_cond));
-	else if (r2->cond != NULL)
-		PTOK(pthread_cond_signal(r2->cond));
-	Lck_Unlock(&h2->sess->mtx);
-	if (r2->t_window >= (1LL << 31))
-		return (H2SE_FLOW_CONTROL_ERROR);
+	if (h2->rxf_stream == 0) {
+		AZ(r2);
+		if (wu == 0)
+			return (H2CE_PROTOCOL_ERROR);
+		h2->tx_window += wu;
+		if (h2->tx_window >= (1LL << 31))
+			return (H2CE_FLOW_CONTROL_ERROR);
+	} else {
+		if (wu == 0)
+			return (H2SE_PROTOCOL_ERROR);
+		if (r2 == NULL) {
+			/* Window update received for a stream we are no
+			 * longer tracking. We MUST ignore this.
+			 * rfc7540,l,2583,2586 */
+			return (0);
+		}
+		r2->tx_window += wu;
+		if (r2->tx_window >= (1LL << 31))
+			return (H2SE_FLOW_CONTROL_ERROR);
+		if (r2->t_win_low != 0.) {
+			assert(h2->win_low_streams > 0);
+			h2->win_low_streams--;
+			r2->t_win_low = 0.;
+		}
+	}
+	/* Assume we are no longer stuck on output. */
+	h2->tx_l_stuck = 0;
 	return (0);
 }
 
@@ -474,7 +537,7 @@ h2_rx_priority(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC);
 	return (0);
 }
@@ -511,12 +574,9 @@ h2_win_adjust(const struct h2_sess *h2, uint32_t oldval, uint32_t newval)
 {
 	struct h2_req *r2;
 
-	Lck_AssertHeld(&h2->sess->mtx);
 	// rfc7540,l,2668,2674
 	VTAILQ_FOREACH(r2, &h2->streams, list) {
 		CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-		if (r2 == h2->req0)
-			continue; // rfc7540,l,2699,2699
 		switch (r2->state) {
 		case H2_S_IDLE:
 		case H2_S_OPEN:
@@ -525,7 +585,7 @@ h2_win_adjust(const struct h2_sess *h2, uint32_t oldval, uint32_t newval)
 			 * We allow a window to go negative, as per
 			 * rfc7540,l,2676,2680
 			 */
-			r2->t_window += (int64_t)newval - oldval;
+			r2->tx_window += (int64_t)newval - oldval;
 			break;
 		default:
 			break;
@@ -544,22 +604,25 @@ h2_set_setting(struct h2_sess *h2, const uint8_t *d)
 	y = vbe32dec(d + 2);
 	if (x >= H2_SETTING_TBL_LEN || h2_setting_tbl[x] == NULL) {
 		// rfc7540,l,2181,2182
-		H2S_Lock_VSLb(h2, SLT_Debug,
+		VSLb(h2->vsl, SLT_Debug,
 		    "H2SETTING unknown setting 0x%04x=%08x (ignored)", x, y);
 		return (0);
 	}
 	s = h2_setting_tbl[x];
 	AN(s);
 	if (y < s->minval || y > s->maxval) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "H2SETTING invalid %s=0x%08x",
+		VSLb(h2->vsl, SLT_Debug, "H2SETTING invalid %s=0x%08x",
 		    s->name, y);
 		AN(s->range_error);
 		if (!DO_DEBUG(DBG_H2_NOCHECK))
 			return (s->range_error);
 	}
 	Lck_Lock(&h2->sess->mtx);
-	if (s == H2_SET_INITIAL_WINDOW_SIZE)
+	if (s == H2_SET_INITIAL_WINDOW_SIZE) {
 		h2_win_adjust(h2, h2->remote_settings.initial_window_size, y);
+		/* Assume we are no longer stuck on output. */
+		h2->tx_l_stuck = 0;
+	}
 	VSLb(h2->vsl, SLT_Debug, "H2SETTING %s=0x%08x", s->name, y);
 	Lck_Unlock(&h2->sess->mtx);
 	AN(s->setfunc);
@@ -575,21 +638,20 @@ h2_rx_settings(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	h2_error retval = 0;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	assert(r2 == h2->req0);
+	ASSERT_H2_SESS(h2);
 	AZ(h2->rxf_stream);
+	AZ(r2);
 
-	if (h2->rxf_flags == H2FF_SETTINGS_ACK) {
+	if (h2->rxf_flags == H2FF_ACK) {
 		if (h2->rxf_len > 0) {			// rfc7540,l,2047,2049
-			H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx settings ack with "
+			VSLb(h2->vsl, SLT_SessError, "H2: rx settings ack with "
 			    "(len > 0)");
 			return (H2CE_FRAME_SIZE_ERROR);
 		}
 		return (0);
 	} else {
 		if (h2->rxf_len % 6) {			// rfc7540,l,2062,2064
-			H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx settings with "
+			VSLb(h2->vsl, SLT_SessError, "H2: rx settings with "
 			    "((len %% 6) != 0)");
 			return (H2CE_PROTOCOL_ERROR);
 		}
@@ -599,10 +661,7 @@ h2_rx_settings(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 			if (retval)
 				return (retval);
 		}
-		H2_Send_Get(wrk, h2, r2);
-		H2_Send_Frame(wrk, h2,
-		    H2_F_SETTINGS, H2FF_SETTINGS_ACK, 0, 0, NULL);
-		H2_Send_Rel(h2, r2);
+		H2_Send_SETTINGS(h2, H2FF_ACK, 0, NULL);
 	}
 	return (0);
 }
@@ -631,8 +690,7 @@ h2_do_req(struct worker *wrk, void *priv)
 		CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
 		Lck_Lock(&h2->sess->mtx);
 		r2->scheduled = 0;
-		r2->state = H2_S_CLOSED;
-		r2->h2sess->do_sweep = 1;
+		h2_attention(h2);
 		Lck_Unlock(&h2->sess->mtx);
 	}
 	THR_SetRequest(NULL);
@@ -645,14 +703,20 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2,
 	h2_error h2e;
 	ssize_t cl;
 
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
+	assert(h2->hpack_lock == r2);
 	assert(r2->state == H2_S_OPEN);
 	h2e = h2h_decode_hdr_fini(h2);
-	h2->new_req = NULL;
+	AZ(h2->hpack_lock);
+
+	if (req->req_body_status == BS_NONE) {
+		/* REQ_BODY_NONE implies that the HEADERS frame had flag
+		 * END_STREAM set. */
+		h2_stream_setstate(r2, H2_S_CLOS_REM);
+	}
 	if (h2e != NULL) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "HPACK/FINI %s", h2e->name);
+		VSLb(h2->vsl, SLT_Debug, "HPACK/FINI %s", h2e->name);
 		assert(!WS_IsReserved(r2->req->ws));
-		h2_del_req(wrk, r2);
 		return (h2e);
 	}
 	req->t_req = VTIM_real();
@@ -666,7 +730,7 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2,
 	cl = http_GetContentLength(req->http);
 	assert(cl >= -2);
 	if (cl == -2) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "Non-parseable Content-Length");
+		VSLb(h2->vsl, SLT_Debug, "Non-parseable Content-Length");
 		return (H2SE_PROTOCOL_ERROR);
 	}
 
@@ -689,19 +753,19 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2,
 		assert (req->req_body_status == BS_NONE);
 		r2->state = H2_S_CLOS_REM;
 		if (cl > 0) {
-			H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx header with END_STREAM "
+			VSLb(h2->vsl, SLT_SessError, "H2: rx header with END_STREAM "
 			    "and content-length > 0");
 			return (H2CE_PROTOCOL_ERROR); //rfc7540,l,1838,1840
 		}
 	}
 
 	if (req->http->hd[HTTP_HDR_METHOD].b == NULL) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "Missing :method");
+		VSLb(h2->vsl, SLT_Debug, "Missing :method");
 		return (H2SE_PROTOCOL_ERROR); //rfc7540,l,3087,3090
 	}
 
 	if (req->http->hd[HTTP_HDR_URL].b == NULL) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "Missing :path");
+		VSLb(h2->vsl, SLT_Debug, "Missing :path");
 		return (H2SE_PROTOCOL_ERROR); //rfc7540,l,3087,3090
 	}
 
@@ -710,7 +774,7 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2,
 	if (*req->http->hd[HTTP_HDR_URL].b == '*' &&
 	    (Tlen(req->http->hd[HTTP_HDR_METHOD]) != 7 ||
 	    strncmp(req->http->hd[HTTP_HDR_METHOD].b, "OPTIONS", 7))) {
-		H2S_Lock_VSLb(h2, SLT_BogoHeader, "Illegal :path pseudo-header");
+		VSLb(h2->vsl, SLT_BogoHeader, "Illegal :path pseudo-header");
 		return (H2SE_PROTOCOL_ERROR); //rfc7540,l,3068,3071
 	}
 
@@ -719,15 +783,46 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2,
 	VCL_TaskEnter(req->top->privs);
 	req->task->func = h2_do_req;
 	req->task->priv = req;
+
+	/* NB: we don't need to guard the read of h2->open_streams because
+	 * headers are handled sequentially so it cannot increase under our
+	 * feet.
+	 */
+	if (h2->open_streams > (int)h2->local_settings.max_concurrent_streams) {
+		VSLb(h2->vsl, SLT_Debug,
+		    "H2: stream %u: Hit maximum number of concurrent streams",
+		    h2->rxf_stream);
+		return (H2SE_REFUSED_STREAM);   // rfc7540,l,1200,1205
+	}
+
 	r2->scheduled = 1;
 	if (Pool_Task(wrk->pool, req->task, TASK_QUEUE_STR) != 0) {
 		r2->scheduled = 0;
-		r2->state = H2_S_CLOSED;
 		return (H2SE_REFUSED_STREAM); //rfc7540,l,3326,3329
 	}
 	return (0);
 }
 
+static h2_error
+h2_decode_headers(struct h2_sess *h2, struct h2_req *r2,
+    const void *p, size_t l)
+{
+	h2_error h2e;
+
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	assert(h2->hpack_lock == r2);
+
+	h2e = h2h_decode_bytes(h2, p, l);
+	r2->req->acct.req_hdrbytes += l;
+
+	if (h2e != NULL) {
+		VSLb(h2->vsl, SLT_Debug, "HPACK(%s) %s",
+		    h2_framename(h2->rxf_type), h2e->name);
+	}
+
+	return (h2e);
+}
+
 static h2_error v_matchproto_(h2_rxframe_f)
 h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
@@ -737,33 +832,10 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	size_t l;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
-
-	if (r2 != NULL) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx headers on non-idle stream");
-		return (H2CE_PROTOCOL_ERROR);	// rfc9113,l,887,891
-	}
-
-	if (h2->rxf_stream <= h2->highest_stream) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: new stream ID < highest stream");
-		return (H2CE_PROTOCOL_ERROR);	// rfc7540,l,1153,1158
-	}
-        /* NB: we don't need to guard the read of h2->open_streams
-         * because headers are handled sequentially so it cannot
-         * increase under our feet.
-         */
-        if (h2->open_streams >=
-	    (int)h2->local_settings.max_concurrent_streams) {
-		H2S_Lock_VSLb(h2, SLT_Debug,
-		    "H2: stream %u: Hit maximum number of "
-		    "concurrent streams", h2->rxf_stream);
-		return (H2SE_REFUSED_STREAM);	// rfc7540,l,1200,1205
-	}
-	h2->highest_stream = h2->rxf_stream;
-	r2 = h2_new_req(h2, h2->rxf_stream, NULL);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
 	assert(r2->state == H2_S_IDLE);
-	r2->state = H2_S_OPEN;
+	h2_stream_setstate(r2, H2_S_OPEN);
 
 	req = r2->req;
 	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
@@ -772,7 +844,6 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	VSLb(req->vsl, SLT_Begin, "req %ju rxreq", VXID(req->sp->vxid));
 	VSL(SLT_Link, req->sp->vxid, "req %ju rxreq", VXID(req->vsl->wid));
 
-	h2->new_req = req;
 	req->sp = h2->sess;
 	req->transport = &HTTP2_transport;
 
@@ -784,42 +855,42 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 	HTTP_Setup(req->http, req->ws, req->vsl, SLT_ReqMethod);
 	http_SetH(req->http, HTTP_HDR_PROTO, "HTTP/2.0");
 
-	h2h_decode_hdr_init(h2);
+	h2h_decode_hdr_init(h2, r2);
 
 	p = h2->rxf_data;
 	l = h2->rxf_len;
-	if (h2->rxf_flags & H2FF_HEADERS_PADDED) {
+	if (h2->rxf_flags & H2FF_PADDED) {
 		if (*p + 1 > l) {
-			H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx headers with pad length > frame len");
+			VSLb(h2->vsl, SLT_SessError, "H2: rx headers with pad length > frame len");
 			return (H2CE_PROTOCOL_ERROR);	// rfc7540,l,1884,1887
 		}
 		l -= 1 + *p;
 		p += 1;
 	}
-	if (h2->rxf_flags & H2FF_HEADERS_PRIORITY) {
+	if (h2->rxf_flags & H2FF_PRIORITY) {
 		if (l < 5) {
-			H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx headers with incorrect "
+			VSLb(h2->vsl, SLT_SessError, "H2: rx headers with incorrect "
 			    "priority data");
 			return (H2CE_PROTOCOL_ERROR);
 		}
 		l -= 5;
 		p += 5;
 	}
-	h2e = h2h_decode_bytes(h2, p, l);
-	if (h2e != NULL) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "HPACK(hdr) %s", h2e->name);
-		(void)h2h_decode_hdr_fini(h2);
-		assert(!WS_IsReserved(r2->req->ws));
-		h2_del_req(wrk, r2);
+
+	h2e = h2_decode_headers(h2, r2, p, l);
+	if (h2e != NULL)
 		return (h2e);
-	}
 
-	if (h2->rxf_flags & H2FF_HEADERS_END_STREAM)
+	if (h2->rxf_flags & H2FF_END_STREAM)
 		req->req_body_status = BS_NONE;
 
-	if (h2->rxf_flags & H2FF_HEADERS_END_HEADERS)
+	if (h2->rxf_flags & H2FF_END_HEADERS)
 		return (h2_end_headers(wrk, h2, req, r2));
-	return (0);
+
+	/* This wasn't the end of the headers. h2->hpack_lock is left as
+	 * evidence to pick up that a CONTINUATION frame is expected next
+	 * on this stream. */
+	return (NULL);
 }
 
 /**********************************************************************/
@@ -827,31 +898,28 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 static h2_error v_matchproto_(h2_rxframe_f)
 h2_rx_continuation(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
-	struct req *req;
 	h2_error h2e;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 	CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC);
 
-	if (r2 == NULL || r2->state != H2_S_OPEN || r2->req != h2->new_req) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx unexpected CONT frame"
+	if (r2 == NULL || r2->state != H2_S_OPEN || r2 != h2->hpack_lock) {
+		VSLb(h2->vsl, SLT_SessError, "H2: rx unexpected CONT frame"
 		    " on stream %d", h2->rxf_stream);
 		return (H2CE_PROTOCOL_ERROR);	// XXX spec ?
 	}
-	req = r2->req;
-	h2e = h2h_decode_bytes(h2, h2->rxf_data, h2->rxf_len);
-	r2->req->acct.req_hdrbytes += h2->rxf_len;
-	if (h2e != NULL) {
-		H2S_Lock_VSLb(h2, SLT_Debug, "HPACK(cont) %s", h2e->name);
-		(void)h2h_decode_hdr_fini(h2);
-		assert(!WS_IsReserved(r2->req->ws));
-		h2_del_req(wrk, r2);
+	h2e = h2_decode_headers(h2, r2, h2->rxf_data, h2->rxf_len);
+	if (h2e != NULL)
 		return (h2e);
-	}
-	if (h2->rxf_flags & H2FF_HEADERS_END_HEADERS)
-		return (h2_end_headers(wrk, h2, req, r2));
-	return (0);
+
+	if (h2->rxf_flags & H2FF_END_HEADERS)
+		return (h2_end_headers(wrk, h2, r2->req, r2));
+
+	/* This wasn't the end of the headers. h2->hpack_lock is left as
+	 * evidence to pick up that a CONTINUATION frame is expected next
+	 * on this stream. */
+	return (NULL);
 }
 
 /**********************************************************************/
@@ -859,403 +927,15 @@ h2_rx_continuation(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 static h2_error v_matchproto_(h2_rxframe_f)
 h2_rx_data(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
 {
-	char buf[4];
-	ssize_t l;
-	uint64_t l2, head;
-	const uint8_t *src;
-	unsigned len;
-
-	/* XXX: Shouldn't error handling, setting of r2->error and
-	 * r2->cond signalling be handled more generally at the end of
-	 * procframe()??? */
 
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	ASSERT_RXTHR(h2);
 	CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC);
 
-	if (r2 == NULL)
-		return (0);
-
-	if (r2->state >= H2_S_CLOS_REM) {
-		r2->error = H2SE_STREAM_CLOSED;
+	if (r2 == NULL || r2->state == H2_S_CLOSED)
+		return (H2CE_PROTOCOL_ERROR); // rfc7540,l,1727,1730
+	if (r2->state >= H2_S_CLOS_REM)
 		return (H2SE_STREAM_CLOSED); // rfc7540,l,1766,1769
-	}
-
-	Lck_Lock(&h2->sess->mtx);
-	CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
-
-	if (h2->error != NULL || r2->error != NULL) {
-		if (r2->cond)
-			PTOK(pthread_cond_signal(r2->cond));
-		Lck_Unlock(&h2->sess->mtx);
-		return (h2->error != NULL ? h2->error : r2->error);
-	}
-
-	/* Check padding if present */
-	src = h2->rxf_data;
-	len = h2->rxf_len;
-	if (h2->rxf_flags & H2FF_DATA_PADDED) {
-		if (*src >= len) {
-			VSLb(h2->vsl, SLT_SessError,
-			    "H2: stream %u: Padding larger than frame length",
-			    h2->rxf_stream);
-			r2->error = H2CE_PROTOCOL_ERROR;
-			if (r2->cond)
-				PTOK(pthread_cond_signal(r2->cond));
-			Lck_Unlock(&h2->sess->mtx);
-			return (H2CE_PROTOCOL_ERROR);
-		}
-		len -= 1 + *src;
-		src += 1;
-	}
-
-	/* Check against the Content-Length header if given */
-	if (r2->req->htc->content_length >= 0) {
-		if (r2->rxbuf)
-			l = r2->rxbuf->head;
-		else
-			l = 0;
-		l += len;
-		if (l > r2->req->htc->content_length ||
-		    ((h2->rxf_flags & H2FF_DATA_END_STREAM) &&
-		     l != r2->req->htc->content_length)) {
-			VSLb(h2->vsl, SLT_Debug,
-			    "H2: stream %u: Received data and Content-Length"
-			    " mismatch", h2->rxf_stream);
-			r2->error = H2SE_PROTOCOL_ERROR;
-			if (r2->cond)
-				PTOK(pthread_cond_signal(r2->cond));
-			Lck_Unlock(&h2->sess->mtx);
-			return (H2SE_PROTOCOL_ERROR);
-		}
-	}
-
-	/* Check and charge connection window. The entire frame including
-	 * padding (h2->rxf_len) counts towards the window. */
-	if (h2->rxf_len > h2->req0->r_window) {
-		VSLb(h2->vsl, SLT_SessError,
-		    "H2: stream %u: Exceeded connection receive window",
-		    h2->rxf_stream);
-		r2->error = H2CE_FLOW_CONTROL_ERROR;
-		if (r2->cond)
-			PTOK(pthread_cond_signal(r2->cond));
-		Lck_Unlock(&h2->sess->mtx);
-		return (H2CE_FLOW_CONTROL_ERROR);
-	}
-	h2->req0->r_window -= h2->rxf_len;
-	if (h2->req0->r_window < cache_param->h2_rx_window_low_water) {
-		h2->req0->r_window += cache_param->h2_rx_window_increment;
-		vbe32enc(buf, cache_param->h2_rx_window_increment);
-		Lck_Unlock(&h2->sess->mtx);
-		H2_Send_Get(wrk, h2, h2->req0);
-		H2_Send_Frame(wrk, h2, H2_F_WINDOW_UPDATE, 0, 4, 0, buf);
-		H2_Send_Rel(h2, h2->req0);
-		Lck_Lock(&h2->sess->mtx);
-	}
-
-	/* Check stream window. The entire frame including padding
-	 * (h2->rxf_len) counts towards the window. */
-	if (h2->rxf_len > r2->r_window) {
-		VSLb(h2->vsl, SLT_Debug,
-		    "H2: stream %u: Exceeded stream receive window",
-		    h2->rxf_stream);
-		r2->error = H2SE_FLOW_CONTROL_ERROR;
-		if (r2->cond)
-			PTOK(pthread_cond_signal(r2->cond));
-		Lck_Unlock(&h2->sess->mtx);
-		return (H2SE_FLOW_CONTROL_ERROR);
-	}
-
-	/* Handle zero size frame before starting to allocate buffers */
-	if (len == 0) {
-		r2->r_window -= h2->rxf_len;
-
-		/* Handle the specific corner case where the entire window
-		 * has been exhausted using nothing but padding
-		 * bytes. Since no bytes have been buffered, no bytes
-		 * would be consumed by the request thread and no stream
-		 * window updates sent. Unpaint ourselves from this corner
-		 * by sending a stream window update here. */
-		CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
-		if (r2->r_window == 0 &&
-		    (r2->rxbuf == NULL || r2->rxbuf->tail == r2->rxbuf->head)) {
-			if (r2->rxbuf)
-				l = r2->rxbuf->size;
-			else
-				l = h2->local_settings.initial_window_size;
-			r2->r_window += l;
-			Lck_Unlock(&h2->sess->mtx);
-			vbe32enc(buf, l);
-			H2_Send_Get(wrk, h2, h2->req0);
-			H2_Send_Frame(wrk, h2, H2_F_WINDOW_UPDATE, 0, 4,
-			    r2->stream, buf);
-			H2_Send_Rel(h2, h2->req0);
-			Lck_Lock(&h2->sess->mtx);
-		}
-
-		if (h2->rxf_flags & H2FF_DATA_END_STREAM)
-			r2->state = H2_S_CLOS_REM;
-		if (r2->cond)
-			PTOK(pthread_cond_signal(r2->cond));
-		Lck_Unlock(&h2->sess->mtx);
-		return (0);
-	}
-
-	/* Make the buffer on demand */
-	if (r2->rxbuf == NULL) {
-		unsigned bufsize;
-		size_t bstest;
-		struct stv_buffer *stvbuf;
-		struct h2_rxbuf *rxbuf;
-
-		Lck_Unlock(&h2->sess->mtx);
-
-		bufsize = h2->local_settings.initial_window_size;
-		if (bufsize < r2->r_window) {
-			/* This will not happen because we do not have any
-			 * mechanism to change the initial window size on
-			 * a running session. But if we gain that ability,
-			 * this future proofs it. */
-			bufsize = r2->r_window;
-		}
-		assert(bufsize > 0);
-		if ((h2->rxf_flags & H2FF_DATA_END_STREAM) &&
-		    bufsize > len)
-			/* Cap the buffer size when we know this is the
-			 * single data frame. */
-			bufsize = len;
-		CHECK_OBJ_NOTNULL(stv_h2_rxbuf, STEVEDORE_MAGIC);
-		stvbuf = STV_AllocBuf(wrk, stv_h2_rxbuf,
-		    bufsize + sizeof *rxbuf);
-		if (stvbuf == NULL) {
-			Lck_Lock(&h2->sess->mtx);
-			VSLb(h2->vsl, SLT_Debug,
-			    "H2: stream %u: Failed to allocate request body"
-			    " buffer",
-			    h2->rxf_stream);
-			r2->error = H2SE_INTERNAL_ERROR;
-			if (r2->cond)
-				PTOK(pthread_cond_signal(r2->cond));
-			Lck_Unlock(&h2->sess->mtx);
-			return (H2SE_INTERNAL_ERROR);
-		}
-		rxbuf = STV_GetBufPtr(stvbuf, &bstest);
-		AN(rxbuf);
-		assert(bstest >= bufsize + sizeof *rxbuf);
-		assert(PAOK(rxbuf));
-		INIT_OBJ(rxbuf, H2_RXBUF_MAGIC);
-		rxbuf->size = bufsize;
-		rxbuf->stvbuf = stvbuf;
-
-		r2->rxbuf = rxbuf;
-
-		Lck_Lock(&h2->sess->mtx);
-	}
-
-	CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC);
-	assert(r2->rxbuf->tail <= r2->rxbuf->head);
-	l = r2->rxbuf->head - r2->rxbuf->tail;
-	assert(l <= r2->rxbuf->size);
-	l = r2->rxbuf->size - l;
-	assert(len <= l); /* Stream window handling ensures this */
-
-	Lck_Unlock(&h2->sess->mtx);
-
-	l = len;
-	head = r2->rxbuf->head;
-	do {
-		l2 = l;
-		if ((head % r2->rxbuf->size) + l2 > r2->rxbuf->size)
-			l2 = r2->rxbuf->size - (head % r2->rxbuf->size);
-		assert(l2 > 0);
-		memcpy(&r2->rxbuf->data[head % r2->rxbuf->size], src, l2);
-		src += l2;
-		head += l2;
-		l -= l2;
-	} while (l > 0);
-
-	Lck_Lock(&h2->sess->mtx);
-
-	/* Charge stream window. The entire frame including padding
-	 * (h2->rxf_len) counts towards the window. The used padding
-	 * bytes will be included in the next connection window update
-	 * sent when the buffer bytes are consumed because that is
-	 * calculated against the available buffer space. */
-	r2->r_window -= h2->rxf_len;
-	r2->rxbuf->head += len;
-	assert(r2->rxbuf->tail <= r2->rxbuf->head);
-	if (h2->rxf_flags & H2FF_DATA_END_STREAM)
-		r2->state = H2_S_CLOS_REM;
-	if (r2->cond)
-		PTOK(pthread_cond_signal(r2->cond));
-	Lck_Unlock(&h2->sess->mtx);
-
-	return (0);
-}
-
-static enum vfp_status v_matchproto_(vfp_pull_f)
-h2_vfp_body(struct vfp_ctx *vc, struct vfp_entry *vfe, void *ptr, ssize_t *lp)
-{
-	struct h2_req *r2;
-	struct h2_sess *h2;
-	enum vfp_status retval;
-	ssize_t l, l2;
-	uint64_t tail;
-	uint8_t *dst;
-	char buf[4];
-	int i;
-
-	CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC);
-	CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC);
-	CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC);
-	h2 = r2->h2sess;
-
-	AN(ptr);
-	AN(lp);
-	assert(*lp >= 0);
-
-	Lck_Lock(&h2->sess->mtx);
-
-	r2->cond = &vc->wrk->cond;
-	while (1) {
-		CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
-		if (r2->rxbuf) {
-			assert(r2->rxbuf->tail <= r2->rxbuf->head);
-			l = r2->rxbuf->head - r2->rxbuf->tail;
-		} else
-			l = 0;
-
-		if (h2->error != NULL || r2->error != NULL)
-			retval = VFP_ERROR;
-		else if (r2->state >= H2_S_CLOS_REM && l <= *lp)
-			retval = VFP_END;
-		else {
-			if (l > *lp)
-				l = *lp;
-			retval = VFP_OK;
-		}
-
-		if (retval != VFP_OK || l > 0)
-			break;
-
-		i = Lck_CondWaitTimeout(r2->cond, &h2->sess->mtx,
-		    SESS_TMO(h2->sess, timeout_idle));
-		if (i == ETIMEDOUT) {
-			retval = VFP_ERROR;
-			break;
-		}
-	}
-	r2->cond = NULL;
-
-	Lck_Unlock(&h2->sess->mtx);
-
-	if (l == 0 || retval == VFP_ERROR) {
-		*lp = 0;
-		return (retval);
-	}
-
-	*lp = l;
-	dst = ptr;
-	tail = r2->rxbuf->tail;
-	do {
-		l2 = l;
-		if ((tail % r2->rxbuf->size) + l2 > r2->rxbuf->size)
-			l2 = r2->rxbuf->size - (tail % r2->rxbuf->size);
-		assert(l2 > 0);
-		memcpy(dst, &r2->rxbuf->data[tail % r2->rxbuf->size], l2);
-		dst += l2;
-		tail += l2;
-		l -= l2;
-	} while (l > 0);
-
-	Lck_Lock(&h2->sess->mtx);
-
-	CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC);
-	r2->rxbuf->tail = tail;
-	assert(r2->rxbuf->tail <= r2->rxbuf->head);
-
-	if (r2->r_window < cache_param->h2_rx_window_low_water &&
-	    r2->state < H2_S_CLOS_REM) {
-		/* l is free buffer space */
-		/* l2 is calculated window increment */
-		l = r2->rxbuf->size - (r2->rxbuf->head - r2->rxbuf->tail);
-		assert(r2->r_window <= l);
-		l2 = cache_param->h2_rx_window_increment;
-		if (r2->r_window + l2 > l)
-			l2 = l - r2->r_window;
-		r2->r_window += l2;
-	} else
-		l2 = 0;
-
-	Lck_Unlock(&h2->sess->mtx);
-
-	if (l2 > 0) {
-		vbe32enc(buf, l2);
-		H2_Send_Get(vc->wrk, h2, r2);
-		H2_Send_Frame(vc->wrk, h2, H2_F_WINDOW_UPDATE, 0, 4,
-		    r2->stream, buf);
-		H2_Send_Rel(h2, r2);
-	}
 
-	return (retval);
-}
-
-static void
-h2_vfp_body_fini(struct vfp_ctx *vc, struct vfp_entry *vfe)
-{
-	struct h2_req *r2;
-	struct h2_sess *h2;
-	struct stv_buffer *stvbuf = NULL;
-
-	CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC);
-	CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC);
-	CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(r2->req, REQ_MAGIC);
-	h2 = r2->h2sess;
-
-	if (vc->failed) {
-		CHECK_OBJ_NOTNULL(r2->req->wrk, WORKER_MAGIC);
-		H2_Send_Get(r2->req->wrk, h2, r2);
-		H2_Send_RST(r2->req->wrk, h2, r2, r2->stream,
-		    H2SE_REFUSED_STREAM);
-		H2_Send_Rel(h2, r2);
-		Lck_Lock(&h2->sess->mtx);
-		r2->error = H2SE_REFUSED_STREAM;
-		Lck_Unlock(&h2->sess->mtx);
-	}
-
-	if (r2->state >= H2_S_CLOS_REM && r2->rxbuf != NULL) {
-		Lck_Lock(&h2->sess->mtx);
-		CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
-		if (r2->rxbuf != NULL) {
-			stvbuf = r2->rxbuf->stvbuf;
-			r2->rxbuf = NULL;
-		}
-		Lck_Unlock(&h2->sess->mtx);
-		if (stvbuf != NULL) {
-			STV_FreeBuf(vc->wrk, &stvbuf);
-			AZ(stvbuf);
-		}
-	}
-}
-
-static const struct vfp h2_body = {
-	.name = "H2_BODY",
-	.pull = h2_vfp_body,
-	.fini = h2_vfp_body_fini
-};
-
-void v_matchproto_(vtr_req_body_t)
-h2_req_body(struct req *req)
-{
-	struct h2_req *r2;
-	struct vfp_entry *vfe;
-
-	CHECK_OBJ(req, REQ_MAGIC);
-	CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC);
-	vfe = VFP_Push(req->vfc, &h2_body);
-	AN(vfe);
-	vfe->priv1 = r2;
+	return (h2_reqbody_data(wrk, h2, r2));
 }
 
 /**********************************************************************/
@@ -1263,9 +943,13 @@ h2_req_body(struct req *req)
 void v_matchproto_(vtr_req_fail_f)
 h2_req_fail(struct req *req, stream_close_t reason)
 {
+	struct h2_req *r2;
+
 	assert(reason != SC_NULL);
-	assert(req->sp->fd != 0);
 	VSLb(req->vsl, SLT_Debug, "H2FAILREQ");
+
+	CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC);
+	h2_async_error(r2, H2SE_INTERNAL_ERROR);
 }
 
 /**********************************************************************/
@@ -1274,169 +958,232 @@ static enum htc_status_e v_matchproto_(htc_complete_f)
 h2_frame_complete(struct http_conn *htc)
 {
 	struct h2_sess *h2;
+	unsigned u;
+	size_t l;
 
 	CHECK_OBJ_NOTNULL(htc, HTTP_CONN_MAGIC);
 	CAST_OBJ_NOTNULL(h2, htc->priv, H2_SESS_MAGIC);
-	if (htc->rxbuf_b + 9 > htc->rxbuf_e ||
-	    htc->rxbuf_b + 9 + (vbe32dec(htc->rxbuf_b) >> 8) > htc->rxbuf_e)
+	l = pdiff(htc->rxbuf_b, htc->rxbuf_e);
+	if (l == 0)
+		return (HTC_S_EMPTY);
+	if (l < 9)
 		return (HTC_S_MORE);
-	return (HTC_S_COMPLETE);
+	u = vbe32dec(htc->rxbuf_b) >> 8;
+	if (u > h2->local_settings.max_frame_size)
+		return (HTC_S_OVERFLOW);
+	if (l >= u + 9)
+		return (HTC_S_COMPLETE);
+
+	return (HTC_S_MORE);
 }
 
+
 /**********************************************************************/
 
-static h2_error
+static void
 h2_procframe(struct worker *wrk, struct h2_sess *h2, h2_frame h2f)
 {
-	struct h2_req *r2;
-	h2_error h2e;
+	struct h2_req *r2 = NULL;
+	h2_error h2e = NULL;
 
-	ASSERT_RXTHR(h2);
-	if (h2->rxf_stream == 0 && h2f->act_szero != 0) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: unexpected %s frame on stream 0",
-		    h2f->name);
-		return (h2f->act_szero);
+	ASSERT_H2_SESS(h2);
+	if (h2->rxf_stream == 0 && h2f->act_szero != NULL) {
+		VSLb(h2->vsl, SLT_SessError,
+		    "H2: unexpected %s frame on stream 0", h2f->name);
+		h2e = h2f->act_szero;
+		goto exit;
 	}
 
-	if (h2->rxf_stream != 0 && h2f->act_snonzero != 0) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: unexpected %s frame on stream %d",
+	if (h2->rxf_stream != 0 && h2f->act_snonzero != NULL) {
+		VSLb(h2->vsl, SLT_SessError,
+		    "H2: unexpected %s frame on stream %d",
 		    h2f->name, h2->rxf_stream);
-		return (h2f->act_snonzero);
+		h2e = h2f->act_snonzero;
+		goto exit;
 	}
 
 	if (h2->rxf_stream > h2->highest_stream && h2f->act_sidle != 0) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: unexpected %s frame on idle stream "
-		    "%d", h2f->name, h2->rxf_stream);
-		return (h2f->act_sidle);
+		VSLb(h2->vsl, SLT_SessError,
+		    "H2: unexpected %s frame on idle stream %d",
+		    h2f->name, h2->rxf_stream);
+		h2e = h2f->act_sidle;
+		goto exit;
+	}
+
+	if (h2->expect_settings_next) {
+		if (h2f != H2_F_SETTINGS || (h2->rxf_flags & H2FF_ACK)) {
+			// rfc7540,l,579,637
+			// rfc7540,l,482,485
+			VSLb(h2->vsl, SLT_Error,
+			    "H2: unexpected %s%s frame on stream %d,"
+			    " expected preface settings",
+			    h2f->name,
+			    h2->rxf_flags & H2FF_ACK ? "(ACK)" : "",
+			    h2->rxf_stream);
+			h2e = H2CE_PROTOCOL_ERROR;
+			goto exit;
+		}
+		h2->expect_settings_next = 0;
 	}
 
 	if (h2->rxf_stream != 0 && !(h2->rxf_stream & 1)) {
 		// rfc7540,l,1140,1145
 		// rfc7540,l,1153,1158
 		/* No even streams, we don't do PUSH_PROMISE */
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: illegal stream (=%u)",
+		VSLb(h2->vsl, SLT_SessError, "H2: illegal stream (=%u)",
 		    h2->rxf_stream);
-		return (H2CE_PROTOCOL_ERROR);
+		h2e = H2CE_PROTOCOL_ERROR;
+		goto exit;
 	}
 
-	VTAILQ_FOREACH(r2, &h2->streams, list)
-		if (r2->stream == h2->rxf_stream)
-			break;
+	if (h2->hpack_lock != NULL && h2f != H2_F_CONTINUATION) {
+		VSLb(h2->vsl, SLT_SessError,
+		    "H2: expected continuation but received %s on stream %d",
+		    h2f->name, h2->rxf_stream);
+		h2e = H2CE_PROTOCOL_ERROR;	// rfc7540,l,1859,1863
+		goto exit;
+	}
 
-	if (h2->new_req != NULL && h2f != H2_F_CONTINUATION) {
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: expected continuation but "
-		    " received %s on stream %d", h2f->name, h2->rxf_stream);
-		return (H2CE_PROTOCOL_ERROR);	// rfc7540,l,1859,1863
+	if (h2f == H2_F_HEADERS && h2->rxf_stream <= h2->highest_stream) {
+		VSLb(h2->vsl, SLT_Error, "H2: new stream ID < highest stream");
+		h2e = H2CE_PROTOCOL_ERROR;      // rfc7540,l,1153,1158
+		goto exit;
+	}
+
+	if (h2->rxf_stream != 0) {
+		VTAILQ_FOREACH(r2, &h2->streams, list) {
+			if (r2->stream == h2->rxf_stream)
+				break;
+		}
+		if (r2 != NULL && r2->error != NULL) {
+			/* Ignore frames for streams once error is set. */
+			/* XXX: missing accounting? */
+			return;
+		}
+	}
+
+	if (h2f == H2_F_HEADERS) {
+		AZ(r2); /* We checked against highest_stream above. */
+		r2 = h2_new_req(h2, h2->rxf_stream, NULL);
+		CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+		h2->highest_stream = r2->stream;
 	}
 
 	h2e = h2f->rxfunc(wrk, h2, r2);
-	if (h2e == NULL)
-		return (NULL);
-	if (h2->rxf_stream == 0 || h2e->connection)
-		return (h2e);	// Connection errors one level up
 
-	H2_Send_Get(wrk, h2, h2->req0);
-	H2_Send_RST(wrk, h2, h2->req0, h2->rxf_stream, h2e);
-	H2_Send_Rel(h2, h2->req0);
-	return (NULL);
+exit:
+	if (h2e != NULL) {
+		if (h2->rxf_stream == 0 || h2e->connection)
+			h2->error = h2e;
+		if (r2 != NULL)
+			h2_kill_req(wrk, h2, &r2, h2e);
+	}
 }
 
-h2_error
-h2_stream_tmo(struct h2_sess *h2, const struct h2_req *r2, vtim_real now)
+void
+h2_stream_setstate(struct h2_req *r2, enum h2_stream_e state)
 {
-	h2_error h2e = NULL;
-
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
 	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	Lck_AssertHeld(&h2->sess->mtx);
+	ASSERT_H2_SESS(r2->h2sess);
+
+	if (r2->state >= state) {
+		/* State transitions only go from lower states to
+		 * higher. If we are already at a higher state, ignore
+		 * it. (We do not assert on state changes because change
+		 * of state is both driven by our internal progress as
+		 * well as incoming client data.) */
+		return;
+	}
 
-	/* NB: when now is NAN, it means that h2_window_timeout was hit
-	 * on a lock condwait operation.
-	 */
-	if (isnan(now))
-		AN(r2->t_winupd);
+	if (state >= H2_S_CLOSED) {
+		assert(r2->h2sess->open_streams > 0);
+		r2->h2sess->open_streams--;
+	}
 
-	if (h2->error != NULL && h2->error->connection &&
-	    !h2->error->send_goaway)
-		return (h2->error);
+	r2->state = state;
+}
 
-	if (r2->t_winupd == 0 && r2->t_send == 0)
-		return (NULL);
+static h2_error
+h2_stream_tmo(struct h2_sess *h2, const struct h2_req *r2, vtim_real now)
+{
 
-	if (isnan(now) || (r2->t_winupd != 0 &&
-	    now - r2->t_winupd > cache_param->h2_window_timeout)) {
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+
+	if (r2->t_win_low != 0 &&
+	    now - r2->t_win_low > cache_param->h2_window_timeout) {
 		VSLb(h2->vsl, SLT_Debug,
-		     "H2: stream %u: Hit h2_window_timeout", r2->stream);
-		h2e = H2SE_BROKE_WINDOW;
+		    "H2: stream %u: Hit h2_window_timeout", r2->stream);
+		if (h2->open_streams <= h2->win_low_streams) {
+			/* If all streams ran out of control flow window
+			 * credits upon triggering h2_window_timeout,
+			 * declare bankruptcy for the entire connection. */
+			return (H2CE_BANKRUPT);
+		}
+		return (H2SE_BROKE_WINDOW);
 	}
 
-	if (h2e == NULL && r2->t_send != 0 &&
+	if (r2->t_send != 0 &&
 	    now - r2->t_send > SESS_TMO(h2->sess, send_timeout)) {
 		VSLb(h2->vsl, SLT_Debug,
 		     "H2: stream %u: Hit send_timeout", r2->stream);
-		h2e = H2SE_CANCEL;
+		return (H2SE_SEND_TIMEOUT);
 	}
 
-	return (h2e);
-}
-
-static h2_error
-h2_stream_tmo_unlocked(struct h2_sess *h2, const struct h2_req *r2)
-{
-	h2_error h2e;
-
-	Lck_Lock(&h2->sess->mtx);
-	h2e = h2_stream_tmo(h2, r2, h2->sess->t_idle);
-	Lck_Unlock(&h2->sess->mtx);
-
-	return (h2e);
+	return (NULL);
 }
 
 /*
  * This is the janitorial task of cleaning up any closed & refused
  * streams, and checking if the session is timed out.
  */
-static h2_error
-h2_sweep(struct worker *wrk, struct h2_sess *h2)
+static void
+h2_sweep(struct worker *wrk, struct h2_sess *h2, vtim_real now)
 {
 	struct h2_req *r2, *r22;
-	h2_error h2e, tmo;
-	vtim_real now;
+	h2_error h2e;
+	int64_t l;
 
-	ASSERT_RXTHR(h2);
+	ASSERT_H2_SESS(h2);
 
-	h2e = h2->error;
-	now = VTIM_real();
-	if (h2e == NULL && h2->open_streams == 0 &&
-	    h2->sess->t_idle + cache_param->timeout_idle < now)
-		h2e = H2CE_NO_ERROR;
-
-	h2->do_sweep = 0;
 	VTAILQ_FOREACH_SAFE(r2, &h2->streams, list, r22) {
-		if (r2 == h2->req0) {
-			assert (r2->state == H2_S_IDLE);
+		if (r2->async_error != NULL) {
+			/* Request thread has set an error state. Kill it. */
+			h2e = r2->async_error;
+			r2->async_error = NULL;
+			h2_kill_req(wrk, h2, &r2, h2e);
 			continue;
 		}
+
+		if (r2->rxbuf != NULL && r2->state < H2_S_CLOS_REM &&
+		    r2->error == NULL) {
+			/* Check and expand the request body window if
+			 * necessary. */
+			CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+			assert(r2->rxbuf->tail <= r2->rxbuf->head);
+			l = r2->rxbuf->head - r2->rxbuf->tail;
+			assert(l <= r2->rxbuf->size);
+			l = r2->rxbuf->size - l;
+			if (r2->rx_window < l) {
+				l = l - r2->rx_window;
+				H2_Send_WINDOW_UPDATE(h2, r2->stream, l);
+				r2->rx_window += l;
+			}
+		}
+
 		switch (r2->state) {
 		case H2_S_CLOSED:
-			AZ(r2->scheduled);
-			h2_del_req(wrk, r2);
+			if (!r2->scheduled)
+				h2_kill_req(wrk, h2, &r2, H2SE_NO_ERROR);
 			break;
 		case H2_S_CLOS_REM:
-			if (!r2->scheduled) {
-				H2_Send_Get(wrk, h2, h2->req0);
-				H2_Send_RST(wrk, h2, h2->req0, r2->stream,
-				    H2SE_REFUSED_STREAM);
-				H2_Send_Rel(h2, h2->req0);
-				h2_del_req(wrk, r2);
-				continue;
-			}
-			/* FALLTHROUGH */
 		case H2_S_CLOS_LOC:
 		case H2_S_OPEN:
-			tmo = h2_stream_tmo_unlocked(h2, r2);
-			if (h2e == NULL)
-				h2e = tmo;
+			h2e = h2_stream_tmo(h2, r2, now);
+			if (h2e != NULL && h2e->connection)
+				h2->error = h2e;
+			else if (h2e != NULL)
+				h2_kill_req(wrk, h2, &r2, h2e);
 			break;
 		case H2_S_IDLE:
 			/* Current code make this unreachable: h2_new_req is
@@ -1448,7 +1195,6 @@ h2_sweep(struct worker *wrk, struct h2_sess *h2)
 			break;
 		}
 	}
-	return (h2e);
 }
 
 /*
@@ -1456,21 +1202,15 @@ h2_sweep(struct worker *wrk, struct h2_sess *h2)
  * if we have not received end_stream, DATA frames are expected later
  *
  * neither of these make much sense to output here
- *
- * goaway currently is always 0, see #4285
  */
 static void
 h2_htc_debug(enum htc_status_e hs, struct h2_sess *h2)
 {
 	const char *s, *r;
 
-	if (LIKELY(VSL_tag_is_masked(SLT_Debug)))
-		return;
-
 	HTC_Status(hs, &s, &r);
-	H2S_Lock_VSLb(h2, SLT_Debug, "H2: HTC %s (%s) frame=%s goaway=%d",
-	    s, r, h2->htc->rxbuf_b == h2->htc->rxbuf_e ? "complete" : "partial",
-	    h2->goaway);
+	VSLb(h2->vsl, SLT_Debug, "H2: HTC %s (%s) frame=%s", s, r,
+	    h2->htc->rxbuf_b == h2->htc->rxbuf_e ? "complete" : "partial");
 }
 
 /***********************************************************************
@@ -1488,54 +1228,61 @@ static const h2_frame h2flist[] = {
 
 #define H2FMAX (sizeof(h2flist) / sizeof(h2flist[0]))
 
-int
-h2_rxframe(struct worker *wrk, struct h2_sess *h2)
+static enum htc_status_e
+h2_rxstuff(struct h2_sess *h2)
 {
+	struct http_conn *htc;
 	enum htc_status_e hs;
-	h2_frame h2f;
-	h2_error h2e;
-	const char *s, *r;
-
-	ASSERT_RXTHR(h2);
-
-	if (h2->goaway && h2->open_streams == 0)
-		return (0);
+	size_t res;
+	ssize_t l;
 
-	h2->t1 = NAN;
-	VTCP_blocking(*h2->htc->rfd);
-	hs = HTC_RxStuff(h2->htc, h2_frame_complete, &h2->t1, NULL, NAN,
-	    VTIM_real() + 0.5, NAN, h2->local_settings.max_frame_size + 9);
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	htc = h2->htc;
+	CHECK_OBJ_NOTNULL(htc, HTTP_CONN_MAGIC);
+	AN(htc->rfd);
+	assert(*htc->rfd > 0);
+
+	/* Set up the workspace buffer */
+	assert(htc->rxbuf_b <= htc->rxbuf_e);
+	HTC_RxPipeline(htc, htc->rxbuf_b);
+	HTC_RxInit(htc, h2->ws);
+	res = WS_ReservationSize(h2->ws);
+
+	if (res == 0) {
+		WS_Release(htc->ws, 0);
+		return (HTC_S_OVERFLOW);
+	}
 
-	h2e = NULL;
-	switch (hs) {
-	case HTC_S_EOF:
+	l = read(*htc->rfd, htc->rxbuf_e, res);
+	if (l < 0 && errno == EWOULDBLOCK)
+		hs = HTC_S_MORE;
+	else if (l < 0)
+		hs = HTC_S_CLOSE;
+	else if (l == 0) {
+		hs = HTC_S_EOF;
 		h2_htc_debug(hs, h2);
-		h2e = H2CE_NO_ERROR;
-		break;
-	case HTC_S_COMPLETE:
-		h2->sess->t_idle = VTIM_real();
-		if (h2->do_sweep)
-			h2e = h2_sweep(wrk, h2);
-		break;
-	case HTC_S_TIMEOUT:
-		//// #4279
-		// h2_htc_debug(hs, h2);
-		h2e = h2_sweep(wrk, h2);
-		break;
-	default:
-		HTC_Status(hs, &s, &r);
-		H2S_Lock_VSLb(h2, SLT_SessError, "H2: HTC %s (%s)", s, r);
-		h2e = H2CE_ENHANCE_YOUR_CALM;
+	} else {
+		h2->t1 = VTIM_real();
+		htc->rxbuf_e += l;
+		hs = h2_frame_complete(htc);
 	}
 
-	if (h2e != NULL && h2e->connection) {
-		h2->error = h2e;
-		h2_tx_goaway(wrk, h2, h2e);
-		return (0);
-	}
+	WS_ReleaseP(htc->ws, htc->rxbuf_e);
+	return (hs);
+}
 
+static enum htc_status_e
+h2_rxframe(struct worker *wrk, struct h2_sess *h2)
+{
+	enum htc_status_e hs;
+	h2_frame h2f;
+
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
+	ASSERT_H2_SESS(h2);
+
+	hs = h2_frame_complete(h2->htc);
 	if (hs != HTC_S_COMPLETE)
-		return (1);
+		return (hs);
 
 	h2->rxf_len = vbe32dec(h2->htc->rxbuf_b) >> 8;
 	h2->rxf_type = h2->htc->rxbuf_b[3];
@@ -1543,21 +1290,21 @@ h2_rxframe(struct worker *wrk, struct h2_sess *h2)
 	h2->rxf_stream = vbe32dec(h2->htc->rxbuf_b + 5);
 	h2->rxf_stream &= ~(1LU<<31);			// rfc7540,l,690,692
 	h2->rxf_data = (void*)(h2->htc->rxbuf_b + 9);
-	/* XXX: later full DATA will not be rx'ed yet. */
-	HTC_RxPipeline(h2->htc, h2->htc->rxbuf_b + h2->rxf_len + 9);
 
-	h2_vsl_frame(h2, h2->htc->rxbuf_b, 9L + h2->rxf_len);
+	h2_rxframe_vsl(h2, h2->htc->rxbuf_b, 9L + h2->rxf_len);
 	h2->srq->acct.req_hdrbytes += 9;
 
+	h2->htc->rxbuf_b += h2->rxf_len + 9;
+	assert(h2->htc->rxbuf_b <= h2->htc->rxbuf_e);
+
 	if (h2->rxf_type >= H2FMAX) {
 		// rfc7540,l,679,681
-		// XXX: later, drain rest of frame
 		h2->bogosity++;
-		H2S_Lock_VSLb(h2, SLT_Debug,
+		VSLb(h2->vsl, SLT_Debug,
 		    "H2: Unknown frame type 0x%02x (ignored)",
 		    (uint8_t)h2->rxf_type);
 		h2->srq->acct.req_bodybytes += h2->rxf_len;
-		return (1);
+		return (h2_frame_complete(h2->htc));
 	}
 	h2f = h2flist[h2->rxf_type];
 
@@ -1569,17 +1316,185 @@ h2_rxframe(struct worker *wrk, struct h2_sess *h2)
 	if (h2->rxf_flags & ~h2f->flags) {
 		// rfc7540,l,687,688
 		h2->bogosity++;
-		H2S_Lock_VSLb(h2, SLT_Debug,
+		VSLb(h2->vsl, SLT_Debug,
 		    "H2: Unknown flags 0x%02x on %s (ignored)",
 		    (uint8_t)h2->rxf_flags & ~h2f->flags, h2f->name);
 		h2->rxf_flags &= h2f->flags;
 	}
 
-	h2e = h2_procframe(wrk, h2, h2f);
-	if (h2->error == NULL && h2e != NULL) {
-		h2->error = h2e;
-		h2_tx_goaway(wrk, h2, h2e);
+	if (h2->error == NULL)
+		h2_procframe(wrk, h2, h2f);
+
+	return (h2_frame_complete(h2->htc));
+}
+
+void
+h2_async_error(struct h2_req *r2, h2_error h2e)
+{
+
+       /* Report an error from a request handling thread */
+       CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+       AN(h2e);
+
+       AN(r2->scheduled);
+       CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC);
+       ASSERT_H2_REQ(r2->h2sess);
+
+       if (h2e->connection)
+               r2->h2sess->error = h2e;
+       else
+               r2->async_error = h2e;
+
+       h2_attention(r2->h2sess);
+}
+
+void
+h2_attention(struct h2_sess *h2)
+{
+
+       CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+       AZ(VEFD_Signal(h2->efd));
+}
+
+void
+h2_run(struct worker *wrk, struct h2_sess *h2)
+{
+	struct pollfd pfd[2];
+	enum htc_status_e hs;
+	const char *s, *r;
+	int i;
+	ssize_t l;
+	vtim_real now;
+	vtim_dur tmo;
+
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+
+	assert(h2->efd->poll_fd >= 0);
+
+	enum {
+		pfd_h2 = 0,
+		pfd_ev = 1,
+	};
+	memset(pfd, 0, sizeof pfd);
+	pfd[pfd_h2].fd = h2->sess->fd;
+	pfd[pfd_h2].events = POLLIN;
+	pfd[pfd_ev].fd = h2->efd->poll_fd;
+	pfd[pfd_ev].events = POLLIN;
+
+	VTCP_nonblocking(h2->sess->fd);
+
+	now = VTIM_real();
+	h2->deadline = now + cache_param->timeout_idle;
+
+	while (h2->error == NULL) {
+		if (H2_Send_Pending(h2))
+			pfd[pfd_h2].events = POLLIN | POLLOUT;
+		else
+			pfd[pfd_h2].events = POLLIN;
+		i = poll(pfd, 2, 1000);
+
+		/* Calculate the next deadline. The deadline is the time
+		 * at which any "blocking" poll()s in code called by this
+		 * loop (e.g. a need to flush the output to free up buffer
+		 * space) are allowed to wait before flagging error. */
+		now = VTIM_real();
+		tmo = SESS_TMO(h2->sess, timeout_idle);
+		h2->deadline = now + cache_param->timeout_idle;
+
+		/* Connection timeouts */
+		if (h2->error == NULL && h2->hpack_lock != NULL &&
+		    h2->hpack_lock->req->t_first + tmo < now)
+			h2->error = H2CE_COMPRESSION_ERROR;
+		else if (h2->error == NULL && h2->open_streams == 0 &&
+		    h2->sess->t_idle + tmo < now)
+			h2->error = H2CE_NO_ERROR;
+
+		if (pfd[pfd_ev].revents & POLLIN) {
+			/* Signalled for attention by a request
+			 * thread. Reset the eventfd. */
+			AZ(VEFD_Clear(h2->efd));
+		}
+
+		if (pfd[pfd_h2].revents & POLLIN) {
+			hs = h2_rxstuff(h2);
+			while (h2->error == NULL && hs == HTC_S_COMPLETE)
+				hs = h2_rxframe(wrk, h2);
+			if (h2->error == NULL && hs < 0) {
+				switch (hs) {
+				case HTC_S_EOF:
+					/* Remote close */
+					h2->error = H2CE_IO_ERROR;
+					break;
+				default:
+					HTC_Status(hs, &s, &r);
+					VSLb(h2->vsl, SLT_Error, "H2: %s", s);
+					h2->error = H2CE_PROTOCOL_ERROR;
+					break;
+				}
+			}
+		}
+
+		if (pfd[pfd_h2].revents & POLLOUT) {
+			/* We have data to send and it is possible to
+			 * send. */
+			l = H2_Send_TxStuff(h2);
+			if (l < 0 && errno != EWOULDBLOCK) {
+				VSLb(h2->vsl, SLT_Error, "H2: Send error (%s)",
+				    strerror(errno));
+				h2->error = H2CE_IO_ERROR;
+			}
+		}
+
+		h2_sweep(wrk, h2, now);
+	}
+	AN(h2->error);
+
+	/* Wake up any threads waiting to send, cancelling any queued
+	 * writes. */
+	H2_Send_Shutdown(h2);
+
+	/* Kill all streams, kicking any waitinglist stuck items */
+	h2_kill_all(wrk, h2, h2->error);
+
+	if (h2->error->send_goaway) {
+		/* Add timeout_linger to the deadline which may have
+		 * already been spent, to give some additional time to get
+		 * the GOAWAY out the door. */
+		h2->deadline += cache_param->timeout_linger;
+
+		/* Send GOAWAY, and then spend up until the last deadline
+		 * set draining the outgoing buffers. This is to be a good
+		 * citizen and make some effort on communicating the
+		 * GOAWAY. */
+		H2_Send_GOAWAY(h2, h2->highest_stream, h2->error);
+		while (H2_Send_Pending(h2)) {
+			if (H2_Send_Something(h2) < 0)
+				break;
+		}
 	}
 
-	return (h2->error != NULL ? 0 : 1);
+	/* We will not be sending anything more on the socket. */
+	H2_Send_Stop(h2);
+	AN(VTAILQ_EMPTY(&h2->tx_l_queue));
+
+	/* XXX: Shutdown socket? Would presumably free up kernel socket
+	 * buffers while waiting for waitinglists and the like to clean
+	 * up. */
+
+	/* Wait until all the requests have been removed */
+	pfd[pfd_h2].fd = -pfd[pfd_h2].fd; /* Disable polling on the sess fd */
+	while (h2->refcnt > 0) {
+		/* Don't use infinite timeout here. The walkaway has data
+		 * race issues, and we may need to kill a req more than
+		 * once to wake it. */
+		i = poll(pfd, 2, 250);
+
+		if (i > 0 && pfd[pfd_ev].revents & POLLIN) {
+			/* Clear the eventfd before the next sleep */
+			AZ(VEFD_Clear(h2->efd));
+		}
+		h2_kill_all(wrk, h2, h2->error);
+		h2_sweep(wrk, h2, now);
+	}
 }
diff --git a/bin/varnishd/http2/cache_http2_reqbody.c b/bin/varnishd/http2/cache_http2_reqbody.c
new file mode 100644
index 0000000000..dad4dd9e8c
--- /dev/null
+++ b/bin/varnishd/http2/cache_http2_reqbody.c
@@ -0,0 +1,421 @@
+/*-
+ * Copyright (c) 2016-2025 Varnish Software AS
+ * All rights reserved.
+ *
+ * Author: Poul-Henning Kamp <phk@phk.freebsd.dk>
+ * Author: Martin Blix Grydeland <martin@varnish-software.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "config.h"
+
+#include <pthread.h>
+#include <errno.h>
+
+#include "cache/cache_varnishd.h"
+
+#include "cache/cache_transport.h"
+#include "cache/cache_filter.h"
+#include "http2/cache_http2.h"
+#include "storage/storage.h"
+
+#include "vtim.h"
+
+struct h2_reqbody_waiter {
+	unsigned		magic;
+#define H2_REQBODY_WAITER_MAGIC	0xb6f4c52c
+	pthread_cond_t		cond;
+};
+
+static int
+h2_reqbody_wait(struct h2_req *r2, vtim_real when)
+{
+	struct h2_reqbody_waiter w;
+	int retval;
+
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC);
+
+	Lck_AssertHeld(&r2->h2sess->sess->mtx);
+
+	INIT_OBJ(&w, H2_REQBODY_WAITER_MAGIC);
+	PTOK(pthread_cond_init(&w.cond, NULL));
+
+	AZ(r2->reqbody_waiter);
+	r2->reqbody_waiter = &w;
+	retval = Lck_CondWaitUntil(&w.cond, &r2->h2sess->sess->mtx, when);
+	r2->reqbody_waiter = NULL;
+
+	PTOK(pthread_cond_destroy(&w.cond));
+	w.magic = 0;
+
+	return (retval);
+}
+
+void
+h2_reqbody_kick(struct h2_req *r2)
+{
+
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC);
+
+	Lck_AssertHeld(&r2->h2sess->sess->mtx);
+
+	CHECK_OBJ_ORNULL(r2->reqbody_waiter, H2_REQBODY_WAITER_MAGIC);
+	if (r2->reqbody_waiter != NULL)
+		PTOK(pthread_cond_signal(&r2->reqbody_waiter->cond));
+}
+
+h2_error
+h2_reqbody_data(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
+{
+	ssize_t l;
+	uint64_t l2, head;
+	const uint8_t *src;
+	unsigned len;
+
+	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+
+	ASSERT_H2_SESS(h2);
+
+	CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+
+	/* XXX: errcheck? */
+	if (h2->error != NULL || r2->error != NULL)
+		return (h2->error != NULL ? h2->error : r2->error);
+
+	/* Check padding if present */
+	src = h2->rxf_data;
+	len = h2->rxf_len;
+	if (h2->rxf_flags & H2FF_PADDED) {
+		if (*src >= len) {
+			VSLb(h2->vsl, SLT_SessError,
+			    "H2: stream %u: Padding larger than frame length",
+			    h2->rxf_stream);
+			return (H2CE_PROTOCOL_ERROR);
+		}
+		len -= 1 + *src;
+		src += 1;
+	}
+
+	/* Check against the Content-Length header if given */
+	if (r2->req->htc->content_length >= 0) {
+		if (r2->rxbuf)
+			l = r2->rxbuf->head;
+		else
+			l = 0;
+		l += len;
+		if (l > r2->req->htc->content_length ||
+		    ((h2->rxf_flags & H2FF_END_STREAM) &&
+		     l != r2->req->htc->content_length)) {
+			VSLb(h2->vsl, SLT_Debug,
+			    "H2: stream %u: Received data and Content-Length"
+			    " mismatch", h2->rxf_stream);
+			return (H2SE_PROTOCOL_ERROR);
+		}
+	}
+
+	/* Check and charge connection window. The entire frame including
+	 * padding (h2->rxf_len) counts towards the window. */
+	if (h2->rxf_len > h2->rx_window) {
+		VSLb(h2->vsl, SLT_SessError,
+		    "H2: stream %u: Exceeded connection receive window",
+		    h2->rxf_stream);
+		return (H2CE_FLOW_CONTROL_ERROR);
+	}
+	h2->rx_window -= h2->rxf_len;
+	if (h2->rx_window < cache_param->h2_rx_window_low_water) {
+		/* Running low, increase the window */
+		l = cache_param->h2_rx_window_increment;
+		assert(l < (1UL << 31));
+		h2->rx_window += l;
+		H2_Send_WINDOW_UPDATE(h2, 0, l);
+	}
+
+	/* Check stream window. The entire frame including padding
+	 * (h2->rxf_len) counts towards the window. */
+	if (h2->rxf_len > r2->rx_window) {
+		VSLb(h2->vsl, SLT_Debug,
+		    "H2: stream %u: Exceeded stream receive window",
+		    h2->rxf_stream);
+		return (H2SE_FLOW_CONTROL_ERROR);
+	}
+
+	/* Handle zero size frame before starting to allocate buffers */
+	if (len == 0) {
+		r2->rx_window -= h2->rxf_len;
+
+		/* Handle the specific corner case where the entire window
+		 * has been exhausted using nothing but padding
+		 * bytes. Since no bytes have been buffered, no bytes
+		 * would be consumed by the request thread and no stream
+		 * window updates sent. Unpaint ourselves from this corner
+		 * by sending a stream window update here. */
+		CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+		if (r2->rx_window == 0 &&
+		    (r2->rxbuf == NULL || r2->rxbuf->tail == r2->rxbuf->head)) {
+			/* XXX: bogosity++? */
+			if (r2->rxbuf)
+				l = r2->rxbuf->size;
+			else
+				l = h2->local_settings.initial_window_size;
+			r2->rx_window += l;
+			H2_Send_WINDOW_UPDATE(h2, r2->stream, l);
+		}
+
+		if (h2->rxf_flags & H2FF_END_STREAM)
+			h2_stream_setstate(r2, H2_S_CLOS_REM);
+		Lck_Lock(&h2->sess->mtx);
+		h2_reqbody_kick(r2);
+		Lck_Unlock(&h2->sess->mtx);
+		return (0);
+	}
+
+	/* Make the buffer on demand */
+	if (r2->rxbuf == NULL) {
+		unsigned bufsize;
+		size_t bstest;
+		struct stv_buffer *stvbuf;
+		struct h2_rxbuf *rxbuf;
+
+		bufsize = h2->local_settings.initial_window_size;
+		if (bufsize < r2->rx_window) {
+			/* This will not happen because we do not have any
+			 * mechanism to change the initial window size on
+			 * a running session. But if we gain that ability,
+			 * this future proofs it. */
+			bufsize = r2->rx_window;
+		}
+		assert(bufsize > 0);
+		if ((h2->rxf_flags & H2FF_END_STREAM) &&
+		    bufsize > len) {
+			/* Cap the buffer size when we know this is the
+			 * single data frame. */
+			bufsize = len;
+		}
+		CHECK_OBJ_NOTNULL(stv_h2_rxbuf, STEVEDORE_MAGIC);
+		stvbuf = STV_AllocBuf(wrk, stv_h2_rxbuf,
+		    bufsize + sizeof *rxbuf);
+		if (stvbuf == NULL) {
+			VSLb(h2->vsl, SLT_Debug,
+			    "H2: stream %u: Failed to allocate request body"
+			    " buffer",
+			    h2->rxf_stream);
+			return (H2SE_INTERNAL_ERROR);
+		}
+		rxbuf = STV_GetBufPtr(stvbuf, &bstest);
+		AN(rxbuf);
+		assert(bstest >= bufsize + sizeof *rxbuf);
+		assert(PAOK(rxbuf));
+		INIT_OBJ(rxbuf, H2_RXBUF_MAGIC);
+		rxbuf->size = bufsize;
+		rxbuf->stvbuf = stvbuf;
+
+		r2->rxbuf = rxbuf;
+	}
+
+	CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+	assert(r2->rxbuf->tail <= r2->rxbuf->head);
+	l = r2->rxbuf->head - r2->rxbuf->tail;
+	assert(l <= r2->rxbuf->size);
+	l = r2->rxbuf->size - l;
+	assert(len <= l); /* Stream window handling ensures this */
+
+	l = len;
+	head = r2->rxbuf->head;
+	do {
+		l2 = l;
+		if ((head % r2->rxbuf->size) + l2 > r2->rxbuf->size)
+			l2 = r2->rxbuf->size - (head % r2->rxbuf->size);
+		assert(l2 > 0);
+		memcpy(&r2->rxbuf->data[head % r2->rxbuf->size], src, l2);
+		src += l2;
+		head += l2;
+		l -= l2;
+	} while (l > 0);
+
+	Lck_Lock(&h2->sess->mtx);
+	/* Charge stream window. The entire frame including padding
+	 * (h2->rxf_len) counts towards the window. The used padding
+	 * bytes will be included in the next connection window update
+	 * sent when the buffer bytes are consumed because that is
+	 * calculated against the available buffer space. */
+	r2->rx_window -= h2->rxf_len;
+	r2->rxbuf->head += len;
+	assert(r2->rxbuf->tail <= r2->rxbuf->head);
+	if (h2->rxf_flags & H2FF_END_STREAM)
+		h2_stream_setstate(r2, H2_S_CLOS_REM);
+	h2_reqbody_kick(r2);
+	Lck_Unlock(&h2->sess->mtx);
+
+	return (0);
+}
+
+static enum vfp_status v_matchproto_(vfp_pull_f)
+h2_vfp_body(struct vfp_ctx *vc, struct vfp_entry *vfe, void *ptr, ssize_t *lp)
+{
+	struct h2_req *r2;
+	struct h2_sess *h2;
+	enum vfp_status retval;
+	h2_error h2e = NULL;
+	ssize_t l, l2;
+	uint64_t tail;
+	uint8_t *dst;
+	int wait_error = 0;
+
+	CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC);
+	CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC);
+	CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC);
+	h2 = r2->h2sess;
+
+	ASSERT_H2_REQ(h2);
+
+	AN(ptr);
+	AN(lp);
+	assert(*lp >= 0);
+
+	Lck_Lock(&h2->sess->mtx);
+
+	while (1) {
+		CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+		if (r2->rxbuf) {
+			assert(r2->rxbuf->tail <= r2->rxbuf->head);
+			l = r2->rxbuf->head - r2->rxbuf->tail;
+		} else
+			l = 0;
+
+		h2e = h2_errcheck(r2);
+		if (h2e != NULL)
+			break;
+		else if (r2->state >= H2_S_CLOS_REM && l <= *lp)
+			retval = VFP_END;
+		else {
+			if (l > *lp)
+				l = *lp;
+			retval = VFP_OK;
+		}
+
+		if (retval != VFP_OK || l > 0)
+			break;
+
+		wait_error = h2_reqbody_wait(r2,
+		    VTIM_real() + SESS_TMO(h2->sess, timeout_idle));
+		if (wait_error == ETIMEDOUT)
+			break;
+	}
+
+	Lck_Unlock(&h2->sess->mtx);
+
+	if (h2e != NULL)
+		retval = VFP_Error(vc, "H2: Request body error (%s)", h2e->txt);
+	else if (wait_error == ETIMEDOUT)
+		retval = VFP_Error(vc, "H2: Request body timed out");
+
+	if (l == 0 || retval == VFP_ERROR) {
+		*lp = 0;
+		return (retval);
+	}
+
+	*lp = l;
+	dst = ptr;
+	tail = r2->rxbuf->tail;
+	do {
+		l2 = l;
+		if ((tail % r2->rxbuf->size) + l2 > r2->rxbuf->size)
+			l2 = r2->rxbuf->size - (tail % r2->rxbuf->size);
+		assert(l2 > 0);
+		memcpy(dst, &r2->rxbuf->data[tail % r2->rxbuf->size], l2);
+		dst += l2;
+		tail += l2;
+		l -= l2;
+	} while (l > 0);
+
+	Lck_Lock(&h2->sess->mtx);
+
+	CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+	r2->rxbuf->tail = tail;
+	assert(r2->rxbuf->tail <= r2->rxbuf->head);
+
+	if (r2->rx_window < cache_param->h2_rx_window_low_water &&
+	    r2->state < H2_S_CLOS_REM) {
+		/* Kick the session thread so it can hand out an extended
+		 * window to the peer. */
+		h2_attention(h2);
+	}
+
+	Lck_Unlock(&h2->sess->mtx);
+	return (retval);
+}
+
+static void
+h2_vfp_body_fini(struct vfp_ctx *vc, struct vfp_entry *vfe)
+{
+	struct h2_req *r2;
+	struct stv_buffer *stvbuf = NULL;
+
+	CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC);
+	CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC);
+	CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->req, REQ_MAGIC);
+
+	ASSERT_H2_REQ(r2->h2sess);
+
+	if (vc->failed)
+		h2_async_error(r2, H2SE_REFUSED_STREAM);
+
+	CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC);
+	if (r2->state >= H2_S_CLOS_REM && r2->rxbuf != NULL) {
+		/* Free the buffer. This is safe without any locking
+		 * because the session thread will only free the buffer as
+		 * part of h2_del_req(), which won't be run as long as we
+		 * are scheduled. */
+		AN(r2->scheduled);
+		stvbuf = r2->rxbuf->stvbuf;
+		r2->rxbuf = NULL;
+		STV_FreeBuf(vc->wrk, &stvbuf);
+	}
+}
+
+static const struct vfp h2_body = {
+	.name = "H2_BODY",
+	.pull = h2_vfp_body,
+	.fini = h2_vfp_body_fini,
+};
+
+void v_matchproto_(vtr_req_body_t)
+h2_reqbody(struct req *req)
+{
+	struct h2_req *r2;
+	struct vfp_entry *vfe;
+
+	CHECK_OBJ(req, REQ_MAGIC);
+	CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC);
+	vfe = VFP_Push(req->vfc, &h2_body);
+	AN(vfe);
+	vfe->priv1 = r2;
+}
diff --git a/bin/varnishd/http2/cache_http2_send.c b/bin/varnishd/http2/cache_http2_send.c
index d4e66aab8c..62093b152e 100644
--- a/bin/varnishd/http2/cache_http2_send.c
+++ b/bin/varnishd/http2/cache_http2_send.c
@@ -32,414 +32,593 @@
 #include "config.h"
 
 #include <sys/uio.h>
+#include <stdio.h>
+#include <poll.h>
 
 #include "cache/cache_varnishd.h"
-
 #include "cache/cache_transport.h"
 #include "http2/cache_http2.h"
 
 #include "vend.h"
 #include "vtim.h"
 
-#define H2_SEND_HELD(h2, r2) (VTAILQ_FIRST(&(h2)->txqueue) == (r2))
+static void
+h2_send_vsl(struct vsl_log *vsl, const void *ptr, size_t len)
+{
+	const uint8_t *b;
+	struct vsb *vsb;
+	const char *p;
+	unsigned u;
+
+	if (VSL_tag_is_masked(SLT_H2TxHdr) &&
+	    VSL_tag_is_masked(SLT_H2TxBody))
+		return;
+
+	AN(ptr);
+	assert(len >= 9);
+	b = ptr;
 
-static h2_error
-h2_errcheck(const struct h2_req *r2, const struct h2_sess *h2)
+	vsb = VSB_new_auto();
+	AN(vsb);
+	p = h2_framename(b[3]);
+	if (p != NULL)
+		VSB_cat(vsb, p);
+	else
+		VSB_quote(vsb, b + 3, 1, VSB_QUOTE_HEX);
+
+	u = vbe32dec(b) >> 8;
+	VSB_printf(vsb, "[%u] ", u);
+	VSB_quote(vsb, b + 4, 1, VSB_QUOTE_HEX);
+	VSB_putc(vsb, ' ');
+	VSB_quote(vsb, b + 5, 4, VSB_QUOTE_HEX);
+	AZ(VSB_finish(vsb));
+	VSLb_bin(vsl, SLT_H2TxHdr, 9, b);
+	if (len > 9)
+		VSLb_bin(vsl, SLT_H2TxBody, len - 9, b + 9);
+
+	VSLb(vsl, SLT_Debug, "H2TXF %s", VSB_data(vsb));
+	VSB_destroy(&vsb);
+}
+
+static void
+h2_mk_hdr(uint8_t *hdr, h2_frame ftyp, uint8_t flags,
+    uint32_t len, uint32_t stream)
 {
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
 
-	if (r2->error != NULL)
-		return (r2->error);
-	if (h2->error != NULL && r2->stream > h2->goaway_last_stream)
-		return (h2->error);
-	return (NULL);
+	AN(hdr);
+	AZ(flags & ~(ftyp->flags));
+	if (stream == 0)
+		AZ(ftyp->act_szero);
+	else
+		AZ(ftyp->act_snonzero);
+	assert(len < (1U << 24));
+	vbe32enc(hdr, len << 8);
+	hdr[3] = ftyp->type;
+	hdr[4] = flags;
+	vbe32enc(hdr + 5, stream);
 }
 
-static int
-h2_cond_wait(pthread_cond_t *cond, struct h2_sess *h2, struct h2_req *r2)
+static int64_t
+h2_win_limit(const struct h2_req *r2)
 {
-	vtim_dur tmo = 0.;
-	vtim_real now;
-	h2_error h2e;
-	int r;
 
-	AN(cond);
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
 	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC);
 
-	Lck_AssertHeld(&h2->sess->mtx);
+	return (vmin_t(int64_t, r2->tx_window, r2->h2sess->tx_window));
+}
 
-	if (cache_param->h2_window_timeout > 0.)
-		tmo = cache_param->h2_window_timeout;
+static void
+h2_win_charge(struct h2_req *r2, uint32_t w)
+{
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC);
 
-	r = Lck_CondWaitTimeout(cond, &h2->sess->mtx, tmo);
-	assert(r == 0 || r == ETIMEDOUT);
+	r2->tx_window -= w;
+	r2->h2sess->tx_window -= w;
+}
 
-	now = VTIM_real();
+static int
+h2_send_small(struct h2_sess *h2, h2_frame ftyp, uint8_t flags,
+    uint32_t stream, uint32_t len, const void *ptr)
+{
 
-	/* NB: when we grab h2_window_timeout before acquiring the session
-	 * lock we may time out, but once we wake up both send_timeout and
-	 * h2_window_timeout may have changed meanwhile. For this reason
-	 * h2_stream_tmo() may not log what timed out and we need to call
-	 * again with a magic NAN "now" that indicates to h2_stream_tmo()
-	 * that the stream reached the h2_window_timeout via the lock and
-	 * force it to log it.
-	 */
-	h2e = h2_stream_tmo(h2, r2, now);
-	if (h2e == NULL && r == ETIMEDOUT) {
-		h2e = h2_stream_tmo(h2, r2, NAN);
-		AN(h2e);
+	ASSERT_H2_SESS(h2);
+	AN(ftyp);
+	AZ(flags & ~(ftyp->flags));
+	if (stream == 0)
+		AZ(ftyp->act_szero);
+	else
+		AZ(ftyp->act_snonzero);
+	assert(len + 9 <= pdiff(h2->tx_s_start, h2->tx_s_end));
+	if (len > 0)
+		AN(ptr);
+
+	while (len + 9 > pdiff(h2->tx_s_head, h2->tx_s_end)) {
+		/* Send something (up until h2->deadline) to free up space. */
+		if (H2_Send_Something(h2) < 0)
+			return (-1);
+	}
+
+	h2_mk_hdr(h2->tx_s_head, ftyp, flags, len, stream);
+	h2->tx_s_head += 9;
+	if (len > 0) {
+		memcpy(h2->tx_s_head, ptr, len);
+		h2->tx_s_head += len;
 	}
+	assert(h2->tx_s_head <= h2->tx_s_end);
+	h2_send_vsl(h2->vsl, h2->tx_s_head - (9 + len), 9 + len);
 
-	if (r2->error == NULL)
-		r2->error = h2e;
+	h2->srq->acct.resp_hdrbytes += 9;
+	if (ftyp->overhead)
+		h2->srq->acct.resp_bodybytes += len;
 
-	return (h2e != NULL ? -1 : 0);
+	return (0);
 }
 
-static void
-h2_send_get_locked(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
+int
+H2_Send_RST(struct h2_sess *h2, uint32_t stream, h2_error h2e)
 {
+	uint8_t buf[4];
 
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-
-	Lck_AssertHeld(&h2->sess->mtx);
-	if (&wrk->cond == h2->cond)
-		ASSERT_RXTHR(h2);
-	r2->wrk = wrk;
-	VTAILQ_INSERT_TAIL(&h2->txqueue, r2, tx_list);
-	while (!H2_SEND_HELD(h2, r2))
-		AZ(Lck_CondWait(&wrk->cond, &h2->sess->mtx));
-	r2->wrk = NULL;
+	vbe32enc(buf, h2e->val);
+	return (h2_send_small(h2, H2_F_RST_STREAM, 0, stream,
+		sizeof buf, buf));
 }
 
-void
-H2_Send_Get(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2)
+int
+H2_Send_SETTINGS(struct h2_sess *h2, uint8_t flags, ssize_t len,
+    const uint8_t *buf)
 {
-
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-
-	Lck_Lock(&h2->sess->mtx);
-	h2_send_get_locked(wrk, h2, r2);
-	Lck_Unlock(&h2->sess->mtx);
+	if (flags & H2FF_ACK)
+		assert(len == 0);
+	return (h2_send_small(h2, H2_F_SETTINGS, flags, 0, len, buf));
 }
 
-static void
-h2_send_rel_locked(struct h2_sess *h2, const struct h2_req *r2)
+int
+H2_Send_PING(struct h2_sess *h2, uint8_t flags, uint64_t data)
 {
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-
-	Lck_AssertHeld(&h2->sess->mtx);
-	AN(H2_SEND_HELD(h2, r2));
-	VTAILQ_REMOVE(&h2->txqueue, r2, tx_list);
-	r2 = VTAILQ_FIRST(&h2->txqueue);
-	if (r2 != NULL) {
-		CHECK_OBJ_NOTNULL(r2->wrk, WORKER_MAGIC);
-		PTOK(pthread_cond_signal(&r2->wrk->cond));
-	}
+	return (h2_send_small(h2, H2_F_PING, flags, 0, sizeof data, &data));
 }
 
-void
-H2_Send_Rel(struct h2_sess *h2, const struct h2_req *r2)
+int
+H2_Send_GOAWAY(struct h2_sess *h2, uint32_t last_stream_id, h2_error h2e)
 {
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	uint8_t buf[8];
 
-	Lck_Lock(&h2->sess->mtx);
-	h2_send_rel_locked(h2, r2);
-	Lck_Unlock(&h2->sess->mtx);
+	vbe32enc(&buf[0], last_stream_id);
+	vbe32enc(&buf[4], h2e->val);
+	return (h2_send_small(h2, H2_F_GOAWAY, 0, 0, sizeof buf, buf));
 }
 
-static void
-h2_mk_hdr(uint8_t *hdr, h2_frame ftyp, uint8_t flags,
-    uint32_t len, uint32_t stream)
+int
+H2_Send_WINDOW_UPDATE(struct h2_sess *h2, uint32_t stream, uint32_t incr)
 {
+	uint8_t buf[4];
 
-	AN(hdr);
-	assert(len < (1U << 24));
-	vbe32enc(hdr, len << 8);
-	hdr[3] = ftyp->type;
-	hdr[4] = flags;
-	vbe32enc(hdr + 5, stream);
+	vbe32enc(&buf[0], incr);
+	return (h2_send_small(h2, H2_F_WINDOW_UPDATE, 0, stream,
+		sizeof buf, buf));
 }
 
-/*
- * This is the "raw" frame sender, all per-stream accounting and
- * prioritization must have happened before this is called, and
- * the session mtx must be held.
- */
+struct h2_send_large {
+	unsigned			magic;
+#define H2_SEND_LARGE_MAGIC		0x478020e3
 
-void
-H2_Send_Frame(struct worker *wrk, struct h2_sess *h2,
-    h2_frame ftyp, uint8_t flags,
-    uint32_t len, uint32_t stream, const void *ptr)
+	char				last;
+	char				started;
+	char				returned;
+
+	uint8_t				flags;
+	h2_frame			ftyp;
+
+	VTAILQ_ENTRY(h2_send_large)	list;
+
+	pthread_cond_t			cond;
+
+	struct h2_req			*r2;
+
+	const void			*ptr;
+	uint32_t			len;
+	uint32_t			count;
+};
+
+int
+H2_Send(struct vsl_log *vsl, struct h2_req *r2, h2_frame ftyp, uint8_t flags,
+    uint32_t len, const void *ptr)
 {
-	uint8_t hdr[9];
-	ssize_t s;
-	struct iovec iov[2];
+	struct h2_sess *h2;
+	struct h2_send_large large;
+	h2_error h2e;
 
-	(void)wrk;
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	h2 = r2->h2sess;
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
 
-	AN(ftyp);
+	ASSERT_H2_REQ(h2);
+
+	assert(ftyp == H2_F_HEADERS || ftyp == H2_F_DATA);
 	AZ(flags & ~(ftyp->flags));
-	if (stream == 0)
-		AZ(ftyp->act_szero);
-	else
-		AZ(ftyp->act_snonzero);
 
-	h2_mk_hdr(hdr, ftyp, flags, len, stream);
+	h2e = h2_errcheck(r2);
+	if (h2e != NULL) {
+		VSLb(vsl, SLT_Error, "H2: send error (%s)", h2e->name);
+		return (-1);
+	}
+
+	assert(r2->state > H2_S_IDLE);
+	if (r2->state >= H2_S_CLOSED) {
+		VSLb(vsl, SLT_Error, "H2: send on closed stream");
+		return (-1);
+	}
+
+	INIT_OBJ(&large, H2_SEND_LARGE_MAGIC);
+	PTOK(pthread_cond_init(&large.cond, NULL));
+
+	large.ftyp = ftyp;
+	large.flags = flags;
+	large.r2 = r2;
+	large.ptr = ptr;
+	large.len = len;
+
 	Lck_Lock(&h2->sess->mtx);
-	VSLb_bin(h2->vsl, SLT_H2TxHdr, 9, hdr);
-	h2->srq->acct.resp_hdrbytes += 9;
-	if (ftyp->overhead)
-		h2->srq->acct.resp_bodybytes += len;
-	Lck_Unlock(&h2->sess->mtx);
 
-	memset(iov, 0, sizeof iov);
-	iov[0].iov_base = (void*)hdr;
-	iov[0].iov_len = sizeof hdr;
-	iov[1].iov_base = TRUST_ME(ptr);
-	iov[1].iov_len = len;
-	s = writev(h2->sess->fd, iov, len == 0 ? 1 : 2);
-	if (s != sizeof hdr + len) {
-		if (errno == EWOULDBLOCK) {
-			H2S_Lock_VSLb(h2, SLT_SessError,
-			     "H2: stream %u: Hit idle_send_timeout", stream);
-		}
-		else {
-			H2S_Lock_VSLb(h2, SLT_Debug,
-			    "H2: stream %u: write error s=%zd/%zu errno=%d",
-			    stream, s, sizeof hdr + len, errno);
-		}
-		/*
-		 * There is no point in being nice here, we will be unable
-		 * to send a GOAWAY once the code unrolls, so go directly
-		 * to the finale and be done with it.
-		 */
-		h2->error = H2CE_PROTOCOL_ERROR;
-	} else if (len > 0) {
-		Lck_Lock(&h2->sess->mtx);
-		VSLb_bin(h2->vsl, SLT_H2TxBody, len, ptr);
-		Lck_Unlock(&h2->sess->mtx);
+	if (!h2->tx_stopped) {
+		VTAILQ_INSERT_TAIL(&h2->tx_l_queue, &large, list);
+		h2->tx_l_stuck = 0;
+		h2_attention(h2);
+
+		AZ(Lck_CondWait(&large.cond, &h2->sess->mtx));
+		AN(large.returned);	/* Sanity check */
+		/* Note: We will have been removed from the `h2->tx_l_queue`
+		 * list by the signaller. */
 	}
-}
 
-static int64_t
-h2_win_limit(const struct h2_req *r2, const struct h2_sess *h2)
-{
+	h2e = h2_errcheck(r2);
 
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(h2->req0, H2_REQ_MAGIC);
+	Lck_Unlock(&h2->sess->mtx);
+
+	PTOK(pthread_cond_destroy(&large.cond));
+	large.magic = 0;
+
+	if (h2e != NULL) {
+		VSLb(vsl, SLT_Error, "H2: send error (%s)", h2e->name);
+		return (-1);
+	}
 
-	Lck_AssertHeld(&h2->sess->mtx);
-	return (vmin_t(int64_t, r2->t_window, h2->req0->t_window));
+	return (0);
 }
 
 static void
-h2_win_charge(struct h2_req *r2, const struct h2_sess *h2, uint32_t w)
+h2_send_prep_large(struct h2_sess *h2, struct h2_send_large *large)
 {
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+	struct h2_req *r2;
+	uint8_t flags;
+	ssize_t l, limit;
+	h2_frame ftyp;
+
 	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(h2->req0, H2_REQ_MAGIC);
+	AZ(h2->tx_l_current);
 
-	Lck_AssertHeld(&h2->sess->mtx);
-	r2->t_window -= w;
-	h2->req0->t_window -= w;
+	CHECK_OBJ_NOTNULL(large, H2_SEND_LARGE_MAGIC);
+	AN(large->ftyp);
+	r2 = large->r2;
+	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
+
+	assert(large->ftyp == H2_F_DATA || large->ftyp == H2_F_HEADERS ||
+	    large->ftyp == H2_F_PUSH_PROMISE);
+	AN(large->ftyp->continuation);
+
+	l = large->len - large->count;
+	if (l > h2->remote_settings.max_frame_size)
+		l = h2->remote_settings.max_frame_size;
+
+	if (large->ftyp->respect_window) {
+		limit = h2_win_limit(r2);
+		assert(limit > 0);
+		if (l > limit)
+			l = limit;
+		h2_win_charge(r2, l);
+		if (r2->t_win_low == 0. && r2->tx_window == 0) {
+			/* The send window is low. Set a timestamp to
+			 * record when this happened, so that we can
+			 * become emo if the window isn't extended
+			 * promptly. */
+			/* XXX: This mechanism would be more effective if
+			 * we had some threshold (10% of initial window
+			 * size or something. */
+			r2->t_win_low = VTIM_real();
+			h2->win_low_streams++;
+		}
+	}
+	assert(large->count + l <= large->len);
+
+	ftyp = large->ftyp;
+	flags = large->flags;
+	AZ(flags & ~(ftyp->flags));
+
+	if (large->count > 0) {
+		/* This is a continuation. Switch frame type and mask out
+		 * the flags not defined on its continuation type. */
+		ftyp = ftyp->continuation;
+		AN(ftyp);
+		flags &= ftyp->flags;
+	}
+
+	if (large->count + l < large->len) {
+		/* We are breaking it up into smaller frames. Clear the
+		 * last marker from the flags if present. */
+		flags &= ~(ftyp->final_flags);
+	}
+
+	h2_mk_hdr(h2->tx_l_hdrbuf, ftyp, flags, l, r2->stream);
+	h2_send_vsl(h2->vsl, h2->tx_l_hdrbuf, 9);
+	h2->tx_vec[0].iov_base = h2->tx_l_hdrbuf;
+	h2->tx_vec[0].iov_len = 9;
+	if (l == 0) {
+		/* Zero payload frame is valid. Will be used on
+		 * "chunked encoding" and the end of stream is
+		 * found. */
+		h2->tx_nvec = 1;
+	} else {
+		h2->tx_vec[1].iov_base =
+		    TRUST_ME((uintptr_t)large->ptr + large->count);
+		h2->tx_vec[1].iov_len = l;
+		h2->tx_nvec = 2;
+		large->count += l;
+	}
+	h2->tx_l_current = large;
+
+	/* Charge the session accounting for the protocol bytes */
+	h2->srq->acct.resp_hdrbytes += 9;
+	if (ftyp->overhead)
+		h2->srq->acct.resp_bodybytes += l;
+
+	/* Charge the request accounting for HEADERS and DATA frames */
+	if (large->ftyp == H2_F_HEADERS)
+		r2->req->acct.resp_hdrbytes += l;
+	else if (large->ftyp == H2_F_DATA)
+		r2->req->acct.resp_bodybytes += l;
 }
 
-static int64_t
-h2_do_window(struct worker *wrk, struct h2_req *r2,
-    struct h2_sess *h2, int64_t wanted)
+ssize_t
+H2_Send_TxStuff(struct h2_sess *h2)
 {
-	int64_t w = 0;
+	struct h2_send_large *large;
+	ssize_t l, ltot = 0;
+	int err = 0;
+
+	ASSERT_H2_SESS(h2);
 
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
 	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	AZ(h2->tx_stopped);
+
+	if (h2->tx_nvec == 0 && h2->tx_s_head != h2->tx_s_start) {
+		/* Prioritise sending the small frames */
+		assert(h2->tx_s_start < h2->tx_s_head);
+		assert(h2->tx_s_head <= h2->tx_s_end);
+		assert(h2->tx_s_mark == h2->tx_s_start);
+		h2->tx_vec[0].iov_base = h2->tx_s_start;
+		h2->tx_vec[0].iov_len = h2->tx_s_head - h2->tx_s_start;
+		h2->tx_nvec = 1;
+		h2->tx_s_mark = h2->tx_s_head;
+	} else if (h2->tx_nvec == 0) {
+		/* Construct a large frame from the queue (if possible
+		 * considering the current windows). If we ever implement
+		 * priorities, this would be the place to take them into
+		 * account. */
+		Lck_Lock(&h2->sess->mtx);
 
-	if (wanted == 0)
-		return (0);
+		VTAILQ_FOREACH(large, &h2->tx_l_queue, list) {
+			CHECK_OBJ_NOTNULL(large, H2_SEND_LARGE_MAGIC);
+			CHECK_OBJ_NOTNULL(large->r2, H2_REQ_MAGIC);
+			assert(large->count <= large->len);
+			AN(large->ftyp);
+
+			if (h2_errcheck(large->r2) != NULL) {
+				VTAILQ_REMOVE(&h2->tx_l_queue, large, list);
+				large->returned = 1;
+				PTOK(pthread_cond_signal(&large->cond));
+				continue;
+			}
 
-	Lck_Lock(&h2->sess->mtx);
-	if (r2->t_window <= 0 || h2->req0->t_window <= 0) {
-		r2->t_winupd = VTIM_real();
-		h2_send_rel_locked(h2, r2);
+			if (!large->ftyp->respect_window)
+				break;
 
-		assert(h2->winup_streams >= 0);
-		h2->winup_streams++;
+			if (h2->tx_window <= 0) {
+				/* If the session window is empty, none of
+				 * the respect_window frame types can be
+				 * selected. */
+				continue;
+			}
 
-		while (r2->t_window <= 0 && h2_errcheck(r2, h2) == NULL) {
-			r2->cond = &wrk->cond;
-			(void)h2_cond_wait(r2->cond, h2, r2);
-			r2->cond = NULL;
+			if (large->r2->tx_window > 0)
+				break;
 		}
 
-		while (h2->req0->t_window <= 0 && h2_errcheck(r2, h2) == NULL)
-			(void)h2_cond_wait(h2->winupd_cond, h2, r2);
-
-		if (h2_errcheck(r2, h2) == NULL) {
-			w = vmin_t(int64_t, h2_win_limit(r2, h2), wanted);
-			h2_win_charge(r2, h2, w);
-			assert (w > 0);
+		if (large == NULL) {
+			/* Tx is unable to make progress until there has
+			 * been a window update. */
+			h2->tx_l_stuck = 1;
+		} else {
+			h2->tx_l_stuck = 0;
 		}
 
-		if (r2->error == H2SE_BROKE_WINDOW &&
-		    h2->open_streams <= h2->winup_streams) {
-			VSLb(h2->vsl, SLT_SessError, "H2: window bankrupt");
-			h2->error = r2->error = H2CE_BANKRUPT;
-		    }
+		Lck_Unlock(&h2->sess->mtx);
 
-		assert(h2->winup_streams > 0);
-		h2->winup_streams--;
+		if (large == NULL)
+			return (0);
 
-		h2_send_get_locked(wrk, h2, r2);
+		h2_send_prep_large(h2, large);
 	}
 
-	if (w == 0 && h2_errcheck(r2, h2) == NULL) {
-		assert(r2->t_window > 0);
-		assert(h2->req0->t_window > 0);
-		w = h2_win_limit(r2, h2);
-		if (w > wanted)
-			w = wanted;
-		h2_win_charge(r2, h2, w);
-		assert (w > 0);
+	assert(h2->tx_nvec > 0);
+	while (h2->tx_nvec > 0) {
+		l = writev(h2->sess->fd, h2->tx_vec, h2->tx_nvec);
+		if (l < 0) {
+			/* Save the value of errno. This is strictly not
+			 * necessary as none of the calls between here and
+			 * the return should update errno, but done for
+			 * future proofing. */
+			err = errno;
+			break;
+		}
+
+		assert(l > 0);
+		VIOV_prune(h2->tx_vec, &h2->tx_nvec, l);
+		ltot += l;
 	}
-	r2->t_winupd = 0;
-	Lck_Unlock(&h2->sess->mtx);
-	return (w);
+
+	if (h2->tx_nvec == 0 && h2->tx_l_current != NULL) {
+		/* We have just finished sending a large frame. */
+		assert(h2->tx_s_mark == h2->tx_s_start);
+
+		TAKE_OBJ_NOTNULL(large, &h2->tx_l_current, H2_SEND_LARGE_MAGIC);
+		AZ(h2->tx_l_current);
+
+		AN(large->ftyp);
+
+		assert(large->count <= large->len);
+		if (large->count == large->len) {
+			if (large->flags & H2FF_END_STREAM)
+				h2_stream_setstate(large->r2, H2_S_CLOSED);
+
+			/* Signal that we are finished */
+			Lck_Lock(&h2->sess->mtx);
+			VTAILQ_REMOVE(&h2->tx_l_queue, large, list);
+			PTOK(pthread_cond_signal(&large->cond));
+			large->returned = 1;
+			Lck_Unlock(&h2->sess->mtx);
+		} else if (large->ftyp == H2_F_HEADERS ||
+		    large->ftyp == H2_F_PUSH_PROMISE) {
+			/* A CONTINUATION frame must come immediately
+			 * after the previous
+			 * HEADER|PUSH_PROMISE|CONTINUATION frame. Prepare
+			 * the `large` again, which will force that to be
+			 * the next output. */
+			h2_send_prep_large(h2, large);
+			assert(large == h2->tx_l_current);
+			assert(h2->tx_nvec > 0);
+		}
+	} else if (h2->tx_nvec == 0) {
+		/* We have just finished sending the small buffer */
+		assert(h2->tx_s_start < h2->tx_s_mark);
+		assert(h2->tx_s_mark <= h2->tx_s_head);
+		assert(h2->tx_s_head <= h2->tx_s_end);
+		memmove(h2->tx_s_start, h2->tx_s_mark,
+		    h2->tx_s_head - h2->tx_s_mark);
+		h2->tx_s_head -= h2->tx_s_mark - h2->tx_s_start;
+		h2->tx_s_mark = h2->tx_s_start;
+	}
+
+	if (ltot > 0)
+		return (ltot);
+
+	errno = err;
+	return (-1);
 }
 
-/*
- * This is the per-stream frame sender.
- * XXX: priority
- */
+int
+H2_Send_Something(struct h2_sess *h2)
+{
+	ssize_t l;
+	vtim_real now;
+	struct pollfd pfd[1];
+
+	/* Block up until h2->deadline and then send something. */
+
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	ASSERT_H2_SESS(h2);
+	AZ(h2->tx_stopped);
+
+	assert(h2->sess->fd >= 0);
+	pfd->fd = h2->sess->fd;
+	pfd->events = POLLOUT;
+
+	do {
+		now = VTIM_real();
+		if (now > h2->deadline)
+			goto error;
+		l = poll(pfd, 1, VTIM_poll_tmo(h2->deadline - now));
+	} while (l < 0 && errno == EINTR);
+
+	if (l == 0 || !(pfd->revents & POLLOUT))
+		goto error;
+
+	l = H2_Send_TxStuff(h2);
+	if (l < 0 && errno != EWOULDBLOCK)
+		goto error;
+
+	return (0);
+
+error:
+	/* Failure to send on the socket (IO error or timeout). */
+	if (h2->error == NULL)
+		h2->error = H2CE_IO_ERROR;
+	return (-1);
+}
+
+int
+H2_Send_Pending(struct h2_sess *h2)
+{
+	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
+	ASSERT_H2_SESS(h2);
+
+	if (h2->tx_nvec > 0)
+		return (1);
+	if (h2->tx_s_head != h2->tx_s_start)
+		return (1);
+	if (!VTAILQ_EMPTY(&h2->tx_l_queue) && !h2->tx_l_stuck)
+		return (1);
+	return (0);
+}
 
 static void
-h2_send(struct worker *wrk, struct h2_req *r2, h2_frame ftyp, uint8_t flags,
-    uint32_t len, const void *ptr, uint64_t *counter)
+h2_send_close(struct h2_sess *h2, unsigned stop)
 {
-	struct h2_sess *h2;
-	uint32_t mfs, tf;
-	const char *p;
-	uint8_t final_flags;
+	struct h2_send_large *large, *large2;
 
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	h2 = r2->h2sess;
 	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	assert(len == 0 || ptr != NULL);
-	AN(counter);
+	ASSERT_H2_SESS(h2);
 
-	AN(H2_SEND_HELD(h2, r2));
+	Lck_Lock(&h2->sess->mtx);
 
-	if (h2_errcheck(r2, h2) != NULL)
-		return;
+	/* A session error state should have been set prior to calling
+	 * this function. */
+	AN(h2->error);
+	AZ(h2->tx_stopped);
 
-	AN(ftyp);
-	AZ(flags & ~(ftyp->flags));
-	if (r2->stream == 0)
-		AZ(ftyp->act_szero);
-	else
-		AZ(ftyp->act_snonzero);
+	if (stop) {
+		h2->tx_stopped = 1;
 
-	Lck_Lock(&h2->sess->mtx);
-	mfs = h2->remote_settings.max_frame_size;
-	if (r2->counted && (
-	    (ftyp == H2_F_HEADERS && (flags & H2FF_HEADERS_END_STREAM)) ||
-	    (ftyp == H2_F_DATA && (flags & H2FF_DATA_END_STREAM)) ||
-	    ftyp == H2_F_RST_STREAM
-	    )) {
-		assert(h2->open_streams > 0);
-		h2->open_streams--;
-		r2->counted = 0;
+		CHECK_OBJ_ORNULL(h2->tx_l_current, H2_SEND_LARGE_MAGIC);
+		if (h2->tx_l_current != NULL) {
+			/* Abort the large frame */
+			h2->tx_l_current = NULL;
+			h2->tx_nvec = 0;
+		}
 	}
-	Lck_Unlock(&h2->sess->mtx);
 
-	if (ftyp->respect_window) {
-		tf = h2_do_window(wrk, r2, h2, (len > mfs) ? mfs : len);
-		if (h2_errcheck(r2, h2) != NULL)
-			return;
-		AN(H2_SEND_HELD(h2, r2));
-	} else
-		tf = mfs;
-
-	if (len <= tf) {
-		H2_Send_Frame(wrk, h2, ftyp, flags, len, r2->stream, ptr);
-		*counter += len;
-	} else {
-		AN(ptr);
-		p = ptr;
-		final_flags = ftyp->final_flags & flags;
-		flags &= ~ftyp->final_flags;
-		do {
-			AN(ftyp->continuation);
-			if (!ftyp->respect_window)
-				tf = mfs;
-			if (ftyp->respect_window && p != ptr) {
-				tf = h2_do_window(wrk, r2, h2,
-				    (len > mfs) ? mfs : len);
-				if (h2_errcheck(r2, h2) != NULL)
-					return;
-				AN(H2_SEND_HELD(h2, r2));
-			}
-			if (tf < len) {
-				H2_Send_Frame(wrk, h2, ftyp,
-				    flags, tf, r2->stream, p);
-			} else {
-				if (ftyp->respect_window)
-					assert(tf == len);
-				tf = len;
-				H2_Send_Frame(wrk, h2, ftyp, final_flags, tf,
-				    r2->stream, p);
-				flags = 0;
-			}
-			p += tf;
-			len -= tf;
-			*counter += tf;
-			ftyp = ftyp->continuation;
-			flags &= ftyp->flags;
-			final_flags &= ftyp->flags;
-		} while (h2->error == NULL && len > 0);
+	VTAILQ_FOREACH_SAFE(large, &h2->tx_l_queue, list, large2) {
+		CHECK_OBJ_NOTNULL(large, H2_SEND_LARGE_MAGIC);
+		if (large == h2->tx_l_current)
+			continue;
+		VTAILQ_REMOVE(&h2->tx_l_queue, large, list);
+		large->returned = 1;
+		PTOK(pthread_cond_signal(&large->cond));
 	}
+
+	Lck_Unlock(&h2->sess->mtx);
 }
 
 void
-H2_Send_RST(struct worker *wrk, struct h2_sess *h2, const struct h2_req *r2,
-    uint32_t stream, h2_error h2e)
+H2_Send_Shutdown(struct h2_sess *h2)
 {
-	char b[4];
-
-	CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC);
-	CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC);
-	AN(H2_SEND_HELD(h2, r2));
-	AN(h2e);
-
-	H2S_Lock_VSLb(h2, SLT_Debug, "H2: stream %u: %s", stream, h2e->txt);
-	vbe32enc(b, h2e->val);
-
-	H2_Send_Frame(wrk, h2, H2_F_RST_STREAM, 0, sizeof b, stream, b);
+	h2_send_close(h2, 0);
 }
 
 void
-H2_Send(struct worker *wrk, struct h2_req *r2, h2_frame ftyp, uint8_t flags,
-    uint32_t len, const void *ptr, uint64_t *counter)
+H2_Send_Stop(struct h2_sess *h2)
 {
-	uint64_t dummy_counter = 0;
-	h2_error h2e;
-
-	if (counter == NULL)
-		counter = &dummy_counter;
-
-	h2_send(wrk, r2, ftyp, flags, len, ptr, counter);
-
-	h2e = h2_errcheck(r2, r2->h2sess);
-	if (H2_ERROR_MATCH(h2e, H2SE_CANCEL))
-		H2_Send_RST(wrk, r2->h2sess, r2, r2->stream, h2e);
+	h2_send_close(h2, 1);
 }
diff --git a/bin/varnishd/http2/cache_http2_session.c b/bin/varnishd/http2/cache_http2_session.c
index 46b02dc094..009b4ae128 100644
--- a/bin/varnishd/http2/cache_http2_session.c
+++ b/bin/varnishd/http2/cache_http2_session.c
@@ -31,10 +31,10 @@
 
 #include "config.h"
 
-#include "cache/cache_varnishd.h"
-
+#include <poll.h>
 #include <stdio.h>
 
+#include "cache/cache_varnishd.h"
 #include "cache/cache_transport.h"
 #include "http2/cache_http2.h"
 
@@ -88,30 +88,6 @@ h2_local_settings(struct h2_settings *h2s)
 	h2s->max_header_list_size = cache_param->http_req_size;
 }
 
-void
-H2S_Lock_VSLb(const struct h2_sess *h2, enum VSL_tag_e tag, const char *fmt, ...)
-{
-	va_list ap;
-	int held = 0;
-
-	AN(h2);
-
-	if (VSL_tag_is_masked(tag))
-		return;
-
-	if (h2->highest_stream > 0) {
-		held = 1;
-		Lck_Lock(&h2->sess->mtx);
-	}
-
-	va_start(ap, fmt);
-	VSLbv(h2->vsl, tag, fmt, ap);
-	va_end(ap);
-
-	if (held)
-		Lck_Unlock(&h2->sess->mtx);
-}
-
 /**********************************************************************
  * The h2_sess struct needs many of the same things as a request,
  * WS, VSL, HTC &c,  but rather than implement all that stuff over, we
@@ -120,20 +96,20 @@ H2S_Lock_VSLb(const struct h2_sess *h2, enum VSL_tag_e tag, const char *fmt, ...
  */
 
 static struct h2_sess *
-h2_init_sess(struct sess *sp,
-    struct h2_sess *h2s, struct req *srq, struct h2h_decode *decode)
+h2_init_sess(struct sess *sp, struct h2_sess *h2s, struct req **psrq,
+    struct h2h_decode *decode)
 {
+	struct req *srq;
 	uintptr_t *up;
 	struct h2_sess *h2;
 
+	TAKE_OBJ_NOTNULL(srq, psrq, REQ_MAGIC);
+
 	/* proto_priv session attribute will always have been set up by H1
 	 * before reaching here. */
 	AZ(SES_Get_proto_priv(sp, &up));
 	assert(*up == 0);
 
-	if (srq == NULL)
-		srq = Req_New(sp, NULL);
-	AN(srq);
 	h2 = h2s;
 	AN(h2);
 	INIT_OBJ(h2, H2_SESS_MAGIC);
@@ -146,12 +122,15 @@ h2_init_sess(struct sess *sp,
 	h2->htc->rfd = &sp->fd;
 	h2->sess = sp;
 	h2->rxthr = pthread_self();
-	PTOK(pthread_cond_init(h2->winupd_cond, NULL));
 	VTAILQ_INIT(&h2->streams);
-	VTAILQ_INIT(&h2->txqueue);
 	h2_local_settings(&h2->local_settings);
 	h2->remote_settings = H2_proto_settings;
 	h2->decode = decode;
+	h2->expect_settings_next = 1;
+	VEFD_INIT(h2->efd);
+
+	h2->tx_window = h2->remote_settings.initial_window_size;
+	h2->rx_window = h2->local_settings.initial_window_size;
 
 	h2->rapid_reset = cache_param->h2_rapid_reset;
 	h2->rapid_reset_limit = cache_param->h2_rapid_reset_limit;
@@ -163,6 +142,19 @@ h2_init_sess(struct sess *sp,
 
 	AZ(VHT_Init(h2->dectbl, h2->local_settings.header_table_size));
 
+	/* Allocate a scratch space to use for staging small outgoing
+	 * frames. */
+	h2->tx_s_start = WS_Alloc(h2->ws, H2_TX_BUFSIZE);
+	AN(h2->tx_s_start);
+	h2->tx_s_end = h2->tx_s_start + H2_TX_BUFSIZE;
+	h2->tx_s_head = h2->tx_s_start;
+	h2->tx_s_mark = h2->tx_s_start;
+
+	/* Init send queue */
+	VTAILQ_INIT(&h2->tx_l_queue);
+
+	h2->htc->pipeline_snap = WS_Snapshot(h2->ws);
+
 	*up = (uintptr_t)h2;
 
 	return (h2);
@@ -180,7 +172,8 @@ h2_del_sess(struct worker *wrk, struct h2_sess *h2, stream_close_t reason)
 	AN(reason);
 
 	VHT_Fini(h2->dectbl);
-	PTOK(pthread_cond_destroy(h2->winupd_cond));
+	if (h2->efd->poll_fd >= 0)
+		VEFD_Close(h2->efd);
 	TAKE_OBJ_NOTNULL(req, &h2->srq, REQ_MAGIC);
 	assert(!WS_IsReserved(req->ws));
 	sp = h2->sess;
@@ -264,28 +257,33 @@ h2_b64url_settings(struct h2_sess *h2, struct req *req)
 
 /**********************************************************************/
 
-static int
-h2_ou_rel(struct worker *wrk, struct req *req)
+static void
+h2_ou_rel_req(struct worker *wrk, struct req **preq)
 {
+	struct req *req;
+
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
+	TAKE_OBJ_NOTNULL(req, preq, REQ_MAGIC);
 	AZ(req->vcl);
 	Req_AcctLogCharge(wrk->stats, req);
 	Req_Release(req);
-	return (0);
 }
 
-static int
+static struct h2_req *
 h2_ou_session(struct worker *wrk, struct h2_sess *h2,
-    struct req *req)
+    struct req **preq)
 {
+	struct req *req;
 	ssize_t sz;
 	enum htc_status_e hs;
 	struct h2_req *r2;
 
+	TAKE_OBJ_NOTNULL(req, preq, REQ_MAGIC);
+
 	if (h2_b64url_settings(h2, req)) {
 		VSLb(h2->vsl, SLT_Debug, "H2: Bad HTTP-Settings");
-		return (h2_ou_rel(wrk, req));
+		h2_ou_rel_req(wrk, &req);
+		return (NULL);
 	}
 
 	sz = write(h2->sess->fd, h2_resp_101, strlen(h2_resp_101));
@@ -293,13 +291,11 @@ h2_ou_session(struct worker *wrk, struct h2_sess *h2,
 	if (sz != strlen(h2_resp_101)) {
 		VSLb(h2->vsl, SLT_Debug, "H2: Upgrade: Error writing 101"
 		    " response: %s\n", VAS_errtxt(errno));
-		return (h2_ou_rel(wrk, req));
+		h2_ou_rel_req(wrk, &req);
+		return (NULL);
 	}
 
-	http_Unset(req->http, H_Upgrade);
-	http_Unset(req->http, H_HTTP2_Settings);
-
-	/* Steal pipelined read-ahead, if any */
+	/* Copy any pipelined data from the request into the session. */
 	h2->htc->pipeline_b = req->htc->pipeline_b;
 	h2->htc->pipeline_e = req->htc->pipeline_e;
 	req->htc->pipeline_b = NULL;
@@ -309,36 +305,35 @@ h2_ou_session(struct worker *wrk, struct h2_sess *h2,
 	   do about the overflowing data is an open issue. */
 	HTC_RxInit(h2->htc, h2->ws);
 
-	/* Start req thread */
-	r2 = h2_new_req(h2, 1, req);
-	AZ(h2->highest_stream);
-	h2->highest_stream = r2->stream;
-	req->transport = &HTTP2_transport;
-	assert(req->req_step == R_STP_TRANSPORT);
-	req->task->func = h2_do_req;
-	req->task->priv = req;
-	r2->scheduled = 1;
-	r2->state = H2_S_CLOS_REM; // rfc7540,l,489,491
-	req->err_code = 0;
-	http_SetH(req->http, HTTP_HDR_PROTO, "HTTP/2.0");
-
 	/* Wait for PRISM response */
 	hs = HTC_RxStuff(h2->htc, H2_prism_complete,
 	    NULL, NULL, NAN, h2->sess->t_idle + cache_param->timeout_idle, NAN,
 	    sizeof H2_prism);
 	if (hs != HTC_S_COMPLETE) {
 		VSLb(h2->vsl, SLT_Debug, "H2: No/Bad OU PRISM (hs=%d)", hs);
-		r2->scheduled = 0;
-		h2_del_req(wrk, r2);
-		return (0);
+		h2_ou_rel_req(wrk, &req);
+		return (NULL);
 	}
-	if (Pool_Task(wrk->pool, req->task, TASK_QUEUE_REQ)) {
-		r2->scheduled = 0;
-		h2_del_req(wrk, r2);
-		VSLb(h2->vsl, SLT_Debug, "H2: No Worker-threads");
-		return (0);
-	}
-	return (1);
+
+	http_Unset(req->http, H_Upgrade);
+	http_Unset(req->http, H_HTTP2_Settings);
+
+	/* Prepare the req thread, but do not start it. The RFC requires
+	 * us to send our settings frame before any response frames, so we
+	 * delay the start of the thread until after the settings frame
+	 * has been sent. */
+	r2 = h2_new_req(h2, 1, &req);
+	AZ(req);
+	AZ(h2->highest_stream);
+	h2->highest_stream = r2->stream;
+	r2->req->transport = &HTTP2_transport;
+	assert(r2->req->req_step == R_STP_TRANSPORT);
+	r2->req->task->func = h2_do_req;
+	r2->req->task->priv = r2->req;
+	h2_stream_setstate(r2, H2_S_CLOS_REM); // rfc7540,l,489,491
+	http_SetH(r2->req->http, HTTP_HDR_PROTO, "HTTP/2.0");
+
+	return (r2);
 }
 
 /**********************************************************************
@@ -366,14 +361,15 @@ H2_OU_Sess(struct worker *wrk, struct sess *sp, struct req *req)
 static void v_matchproto_(task_func_t)
 h2_new_session(struct worker *wrk, void *arg)
 {
-	struct req *req;
+	struct req *req, *srq = NULL;
 	struct sess *sp;
 	struct h2_sess h2s;
 	struct h2_sess *h2;
-	struct h2_req *r2, *r22;
-	int again;
+	struct h2_req *r2_ou = NULL;
+	uint16_t marker;
 	uint8_t settings[48];
 	struct h2h_decode decode;
+	stream_close_t reason;
 	size_t l;
 
 	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
@@ -386,90 +382,110 @@ h2_new_session(struct worker *wrk, void *arg)
 
 	assert(req->transport == &HTTP2_transport);
 
-	assert (req->err_code == H2_PU_MARKER || req->err_code == H2_OU_MARKER);
+	marker = req->err_code;
+	assert(marker == H2_PU_MARKER || marker == H2_OU_MARKER);
+	req->err_code = 0;
+
+	if (marker == H2_PU_MARKER) {
+		/* Prior knowledge. The incoming req does not hold
+		 * anything of value and can be repurposed as the session
+		 * req (srq). */
+		srq = req;
+		req = NULL;
+	} else {
+		/* Opportunistic upgrade. The incoming req holds the first
+		 * stream H/1 received request. We will need a fresh req
+		 * for srq. */
+		srq = Req_New(sp, NULL);
+	}
+	CHECK_OBJ_NOTNULL(srq, REQ_MAGIC);
 
-	h2 = h2_init_sess(sp, &h2s,
-	    req->err_code == H2_PU_MARKER ? req : NULL, &decode);
-	h2->req0 = h2_new_req(h2, 0, NULL);
+	h2 = h2_init_sess(sp, &h2s, &srq, &decode);
+	AZ(srq);
+
+	CHECK_OBJ_NOTNULL(h2->htc, HTTP_CONN_MAGIC);
 	AZ(h2->htc->priv);
 	h2->htc->priv = h2;
 
-	AZ(wrk->vsl);
-	wrk->vsl = h2->vsl;
-
-	if (req->err_code == H2_OU_MARKER && !h2_ou_session(wrk, h2, req)) {
-		assert(h2->refcnt == 1);
-		h2_del_req(wrk, h2->req0);
-		h2_del_sess(wrk, h2, SC_RX_JUNK);
+	/* Set up the eventfd for communication with request handling
+	 * threads. */
+	if (VEFD_Open(h2->efd) < 0) {
+		VSLb(h2->vsl, SLT_Error, "H2: Failed to create eventfd");
+		h2_del_sess(wrk, h2, SC_OVERLOAD);
 		wrk->vsl = NULL;
 		return;
 	}
+
+	AZ(wrk->vsl);
+	wrk->vsl = h2->vsl;
+
+	if (marker == H2_OU_MARKER) {
+		/* Deal with opportunistic upgrade. The upgrade request
+		 * was received by HTTP/1 and is held in req. The response
+		 * will be sent by H/2. Convert the req struct to an H/2
+		 * req. */
+		AN(req);
+		r2_ou = h2_ou_session(wrk, h2, &req);
+		AZ(req);
+		if (r2_ou == NULL) {
+			h2_del_sess(wrk, h2, SC_RX_JUNK);
+			wrk->vsl = NULL;
+			return;
+		}
+
+		CHECK_OBJ_NOTNULL(r2_ou, H2_REQ_MAGIC);
+		AZ(r2_ou->scheduled);
+	} else
+		VSLb(h2->vsl, SLT_Debug, "H2: Got pu PRISM");
+
 	assert(HTC_S_COMPLETE == H2_prism_complete(h2->htc));
+
+	/* Initialize the workspace rx buffer. Some read overshoot data
+	 * may be present as pipeline data. This sequence of calls
+	 * basically just resets the WS, memmove()s the pipeline data
+	 * first, and sets htc->rxbuf_[be] to the pipeline data. */
 	HTC_RxPipeline(h2->htc, h2->htc->rxbuf_b + sizeof(H2_prism));
 	HTC_RxInit(h2->htc, h2->ws);
-	AN(WS_Reservation(h2->ws));
-	VSLb(h2->vsl, SLT_Debug, "H2: Got pu PRISM");
+	WS_ReleaseP(h2->htc->ws, h2->htc->rxbuf_e);
 
 	THR_SetRequest(h2->srq);
-	AN(WS_Reservation(h2->ws));
 
+	/* Send our settings */
 	l = h2_enc_settings(&h2->local_settings, settings, sizeof (settings));
-	AN(WS_Reservation(h2->ws));
-	H2_Send_Get(wrk, h2, h2->req0);
-	AN(WS_Reservation(h2->ws));
-	H2_Send_Frame(wrk, h2,
-	    H2_F_SETTINGS, H2FF_NONE, l, 0, settings);
-	AN(WS_Reservation(h2->ws));
-	H2_Send_Rel(h2, h2->req0);
-	AN(WS_Reservation(h2->ws));
-
-	/* and off we go... */
-	h2->cond = &wrk->cond;
-
-	while (h2_rxframe(wrk, h2)) {
-		HTC_RxInit(h2->htc, h2->ws);
-		if (WS_Overflowed(h2->ws)) {
-			H2S_Lock_VSLb(h2, SLT_SessError, "H2: Empty Rx Workspace");
-			h2->error = H2CE_INTERNAL_ERROR;
-			break;
+	H2_Send_SETTINGS(h2, H2FF_NONE, l, settings);
+
+	if (r2_ou != NULL) {
+		/* Schedule the opportunistic request received over HTTP/1
+		 * as part of the upgrade. */
+		AZ(r2_ou->scheduled);
+		r2_ou->scheduled = 1;
+		if (Pool_Task(wrk->pool, r2_ou->req->task, TASK_QUEUE_REQ)) {
+			/* We failed to schedule it. Make the client go
+			 * away.
+			 *
+			 * Note: Calling h2_tx_goaway will set the
+			 * h2->goaway flag, causing h2_rxframe() below to
+			 * return failure without reading from the
+			 * socket. */
+			r2_ou->scheduled = 0;
+			VSLb(h2->vsl, SLT_Debug, "H2: No Worker-threads");
+			h2_kill_req(wrk, h2, &r2_ou, H2SE_ENHANCE_YOUR_CALM);
+			h2->error = H2CE_ENHANCE_YOUR_CALM;
 		}
-		AN(WS_Reservation(h2->ws));
+		r2_ou = NULL;
 	}
 
-	AN(h2->error);
+	/* and off we go... */
+	h2_run(wrk, h2);
 
-	/* Delete all idle streams */
-	Lck_Lock(&h2->sess->mtx);
-	VSLb(h2->vsl, SLT_Debug, "H2 CLEANUP %s", h2->error->name);
-	VTAILQ_FOREACH(r2, &h2->streams, list) {
-		if (r2->error == 0)
-			r2->error = h2->error;
-		if (r2->cond != NULL)
-			PTOK(pthread_cond_signal(r2->cond));
-	}
-	PTOK(pthread_cond_broadcast(h2->winupd_cond));
-	Lck_Unlock(&h2->sess->mtx);
-	while (1) {
-		again = 0;
-		VTAILQ_FOREACH_SAFE(r2, &h2->streams, list, r22) {
-			if (r2 != h2->req0) {
-				h2_kill_req(wrk, h2, r2, h2->error);
-				again++;
-			}
-		}
-		if (!again)
-			break;
-		Lck_Lock(&h2->sess->mtx);
-		VTAILQ_FOREACH(r2, &h2->streams, list)
-			VSLb(h2->vsl, SLT_Debug, "ST %u %d",
-			    r2->stream, r2->state);
-		(void)Lck_CondWaitTimeout(h2->cond, &h2->sess->mtx, .1);
-		Lck_Unlock(&h2->sess->mtx);
+	AN(h2->error);
+	reason = h2->error->reason;
+	if (reason == SC_NULL) {
+		/* XXX: It's messy that some h2_errors have reasosn
+		 * SC_NULL, which is just WRONG() wrt to SES_Delete(). */
+		reason = SC_REM_CLOSE;
 	}
-	h2->cond = NULL;
-	assert(h2->refcnt == 1);
-	h2_del_req(wrk, h2->req0);
-	h2_del_sess(wrk, h2, h2->error->reason);
+	h2_del_sess(wrk, h2, reason);
 	wrk->vsl = NULL;
 }
 
@@ -489,7 +505,7 @@ struct transport HTTP2_transport = {
 	.deliver =		h2_deliver,
 	.minimal_response =	h2_minimal_response,
 	.new_session =		h2_new_session,
-	.req_body =		h2_req_body,
+	.req_body =		h2_reqbody,
 	.req_fail =		h2_req_fail,
 	.sess_panic =		h2_sess_panic,
 	.poll =			h2_poll,
diff --git a/bin/varnishtest/tests/f00007.vtc b/bin/varnishtest/tests/f00007.vtc
index e982548a03..7976362def 100644
--- a/bin/varnishtest/tests/f00007.vtc
+++ b/bin/varnishtest/tests/f00007.vtc
@@ -62,6 +62,7 @@ client c3 {
 	stream 1 {
 		txreq -req POST -url /3 -hdr "content-length" "1" -nostrend
 		txdata -data "A" -nostrend
+		rxwinup
 		delay 0.5
 		txdata -data "GET /FAIL HTTP/1.1\r\n\r\n"
 		rxrst
diff --git a/bin/varnishtest/tests/r02387.vtc b/bin/varnishtest/tests/r02387.vtc
index d2c9796e71..3d9dab7f45 100644
--- a/bin/varnishtest/tests/r02387.vtc
+++ b/bin/varnishtest/tests/r02387.vtc
@@ -11,8 +11,8 @@ varnish v1 -cliok "param.set feature +http2"
 varnish v1 -cliok "param.set debug +syncvsl"
 
 
-barrier b1 cond 2
-barrier b2 cond 2
+barrier b1 cond 3
+barrier b2 cond 3
 
 client c1 {
 	stream 1 {
@@ -27,7 +27,16 @@ client c1 {
 		barrier b1 sync
 		txcont -hdr "bar" "foo"
 
-	} -run
+	} -start
+
+	barrier b2 sync
+	non_fatal
+	barrier b1 sync
+
+	stream 1 -wait
+	stream 3 -wait
+	fatal
+
 	stream 0 {
 		rxgoaway
 		expect goaway.laststream == "1"
diff --git a/bin/varnishtest/tests/r02679.vtc b/bin/varnishtest/tests/r02679.vtc
index 590dfb264c..d3707151d7 100644
--- a/bin/varnishtest/tests/r02679.vtc
+++ b/bin/varnishtest/tests/r02679.vtc
@@ -22,7 +22,9 @@ client c1 {
 	stream 1 {
 		txreq -req POST -hdr "content-length" "31469" -nostrend
 		txdata -datalen 1550 -nostrend
+		rxwinup
 		txdata -datalen 16000 -nostrend
+		rxwinup
 		txdata -datalen 13919
 		rxresp
 		expect resp.status == 200
diff --git a/bin/varnishtest/tests/r02923.vtc b/bin/varnishtest/tests/r02923.vtc
index 324f20cff6..537812eb22 100644
--- a/bin/varnishtest/tests/r02923.vtc
+++ b/bin/varnishtest/tests/r02923.vtc
@@ -34,9 +34,6 @@ varnish v1 -vcl+backend {
 } -start
 
 client c1 {
-	txpri
-	stream 0 rxsettings -run
-
 	stream 1 {
 		txreq -url /sync
 		rxresp
diff --git a/bin/varnishtest/tests/r02937.vtc b/bin/varnishtest/tests/r02937.vtc
index 8a2d00d58b..11dbf32330 100644
--- a/bin/varnishtest/tests/r02937.vtc
+++ b/bin/varnishtest/tests/r02937.vtc
@@ -21,5 +21,20 @@ client c1 {
 	expect resp.http.upgrade == h2c
 	expect resp.http.connection == Upgrade
 	txpri
+
+	stream 0 {
+		rxsettings
+		rxgoaway
+		expect goaway.err == ENHANCE_YOUR_CALM
+		expect goaway.laststream == 1
+	} -start
+
+	stream 1 {
+		rxrst
+	} -start
+
+	stream 0 -wait
+	stream 1 -wait
+
 	expect_close
 } -run
diff --git a/bin/varnishtest/tests/t02003.vtc b/bin/varnishtest/tests/t02003.vtc
index fe30e82437..fab076273b 100644
--- a/bin/varnishtest/tests/t02003.vtc
+++ b/bin/varnishtest/tests/t02003.vtc
@@ -36,7 +36,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test reverse order stream numbers
 
-client c1 {
+client c2 {
 	stream 0 {
 		rxgoaway
 		expect goaway.laststream == 3
@@ -61,7 +61,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test WINDOW_UPDATE error conditions
 
-client c1 {
+client c3 {
 	stream 1 {
 		txreq -nostrend
 		txwinup -size 0
@@ -92,7 +92,7 @@ client c1 {
 	} -run
 	stream 0 -wait
 } -run
-client c1 {
+client c4 {
 	stream 0 {
 		txwinup -size 0x40000000
 		txwinup -size 0x40000000
@@ -102,7 +102,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c5 {
 	stream 1 {
 		txreq
 		rxresp
@@ -123,7 +123,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test PING error conditions
 
-client c1 {
+client c6 {
 	stream 0 {
 		txping -ack -data "FOOBAR42"
 		rxgoaway
@@ -132,7 +132,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c7 {
 	stream 0 {
 		sendhex "000008 06 80 00000001 0102030405060708"
 		rxgoaway
@@ -141,7 +141,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c8 {
 	stream 0 {
 		sendhex "000007 06 80 00000000 01020304050607"
 		rxgoaway
@@ -160,7 +160,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test PUSH_PROMISE error conditions
 
-client c1 {
+client c9 {
 	stream 0 {
 		rxgoaway
 		expect goaway.err == PROTOCOL_ERROR
@@ -173,7 +173,7 @@ client c1 {
 	stream 0 -wait
 } -run
 
-client c1 {
+client c10 {
 	stream 0 {
 		rxgoaway
 		expect goaway.err == PROTOCOL_ERROR
@@ -198,7 +198,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test RST_STREAM error conditions
 
-client c1 {
+client c11 {
 	stream 0 {
 		# RST idle stream
 		sendhex "000004 03 00 00000007 00000008"
@@ -208,7 +208,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c12 {
 	stream 0 {
 		rxgoaway
 		expect goaway.err == FRAME_SIZE_ERROR
@@ -222,7 +222,7 @@ client c1 {
 	stream 0 -wait
 } -run
 
-client c1 {
+client c13 {
 	stream 0 {
 		# RST stream zero
 		sendhex "000000 03 00 00000000 00000008"
@@ -232,8 +232,11 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+barrier b14 cond 2
+client c14 {
 	stream 0 {
+		barrier b14 sync
+		txgoaway
 		rxgoaway
 		expect goaway.err == NO_ERROR
 		expect goaway.laststream == 3
@@ -246,11 +249,15 @@ client c1 {
 		txreq -nostrend
 		txrst -err 0x666
 	} -run
+	barrier b14 sync
 	stream 0 -wait
 } -run
 
-client c1 {
+barrier b15 cond 2
+client c15 {
 	stream 0 {
+		barrier b15 sync
+		txgoaway
 		rxgoaway
 		expect goaway.err == NO_ERROR
 		expect goaway.laststream == 1
@@ -263,6 +270,7 @@ client c1 {
 		# RST_STREAM on closed stream
 		txrst
 	} -run
+	barrier b15 sync
 	stream 0 -wait
 } -run
 
@@ -277,7 +285,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test SETTING error conditions
 
-client c1 {
+client c16 {
 	stream 0 {
 		# SETTING ACK with data
 		sendhex "000001 04 01 00000000 aa"
@@ -287,7 +295,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c17 {
 	stream 0 {
 		# SETTING ACK with bad length
 		sendhex "000001 04 00 00000000 aa"
@@ -296,7 +304,7 @@ client c1 {
 		expect goaway.laststream == 0
 	} -run } -run
 
-client c1 {
+client c18 {
 	stream 0 {
 		# SETTING ACK with bad value
 		txsettings -winsize 0x80000000
@@ -306,7 +314,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c19 {
 	stream 0 {
 		# SETTING unknown value
 		sendhex "000006 04 00 00000000 ffff00000000"
@@ -326,16 +334,19 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test GOAWAY error conditions
 
-client c1 {
+client c20 {
 	stream 0 {
 		txgoaway -err 2
+		rxgoaway
+		expect goaway.err == NO_ERROR
 	} -run
 	expect_close
 } -run
 
-client c1 {
+client c21 {
 	stream 0 {
 		txgoaway -err 2222
+		rxgoaway
 	} -run
 	expect_close
 } -run
@@ -351,7 +362,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test HEADERS error conditions
 
-client c1 {
+client c22 {
 	stream 1 {
 		txreq -nostrend
 		txreq -nostrend
@@ -362,7 +373,7 @@ client c1 {
 	expect_close
 } -run
 
-client c1 {
+client c23 {
 	stream 0 {
 		sendhex 00000c
 		sendhex 01
@@ -376,7 +387,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c24 {
 	stream 0 {
 		sendhex 000012
 		sendhex 01
@@ -388,7 +399,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c25 {
 	stream 1 {
 		txreq -hdr ":bla" "foo"
 		rxrst
@@ -398,7 +409,7 @@ client c1 {
 
 
 #2349: Padding exceeds frame size
-client c1 {
+client c26 {
 	stream 1 {
 		sendhex 000001
 		sendhex 01
@@ -415,7 +426,7 @@ client c1 {
 } -run
 
 #2349: Padding equal to frame size
-client c1 {
+client c27 {
 	stream 1 {
 		sendhex 000001
 		sendhex 01
@@ -432,7 +443,7 @@ client c1 {
 } -run
 
 #2349: Integer underrun may also occur when the priority flag is set
-client c1 {
+client c28 {
 	stream 1 {
 		sendhex 000004
 		sendhex 01
@@ -458,7 +469,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test CONTINUATION error conditions
 
-client c1 {
+client c29 {
 	stream 1 {
 		txreq -nostrend
 		txcont -hdr "bar" "foo"
@@ -469,7 +480,7 @@ client c1 {
 	expect_close
 } -run
 
-client c1 {
+client c30 {
 	stream 0 {
 		sendhex 000014
 		sendhex 01
@@ -489,7 +500,7 @@ client c1 {
 	} -run
 } -run
 
-client c1 {
+client c31 {
 	stream 1 {
 		txreq -nohdrend
 		txcont -hdr "bar" "foo"
@@ -499,7 +510,7 @@ client c1 {
 } -run
 
 # 2350: Don't accept a continuation frame after stream is closed
-client c1 {
+client c32 {
 	stream 1 {
 		txreq
 		rxresp
@@ -522,25 +533,26 @@ varnish v1 -expect MEMPOOL.sess1.live == 0
 #######################################################################
 # Test DATA error conditions
 
-client c1 {
+client c33 {
 	stream 1 {
 		txdata -data "FOOBAR"
 	} -run
 	stream 0 {
 		rxgoaway
+		expect goaway.err == PROTOCOL_ERROR
 	} -run
 	expect_close
 } -run
 
-client c1 {
+client c34 {
 	stream 1 {
 		txreq
 		rxresp
 		txdata -data "FOOBAR"
 	} -run
-	stream 3 {
-		txreq
-		rxresp
+	stream 0 {
+		rxgoaway
+		expect goaway.err == PROTOCOL_ERROR
 	} -run
 } -run
 
diff --git a/bin/varnishtest/tests/t02005.vtc b/bin/varnishtest/tests/t02005.vtc
index 39737f93a6..10f54ded8d 100644
--- a/bin/varnishtest/tests/t02005.vtc
+++ b/bin/varnishtest/tests/t02005.vtc
@@ -31,7 +31,7 @@ varnish v1 -vcl+backend {
 varnish v1 -cliok "param.set debug +syncvsl"
 
 logexpect l1 -v v1 -g raw {
-	expect	* 1001 ReqAcct	"80 7 87 78 8 86"
+	expect	* 1001 ReqAcct	"160 7 167 78 16 94"
 	expect	* 1000 ReqAcct	"45 8 53 63 34 97"
 } -start
 
diff --git a/bin/varnishtest/tests/t02008.vtc b/bin/varnishtest/tests/t02008.vtc
index 75cee513fa..4a7f096d78 100644
--- a/bin/varnishtest/tests/t02008.vtc
+++ b/bin/varnishtest/tests/t02008.vtc
@@ -28,6 +28,8 @@ client c1 {
 	} -run
 	stream 0 {
 		txgoaway -err 2
+		rxgoaway
+		expect goaway.err == NO_ERROR
 	} -run
 	expect_close
 } -run
diff --git a/bin/varnishtest/tests/t02011.vtc b/bin/varnishtest/tests/t02011.vtc
index 88c64d9045..0d25eb6f2c 100644
--- a/bin/varnishtest/tests/t02011.vtc
+++ b/bin/varnishtest/tests/t02011.vtc
@@ -43,9 +43,6 @@ varnish v1 -vcl+backend {
 } -start
 
 client c1 {
-	txpri
-	stream 0 rxsettings -run
-
 	stream 1 {
 		txreq -hdr should sync
 		barrier b1 sync
diff --git a/bin/varnishtest/tests/t02015.vtc b/bin/varnishtest/tests/t02015.vtc
index 6e59dc7abc..860e79da38 100644
--- a/bin/varnishtest/tests/t02015.vtc
+++ b/bin/varnishtest/tests/t02015.vtc
@@ -14,17 +14,12 @@ varnish v1 -vcl+backend {
 } -start
 
 logexpect l1 -v v1 -g raw -q ReqAcct {
-	expect ? 1001	ReqAcct "46 0 46 69 12345 12414"
-	expect ? 1003	ReqAcct "46 0 46 74 1000 1074"
+	expect ? 1001	ReqAcct "92 0 92 69 24690 24759"
+	expect ? 1003	ReqAcct "92 0 92 74 13345 13419"
 } -start
 
 client c1 {
-	txpri
-
 	stream 0 {
-		rxsettings
-		expect settings.ack == false
-		txsettings -ack
 		txsettings -winsize 1000
 		rxsettings
 		expect settings.ack == true
diff --git a/bin/varnishtest/tests/t02016.vtc b/bin/varnishtest/tests/t02016.vtc
index 1e5a7dc8ae..7e8dd5a094 100644
--- a/bin/varnishtest/tests/t02016.vtc
+++ b/bin/varnishtest/tests/t02016.vtc
@@ -6,6 +6,8 @@ server s1 {
 } -start
 
 varnish v1 -cliok "param.set feature +http2"
+varnish v1 -cliok "param.set debug +syncvsl"
+varnish v1 -cliok "param.set timeout_idle 10"
 varnish v1 -vcl+backend {
 	sub vcl_recv {
 		if (req.url ~ "synth") {
@@ -23,12 +25,7 @@ logexpect l1 -v v1 {
 } -start
 
 client c1 {
-	txpri
-
 	stream 0 {
-		rxsettings
-		expect settings.ack == false
-		txsettings -ack
 		txsettings -winsize 1000
 		rxsettings
 		expect settings.ack == true
@@ -60,12 +57,7 @@ logexpect l2 -v v1 {
 } -start
 
 client c2 {
-	txpri
-
 	stream 0 {
-		rxsettings
-		expect settings.ack == false
-		txsettings -ack
 		txsettings -winsize 1000
 		rxsettings
 		expect settings.ack == true
@@ -101,12 +93,7 @@ logexpect l3 -v v1 {
 } -start
 
 client c3 {
-	txpri
-
 	stream 0 {
-		rxsettings
-		expect settings.ack == false
-		txsettings -ack
 		txsettings -winsize 1000
 		rxsettings
 		expect settings.ack == true
diff --git a/bin/varnishtest/tests/t02020.vtc b/bin/varnishtest/tests/t02020.vtc
index e2bcb76f43..e12a5c18e5 100644
--- a/bin/varnishtest/tests/t02020.vtc
+++ b/bin/varnishtest/tests/t02020.vtc
@@ -1,6 +1,6 @@
 varnishtest "H/2 received data frames with padding"
 
-barrier b1 sock 3
+barrier b1 sock 2
 
 server s1 {
 	rxreq
@@ -8,6 +8,7 @@ server s1 {
 	expect req.body == abcde
 	txresp
 	rxreq
+	expect req.bodylen == 81500
 	txresp
 	rxreq
 	txresp
@@ -48,31 +49,20 @@ client c2 {
 	# by unblocking the client thread stuck in vcl_recv. From that
 	# point on window updates will also be sent on the stream.
 
-	stream 0 {
-		rxwinup
-		rxwinup
-		rxwinup
-		rxwinup
-		barrier b1 sync
-	} -start
-
 	stream 3 {
-		txreq -req POST -url /3 -hdr "content-length" "131072" -nostrend
-		txdata -datalen 16300 -padlen 83 -nostrend
-		txdata -datalen 16300 -padlen 83 -nostrend
-		txdata -datalen 16300 -padlen 83 -nostrend
+		txreq -req POST -url /3 -hdr "content-length" "81500" -nostrend
+		loop 3 {
+			txdata -datalen 16300 -padlen 83 -nostrend
+			rxwinup
+			expect winup.size == 84
+		}
 		txdata -datalen 16300 -padlen 82 -nostrend
-		barrier b1 sync
-		rxwinup
-		txdata -datalen 16300 -padlen 83 -nostrend
 		rxwinup
-		txdata -datalen 16300 -padlen 83 -nostrend
-		rxwinup
-		txdata -datalen 16300 -padlen 83 -nostrend
-		rxwinup
-		txdata -datalen 16300 -padlen 83 -nostrend
+		expect winup.size == 83
+		barrier b1 sync
 		rxwinup
-		txdata -datalen 672
+		expect winup.size == 65200
+		txdata -datalen 16300 -padlen 83
 		rxresp
 		expect resp.status == 200
 	} -start
diff --git a/bin/varnishtest/tests/t02023.vtc b/bin/varnishtest/tests/t02023.vtc
index bdc722ce3a..039cc5f4ae 100644
--- a/bin/varnishtest/tests/t02023.vtc
+++ b/bin/varnishtest/tests/t02023.vtc
@@ -14,58 +14,43 @@ client c1 {
 	expect resp.status == 400
 } -run
 
-client c1 {
+client c2 {
 	txreq -req ""
 	rxresp
 	expect resp.status == 400
 } -run
 
-client c1 {
+client c3 {
 	txreq -proto ""
 	rxresp
 	expect resp.status == 400
 } -run
 
-client c1 {
+client c4 {
 	stream 1 {
 		txreq -url ""
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c5 {
 	stream 1 {
 		txreq -scheme ""
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c6 {
 	stream 1 {
 		txreq -req ""
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c7 {
 	stream 1 {
 		txreq -hdr "empty" ""
 		rxresp
@@ -80,118 +65,74 @@ client c1 {
 
 varnish v1 -vsl_catchup
 
-client c1 {
+client c8 {
 	stream 1 {
 		txreq -hdr "foo" " bar"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c9 {
 	stream 1 {
 		txreq -hdr "foo" " "
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c10 {
 	stream 1 {
 		txreq -hdr ":foo" "bar"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c11 {
 	stream 1 {
 		txreq -hdr "foo" "b\x0car"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c12 {
 	stream 1 {
 		txreq -hdr "f o" "bar"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c13 {
 	stream 1 {
 		txreq -hdr "f: o" "bar"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c14 {
 	stream 1 {
 		txreq -hdr "foo" "bar "
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c15 {
 	stream 1 {
 		txreq -hdr "foo" "	bar"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-		expect resp.status == 200
-	} -run
 } -run
 
-client c1 {
+client c16 {
 	stream 1 {
 		txreq -hdr "foo" "bar	"
 		rxrst
 		expect rst.err == PROTOCOL_ERROR
 	} -run
-	stream 3 {
-		txreq
-		rxresp
-	} -run
 } -run
diff --git a/bin/varnishtest/tests/t02027.vtc b/bin/varnishtest/tests/t02027.vtc
index 5bc7b48160..835913b3b4 100644
--- a/bin/varnishtest/tests/t02027.vtc
+++ b/bin/varnishtest/tests/t02027.vtc
@@ -10,8 +10,7 @@ varnish v1 -arg "-p feature=+http2" -arg "-p debug=+syncvsl" -vcl {
 logexpect l0 -v v1 -g vxid -q "Begin ~ sess" {
 	fail add * SessError
 	expect * * Debug           {^H2: Got pu PRISM}
-	expect 0 = Debug           {^H2: HTC eof.*frame=complete goaway=0}
-	expect 0 = Debug           {^H2 CLEANUP H2CE_NO_ERROR}
+	expect 0 = Debug           {^H2: HTC eof .* frame=complete}
 	expect 0 = ReqAcct         {^0 0 0 18 26 44}
 	expect 0 = SessClose       {^REM_CLOSE}
 	expect 0 = End
@@ -22,18 +21,17 @@ logexpect l0 -v v1 -g vxid -q "Begin ~ sess" {
 client c0 {
 	txpri
 	shutdown -write
-        stream 0 {
+	stream 0 {
 		rxsettings
-                rxgoaway
-                expect goaway.laststream == 0
-                expect goaway.err == NO_ERROR
+		rxgoaway
+		expect goaway.laststream == 0
+		expect goaway.err == NO_ERROR
 	} -run
 } -run
 
 logexpect l1 -v v1 -g vxid -q "Begin ~ sess" {
 	fail add * SessError
-	expect * * Debug           {^H2: HTC eof.*frame=complete goaway=0}
-	expect 0 = Debug           {^H2 CLEANUP H2CE_NO_ERROR}
+	expect * * Debug           {^H2: HTC eof .* frame=complete}
 	expect 9 = ReqAcct         {^27 0 27 27 26 53}
 	expect 0 = SessClose       {^REM_CLOSE}
 	expect 0 = End
@@ -46,17 +44,16 @@ client c1 {
 		txreq -nohdrend
 	} -run
 	shutdown -write
-        stream 0 {
-                rxgoaway
-                expect goaway.laststream == 1
-                expect goaway.err == NO_ERROR
+	stream 0 {
+		rxgoaway
+		expect goaway.laststream == 1
+		expect goaway.err == NO_ERROR
 	} -run
 } -run
 
 logexpect l2 -v v1 -g vxid -q "Begin ~ sess" {
 	fail add * SessError
-	expect * * Debug           {^H2: HTC eof.*frame=complete goaway=0}
-	expect 0 = Debug           {^H2 CLEANUP H2CE_NO_ERROR}
+	expect * * Debug           {^H2: HTC eof .* frame=complete}
 	expect 9 = ReqAcct         {^27 0 27 27 26 53}
 	expect 0 = SessClose       {^REM_CLOSE}
 	expect 0 = End
@@ -69,17 +66,16 @@ client c2 {
 		txreq -nostrend
 	} -run
 	shutdown -write
-        stream 0 {
-                rxgoaway
-                expect goaway.laststream == 1
-                expect goaway.err == NO_ERROR
+	stream 0 {
+		rxgoaway
+		expect goaway.laststream == 1
+		expect goaway.err == NO_ERROR
 	} -run
 } -run
 
 logexpect l3 -v v1 -g vxid -q "Begin ~ sess" {
 	fail add * SessError
-	expect * * Debug           {^H2: HTC eof.*frame=partial goaway=0}
-	expect 0 = Debug           {^H2 CLEANUP H2CE_NO_ERROR}
+	expect * * Debug           {^H2: HTC eof .* frame=partial}
 	expect 0 = ReqAcct         {^18 0 18 27 26 53}
 	expect 0 = SessClose       {^REM_CLOSE}
 	expect 0 = End
@@ -89,10 +85,10 @@ logexpect l3 -v v1 -g vxid -q "Begin ~ sess" {
 # middle of frame
 client c3 {
 	stream 1 {
-		#		   +- 01 END_STREAM
+		#                  +- 01 END_STREAM
 		#                  +- 04 END_HEADERS
-		#		   |
-		#	   len ty fl strmid
+		#                  |
+		#          len ty fl strmid
 		sendhex {
 			000024 01 05 00000001
 			00053a70617468012f00073a6d6574686f640347455400073a736368656d6504687474
@@ -101,10 +97,10 @@ client c3 {
 		#	00053a70617468012f00073a6d6574686f640347455400073a736368656d650468747470
 	} -run
 	shutdown -write
-        stream 0 {
-                rxgoaway
-                expect goaway.laststream == 0
-                expect goaway.err == NO_ERROR
+	stream 0 {
+		rxgoaway
+		expect goaway.laststream == 0
+		expect goaway.err == NO_ERROR
 	} -run
 } -run
 
diff --git a/bin/varnishtest/tests/t02028.vtc b/bin/varnishtest/tests/t02028.vtc
new file mode 100644
index 0000000000..cec3dd58f2
--- /dev/null
+++ b/bin/varnishtest/tests/t02028.vtc
@@ -0,0 +1,17 @@
+varnishtest "Bad preface: no SETTINGS frame first"
+
+varnish v1 -cliok "param.set feature +http2"
+varnish v1 -cliok "param.set debug +syncvsl"
+varnish v1 -vcl "backend default none;" -start
+
+client c1 {
+	txpri
+
+	stream 0 {
+		rxsettings
+		expect settings.ack == false
+		txsettings -ack
+		rxgoaway
+		expect goaway.err == PROTOCOL_ERROR
+	} -run
+} -run
diff --git a/configure.ac b/configure.ac
index 2483dfeac2..4ff0d3eb65 100644
--- a/configure.ac
+++ b/configure.ac
@@ -483,6 +483,8 @@ else
 	ac_cv_func_port_create=no
 fi
 
+AC_CHECK_FUNCS([eventfd])
+
 # --with-persistent-storage
 AC_ARG_WITH(persistent-storage,
     AS_HELP_STRING([--with-persistent-storage],
diff --git a/include/Makefile.am b/include/Makefile.am
index 347724d70a..a44b32ccbd 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -95,6 +95,7 @@ nobase_noinst_HEADERS = \
 	vcs_version.h \
 	vct.h \
 	vcurses.h \
+	vefd.h \
 	venc.h \
 	vend.h \
 	vev.h \
diff --git a/include/tbl/h2_error.h b/include/tbl/h2_error.h
index adfbbde422..ceffc41904 100644
--- a/include/tbl/h2_error.h
+++ b/include/tbl/h2_error.h
@@ -206,6 +206,24 @@ H2_ERROR(
 	/* reason */	SC_NULL,
 	/* descr */	"HTTP/2 header list exceeded http_req_size"
 )
+
+H2_ERROR(
+	/* name */	SEND_TIMEOUT,
+	/* val */	8, /* CANCEL */
+	/* types */	2,
+	/* goaway */	0,
+	/* reason */	SC_NULL,
+	/* descr */	"send timeout"
+)
+
+H2_ERROR(
+	/* name */	IO_ERROR,
+	/* val */	0,
+	/* types */	1,
+	/* goaway */	1,
+	/* reason */	SC_REM_CLOSE,
+	/* descr */	"socket error"
+)
 #  undef H2_CUSTOM_ERRORS
 #endif
 
diff --git a/include/tbl/h2_frames.h b/include/tbl/h2_frames.h
index 2b1e2c04f5..52a987b9b2 100644
--- a/include/tbl/h2_frames.h
+++ b/include/tbl/h2_frames.h
@@ -138,17 +138,11 @@
 #ifdef H2_FRAME_FLAGS
 /*		 lower,			upper,				flag */
   H2_FRAME_FLAGS(none,			NONE,				0x00)
-  H2_FRAME_FLAGS(data_end_stream,	DATA_END_STREAM,		0x01)
-  H2_FRAME_FLAGS(data_padded,		DATA_PADDED,			0x08)
-  H2_FRAME_FLAGS(headers_end_stream,	HEADERS_END_STREAM,		0x01)
-  H2_FRAME_FLAGS(headers_end_headers,	HEADERS_END_HEADERS,		0x04)
-  H2_FRAME_FLAGS(headers_padded,	HEADERS_PADDED,			0x08)
-  H2_FRAME_FLAGS(headers_priority,	HEADERS_PRIORITY,		0x20)
-  H2_FRAME_FLAGS(settings_ack,		SETTINGS_ACK,			0x01)
-  H2_FRAME_FLAGS(push_promise_end_headers,PUSH_PROMISE_END_HEADERS,	0x04)
-  H2_FRAME_FLAGS(push_promise_padded,	PUSH_PROMISE_PADDED,		0x08)
-  H2_FRAME_FLAGS(ping_ack,		PING_ACK,			0x01)
-  H2_FRAME_FLAGS(continuation_end_headers,CONTINUATION_END_HEADERS,	0x04)
+  H2_FRAME_FLAGS(ack,			ACK,				0x01)
+  H2_FRAME_FLAGS(end_stream,		END_STREAM,			0x01)
+  H2_FRAME_FLAGS(end_headers,		END_HEADERS,			0x04)
+  H2_FRAME_FLAGS(padded,		PADDED,				0x08)
+  H2_FRAME_FLAGS(priority,		PRIORITY,			0x20)
   #undef H2_FRAME_FLAGS
 #endif
 
diff --git a/include/vefd.h b/include/vefd.h
new file mode 100644
index 0000000000..5f41bada1e
--- /dev/null
+++ b/include/vefd.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2025 Varnish Software AS
+ * All rights reserved.
+ *
+ * Author: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+struct vefd {
+	unsigned	magic;
+#define VEFD_MAGIC	0x1548c1a6
+	int		poll_fd;
+	int		priv_fd;
+};
+
+#define VEFD_INIT(vefd)				\
+	do {					\
+		INIT_OBJ(vefd, VEFD_MAGIC);	\
+		(vefd)->poll_fd = -1;		\
+		(vefd)->priv_fd = -1;		\
+	} while (0)
+
+int VEFD_Open(struct vefd *);
+int VEFD_Signal(struct vefd *);
+int VEFD_Clear(struct vefd *);
+int VEFD_Close(struct vefd *);
diff --git a/lib/libvarnish/Makefile.am b/lib/libvarnish/Makefile.am
index 2210911107..1e7e984b8b 100644
--- a/lib/libvarnish/Makefile.am
+++ b/lib/libvarnish/Makefile.am
@@ -23,6 +23,7 @@ libvarnish_la_SOURCES = \
 	vcli_serve.c \
 	vct.c \
 	venc.c \
+	vefd.c \
 	version.c \
 	vev.c \
 	vfil.c \
diff --git a/lib/libvarnish/vefd.c b/lib/libvarnish/vefd.c
new file mode 100644
index 0000000000..8be1c70311
--- /dev/null
+++ b/lib/libvarnish/vefd.c
@@ -0,0 +1,163 @@
+/*-
+ * Copyright (c) 2025 Varnish Software AS
+ * All rights reserved.
+ *
+ * Author: Dridi Boukelmoune <dridi.boukelmoune@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "config.h"
+
+#if HAVE_EVENTFD
+#  include <sys/eventfd.h>
+#else
+#  include <fcntl.h>
+#endif
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include <vdef.h>
+#include <vefd.h>
+#include <vas.h>
+#include <miniobj.h>
+
+#if HAVE_EVENTFD
+int
+VEFD_Open(struct vefd *vefd)
+{
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd == -1);
+	assert(vefd->priv_fd == -1);
+
+	vefd->poll_fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+	return (vefd->poll_fd);
+}
+
+int
+VEFD_Signal(struct vefd *vefd)
+{
+	int64_t buf = 1;
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd >= 0);
+	assert(vefd->priv_fd == -1);
+	assert(write(vefd->poll_fd, &buf, sizeof buf) == sizeof buf);
+	return (0);
+}
+
+int
+VEFD_Clear(struct vefd *vefd)
+{
+	int64_t buf;
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd >= 0);
+	assert(vefd->priv_fd == -1);
+	assert(read(vefd->poll_fd, &buf, sizeof buf) == sizeof buf);
+	return (0);
+}
+
+int
+VEFD_Close(struct vefd *vefd)
+{
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd >= 0);
+	assert(vefd->priv_fd == -1);
+	closefd(&vefd->poll_fd);
+	return (0);
+}
+#else /* !HAVE_EVENTFD */
+int
+VEFD_Open(struct vefd *vefd)
+{
+	int fd[2];
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd == -1);
+	assert(vefd->priv_fd == -1);
+
+	if (pipe(fd) < 0)
+		return (-1);
+
+	AZ(fcntl(fd[0], F_SETFL, O_CLOEXEC|O_NONBLOCK));
+	AZ(fcntl(fd[1], F_SETFL, O_CLOEXEC|O_NONBLOCK));
+	vefd->poll_fd = fd[0];
+	vefd->priv_fd = fd[1];
+	return (0);
+}
+
+int
+VEFD_Signal(struct vefd *vefd)
+{
+	ssize_t r;
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd >= 0);
+	assert(vefd->priv_fd >= 0);
+	assert(vefd->poll_fd != vefd->priv_fd);
+	r = write(vefd->priv_fd, "", 1);
+	if (r < 0 && errno != EAGAIN && errno != EWOULDBLOCK)
+		return (-1);
+	return (0);
+}
+
+int
+VEFD_Clear(struct vefd *vefd)
+{
+	char buf[64];
+	ssize_t r;
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd >= 0);
+	assert(vefd->priv_fd >= 0);
+	assert(vefd->poll_fd != vefd->priv_fd);
+	do {
+		r = read(vefd->poll_fd, buf, sizeof buf);
+	} while (r > 0);
+	if (errno != EAGAIN && errno != EWOULDBLOCK)
+		return (-1);
+	return (0);
+}
+
+int
+VEFD_Close(struct vefd *vefd)
+{
+
+	CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC);
+	assert(vefd->poll_fd >= 0);
+	assert(vefd->priv_fd >= 0);
+	assert(vefd->poll_fd != vefd->priv_fd);
+	closefd(&vefd->poll_fd);
+	closefd(&vefd->priv_fd);
+	return (0);
+}
+#endif /* HAVE_EVENTFD */