diff --git a/bin/varnishd/Makefile.am b/bin/varnishd/Makefile.am index 0d60f1b188..8ff33394bd 100644 --- a/bin/varnishd/Makefile.am +++ b/bin/varnishd/Makefile.am @@ -81,6 +81,7 @@ varnishd_SOURCES = \ http2/cache_http2_hpack.c \ http2/cache_http2_panic.c \ http2/cache_http2_proto.c \ + http2/cache_http2_reqbody.c \ http2/cache_http2_send.c \ http2/cache_http2_session.c \ mgt/mgt_child.c \ diff --git a/bin/varnishd/cache/cache_session.c b/bin/varnishd/cache/cache_session.c index c031cb665f..c0856b400b 100644 --- a/bin/varnishd/cache/cache_session.c +++ b/bin/varnishd/cache/cache_session.c @@ -258,18 +258,13 @@ HTC_Status(enum htc_status_e e, const char **name, const char **desc) void HTC_RxInit(struct http_conn *htc, struct ws *ws) { - unsigned rollback; int l; CHECK_OBJ_NOTNULL(htc, HTTP_CONN_MAGIC); htc->ws = ws; - /* NB: HTTP/1 keep-alive triggers a rollback, so does the first - * request of a session or an h2 request where the rollback is a - * no-op in terms of workspace usage. - */ - rollback = !strcasecmp(ws->id, "req") && htc->body_status == NULL; - l = WS_Pipeline(htc->ws, htc->pipeline_b, htc->pipeline_e, rollback); + l = WS_Pipeline(htc->ws, htc->pipeline_b, htc->pipeline_e, + htc->pipeline_snap); xxxassert(l >= 0); htc->rxbuf_b = WS_Reservation(ws); @@ -410,6 +405,38 @@ HTC_RxStuff(struct http_conn *htc, htc_complete_f *func, } } +/*-------------------------------------------------------------------- + * Prune a vector of struct iovec + */ + +void +VIOV_prune(struct iovec *iov, unsigned *n, size_t l) +{ + unsigned u; + + if (l == 0) + return; + + AN(iov); + AN(n); + + u = 0; + while (l > 0) { + assert(u < *n); + if (iov[u].iov_len <= l) { + l -= iov[u].iov_len; + u++; + } else { + iov[u].iov_base = (char *)iov[u].iov_base + l; + iov[u].iov_len -= l; + break; + } + } + + memmove(iov, &iov[u], (*n - u) * sizeof *iov); + *n -= u; +} + /*-------------------------------------------------------------------- * Get a new session, preferably by recycling an already ready one * diff --git a/bin/varnishd/cache/cache_varnishd.h b/bin/varnishd/cache/cache_varnishd.h index 2892ef188d..9eee2580af 100644 --- a/bin/varnishd/cache/cache_varnishd.h +++ b/bin/varnishd/cache/cache_varnishd.h @@ -104,6 +104,7 @@ struct http_conn { char *rxbuf_e; char *pipeline_b; char *pipeline_e; + uintptr_t pipeline_snap; ssize_t content_length; void *priv; @@ -462,6 +463,8 @@ enum htc_status_e HTC_RxStuff(struct http_conn *, htc_complete_f *, vtim_real *t1, vtim_real *t2, vtim_real ti, vtim_real tn, vtim_dur td, int maxbytes); +void VIOV_prune(struct iovec *iov, unsigned *n, size_t l); + #define SESS_ATTR(UP, low, typ, len) \ int SES_Set_##low(const struct sess *sp, const typ *src); \ int SES_Reserve_##low(struct sess *sp, typ **dst, ssize_t *sz); @@ -567,7 +570,9 @@ WS_IsReserved(const struct ws *ws) void *WS_AtOffset(const struct ws *ws, unsigned off, unsigned len); unsigned WS_ReservationOffset(const struct ws *ws); -int WS_Pipeline(struct ws *, const void *b, const void *e, unsigned rollback); + +extern uintptr_t const ws_pipeline_rollback; +int WS_Pipeline(struct ws *, const void *b, const void *e, uintptr_t snap); /* cache_ws_common.c */ void WS_Id(const struct ws *ws, char *id); diff --git a/bin/varnishd/cache/cache_ws.c b/bin/varnishd/cache/cache_ws.c index 3f2cc5309c..7895e6161d 100644 --- a/bin/varnishd/cache/cache_ws.c +++ b/bin/varnishd/cache/cache_ws.c @@ -136,14 +136,16 @@ WS_Reset(struct ws *ws, uintptr_t pp) */ int -WS_Pipeline(struct ws *ws, const void *b, const void *e, unsigned rollback) +WS_Pipeline(struct ws *ws, const void *b, const void *e, uintptr_t snap) { unsigned r, l; WS_Assert(ws); - if (rollback) + if (snap == ws_pipeline_rollback) WS_Rollback(ws, 0); + else if (snap != 0) + WS_Rollback(ws, snap); r = WS_ReserveAll(ws); diff --git a/bin/varnishd/cache/cache_ws_common.c b/bin/varnishd/cache/cache_ws_common.c index a23cd06af6..bcd28b1e7a 100644 --- a/bin/varnishd/cache/cache_ws_common.c +++ b/bin/varnishd/cache/cache_ws_common.c @@ -37,6 +37,8 @@ #include "cache_varnishd.h" +uintptr_t const ws_pipeline_rollback = (uintptr_t)&ws_pipeline_rollback; + void WS_Id(const struct ws *ws, char *id) { diff --git a/bin/varnishd/cache/cache_ws_emu.c b/bin/varnishd/cache/cache_ws_emu.c index 767839d1e2..c6a1393197 100644 --- a/bin/varnishd/cache/cache_ws_emu.c +++ b/bin/varnishd/cache/cache_ws_emu.c @@ -222,7 +222,7 @@ WS_Reset(struct ws *ws, uintptr_t pp) } int -WS_Pipeline(struct ws *ws, const void *b, const void *e, unsigned rollback) +WS_Pipeline(struct ws *ws, const void *b, const void *e, uintptr_t snap) { void *tmp; unsigned r, l; @@ -248,8 +248,10 @@ WS_Pipeline(struct ws *ws, const void *b, const void *e, unsigned rollback) tmp = NULL; } - if (rollback) + if (snap == ws_pipeline_rollback) WS_Rollback(ws, 0); + else if (snap != 0) + WS_Rollback(ws, snap); r = WS_ReserveAll(ws); diff --git a/bin/varnishd/http1/cache_http1_fsm.c b/bin/varnishd/http1/cache_http1_fsm.c index ac1dc012cf..f755f5fc77 100644 --- a/bin/varnishd/http1/cache_http1_fsm.c +++ b/bin/varnishd/http1/cache_http1_fsm.c @@ -111,6 +111,7 @@ http1_new_session(struct worker *wrk, void *arg) sp = req->sp; CHECK_OBJ_NOTNULL(sp, SESS_MAGIC); + req->htc->pipeline_snap = ws_pipeline_rollback; HTC_RxInit(req->htc, req->ws); sz = sizeof u; diff --git a/bin/varnishd/http2/cache_http2.h b/bin/varnishd/http2/cache_http2.h index ba036b84d6..19ad5e3dbd 100644 --- a/bin/varnishd/http2/cache_http2.h +++ b/bin/varnishd/http2/cache_http2.h @@ -35,6 +35,9 @@ struct h2h_decode; struct h2_frame_s; #include "hpack/vhp.h" +#include "vefd.h" + +#define H2_TX_BUFSIZE 1024 /**********************************************************************/ @@ -140,42 +143,44 @@ struct h2_req { int counted; struct h2_sess *h2sess; struct req *req; - double t_send; - double t_winupd; - pthread_cond_t *cond; + vtim_real t_send; + vtim_real t_win_low; VTAILQ_ENTRY(h2_req) list; - int64_t t_window; - int64_t r_window; - /* Where to wake this stream up */ - struct worker *wrk; + int64_t tx_window; + int64_t rx_window; struct h2_rxbuf *rxbuf; + struct h2_reqbody_waiter *reqbody_waiter; + h2_error async_error; - VTAILQ_ENTRY(h2_req) tx_list; h2_error error; }; VTAILQ_HEAD(h2_req_s, h2_req); +struct h2_send_large; +VTAILQ_HEAD(h2_send_large_s, h2_send_large); + struct h2_sess { unsigned magic; #define H2_SESS_MAGIC 0xa16f7e4b + unsigned expect_settings_next; + pthread_t rxthr; - pthread_cond_t *cond; - pthread_cond_t winupd_cond[1]; struct sess *sess; int refcnt; int open_streams; - int winup_streams; + int win_low_streams; uint32_t highest_stream; - int goaway; int bogosity; - int do_sweep; - struct h2_req *req0; + struct vefd efd[1]; + + int64_t tx_window; + int64_t rx_window; struct h2_req_s streams; @@ -186,6 +191,23 @@ struct h2_sess { struct h2h_decode *decode; struct vht_table dectbl[1]; + vtim_real deadline; + + struct iovec tx_vec[2]; /* Must be 2 wide */ + unsigned tx_nvec; + + unsigned tx_stopped; + + uint8_t *tx_s_start; + uint8_t *tx_s_end; + uint8_t *tx_s_head; + uint8_t *tx_s_mark; + + struct h2_send_large_s tx_l_queue; + struct h2_send_large *tx_l_current; + uint8_t tx_l_hdrbuf[9]; + char tx_l_stuck; + unsigned rxf_len; unsigned rxf_type; unsigned rxf_flags; @@ -195,11 +217,8 @@ struct h2_sess { struct h2_settings remote_settings; struct h2_settings local_settings; - struct req *new_req; + struct h2_req *hpack_lock; vtim_real t1; // t_first for new_req - uint32_t goaway_last_stream; - - VTAILQ_HEAD(,h2_req) txqueue; h2_error error; @@ -213,7 +232,17 @@ struct h2_sess { vtim_real last_rst; }; -#define ASSERT_RXTHR(h2) do {assert(h2->rxthr == pthread_self());} while(0) +#define ASSERT_H2_SESS(h2) \ + do { \ + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); \ + assert(pthread_equal(h2->rxthr, pthread_self())); \ + } while (0) + +#define ASSERT_H2_REQ(h2) \ + do { \ + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); \ + assert(!pthread_equal(h2->rxthr, pthread_self())); \ + } while (0) /* http2/cache_http2_panic.c */ #ifdef TRANSPORT_MAGIC @@ -235,7 +264,6 @@ struct h2h_decode { unsigned has_scheme:1; h2_error error; enum vhd_ret_e vhd_ret; - struct ws *ws; char *out; int64_t limit; size_t out_l; @@ -244,38 +272,42 @@ struct h2h_decode { struct vhd_decode vhd[1]; }; -void h2h_decode_hdr_init(const struct h2_sess *h2); -h2_error h2h_decode_hdr_fini(const struct h2_sess *h2); +void h2h_decode_hdr_init(struct h2_sess *h2, struct h2_req *); +h2_error h2h_decode_hdr_fini(struct h2_sess *h2); h2_error h2h_decode_bytes(struct h2_sess *h2, const uint8_t *ptr, size_t len); /* cache_http2_send.c */ -void H2_Send_Get(struct worker *, struct h2_sess *, struct h2_req *); -void H2_Send_Rel(struct h2_sess *, const struct h2_req *); - -void H2_Send_Frame(struct worker *, struct h2_sess *, - h2_frame type, uint8_t flags, uint32_t len, uint32_t stream, - const void *); - -void H2_Send_RST(struct worker *wrk, struct h2_sess *h2, - const struct h2_req *r2, uint32_t stream, h2_error h2e); - -void H2_Send(struct worker *, struct h2_req *, h2_frame type, uint8_t flags, - uint32_t len, const void *, uint64_t *acct); +int H2_Send_RST(struct h2_sess *h2, uint32_t stream, h2_error h2e); +int H2_Send_SETTINGS(struct h2_sess *h2, uint8_t flags, ssize_t len, + const uint8_t *buf); +int H2_Send_PING(struct h2_sess *h2, uint8_t flags, uint64_t data); +int H2_Send_GOAWAY(struct h2_sess *h2, uint32_t last_stream_id, h2_error h2e); +int H2_Send_WINDOW_UPDATE(struct h2_sess *h2, uint32_t stream, uint32_t incr); +int H2_Send(struct vsl_log *vsl, struct h2_req *r2, h2_frame ftyp, + uint8_t flags, uint32_t len, const void *ptr); +ssize_t H2_Send_TxStuff(struct h2_sess *h2); +int H2_Send_Something(struct h2_sess *h2); +int H2_Send_Pending(struct h2_sess *h2); +void H2_Send_Shutdown(struct h2_sess *h2); +void H2_Send_Stop(struct h2_sess *h2); /* cache_http2_proto.c */ -struct h2_req * h2_new_req(struct h2_sess *, unsigned stream, struct req *); -h2_error h2_stream_tmo(struct h2_sess *, const struct h2_req *, vtim_real); -void h2_del_req(struct worker *, struct h2_req *); -void h2_kill_req(struct worker *, struct h2_sess *, struct h2_req *, h2_error); -int h2_rxframe(struct worker *, struct h2_sess *); +const char *h2_framename(int frame); +h2_error h2_errcheck(const struct h2_req *r2); +void h2_async_error(struct h2_req *r2, h2_error h2e); +void h2_attention(struct h2_sess *h2); +void h2_stream_setstate(struct h2_req *r2, enum h2_stream_e state); +void h2_run(struct worker *wrk, struct h2_sess *h2); +struct h2_req * h2_new_req(struct h2_sess *, unsigned stream, struct req **); +void h2_kill_req(struct worker *, struct h2_sess *, struct h2_req **, h2_error); h2_error h2_set_setting(struct h2_sess *, const uint8_t *); -void h2_req_body(struct req*); task_func_t h2_do_req; #ifdef TRANSPORT_MAGIC vtr_req_fail_f h2_req_fail; #endif -/* cache_http2_session.c */ -void -H2S_Lock_VSLb(const struct h2_sess *, enum VSL_tag_e, const char *, ...); +/* cache_http2_reqbody.c */ +h2_error h2_reqbody_data(struct worker *, struct h2_sess *, struct h2_req *); +void h2_reqbody(struct req *); +void h2_reqbody_kick(struct h2_req *r2); diff --git a/bin/varnishd/http2/cache_http2_deliver.c b/bin/varnishd/http2/cache_http2_deliver.c index 4013012436..a3bc571a5e 100644 --- a/bin/varnishd/http2/cache_http2_deliver.c +++ b/bin/varnishd/http2/cache_http2_deliver.c @@ -73,7 +73,7 @@ V2D_Init(void) /**********************************************************************/ static int v_matchproto_(vdp_init_f) -h2_init(VRT_CTX, struct vdp_ctx *vdc, void **priv) +h2_vdp_init(VRT_CTX, struct vdp_ctx *vdc, void **priv) { struct h2_req *r2; @@ -86,57 +86,67 @@ h2_init(VRT_CTX, struct vdp_ctx *vdc, void **priv) } static int v_matchproto_(vdp_fini_f) -h2_fini(struct vdp_ctx *vdc, void **priv) +h2_vdp_fini(struct vdp_ctx *vdc, void **priv) { struct h2_req *r2; + h2_error h2e = NULL; CHECK_OBJ_NOTNULL(vdc, VDP_CTX_MAGIC); CHECK_OBJ_NOTNULL(vdc->wrk, WORKER_MAGIC); TAKE_OBJ_NOTNULL(r2, priv, H2_REQ_MAGIC); - if (r2->error) - return (0); - if (vdc->retval < 0) { - r2->error = H2SE_INTERNAL_ERROR; /* XXX: proper error? */ - H2_Send_Get(vdc->wrk, r2->h2sess, r2); - H2_Send_RST(vdc->wrk, r2->h2sess, r2, r2->stream, r2->error); - H2_Send_Rel(r2->h2sess, r2); - return (0); + h2e = H2SE_INTERNAL_ERROR; + h2_async_error(r2, h2e); + } else + h2e = h2_errcheck(r2); + + if (h2e != NULL) + VSLb(vdc->vsl, SLT_Error, "H2: delivery error (%s)", h2e->name); + + if (h2e == NULL && r2->state < H2_S_CLOSED) { + /* Not all VDPs will always send VDP_END (e.g. ESI). End + * the stream here if necessary. */ + H2_Send(vdc->vsl, r2, H2_F_DATA, H2FF_END_STREAM, 0, NULL); } - H2_Send_Get(vdc->wrk, r2->h2sess, r2); - H2_Send(vdc->wrk, r2, H2_F_DATA, H2FF_DATA_END_STREAM, 0, "", NULL); - H2_Send_Rel(r2->h2sess, r2); return (0); } static int v_matchproto_(vdp_bytes_f) -h2_bytes(struct vdp_ctx *vdc, enum vdp_action act, void **priv, +h2_vdp_bytes(struct vdp_ctx *vdc, enum vdp_action act, void **priv, const void *ptr, ssize_t len) { struct h2_req *r2; + uint8_t flags = H2FF_NONE; CHECK_OBJ_NOTNULL(vdc, VDP_CTX_MAGIC); CAST_OBJ_NOTNULL(r2, *priv, H2_REQ_MAGIC); - (void)act; + assert(len >= 0); - if ((r2->h2sess->error || r2->error)) + if (h2_errcheck(r2) != NULL) return (-1); - if (len == 0) + vdc->bytes_done = len; + if (len == 0) { + /* No reason to send an empty frame. There is code + * (notably ESI) that will pass len==0 without + * VDP_END. An incomplete delivery will result in + * the len==0 && VDP_END combo, deferring the final + * DATA frame to the h2_vdp_fini() call. */ return (0); - H2_Send_Get(vdc->wrk, r2->h2sess, r2); - vdc->bytes_done = 0; - H2_Send(vdc->wrk, r2, H2_F_DATA, H2FF_NONE, len, ptr, &vdc->bytes_done); - H2_Send_Rel(r2->h2sess, r2); + } + if (act == VDP_END) + flags |= H2FF_END_STREAM; + // XXX? return (H2_Send(...)); + H2_Send(vdc->vsl, r2, H2_F_DATA, flags, len, ptr); return (0); } static const struct vdp h2_vdp = { .name = "H2B", - .init = h2_init, - .bytes = h2_bytes, - .fini = h2_fini, + .init = h2_vdp_init, + .bytes = h2_vdp_bytes, + .fini = h2_vdp_fini, }; static inline size_t @@ -170,6 +180,7 @@ h2_minimal_response(struct req *req, uint16_t status) struct h2_req *r2; size_t l; uint8_t buf[6]; + uint8_t flags; CHECK_OBJ_NOTNULL(req, REQ_MAGIC); CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC); @@ -189,14 +200,10 @@ h2_minimal_response(struct req *req, uint16_t status) req->err_code = status; /* XXX return code checking once H2_Send returns anything but 0 */ - H2_Send_Get(req->wrk, r2->h2sess, r2); - H2_Send(req->wrk, r2, - H2_F_HEADERS, - H2FF_HEADERS_END_HEADERS | - (status < 200 ? 0 : H2FF_HEADERS_END_STREAM), - l, buf, NULL); - H2_Send_Rel(r2->h2sess, r2); - return (0); + flags = H2FF_END_HEADERS; + if (status >= 200) + flags |= H2FF_END_STREAM; + return (H2_Send(req->vsl, r2, H2_F_HEADERS, flags, l, buf)); } static void @@ -302,6 +309,7 @@ h2_deliver(struct req *req, int sendbody) struct vsb resp[1]; struct vrt_ctx ctx[1]; uintptr_t ss; + uint8_t flags; CHECK_OBJ_NOTNULL(req, REQ_MAGIC); CHECK_OBJ_NOTNULL(req->objcore, OBJCORE_MAGIC); @@ -332,11 +340,10 @@ h2_deliver(struct req *req, int sendbody) r2->t_send = req->t_prev; - H2_Send_Get(req->wrk, r2->h2sess, r2); - H2_Send(req->wrk, r2, H2_F_HEADERS, - (sendbody ? 0 : H2FF_HEADERS_END_STREAM) | H2FF_HEADERS_END_HEADERS, - sz, r, &req->acct.resp_hdrbytes); - H2_Send_Rel(r2->h2sess, r2); + flags = H2FF_END_HEADERS; + if (!sendbody) + flags |= H2FF_END_STREAM; + H2_Send(req->vsl, r2, H2_F_HEADERS, flags, sz, r); WS_Reset(req->ws, ss); diff --git a/bin/varnishd/http2/cache_http2_hpack.c b/bin/varnishd/http2/cache_http2_hpack.c index a90e6fde23..ae3709985e 100644 --- a/bin/varnishd/http2/cache_http2_hpack.c +++ b/bin/varnishd/http2/cache_http2_hpack.c @@ -260,25 +260,31 @@ h2h_addhdr(struct http *hp, struct h2h_decode *d) return (0); } -static void -h2h_decode_init(const struct h2_sess *h2, struct ws *ws) +void +h2h_decode_hdr_init(struct h2_sess *h2, struct h2_req *r2) { struct h2h_decode *d; CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(ws, WS_MAGIC); + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->req, REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->req->http, HTTP_MAGIC); + + AZ(h2->hpack_lock); + h2->hpack_lock = r2; AN(h2->decode); d = h2->decode; INIT_OBJ(d, H2H_DECODE_MAGIC); VHD_Init(d->vhd); - d->out_l = WS_ReserveSize(ws, cache_param->http_req_size); + d->out_l = WS_ReserveSize(h2->hpack_lock->req->http->ws, + cache_param->http_req_size); /* * Can't do any work without any buffer * space. Require non-zero size. */ XXXAN(d->out_l); - d->out = WS_Reservation(ws); + d->out = WS_Reservation(h2->hpack_lock->req->http->ws); if (cache_param->h2_max_header_list_size == 0) d->limit = @@ -288,18 +294,6 @@ h2h_decode_init(const struct h2_sess *h2, struct ws *ws) if (d->limit < h2->local_settings.max_header_list_size) d->limit = INT64_MAX; - - d->ws = ws; -} - -void -h2h_decode_hdr_init(const struct h2_sess *h2) -{ - - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(h2->new_req, REQ_MAGIC); - CHECK_OBJ_NOTNULL(h2->new_req->http, HTTP_MAGIC); - h2h_decode_init(h2, h2->new_req->ws); } /* Possible error returns: @@ -311,32 +305,34 @@ h2h_decode_hdr_init(const struct h2_sess *h2) * is a stream level error. */ h2_error -h2h_decode_hdr_fini(const struct h2_sess *h2) +h2h_decode_hdr_fini(struct h2_sess *h2) { h2_error ret; struct h2h_decode *d; CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); d = h2->decode; - CHECK_OBJ_NOTNULL(h2->new_req, REQ_MAGIC); + CHECK_OBJ_NOTNULL(h2->hpack_lock, H2_REQ_MAGIC); CHECK_OBJ_NOTNULL(d, H2H_DECODE_MAGIC); - WS_ReleaseP(d->ws, d->out); + WS_ReleaseP(h2->hpack_lock->req->http->ws, d->out); if (d->vhd_ret != VHD_OK) { /* HPACK header block didn't finish at an instruction boundary */ - VSLb(h2->new_req->http->vsl, SLT_BogoHeader, + VSLb(h2->hpack_lock->req->http->vsl, SLT_BogoHeader, "HPACK compression error/fini (%s)", VHD_Error(d->vhd_ret)); ret = H2CE_COMPRESSION_ERROR; } else if (d->error == NULL && !d->has_scheme) { - H2S_Lock_VSLb(h2, SLT_Debug, "Missing :scheme"); + VSLb(h2->vsl, SLT_Debug, "Missing :scheme"); ret = H2SE_MISSING_SCHEME; //rfc7540,l,3087,3090 } else ret = d->error; FINI_OBJ(d); if (ret == H2SE_REQ_SIZE) { - VSLb(h2->new_req->http->vsl, SLT_LostHeader, + VSLb(h2->hpack_lock->req->http->vsl, SLT_LostHeader, "Header list too large"); } + h2->hpack_lock = NULL; + return (ret); } @@ -357,15 +353,15 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l) const char *r, *e; CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(h2->new_req, REQ_MAGIC); - hp = h2->new_req->http; + CHECK_OBJ_NOTNULL(h2->hpack_lock, H2_REQ_MAGIC); + hp = h2->hpack_lock->req->http; CHECK_OBJ_NOTNULL(hp, HTTP_MAGIC); d = h2->decode; CHECK_OBJ_NOTNULL(d, H2H_DECODE_MAGIC); - CHECK_OBJ_NOTNULL(d->ws, WS_MAGIC); - r = WS_Reservation(d->ws); + CHECK_OBJ_NOTNULL(h2->hpack_lock->req->http->ws, WS_MAGIC); + r = WS_Reservation(h2->hpack_lock->req->http->ws); AN(r); - e = r + WS_ReservationSize(d->ws); + e = r + WS_ReservationSize(h2->hpack_lock->req->http->ws); /* Only H2E_ENHANCE_YOUR_CALM indicates that we should continue processing. Other errors should have been returned and handled @@ -380,7 +376,7 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l) d->out, d->out_l, &d->out_u); if (d->vhd_ret < 0) { - H2S_Lock_VSLb(h2, SLT_BogoHeader, + VSLb(h2->vsl, SLT_BogoHeader, "HPACK compression error (%s)", VHD_Error(d->vhd_ret)); d->error = H2CE_COMPRESSION_ERROR; @@ -440,7 +436,7 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l) } if (H2_ERROR_MATCH(d->error, H2SE_ENHANCE_YOUR_CALM)) { - d->out = WS_Reservation(d->ws); + d->out = WS_Reservation(h2->hpack_lock->req->http->ws); d->out_l = e - d->out; d->limit -= d->out_u; d->out_u = 0; @@ -452,7 +448,7 @@ h2h_decode_bytes(struct h2_sess *h2, const uint8_t *in, size_t in_l) if (d->limit < 0) { /* Fatal error, the client exceeded both http_req_size * and h2_max_header_list_size. */ - H2S_Lock_VSLb(h2, SLT_SessError, "Header list too large"); + VSLb(h2->vsl, SLT_SessError, "Header list too large"); return (H2CE_ENHANCE_YOUR_CALM); } diff --git a/bin/varnishd/http2/cache_http2_panic.c b/bin/varnishd/http2/cache_http2_panic.c index 227097e6b4..6585485037 100644 --- a/bin/varnishd/http2/cache_http2_panic.c +++ b/bin/varnishd/http2/cache_http2_panic.c @@ -78,10 +78,10 @@ h2_sess_panic(struct vsb *vsb, const struct sess *sp) return; VSB_printf(vsb, "refcnt = %d, bogosity = %d, error = %s\n", h2->refcnt, h2->bogosity, h2_panic_error(h2->error)); - VSB_printf(vsb, - "open_streams = %d, highest_stream = %u," - " goaway_last_stream = %u,\n", - h2->open_streams, h2->highest_stream, h2->goaway_last_stream); + VSB_printf(vsb, "open_streams = %d, highest_stream = %u,\n", + h2->open_streams, h2->highest_stream); + VSB_printf(vsb, "tx_window = %jd, rx_window = %jd,\n", + h2->tx_window, h2->rx_window); VSB_cat(vsb, "local_settings = {"); h2_panic_settings(vsb, &h2->local_settings); VSB_cat(vsb, "},\n"); @@ -107,10 +107,10 @@ h2_sess_panic(struct vsb *vsb, const struct sess *sp) VSB_printf(vsb, "h2_sess = %p, scheduled = %d, error = %s,\n", r2->h2sess, r2->scheduled, h2_panic_error(r2->error)); - VSB_printf(vsb, "t_send = %f, t_winupd = %f,\n", - r2->t_send, r2->t_winupd); - VSB_printf(vsb, "t_window = %jd, r_window = %jd,\n", - (intmax_t)r2->t_window, (intmax_t)r2->r_window); + VSB_printf(vsb, "t_send = %f, t_win_low = %f,\n", + r2->t_send, r2->t_win_low); + VSB_printf(vsb, "tx_window = %jd, rx_window = %jd,\n", + (intmax_t)r2->tx_window, (intmax_t)r2->rx_window); if (!PAN_dump_struct(vsb, r2->rxbuf, H2_RXBUF_MAGIC, "rxbuf")) { VSB_printf(vsb, "stvbuf = %p,\n", r2->rxbuf->stvbuf); diff --git a/bin/varnishd/http2/cache_http2_proto.c b/bin/varnishd/http2/cache_http2_proto.c index 254275ebd3..78bcc2196a 100644 --- a/bin/varnishd/http2/cache_http2_proto.c +++ b/bin/varnishd/http2/cache_http2_proto.c @@ -31,11 +31,11 @@ #include "config.h" -#include "cache/cache_varnishd.h" - +#include #include #include +#include "cache/cache_varnishd.h" #include "cache/cache_transport.h" #include "cache/cache_filter.h" #include "http2/cache_http2.h" @@ -74,11 +74,11 @@ enum h2frame { #include "tbl/h2_frames.h" }; -static const char * -h2_framename(enum h2frame h2f) +const char * +h2_framename(int frame) { - switch (h2f) { + switch (frame) { #define H2_FRAME(l,u,t,f,...) case H2F_##u: return (#u); #include "tbl/h2_frames.h" default: @@ -141,17 +141,32 @@ h2_connectionerror(uint32_t u) return (H2NN_ERROR); } +h2_error +h2_errcheck(const struct h2_req *r2) +{ + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC); + + if (r2->error != NULL) + return (r2->error); + return (r2->h2sess->error); +} + /**********************************************************************/ struct h2_req * -h2_new_req(struct h2_sess *h2, unsigned stream, struct req *req) +h2_new_req(struct h2_sess *h2, unsigned stream, struct req **preq) { + struct req *req; struct h2_req *r2; - ASSERT_RXTHR(h2); - if (req == NULL) + ASSERT_H2_SESS(h2); + if (preq != NULL) + TAKE_OBJ_NOTNULL(req, preq, REQ_MAGIC); + else { req = Req_New(h2->sess, NULL); - CHECK_OBJ_NOTNULL(req, REQ_MAGIC); + CHECK_OBJ_NOTNULL(req, REQ_MAGIC); + } r2 = WS_Alloc(req->ws, sizeof *r2); AN(r2); @@ -160,42 +175,37 @@ h2_new_req(struct h2_sess *h2, unsigned stream, struct req *req) r2->h2sess = h2; r2->stream = stream; r2->req = req; - if (stream) - r2->counted = 1; - r2->r_window = h2->local_settings.initial_window_size; - r2->t_window = h2->remote_settings.initial_window_size; + r2->rx_window = h2->local_settings.initial_window_size; + r2->tx_window = h2->remote_settings.initial_window_size; req->transport_priv = r2; - Lck_Lock(&h2->sess->mtx); - if (stream) + if (stream > 0) h2->open_streams++; VTAILQ_INSERT_TAIL(&h2->streams, r2, list); - Lck_Unlock(&h2->sess->mtx); h2->refcnt++; return (r2); } -void -h2_del_req(struct worker *wrk, struct h2_req *r2) +static void +h2_del_req(struct worker *wrk, struct h2_req **pr2) { + struct h2_req *r2; struct h2_sess *h2; struct sess *sp; struct stv_buffer *stvbuf; + struct req *req; - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + TAKE_OBJ_NOTNULL(r2, pr2, H2_REQ_MAGIC); AZ(r2->scheduled); h2 = r2->h2sess; - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); sp = h2->sess; - Lck_Lock(&sp->mtx); assert(h2->refcnt > 0); --h2->refcnt; /* XXX: PRIORITY reshuffle */ VTAILQ_REMOVE(&h2->streams, r2, list); - if (r2->req == h2->new_req) - h2->new_req = NULL; - Lck_Unlock(&sp->mtx); + assert(r2->t_win_low == 0.); + AZ(r2->reqbody_waiter); assert(!WS_IsReserved(r2->req->ws)); AZ(r2->req->ws->r); @@ -207,47 +217,88 @@ h2_del_req(struct worker *wrk, struct h2_req *r2) AZ(stvbuf); } - Req_Cleanup(sp, wrk, r2->req); + req = r2->req; + CHECK_OBJ_NOTNULL(req, REQ_MAGIC); + r2->magic = 0; + req->transport_priv = NULL; + + AZ(req->ws->r); + Req_Cleanup(sp, wrk, req); if (FEATURE(FEATURE_BUSY_STATS_RATE)) WRK_AddStat(wrk); - Req_Release(r2->req); + Req_Release(req); } void -h2_kill_req(struct worker *wrk, struct h2_sess *h2, - struct h2_req *r2, h2_error h2e) +h2_kill_req(struct worker *wrk, struct h2_sess *h2, struct h2_req **pr2, + h2_error h2e) { + struct h2_req *r2; - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); + TAKE_OBJ_NOTNULL(r2, pr2, H2_REQ_MAGIC); AN(h2e); - Lck_Lock(&h2->sess->mtx); - VSLb(h2->vsl, SLT_Debug, "KILL st=%u state=%d sched=%d", - r2->stream, r2->state, r2->scheduled); - if (r2->counted) { - assert(h2->open_streams > 0); - h2->open_streams--; - r2->counted = 0; + + VSLb(h2->vsl, SLT_Debug, "KILL st=%u state=%d sched=%d error=%d", + r2->stream, r2->state, r2->scheduled, h2e->val); + + if (h2->error != NULL) { + /* The connection is in an error state. Don't send RST. */ + } else if (r2->error == NULL && r2->state < H2_S_CLOSED) { + /* Notify the peer only first time it is killed. */ + H2_Send_RST(h2, r2->stream, h2e); } - if (r2->error == NULL) + + if (r2->error == NULL || H2_ERROR_MATCH(r2->error, H2SE_NO_ERROR)) { + /* We latch the first error set, except if it was a "no + * error". */ r2->error = h2e; + } + + if (r2 == h2->hpack_lock) { + /* We are killing the request that holds the hpack + * context. This is a hard error. */ + (void)h2h_decode_hdr_fini(h2); + AZ(h2->hpack_lock); + if (h2->error == NULL) + h2->error = H2CE_COMPRESSION_ERROR; + } + + if (r2->t_win_low != 0.) { + assert(h2->win_low_streams > 0); + h2->win_low_streams--; + r2->t_win_low = 0.; + } + + h2_stream_setstate(r2, H2_S_CLOSED); + + Lck_Lock(&h2->sess->mtx); if (r2->scheduled) { - if (r2->cond != NULL) - PTOK(pthread_cond_signal(r2->cond)); - r2 = NULL; + h2_reqbody_kick(r2); Lck_Unlock(&h2->sess->mtx); } else { Lck_Unlock(&h2->sess->mtx); - if (r2->state == H2_S_OPEN && h2->new_req == r2->req) - (void)h2h_decode_hdr_fini(h2); + h2_del_req(wrk, &r2); + } +} + +static void +h2_kill_all(struct worker *wrk, struct h2_sess *h2, h2_error h2e) +{ + struct h2_req *r2, *r22; + + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + AN(h2e); + VTAILQ_FOREACH_SAFE(r2, &h2->streams, list, r22) { + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + h2_kill_req(wrk, h2, &r2, h2e); } - if (r2 != NULL) - h2_del_req(wrk, r2); } /**********************************************************************/ static void -h2_vsl_frame(const struct h2_sess *h2, const void *ptr, size_t len) +h2_rxframe_vsl(const struct h2_sess *h2, const void *ptr, size_t len) { const uint8_t *b; struct vsb *vsb; @@ -275,18 +326,12 @@ h2_vsl_frame(const struct h2_sess *h2, const void *ptr, size_t len) VSB_quote(vsb, b + 4, 1, VSB_QUOTE_HEX); VSB_putc(vsb, ' '); VSB_quote(vsb, b + 5, 4, VSB_QUOTE_HEX); - if (u > 0) { - VSB_putc(vsb, ' '); - VSB_quote(vsb, b + 9, len - 9, VSB_QUOTE_HEX); - } AZ(VSB_finish(vsb)); - Lck_Lock(&h2->sess->mtx); VSLb_bin(h2->vsl, SLT_H2RxHdr, 9, b); if (len > 9) VSLb_bin(h2->vsl, SLT_H2RxBody, len - 9, b + 9); VSLb(h2->vsl, SLT_Debug, "H2RXF %s", VSB_data(vsb)); - Lck_Unlock(&h2->sess->mtx); VSB_destroy(&vsb); } @@ -297,25 +342,24 @@ h2_vsl_frame(const struct h2_sess *h2, const void *ptr, size_t len) static h2_error v_matchproto_(h2_rxframe_f) h2_rx_ping(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { + uint64_t val; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - assert(r2 == h2->req0); + ASSERT_H2_SESS(h2); + AZ(r2); - if (h2->rxf_len != 8) { // rfc7540,l,2364,2366 - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx ping with (len != 8)"); + if (h2->rxf_len != 8) { // rfc7540,l,2364,2366 + VSLb(h2->vsl, SLT_SessError, "H2: rx ping with (len != 8)"); return (H2CE_FRAME_SIZE_ERROR); } AZ(h2->rxf_stream); // rfc7540,l,2359,2362 if (h2->rxf_flags != 0) { // We never send pings - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx ping ack"); + VSLb(h2->vsl, SLT_SessError, "H2: rx ping ack"); return (H2SE_PROTOCOL_ERROR); } - H2_Send_Get(wrk, h2, r2); - H2_Send_Frame(wrk, h2, - H2_F_PING, H2FF_PING_ACK, 8, 0, h2->rxf_data); - H2_Send_Rel(h2, r2); + _Static_assert(sizeof (val) == 8, ""); + memcpy(&val, h2->rxf_data, sizeof val); + H2_Send_PING(h2, H2FF_ACK, val); return (0); } @@ -327,25 +371,25 @@ h2_rx_push_promise(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC); // rfc7540,l,2262,2267 - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx push promise"); + VSLb(h2->vsl, SLT_SessError, "H2: rx push promise"); return (H2CE_PROTOCOL_ERROR); } /********************************************************************** */ -static h2_error +static int h2_rapid_reset(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { vtim_real now; vtim_dur d; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); if (h2->rapid_reset_limit == 0) @@ -364,12 +408,10 @@ h2_rapid_reset(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) h2->rapid_reset_limit); h2->last_rst = now; - if (h2->rst_budget < 1.0) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: Hit RST limit. Closing session."); - return (H2CE_RAPID_RESET); - } h2->rst_budget -= 1.0; - return (0); + if (h2->rst_budget > 0) + return (0); + return (1); } static h2_error v_matchproto_(h2_rxframe_f) @@ -378,17 +420,38 @@ h2_rx_rst_stream(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) h2_error h2e; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC); if (h2->rxf_len != 4) { // rfc7540,l,2003,2004 - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx rst with (len != 4)"); + VSLb(h2->vsl, SLT_SessError, "H2: rx rst with (len != 4)"); return (H2CE_FRAME_SIZE_ERROR); } if (r2 == NULL) return (0); - h2e = h2_rapid_reset(wrk, h2, r2); - h2_kill_req(wrk, h2, r2, h2_streamerror(vbe32dec(h2->rxf_data))); + + h2e = h2_streamerror(vbe32dec(h2->rxf_data)); + AN(h2e); + if (h2e == H2NN_ERROR) { + /* The error is unknown. We don't want to return + * H2NN_ERROR from this function because that will cause + * us to close the connection. Map the unknown error to + * H2SE_INTERNAL_ERROR as suggested by the RFC. */ + /* rfc7540,l,2839,2841 */ + h2e = H2SE_INTERNAL_ERROR; + } + + /* We set `r2->error` prior to returnnig to prevent sending a RST in + * return. */ + if (r2->error == NULL) + r2->error = h2e; + + if (h2_rapid_reset(wrk, h2, r2)) { + /* Upgrading to a connection level error. */ + VSLb(h2->vsl, SLT_Error, "H2: Hit RST limit. Closing session."); + h2e = H2CE_RAPID_RESET; + } + return (h2e); } @@ -398,36 +461,22 @@ h2_rx_rst_stream(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) static h2_error v_matchproto_(h2_rxframe_f) h2_rx_goaway(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { + h2_error h2e; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - assert(r2 == h2->req0); - - h2->goaway = 1; - h2->goaway_last_stream = vbe32dec(h2->rxf_data); - h2->error = h2_connectionerror(vbe32dec(h2->rxf_data + 4)); - H2S_Lock_VSLb(h2, SLT_Debug, "GOAWAY %s", h2->error->name); - return (h2->error); -} - -static void -h2_tx_goaway(struct worker *wrk, struct h2_sess *h2, h2_error h2e) -{ - char b[8]; + ASSERT_H2_SESS(h2); + AZ(r2); - ASSERT_RXTHR(h2); + h2e = h2_connectionerror(vbe32dec(h2->rxf_data + 4)); AN(h2e); - if (h2->goaway || !h2e->send_goaway) - return; - - h2->goaway = 1; - vbe32enc(b, h2->highest_stream); - vbe32enc(b + 4, h2e->val); - H2_Send_Get(wrk, h2, h2->req0); - H2_Send_Frame(wrk, h2, H2_F_GOAWAY, 0, 8, 0, b); - H2_Send_Rel(h2, h2->req0); + VSLb(h2->vsl, SLT_Debug, "GOAWAY %s", h2e->name); /* XXX: Remove? */ + if (!H2_ERROR_MATCH(h2e, H2CE_NO_ERROR)) { + /* XXX: Should we log something (not SLT_Error) on a + * graceful shutdown? */ + VSLb(h2->vsl, SLT_Error, "H2: rx goaway %s", h2e->name); + } + return (H2CE_NO_ERROR); } /********************************************************************** @@ -439,27 +488,41 @@ h2_rx_window_update(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) uint32_t wu; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC); if (h2->rxf_len != 4) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx winup with (len != 4)"); + VSLb(h2->vsl, SLT_SessError, "H2: rx winup with (len != 4)"); return (H2CE_FRAME_SIZE_ERROR); } wu = vbe32dec(h2->rxf_data) & ~(1LU<<31); - if (wu == 0) - return (H2SE_PROTOCOL_ERROR); - if (r2 == NULL) - return (0); - Lck_Lock(&h2->sess->mtx); - r2->t_window += wu; - if (r2 == h2->req0) - PTOK(pthread_cond_broadcast(h2->winupd_cond)); - else if (r2->cond != NULL) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - if (r2->t_window >= (1LL << 31)) - return (H2SE_FLOW_CONTROL_ERROR); + if (h2->rxf_stream == 0) { + AZ(r2); + if (wu == 0) + return (H2CE_PROTOCOL_ERROR); + h2->tx_window += wu; + if (h2->tx_window >= (1LL << 31)) + return (H2CE_FLOW_CONTROL_ERROR); + } else { + if (wu == 0) + return (H2SE_PROTOCOL_ERROR); + if (r2 == NULL) { + /* Window update received for a stream we are no + * longer tracking. We MUST ignore this. + * rfc7540,l,2583,2586 */ + return (0); + } + r2->tx_window += wu; + if (r2->tx_window >= (1LL << 31)) + return (H2SE_FLOW_CONTROL_ERROR); + if (r2->t_win_low != 0.) { + assert(h2->win_low_streams > 0); + h2->win_low_streams--; + r2->t_win_low = 0.; + } + } + /* Assume we are no longer stuck on output. */ + h2->tx_l_stuck = 0; return (0); } @@ -474,7 +537,7 @@ h2_rx_priority(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC); return (0); } @@ -511,12 +574,9 @@ h2_win_adjust(const struct h2_sess *h2, uint32_t oldval, uint32_t newval) { struct h2_req *r2; - Lck_AssertHeld(&h2->sess->mtx); // rfc7540,l,2668,2674 VTAILQ_FOREACH(r2, &h2->streams, list) { CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - if (r2 == h2->req0) - continue; // rfc7540,l,2699,2699 switch (r2->state) { case H2_S_IDLE: case H2_S_OPEN: @@ -525,7 +585,7 @@ h2_win_adjust(const struct h2_sess *h2, uint32_t oldval, uint32_t newval) * We allow a window to go negative, as per * rfc7540,l,2676,2680 */ - r2->t_window += (int64_t)newval - oldval; + r2->tx_window += (int64_t)newval - oldval; break; default: break; @@ -544,22 +604,25 @@ h2_set_setting(struct h2_sess *h2, const uint8_t *d) y = vbe32dec(d + 2); if (x >= H2_SETTING_TBL_LEN || h2_setting_tbl[x] == NULL) { // rfc7540,l,2181,2182 - H2S_Lock_VSLb(h2, SLT_Debug, + VSLb(h2->vsl, SLT_Debug, "H2SETTING unknown setting 0x%04x=%08x (ignored)", x, y); return (0); } s = h2_setting_tbl[x]; AN(s); if (y < s->minval || y > s->maxval) { - H2S_Lock_VSLb(h2, SLT_Debug, "H2SETTING invalid %s=0x%08x", + VSLb(h2->vsl, SLT_Debug, "H2SETTING invalid %s=0x%08x", s->name, y); AN(s->range_error); if (!DO_DEBUG(DBG_H2_NOCHECK)) return (s->range_error); } Lck_Lock(&h2->sess->mtx); - if (s == H2_SET_INITIAL_WINDOW_SIZE) + if (s == H2_SET_INITIAL_WINDOW_SIZE) { h2_win_adjust(h2, h2->remote_settings.initial_window_size, y); + /* Assume we are no longer stuck on output. */ + h2->tx_l_stuck = 0; + } VSLb(h2->vsl, SLT_Debug, "H2SETTING %s=0x%08x", s->name, y); Lck_Unlock(&h2->sess->mtx); AN(s->setfunc); @@ -575,21 +638,20 @@ h2_rx_settings(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) h2_error retval = 0; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - assert(r2 == h2->req0); + ASSERT_H2_SESS(h2); AZ(h2->rxf_stream); + AZ(r2); - if (h2->rxf_flags == H2FF_SETTINGS_ACK) { + if (h2->rxf_flags == H2FF_ACK) { if (h2->rxf_len > 0) { // rfc7540,l,2047,2049 - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx settings ack with " + VSLb(h2->vsl, SLT_SessError, "H2: rx settings ack with " "(len > 0)"); return (H2CE_FRAME_SIZE_ERROR); } return (0); } else { if (h2->rxf_len % 6) { // rfc7540,l,2062,2064 - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx settings with " + VSLb(h2->vsl, SLT_SessError, "H2: rx settings with " "((len %% 6) != 0)"); return (H2CE_PROTOCOL_ERROR); } @@ -599,10 +661,7 @@ h2_rx_settings(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) if (retval) return (retval); } - H2_Send_Get(wrk, h2, r2); - H2_Send_Frame(wrk, h2, - H2_F_SETTINGS, H2FF_SETTINGS_ACK, 0, 0, NULL); - H2_Send_Rel(h2, r2); + H2_Send_SETTINGS(h2, H2FF_ACK, 0, NULL); } return (0); } @@ -631,8 +690,7 @@ h2_do_req(struct worker *wrk, void *priv) CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); Lck_Lock(&h2->sess->mtx); r2->scheduled = 0; - r2->state = H2_S_CLOSED; - r2->h2sess->do_sweep = 1; + h2_attention(h2); Lck_Unlock(&h2->sess->mtx); } THR_SetRequest(NULL); @@ -645,14 +703,20 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2, h2_error h2e; ssize_t cl; - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); + assert(h2->hpack_lock == r2); assert(r2->state == H2_S_OPEN); h2e = h2h_decode_hdr_fini(h2); - h2->new_req = NULL; + AZ(h2->hpack_lock); + + if (req->req_body_status == BS_NONE) { + /* REQ_BODY_NONE implies that the HEADERS frame had flag + * END_STREAM set. */ + h2_stream_setstate(r2, H2_S_CLOS_REM); + } if (h2e != NULL) { - H2S_Lock_VSLb(h2, SLT_Debug, "HPACK/FINI %s", h2e->name); + VSLb(h2->vsl, SLT_Debug, "HPACK/FINI %s", h2e->name); assert(!WS_IsReserved(r2->req->ws)); - h2_del_req(wrk, r2); return (h2e); } req->t_req = VTIM_real(); @@ -666,7 +730,7 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2, cl = http_GetContentLength(req->http); assert(cl >= -2); if (cl == -2) { - H2S_Lock_VSLb(h2, SLT_Debug, "Non-parseable Content-Length"); + VSLb(h2->vsl, SLT_Debug, "Non-parseable Content-Length"); return (H2SE_PROTOCOL_ERROR); } @@ -689,19 +753,19 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2, assert (req->req_body_status == BS_NONE); r2->state = H2_S_CLOS_REM; if (cl > 0) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx header with END_STREAM " + VSLb(h2->vsl, SLT_SessError, "H2: rx header with END_STREAM " "and content-length > 0"); return (H2CE_PROTOCOL_ERROR); //rfc7540,l,1838,1840 } } if (req->http->hd[HTTP_HDR_METHOD].b == NULL) { - H2S_Lock_VSLb(h2, SLT_Debug, "Missing :method"); + VSLb(h2->vsl, SLT_Debug, "Missing :method"); return (H2SE_PROTOCOL_ERROR); //rfc7540,l,3087,3090 } if (req->http->hd[HTTP_HDR_URL].b == NULL) { - H2S_Lock_VSLb(h2, SLT_Debug, "Missing :path"); + VSLb(h2->vsl, SLT_Debug, "Missing :path"); return (H2SE_PROTOCOL_ERROR); //rfc7540,l,3087,3090 } @@ -710,7 +774,7 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2, if (*req->http->hd[HTTP_HDR_URL].b == '*' && (Tlen(req->http->hd[HTTP_HDR_METHOD]) != 7 || strncmp(req->http->hd[HTTP_HDR_METHOD].b, "OPTIONS", 7))) { - H2S_Lock_VSLb(h2, SLT_BogoHeader, "Illegal :path pseudo-header"); + VSLb(h2->vsl, SLT_BogoHeader, "Illegal :path pseudo-header"); return (H2SE_PROTOCOL_ERROR); //rfc7540,l,3068,3071 } @@ -719,15 +783,46 @@ h2_end_headers(struct worker *wrk, struct h2_sess *h2, VCL_TaskEnter(req->top->privs); req->task->func = h2_do_req; req->task->priv = req; + + /* NB: we don't need to guard the read of h2->open_streams because + * headers are handled sequentially so it cannot increase under our + * feet. + */ + if (h2->open_streams > (int)h2->local_settings.max_concurrent_streams) { + VSLb(h2->vsl, SLT_Debug, + "H2: stream %u: Hit maximum number of concurrent streams", + h2->rxf_stream); + return (H2SE_REFUSED_STREAM); // rfc7540,l,1200,1205 + } + r2->scheduled = 1; if (Pool_Task(wrk->pool, req->task, TASK_QUEUE_STR) != 0) { r2->scheduled = 0; - r2->state = H2_S_CLOSED; return (H2SE_REFUSED_STREAM); //rfc7540,l,3326,3329 } return (0); } +static h2_error +h2_decode_headers(struct h2_sess *h2, struct h2_req *r2, + const void *p, size_t l) +{ + h2_error h2e; + + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + assert(h2->hpack_lock == r2); + + h2e = h2h_decode_bytes(h2, p, l); + r2->req->acct.req_hdrbytes += l; + + if (h2e != NULL) { + VSLb(h2->vsl, SLT_Debug, "HPACK(%s) %s", + h2_framename(h2->rxf_type), h2e->name); + } + + return (h2e); +} + static h2_error v_matchproto_(h2_rxframe_f) h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { @@ -737,33 +832,10 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) size_t l; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); - - if (r2 != NULL) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx headers on non-idle stream"); - return (H2CE_PROTOCOL_ERROR); // rfc9113,l,887,891 - } - - if (h2->rxf_stream <= h2->highest_stream) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: new stream ID < highest stream"); - return (H2CE_PROTOCOL_ERROR); // rfc7540,l,1153,1158 - } - /* NB: we don't need to guard the read of h2->open_streams - * because headers are handled sequentially so it cannot - * increase under our feet. - */ - if (h2->open_streams >= - (int)h2->local_settings.max_concurrent_streams) { - H2S_Lock_VSLb(h2, SLT_Debug, - "H2: stream %u: Hit maximum number of " - "concurrent streams", h2->rxf_stream); - return (H2SE_REFUSED_STREAM); // rfc7540,l,1200,1205 - } - h2->highest_stream = h2->rxf_stream; - r2 = h2_new_req(h2, h2->rxf_stream, NULL); + ASSERT_H2_SESS(h2); CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); assert(r2->state == H2_S_IDLE); - r2->state = H2_S_OPEN; + h2_stream_setstate(r2, H2_S_OPEN); req = r2->req; CHECK_OBJ_NOTNULL(req, REQ_MAGIC); @@ -772,7 +844,6 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) VSLb(req->vsl, SLT_Begin, "req %ju rxreq", VXID(req->sp->vxid)); VSL(SLT_Link, req->sp->vxid, "req %ju rxreq", VXID(req->vsl->wid)); - h2->new_req = req; req->sp = h2->sess; req->transport = &HTTP2_transport; @@ -784,42 +855,42 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) HTTP_Setup(req->http, req->ws, req->vsl, SLT_ReqMethod); http_SetH(req->http, HTTP_HDR_PROTO, "HTTP/2.0"); - h2h_decode_hdr_init(h2); + h2h_decode_hdr_init(h2, r2); p = h2->rxf_data; l = h2->rxf_len; - if (h2->rxf_flags & H2FF_HEADERS_PADDED) { + if (h2->rxf_flags & H2FF_PADDED) { if (*p + 1 > l) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx headers with pad length > frame len"); + VSLb(h2->vsl, SLT_SessError, "H2: rx headers with pad length > frame len"); return (H2CE_PROTOCOL_ERROR); // rfc7540,l,1884,1887 } l -= 1 + *p; p += 1; } - if (h2->rxf_flags & H2FF_HEADERS_PRIORITY) { + if (h2->rxf_flags & H2FF_PRIORITY) { if (l < 5) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx headers with incorrect " + VSLb(h2->vsl, SLT_SessError, "H2: rx headers with incorrect " "priority data"); return (H2CE_PROTOCOL_ERROR); } l -= 5; p += 5; } - h2e = h2h_decode_bytes(h2, p, l); - if (h2e != NULL) { - H2S_Lock_VSLb(h2, SLT_Debug, "HPACK(hdr) %s", h2e->name); - (void)h2h_decode_hdr_fini(h2); - assert(!WS_IsReserved(r2->req->ws)); - h2_del_req(wrk, r2); + + h2e = h2_decode_headers(h2, r2, p, l); + if (h2e != NULL) return (h2e); - } - if (h2->rxf_flags & H2FF_HEADERS_END_STREAM) + if (h2->rxf_flags & H2FF_END_STREAM) req->req_body_status = BS_NONE; - if (h2->rxf_flags & H2FF_HEADERS_END_HEADERS) + if (h2->rxf_flags & H2FF_END_HEADERS) return (h2_end_headers(wrk, h2, req, r2)); - return (0); + + /* This wasn't the end of the headers. h2->hpack_lock is left as + * evidence to pick up that a CONTINUATION frame is expected next + * on this stream. */ + return (NULL); } /**********************************************************************/ @@ -827,31 +898,28 @@ h2_rx_headers(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) static h2_error v_matchproto_(h2_rxframe_f) h2_rx_continuation(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { - struct req *req; h2_error h2e; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC); - if (r2 == NULL || r2->state != H2_S_OPEN || r2->req != h2->new_req) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: rx unexpected CONT frame" + if (r2 == NULL || r2->state != H2_S_OPEN || r2 != h2->hpack_lock) { + VSLb(h2->vsl, SLT_SessError, "H2: rx unexpected CONT frame" " on stream %d", h2->rxf_stream); return (H2CE_PROTOCOL_ERROR); // XXX spec ? } - req = r2->req; - h2e = h2h_decode_bytes(h2, h2->rxf_data, h2->rxf_len); - r2->req->acct.req_hdrbytes += h2->rxf_len; - if (h2e != NULL) { - H2S_Lock_VSLb(h2, SLT_Debug, "HPACK(cont) %s", h2e->name); - (void)h2h_decode_hdr_fini(h2); - assert(!WS_IsReserved(r2->req->ws)); - h2_del_req(wrk, r2); + h2e = h2_decode_headers(h2, r2, h2->rxf_data, h2->rxf_len); + if (h2e != NULL) return (h2e); - } - if (h2->rxf_flags & H2FF_HEADERS_END_HEADERS) - return (h2_end_headers(wrk, h2, req, r2)); - return (0); + + if (h2->rxf_flags & H2FF_END_HEADERS) + return (h2_end_headers(wrk, h2, r2->req, r2)); + + /* This wasn't the end of the headers. h2->hpack_lock is left as + * evidence to pick up that a CONTINUATION frame is expected next + * on this stream. */ + return (NULL); } /**********************************************************************/ @@ -859,403 +927,15 @@ h2_rx_continuation(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) static h2_error v_matchproto_(h2_rxframe_f) h2_rx_data(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) { - char buf[4]; - ssize_t l; - uint64_t l2, head; - const uint8_t *src; - unsigned len; - - /* XXX: Shouldn't error handling, setting of r2->error and - * r2->cond signalling be handled more generally at the end of - * procframe()??? */ - CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - ASSERT_RXTHR(h2); CHECK_OBJ_ORNULL(r2, H2_REQ_MAGIC); - if (r2 == NULL) - return (0); - - if (r2->state >= H2_S_CLOS_REM) { - r2->error = H2SE_STREAM_CLOSED; + if (r2 == NULL || r2->state == H2_S_CLOSED) + return (H2CE_PROTOCOL_ERROR); // rfc7540,l,1727,1730 + if (r2->state >= H2_S_CLOS_REM) return (H2SE_STREAM_CLOSED); // rfc7540,l,1766,1769 - } - - Lck_Lock(&h2->sess->mtx); - CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); - - if (h2->error != NULL || r2->error != NULL) { - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (h2->error != NULL ? h2->error : r2->error); - } - - /* Check padding if present */ - src = h2->rxf_data; - len = h2->rxf_len; - if (h2->rxf_flags & H2FF_DATA_PADDED) { - if (*src >= len) { - VSLb(h2->vsl, SLT_SessError, - "H2: stream %u: Padding larger than frame length", - h2->rxf_stream); - r2->error = H2CE_PROTOCOL_ERROR; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (H2CE_PROTOCOL_ERROR); - } - len -= 1 + *src; - src += 1; - } - - /* Check against the Content-Length header if given */ - if (r2->req->htc->content_length >= 0) { - if (r2->rxbuf) - l = r2->rxbuf->head; - else - l = 0; - l += len; - if (l > r2->req->htc->content_length || - ((h2->rxf_flags & H2FF_DATA_END_STREAM) && - l != r2->req->htc->content_length)) { - VSLb(h2->vsl, SLT_Debug, - "H2: stream %u: Received data and Content-Length" - " mismatch", h2->rxf_stream); - r2->error = H2SE_PROTOCOL_ERROR; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (H2SE_PROTOCOL_ERROR); - } - } - - /* Check and charge connection window. The entire frame including - * padding (h2->rxf_len) counts towards the window. */ - if (h2->rxf_len > h2->req0->r_window) { - VSLb(h2->vsl, SLT_SessError, - "H2: stream %u: Exceeded connection receive window", - h2->rxf_stream); - r2->error = H2CE_FLOW_CONTROL_ERROR; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (H2CE_FLOW_CONTROL_ERROR); - } - h2->req0->r_window -= h2->rxf_len; - if (h2->req0->r_window < cache_param->h2_rx_window_low_water) { - h2->req0->r_window += cache_param->h2_rx_window_increment; - vbe32enc(buf, cache_param->h2_rx_window_increment); - Lck_Unlock(&h2->sess->mtx); - H2_Send_Get(wrk, h2, h2->req0); - H2_Send_Frame(wrk, h2, H2_F_WINDOW_UPDATE, 0, 4, 0, buf); - H2_Send_Rel(h2, h2->req0); - Lck_Lock(&h2->sess->mtx); - } - - /* Check stream window. The entire frame including padding - * (h2->rxf_len) counts towards the window. */ - if (h2->rxf_len > r2->r_window) { - VSLb(h2->vsl, SLT_Debug, - "H2: stream %u: Exceeded stream receive window", - h2->rxf_stream); - r2->error = H2SE_FLOW_CONTROL_ERROR; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (H2SE_FLOW_CONTROL_ERROR); - } - - /* Handle zero size frame before starting to allocate buffers */ - if (len == 0) { - r2->r_window -= h2->rxf_len; - - /* Handle the specific corner case where the entire window - * has been exhausted using nothing but padding - * bytes. Since no bytes have been buffered, no bytes - * would be consumed by the request thread and no stream - * window updates sent. Unpaint ourselves from this corner - * by sending a stream window update here. */ - CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); - if (r2->r_window == 0 && - (r2->rxbuf == NULL || r2->rxbuf->tail == r2->rxbuf->head)) { - if (r2->rxbuf) - l = r2->rxbuf->size; - else - l = h2->local_settings.initial_window_size; - r2->r_window += l; - Lck_Unlock(&h2->sess->mtx); - vbe32enc(buf, l); - H2_Send_Get(wrk, h2, h2->req0); - H2_Send_Frame(wrk, h2, H2_F_WINDOW_UPDATE, 0, 4, - r2->stream, buf); - H2_Send_Rel(h2, h2->req0); - Lck_Lock(&h2->sess->mtx); - } - - if (h2->rxf_flags & H2FF_DATA_END_STREAM) - r2->state = H2_S_CLOS_REM; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (0); - } - - /* Make the buffer on demand */ - if (r2->rxbuf == NULL) { - unsigned bufsize; - size_t bstest; - struct stv_buffer *stvbuf; - struct h2_rxbuf *rxbuf; - - Lck_Unlock(&h2->sess->mtx); - - bufsize = h2->local_settings.initial_window_size; - if (bufsize < r2->r_window) { - /* This will not happen because we do not have any - * mechanism to change the initial window size on - * a running session. But if we gain that ability, - * this future proofs it. */ - bufsize = r2->r_window; - } - assert(bufsize > 0); - if ((h2->rxf_flags & H2FF_DATA_END_STREAM) && - bufsize > len) - /* Cap the buffer size when we know this is the - * single data frame. */ - bufsize = len; - CHECK_OBJ_NOTNULL(stv_h2_rxbuf, STEVEDORE_MAGIC); - stvbuf = STV_AllocBuf(wrk, stv_h2_rxbuf, - bufsize + sizeof *rxbuf); - if (stvbuf == NULL) { - Lck_Lock(&h2->sess->mtx); - VSLb(h2->vsl, SLT_Debug, - "H2: stream %u: Failed to allocate request body" - " buffer", - h2->rxf_stream); - r2->error = H2SE_INTERNAL_ERROR; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - return (H2SE_INTERNAL_ERROR); - } - rxbuf = STV_GetBufPtr(stvbuf, &bstest); - AN(rxbuf); - assert(bstest >= bufsize + sizeof *rxbuf); - assert(PAOK(rxbuf)); - INIT_OBJ(rxbuf, H2_RXBUF_MAGIC); - rxbuf->size = bufsize; - rxbuf->stvbuf = stvbuf; - - r2->rxbuf = rxbuf; - - Lck_Lock(&h2->sess->mtx); - } - - CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC); - assert(r2->rxbuf->tail <= r2->rxbuf->head); - l = r2->rxbuf->head - r2->rxbuf->tail; - assert(l <= r2->rxbuf->size); - l = r2->rxbuf->size - l; - assert(len <= l); /* Stream window handling ensures this */ - - Lck_Unlock(&h2->sess->mtx); - - l = len; - head = r2->rxbuf->head; - do { - l2 = l; - if ((head % r2->rxbuf->size) + l2 > r2->rxbuf->size) - l2 = r2->rxbuf->size - (head % r2->rxbuf->size); - assert(l2 > 0); - memcpy(&r2->rxbuf->data[head % r2->rxbuf->size], src, l2); - src += l2; - head += l2; - l -= l2; - } while (l > 0); - - Lck_Lock(&h2->sess->mtx); - - /* Charge stream window. The entire frame including padding - * (h2->rxf_len) counts towards the window. The used padding - * bytes will be included in the next connection window update - * sent when the buffer bytes are consumed because that is - * calculated against the available buffer space. */ - r2->r_window -= h2->rxf_len; - r2->rxbuf->head += len; - assert(r2->rxbuf->tail <= r2->rxbuf->head); - if (h2->rxf_flags & H2FF_DATA_END_STREAM) - r2->state = H2_S_CLOS_REM; - if (r2->cond) - PTOK(pthread_cond_signal(r2->cond)); - Lck_Unlock(&h2->sess->mtx); - - return (0); -} - -static enum vfp_status v_matchproto_(vfp_pull_f) -h2_vfp_body(struct vfp_ctx *vc, struct vfp_entry *vfe, void *ptr, ssize_t *lp) -{ - struct h2_req *r2; - struct h2_sess *h2; - enum vfp_status retval; - ssize_t l, l2; - uint64_t tail; - uint8_t *dst; - char buf[4]; - int i; - - CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC); - CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC); - CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC); - h2 = r2->h2sess; - - AN(ptr); - AN(lp); - assert(*lp >= 0); - - Lck_Lock(&h2->sess->mtx); - - r2->cond = &vc->wrk->cond; - while (1) { - CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); - if (r2->rxbuf) { - assert(r2->rxbuf->tail <= r2->rxbuf->head); - l = r2->rxbuf->head - r2->rxbuf->tail; - } else - l = 0; - - if (h2->error != NULL || r2->error != NULL) - retval = VFP_ERROR; - else if (r2->state >= H2_S_CLOS_REM && l <= *lp) - retval = VFP_END; - else { - if (l > *lp) - l = *lp; - retval = VFP_OK; - } - - if (retval != VFP_OK || l > 0) - break; - - i = Lck_CondWaitTimeout(r2->cond, &h2->sess->mtx, - SESS_TMO(h2->sess, timeout_idle)); - if (i == ETIMEDOUT) { - retval = VFP_ERROR; - break; - } - } - r2->cond = NULL; - - Lck_Unlock(&h2->sess->mtx); - - if (l == 0 || retval == VFP_ERROR) { - *lp = 0; - return (retval); - } - - *lp = l; - dst = ptr; - tail = r2->rxbuf->tail; - do { - l2 = l; - if ((tail % r2->rxbuf->size) + l2 > r2->rxbuf->size) - l2 = r2->rxbuf->size - (tail % r2->rxbuf->size); - assert(l2 > 0); - memcpy(dst, &r2->rxbuf->data[tail % r2->rxbuf->size], l2); - dst += l2; - tail += l2; - l -= l2; - } while (l > 0); - - Lck_Lock(&h2->sess->mtx); - - CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC); - r2->rxbuf->tail = tail; - assert(r2->rxbuf->tail <= r2->rxbuf->head); - - if (r2->r_window < cache_param->h2_rx_window_low_water && - r2->state < H2_S_CLOS_REM) { - /* l is free buffer space */ - /* l2 is calculated window increment */ - l = r2->rxbuf->size - (r2->rxbuf->head - r2->rxbuf->tail); - assert(r2->r_window <= l); - l2 = cache_param->h2_rx_window_increment; - if (r2->r_window + l2 > l) - l2 = l - r2->r_window; - r2->r_window += l2; - } else - l2 = 0; - - Lck_Unlock(&h2->sess->mtx); - - if (l2 > 0) { - vbe32enc(buf, l2); - H2_Send_Get(vc->wrk, h2, r2); - H2_Send_Frame(vc->wrk, h2, H2_F_WINDOW_UPDATE, 0, 4, - r2->stream, buf); - H2_Send_Rel(h2, r2); - } - return (retval); -} - -static void -h2_vfp_body_fini(struct vfp_ctx *vc, struct vfp_entry *vfe) -{ - struct h2_req *r2; - struct h2_sess *h2; - struct stv_buffer *stvbuf = NULL; - - CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC); - CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC); - CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC); - CHECK_OBJ_NOTNULL(r2->req, REQ_MAGIC); - h2 = r2->h2sess; - - if (vc->failed) { - CHECK_OBJ_NOTNULL(r2->req->wrk, WORKER_MAGIC); - H2_Send_Get(r2->req->wrk, h2, r2); - H2_Send_RST(r2->req->wrk, h2, r2, r2->stream, - H2SE_REFUSED_STREAM); - H2_Send_Rel(h2, r2); - Lck_Lock(&h2->sess->mtx); - r2->error = H2SE_REFUSED_STREAM; - Lck_Unlock(&h2->sess->mtx); - } - - if (r2->state >= H2_S_CLOS_REM && r2->rxbuf != NULL) { - Lck_Lock(&h2->sess->mtx); - CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); - if (r2->rxbuf != NULL) { - stvbuf = r2->rxbuf->stvbuf; - r2->rxbuf = NULL; - } - Lck_Unlock(&h2->sess->mtx); - if (stvbuf != NULL) { - STV_FreeBuf(vc->wrk, &stvbuf); - AZ(stvbuf); - } - } -} - -static const struct vfp h2_body = { - .name = "H2_BODY", - .pull = h2_vfp_body, - .fini = h2_vfp_body_fini -}; - -void v_matchproto_(vtr_req_body_t) -h2_req_body(struct req *req) -{ - struct h2_req *r2; - struct vfp_entry *vfe; - - CHECK_OBJ(req, REQ_MAGIC); - CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC); - vfe = VFP_Push(req->vfc, &h2_body); - AN(vfe); - vfe->priv1 = r2; + return (h2_reqbody_data(wrk, h2, r2)); } /**********************************************************************/ @@ -1263,9 +943,13 @@ h2_req_body(struct req *req) void v_matchproto_(vtr_req_fail_f) h2_req_fail(struct req *req, stream_close_t reason) { + struct h2_req *r2; + assert(reason != SC_NULL); - assert(req->sp->fd != 0); VSLb(req->vsl, SLT_Debug, "H2FAILREQ"); + + CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC); + h2_async_error(r2, H2SE_INTERNAL_ERROR); } /**********************************************************************/ @@ -1274,169 +958,232 @@ static enum htc_status_e v_matchproto_(htc_complete_f) h2_frame_complete(struct http_conn *htc) { struct h2_sess *h2; + unsigned u; + size_t l; CHECK_OBJ_NOTNULL(htc, HTTP_CONN_MAGIC); CAST_OBJ_NOTNULL(h2, htc->priv, H2_SESS_MAGIC); - if (htc->rxbuf_b + 9 > htc->rxbuf_e || - htc->rxbuf_b + 9 + (vbe32dec(htc->rxbuf_b) >> 8) > htc->rxbuf_e) + l = pdiff(htc->rxbuf_b, htc->rxbuf_e); + if (l == 0) + return (HTC_S_EMPTY); + if (l < 9) return (HTC_S_MORE); - return (HTC_S_COMPLETE); + u = vbe32dec(htc->rxbuf_b) >> 8; + if (u > h2->local_settings.max_frame_size) + return (HTC_S_OVERFLOW); + if (l >= u + 9) + return (HTC_S_COMPLETE); + + return (HTC_S_MORE); } + /**********************************************************************/ -static h2_error +static void h2_procframe(struct worker *wrk, struct h2_sess *h2, h2_frame h2f) { - struct h2_req *r2; - h2_error h2e; + struct h2_req *r2 = NULL; + h2_error h2e = NULL; - ASSERT_RXTHR(h2); - if (h2->rxf_stream == 0 && h2f->act_szero != 0) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: unexpected %s frame on stream 0", - h2f->name); - return (h2f->act_szero); + ASSERT_H2_SESS(h2); + if (h2->rxf_stream == 0 && h2f->act_szero != NULL) { + VSLb(h2->vsl, SLT_SessError, + "H2: unexpected %s frame on stream 0", h2f->name); + h2e = h2f->act_szero; + goto exit; } - if (h2->rxf_stream != 0 && h2f->act_snonzero != 0) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: unexpected %s frame on stream %d", + if (h2->rxf_stream != 0 && h2f->act_snonzero != NULL) { + VSLb(h2->vsl, SLT_SessError, + "H2: unexpected %s frame on stream %d", h2f->name, h2->rxf_stream); - return (h2f->act_snonzero); + h2e = h2f->act_snonzero; + goto exit; } if (h2->rxf_stream > h2->highest_stream && h2f->act_sidle != 0) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: unexpected %s frame on idle stream " - "%d", h2f->name, h2->rxf_stream); - return (h2f->act_sidle); + VSLb(h2->vsl, SLT_SessError, + "H2: unexpected %s frame on idle stream %d", + h2f->name, h2->rxf_stream); + h2e = h2f->act_sidle; + goto exit; + } + + if (h2->expect_settings_next) { + if (h2f != H2_F_SETTINGS || (h2->rxf_flags & H2FF_ACK)) { + // rfc7540,l,579,637 + // rfc7540,l,482,485 + VSLb(h2->vsl, SLT_Error, + "H2: unexpected %s%s frame on stream %d," + " expected preface settings", + h2f->name, + h2->rxf_flags & H2FF_ACK ? "(ACK)" : "", + h2->rxf_stream); + h2e = H2CE_PROTOCOL_ERROR; + goto exit; + } + h2->expect_settings_next = 0; } if (h2->rxf_stream != 0 && !(h2->rxf_stream & 1)) { // rfc7540,l,1140,1145 // rfc7540,l,1153,1158 /* No even streams, we don't do PUSH_PROMISE */ - H2S_Lock_VSLb(h2, SLT_SessError, "H2: illegal stream (=%u)", + VSLb(h2->vsl, SLT_SessError, "H2: illegal stream (=%u)", h2->rxf_stream); - return (H2CE_PROTOCOL_ERROR); + h2e = H2CE_PROTOCOL_ERROR; + goto exit; } - VTAILQ_FOREACH(r2, &h2->streams, list) - if (r2->stream == h2->rxf_stream) - break; + if (h2->hpack_lock != NULL && h2f != H2_F_CONTINUATION) { + VSLb(h2->vsl, SLT_SessError, + "H2: expected continuation but received %s on stream %d", + h2f->name, h2->rxf_stream); + h2e = H2CE_PROTOCOL_ERROR; // rfc7540,l,1859,1863 + goto exit; + } - if (h2->new_req != NULL && h2f != H2_F_CONTINUATION) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: expected continuation but " - " received %s on stream %d", h2f->name, h2->rxf_stream); - return (H2CE_PROTOCOL_ERROR); // rfc7540,l,1859,1863 + if (h2f == H2_F_HEADERS && h2->rxf_stream <= h2->highest_stream) { + VSLb(h2->vsl, SLT_Error, "H2: new stream ID < highest stream"); + h2e = H2CE_PROTOCOL_ERROR; // rfc7540,l,1153,1158 + goto exit; + } + + if (h2->rxf_stream != 0) { + VTAILQ_FOREACH(r2, &h2->streams, list) { + if (r2->stream == h2->rxf_stream) + break; + } + if (r2 != NULL && r2->error != NULL) { + /* Ignore frames for streams once error is set. */ + /* XXX: missing accounting? */ + return; + } + } + + if (h2f == H2_F_HEADERS) { + AZ(r2); /* We checked against highest_stream above. */ + r2 = h2_new_req(h2, h2->rxf_stream, NULL); + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + h2->highest_stream = r2->stream; } h2e = h2f->rxfunc(wrk, h2, r2); - if (h2e == NULL) - return (NULL); - if (h2->rxf_stream == 0 || h2e->connection) - return (h2e); // Connection errors one level up - H2_Send_Get(wrk, h2, h2->req0); - H2_Send_RST(wrk, h2, h2->req0, h2->rxf_stream, h2e); - H2_Send_Rel(h2, h2->req0); - return (NULL); +exit: + if (h2e != NULL) { + if (h2->rxf_stream == 0 || h2e->connection) + h2->error = h2e; + if (r2 != NULL) + h2_kill_req(wrk, h2, &r2, h2e); + } } -h2_error -h2_stream_tmo(struct h2_sess *h2, const struct h2_req *r2, vtim_real now) +void +h2_stream_setstate(struct h2_req *r2, enum h2_stream_e state) { - h2_error h2e = NULL; - - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - Lck_AssertHeld(&h2->sess->mtx); + ASSERT_H2_SESS(r2->h2sess); + + if (r2->state >= state) { + /* State transitions only go from lower states to + * higher. If we are already at a higher state, ignore + * it. (We do not assert on state changes because change + * of state is both driven by our internal progress as + * well as incoming client data.) */ + return; + } - /* NB: when now is NAN, it means that h2_window_timeout was hit - * on a lock condwait operation. - */ - if (isnan(now)) - AN(r2->t_winupd); + if (state >= H2_S_CLOSED) { + assert(r2->h2sess->open_streams > 0); + r2->h2sess->open_streams--; + } - if (h2->error != NULL && h2->error->connection && - !h2->error->send_goaway) - return (h2->error); + r2->state = state; +} - if (r2->t_winupd == 0 && r2->t_send == 0) - return (NULL); +static h2_error +h2_stream_tmo(struct h2_sess *h2, const struct h2_req *r2, vtim_real now) +{ - if (isnan(now) || (r2->t_winupd != 0 && - now - r2->t_winupd > cache_param->h2_window_timeout)) { + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + + if (r2->t_win_low != 0 && + now - r2->t_win_low > cache_param->h2_window_timeout) { VSLb(h2->vsl, SLT_Debug, - "H2: stream %u: Hit h2_window_timeout", r2->stream); - h2e = H2SE_BROKE_WINDOW; + "H2: stream %u: Hit h2_window_timeout", r2->stream); + if (h2->open_streams <= h2->win_low_streams) { + /* If all streams ran out of control flow window + * credits upon triggering h2_window_timeout, + * declare bankruptcy for the entire connection. */ + return (H2CE_BANKRUPT); + } + return (H2SE_BROKE_WINDOW); } - if (h2e == NULL && r2->t_send != 0 && + if (r2->t_send != 0 && now - r2->t_send > SESS_TMO(h2->sess, send_timeout)) { VSLb(h2->vsl, SLT_Debug, "H2: stream %u: Hit send_timeout", r2->stream); - h2e = H2SE_CANCEL; + return (H2SE_SEND_TIMEOUT); } - return (h2e); -} - -static h2_error -h2_stream_tmo_unlocked(struct h2_sess *h2, const struct h2_req *r2) -{ - h2_error h2e; - - Lck_Lock(&h2->sess->mtx); - h2e = h2_stream_tmo(h2, r2, h2->sess->t_idle); - Lck_Unlock(&h2->sess->mtx); - - return (h2e); + return (NULL); } /* * This is the janitorial task of cleaning up any closed & refused * streams, and checking if the session is timed out. */ -static h2_error -h2_sweep(struct worker *wrk, struct h2_sess *h2) +static void +h2_sweep(struct worker *wrk, struct h2_sess *h2, vtim_real now) { struct h2_req *r2, *r22; - h2_error h2e, tmo; - vtim_real now; + h2_error h2e; + int64_t l; - ASSERT_RXTHR(h2); + ASSERT_H2_SESS(h2); - h2e = h2->error; - now = VTIM_real(); - if (h2e == NULL && h2->open_streams == 0 && - h2->sess->t_idle + cache_param->timeout_idle < now) - h2e = H2CE_NO_ERROR; - - h2->do_sweep = 0; VTAILQ_FOREACH_SAFE(r2, &h2->streams, list, r22) { - if (r2 == h2->req0) { - assert (r2->state == H2_S_IDLE); + if (r2->async_error != NULL) { + /* Request thread has set an error state. Kill it. */ + h2e = r2->async_error; + r2->async_error = NULL; + h2_kill_req(wrk, h2, &r2, h2e); continue; } + + if (r2->rxbuf != NULL && r2->state < H2_S_CLOS_REM && + r2->error == NULL) { + /* Check and expand the request body window if + * necessary. */ + CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC); + assert(r2->rxbuf->tail <= r2->rxbuf->head); + l = r2->rxbuf->head - r2->rxbuf->tail; + assert(l <= r2->rxbuf->size); + l = r2->rxbuf->size - l; + if (r2->rx_window < l) { + l = l - r2->rx_window; + H2_Send_WINDOW_UPDATE(h2, r2->stream, l); + r2->rx_window += l; + } + } + switch (r2->state) { case H2_S_CLOSED: - AZ(r2->scheduled); - h2_del_req(wrk, r2); + if (!r2->scheduled) + h2_kill_req(wrk, h2, &r2, H2SE_NO_ERROR); break; case H2_S_CLOS_REM: - if (!r2->scheduled) { - H2_Send_Get(wrk, h2, h2->req0); - H2_Send_RST(wrk, h2, h2->req0, r2->stream, - H2SE_REFUSED_STREAM); - H2_Send_Rel(h2, h2->req0); - h2_del_req(wrk, r2); - continue; - } - /* FALLTHROUGH */ case H2_S_CLOS_LOC: case H2_S_OPEN: - tmo = h2_stream_tmo_unlocked(h2, r2); - if (h2e == NULL) - h2e = tmo; + h2e = h2_stream_tmo(h2, r2, now); + if (h2e != NULL && h2e->connection) + h2->error = h2e; + else if (h2e != NULL) + h2_kill_req(wrk, h2, &r2, h2e); break; case H2_S_IDLE: /* Current code make this unreachable: h2_new_req is @@ -1448,7 +1195,6 @@ h2_sweep(struct worker *wrk, struct h2_sess *h2) break; } } - return (h2e); } /* @@ -1456,21 +1202,15 @@ h2_sweep(struct worker *wrk, struct h2_sess *h2) * if we have not received end_stream, DATA frames are expected later * * neither of these make much sense to output here - * - * goaway currently is always 0, see #4285 */ static void h2_htc_debug(enum htc_status_e hs, struct h2_sess *h2) { const char *s, *r; - if (LIKELY(VSL_tag_is_masked(SLT_Debug))) - return; - HTC_Status(hs, &s, &r); - H2S_Lock_VSLb(h2, SLT_Debug, "H2: HTC %s (%s) frame=%s goaway=%d", - s, r, h2->htc->rxbuf_b == h2->htc->rxbuf_e ? "complete" : "partial", - h2->goaway); + VSLb(h2->vsl, SLT_Debug, "H2: HTC %s (%s) frame=%s", s, r, + h2->htc->rxbuf_b == h2->htc->rxbuf_e ? "complete" : "partial"); } /*********************************************************************** @@ -1488,54 +1228,61 @@ static const h2_frame h2flist[] = { #define H2FMAX (sizeof(h2flist) / sizeof(h2flist[0])) -int -h2_rxframe(struct worker *wrk, struct h2_sess *h2) +static enum htc_status_e +h2_rxstuff(struct h2_sess *h2) { + struct http_conn *htc; enum htc_status_e hs; - h2_frame h2f; - h2_error h2e; - const char *s, *r; - - ASSERT_RXTHR(h2); - - if (h2->goaway && h2->open_streams == 0) - return (0); + size_t res; + ssize_t l; - h2->t1 = NAN; - VTCP_blocking(*h2->htc->rfd); - hs = HTC_RxStuff(h2->htc, h2_frame_complete, &h2->t1, NULL, NAN, - VTIM_real() + 0.5, NAN, h2->local_settings.max_frame_size + 9); + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + htc = h2->htc; + CHECK_OBJ_NOTNULL(htc, HTTP_CONN_MAGIC); + AN(htc->rfd); + assert(*htc->rfd > 0); + + /* Set up the workspace buffer */ + assert(htc->rxbuf_b <= htc->rxbuf_e); + HTC_RxPipeline(htc, htc->rxbuf_b); + HTC_RxInit(htc, h2->ws); + res = WS_ReservationSize(h2->ws); + + if (res == 0) { + WS_Release(htc->ws, 0); + return (HTC_S_OVERFLOW); + } - h2e = NULL; - switch (hs) { - case HTC_S_EOF: + l = read(*htc->rfd, htc->rxbuf_e, res); + if (l < 0 && errno == EWOULDBLOCK) + hs = HTC_S_MORE; + else if (l < 0) + hs = HTC_S_CLOSE; + else if (l == 0) { + hs = HTC_S_EOF; h2_htc_debug(hs, h2); - h2e = H2CE_NO_ERROR; - break; - case HTC_S_COMPLETE: - h2->sess->t_idle = VTIM_real(); - if (h2->do_sweep) - h2e = h2_sweep(wrk, h2); - break; - case HTC_S_TIMEOUT: - //// #4279 - // h2_htc_debug(hs, h2); - h2e = h2_sweep(wrk, h2); - break; - default: - HTC_Status(hs, &s, &r); - H2S_Lock_VSLb(h2, SLT_SessError, "H2: HTC %s (%s)", s, r); - h2e = H2CE_ENHANCE_YOUR_CALM; + } else { + h2->t1 = VTIM_real(); + htc->rxbuf_e += l; + hs = h2_frame_complete(htc); } - if (h2e != NULL && h2e->connection) { - h2->error = h2e; - h2_tx_goaway(wrk, h2, h2e); - return (0); - } + WS_ReleaseP(htc->ws, htc->rxbuf_e); + return (hs); +} +static enum htc_status_e +h2_rxframe(struct worker *wrk, struct h2_sess *h2) +{ + enum htc_status_e hs; + h2_frame h2f; + + CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); + ASSERT_H2_SESS(h2); + + hs = h2_frame_complete(h2->htc); if (hs != HTC_S_COMPLETE) - return (1); + return (hs); h2->rxf_len = vbe32dec(h2->htc->rxbuf_b) >> 8; h2->rxf_type = h2->htc->rxbuf_b[3]; @@ -1543,21 +1290,21 @@ h2_rxframe(struct worker *wrk, struct h2_sess *h2) h2->rxf_stream = vbe32dec(h2->htc->rxbuf_b + 5); h2->rxf_stream &= ~(1LU<<31); // rfc7540,l,690,692 h2->rxf_data = (void*)(h2->htc->rxbuf_b + 9); - /* XXX: later full DATA will not be rx'ed yet. */ - HTC_RxPipeline(h2->htc, h2->htc->rxbuf_b + h2->rxf_len + 9); - h2_vsl_frame(h2, h2->htc->rxbuf_b, 9L + h2->rxf_len); + h2_rxframe_vsl(h2, h2->htc->rxbuf_b, 9L + h2->rxf_len); h2->srq->acct.req_hdrbytes += 9; + h2->htc->rxbuf_b += h2->rxf_len + 9; + assert(h2->htc->rxbuf_b <= h2->htc->rxbuf_e); + if (h2->rxf_type >= H2FMAX) { // rfc7540,l,679,681 - // XXX: later, drain rest of frame h2->bogosity++; - H2S_Lock_VSLb(h2, SLT_Debug, + VSLb(h2->vsl, SLT_Debug, "H2: Unknown frame type 0x%02x (ignored)", (uint8_t)h2->rxf_type); h2->srq->acct.req_bodybytes += h2->rxf_len; - return (1); + return (h2_frame_complete(h2->htc)); } h2f = h2flist[h2->rxf_type]; @@ -1569,17 +1316,185 @@ h2_rxframe(struct worker *wrk, struct h2_sess *h2) if (h2->rxf_flags & ~h2f->flags) { // rfc7540,l,687,688 h2->bogosity++; - H2S_Lock_VSLb(h2, SLT_Debug, + VSLb(h2->vsl, SLT_Debug, "H2: Unknown flags 0x%02x on %s (ignored)", (uint8_t)h2->rxf_flags & ~h2f->flags, h2f->name); h2->rxf_flags &= h2f->flags; } - h2e = h2_procframe(wrk, h2, h2f); - if (h2->error == NULL && h2e != NULL) { - h2->error = h2e; - h2_tx_goaway(wrk, h2, h2e); + if (h2->error == NULL) + h2_procframe(wrk, h2, h2f); + + return (h2_frame_complete(h2->htc)); +} + +void +h2_async_error(struct h2_req *r2, h2_error h2e) +{ + + /* Report an error from a request handling thread */ + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + AN(h2e); + + AN(r2->scheduled); + CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC); + ASSERT_H2_REQ(r2->h2sess); + + if (h2e->connection) + r2->h2sess->error = h2e; + else + r2->async_error = h2e; + + h2_attention(r2->h2sess); +} + +void +h2_attention(struct h2_sess *h2) +{ + + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + AZ(VEFD_Signal(h2->efd)); +} + +void +h2_run(struct worker *wrk, struct h2_sess *h2) +{ + struct pollfd pfd[2]; + enum htc_status_e hs; + const char *s, *r; + int i; + ssize_t l; + vtim_real now; + vtim_dur tmo; + + CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + + assert(h2->efd->poll_fd >= 0); + + enum { + pfd_h2 = 0, + pfd_ev = 1, + }; + memset(pfd, 0, sizeof pfd); + pfd[pfd_h2].fd = h2->sess->fd; + pfd[pfd_h2].events = POLLIN; + pfd[pfd_ev].fd = h2->efd->poll_fd; + pfd[pfd_ev].events = POLLIN; + + VTCP_nonblocking(h2->sess->fd); + + now = VTIM_real(); + h2->deadline = now + cache_param->timeout_idle; + + while (h2->error == NULL) { + if (H2_Send_Pending(h2)) + pfd[pfd_h2].events = POLLIN | POLLOUT; + else + pfd[pfd_h2].events = POLLIN; + i = poll(pfd, 2, 1000); + + /* Calculate the next deadline. The deadline is the time + * at which any "blocking" poll()s in code called by this + * loop (e.g. a need to flush the output to free up buffer + * space) are allowed to wait before flagging error. */ + now = VTIM_real(); + tmo = SESS_TMO(h2->sess, timeout_idle); + h2->deadline = now + cache_param->timeout_idle; + + /* Connection timeouts */ + if (h2->error == NULL && h2->hpack_lock != NULL && + h2->hpack_lock->req->t_first + tmo < now) + h2->error = H2CE_COMPRESSION_ERROR; + else if (h2->error == NULL && h2->open_streams == 0 && + h2->sess->t_idle + tmo < now) + h2->error = H2CE_NO_ERROR; + + if (pfd[pfd_ev].revents & POLLIN) { + /* Signalled for attention by a request + * thread. Reset the eventfd. */ + AZ(VEFD_Clear(h2->efd)); + } + + if (pfd[pfd_h2].revents & POLLIN) { + hs = h2_rxstuff(h2); + while (h2->error == NULL && hs == HTC_S_COMPLETE) + hs = h2_rxframe(wrk, h2); + if (h2->error == NULL && hs < 0) { + switch (hs) { + case HTC_S_EOF: + /* Remote close */ + h2->error = H2CE_IO_ERROR; + break; + default: + HTC_Status(hs, &s, &r); + VSLb(h2->vsl, SLT_Error, "H2: %s", s); + h2->error = H2CE_PROTOCOL_ERROR; + break; + } + } + } + + if (pfd[pfd_h2].revents & POLLOUT) { + /* We have data to send and it is possible to + * send. */ + l = H2_Send_TxStuff(h2); + if (l < 0 && errno != EWOULDBLOCK) { + VSLb(h2->vsl, SLT_Error, "H2: Send error (%s)", + strerror(errno)); + h2->error = H2CE_IO_ERROR; + } + } + + h2_sweep(wrk, h2, now); + } + AN(h2->error); + + /* Wake up any threads waiting to send, cancelling any queued + * writes. */ + H2_Send_Shutdown(h2); + + /* Kill all streams, kicking any waitinglist stuck items */ + h2_kill_all(wrk, h2, h2->error); + + if (h2->error->send_goaway) { + /* Add timeout_linger to the deadline which may have + * already been spent, to give some additional time to get + * the GOAWAY out the door. */ + h2->deadline += cache_param->timeout_linger; + + /* Send GOAWAY, and then spend up until the last deadline + * set draining the outgoing buffers. This is to be a good + * citizen and make some effort on communicating the + * GOAWAY. */ + H2_Send_GOAWAY(h2, h2->highest_stream, h2->error); + while (H2_Send_Pending(h2)) { + if (H2_Send_Something(h2) < 0) + break; + } } - return (h2->error != NULL ? 0 : 1); + /* We will not be sending anything more on the socket. */ + H2_Send_Stop(h2); + AN(VTAILQ_EMPTY(&h2->tx_l_queue)); + + /* XXX: Shutdown socket? Would presumably free up kernel socket + * buffers while waiting for waitinglists and the like to clean + * up. */ + + /* Wait until all the requests have been removed */ + pfd[pfd_h2].fd = -pfd[pfd_h2].fd; /* Disable polling on the sess fd */ + while (h2->refcnt > 0) { + /* Don't use infinite timeout here. The walkaway has data + * race issues, and we may need to kill a req more than + * once to wake it. */ + i = poll(pfd, 2, 250); + + if (i > 0 && pfd[pfd_ev].revents & POLLIN) { + /* Clear the eventfd before the next sleep */ + AZ(VEFD_Clear(h2->efd)); + } + h2_kill_all(wrk, h2, h2->error); + h2_sweep(wrk, h2, now); + } } diff --git a/bin/varnishd/http2/cache_http2_reqbody.c b/bin/varnishd/http2/cache_http2_reqbody.c new file mode 100644 index 0000000000..dad4dd9e8c --- /dev/null +++ b/bin/varnishd/http2/cache_http2_reqbody.c @@ -0,0 +1,421 @@ +/*- + * Copyright (c) 2016-2025 Varnish Software AS + * All rights reserved. + * + * Author: Poul-Henning Kamp + * Author: Martin Blix Grydeland + * + * SPDX-License-Identifier: BSD-2-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include "config.h" + +#include +#include + +#include "cache/cache_varnishd.h" + +#include "cache/cache_transport.h" +#include "cache/cache_filter.h" +#include "http2/cache_http2.h" +#include "storage/storage.h" + +#include "vtim.h" + +struct h2_reqbody_waiter { + unsigned magic; +#define H2_REQBODY_WAITER_MAGIC 0xb6f4c52c + pthread_cond_t cond; +}; + +static int +h2_reqbody_wait(struct h2_req *r2, vtim_real when) +{ + struct h2_reqbody_waiter w; + int retval; + + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC); + + Lck_AssertHeld(&r2->h2sess->sess->mtx); + + INIT_OBJ(&w, H2_REQBODY_WAITER_MAGIC); + PTOK(pthread_cond_init(&w.cond, NULL)); + + AZ(r2->reqbody_waiter); + r2->reqbody_waiter = &w; + retval = Lck_CondWaitUntil(&w.cond, &r2->h2sess->sess->mtx, when); + r2->reqbody_waiter = NULL; + + PTOK(pthread_cond_destroy(&w.cond)); + w.magic = 0; + + return (retval); +} + +void +h2_reqbody_kick(struct h2_req *r2) +{ + + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC); + + Lck_AssertHeld(&r2->h2sess->sess->mtx); + + CHECK_OBJ_ORNULL(r2->reqbody_waiter, H2_REQBODY_WAITER_MAGIC); + if (r2->reqbody_waiter != NULL) + PTOK(pthread_cond_signal(&r2->reqbody_waiter->cond)); +} + +h2_error +h2_reqbody_data(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) +{ + ssize_t l; + uint64_t l2, head; + const uint8_t *src; + unsigned len; + + CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + + ASSERT_H2_SESS(h2); + + CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); + + /* XXX: errcheck? */ + if (h2->error != NULL || r2->error != NULL) + return (h2->error != NULL ? h2->error : r2->error); + + /* Check padding if present */ + src = h2->rxf_data; + len = h2->rxf_len; + if (h2->rxf_flags & H2FF_PADDED) { + if (*src >= len) { + VSLb(h2->vsl, SLT_SessError, + "H2: stream %u: Padding larger than frame length", + h2->rxf_stream); + return (H2CE_PROTOCOL_ERROR); + } + len -= 1 + *src; + src += 1; + } + + /* Check against the Content-Length header if given */ + if (r2->req->htc->content_length >= 0) { + if (r2->rxbuf) + l = r2->rxbuf->head; + else + l = 0; + l += len; + if (l > r2->req->htc->content_length || + ((h2->rxf_flags & H2FF_END_STREAM) && + l != r2->req->htc->content_length)) { + VSLb(h2->vsl, SLT_Debug, + "H2: stream %u: Received data and Content-Length" + " mismatch", h2->rxf_stream); + return (H2SE_PROTOCOL_ERROR); + } + } + + /* Check and charge connection window. The entire frame including + * padding (h2->rxf_len) counts towards the window. */ + if (h2->rxf_len > h2->rx_window) { + VSLb(h2->vsl, SLT_SessError, + "H2: stream %u: Exceeded connection receive window", + h2->rxf_stream); + return (H2CE_FLOW_CONTROL_ERROR); + } + h2->rx_window -= h2->rxf_len; + if (h2->rx_window < cache_param->h2_rx_window_low_water) { + /* Running low, increase the window */ + l = cache_param->h2_rx_window_increment; + assert(l < (1UL << 31)); + h2->rx_window += l; + H2_Send_WINDOW_UPDATE(h2, 0, l); + } + + /* Check stream window. The entire frame including padding + * (h2->rxf_len) counts towards the window. */ + if (h2->rxf_len > r2->rx_window) { + VSLb(h2->vsl, SLT_Debug, + "H2: stream %u: Exceeded stream receive window", + h2->rxf_stream); + return (H2SE_FLOW_CONTROL_ERROR); + } + + /* Handle zero size frame before starting to allocate buffers */ + if (len == 0) { + r2->rx_window -= h2->rxf_len; + + /* Handle the specific corner case where the entire window + * has been exhausted using nothing but padding + * bytes. Since no bytes have been buffered, no bytes + * would be consumed by the request thread and no stream + * window updates sent. Unpaint ourselves from this corner + * by sending a stream window update here. */ + CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); + if (r2->rx_window == 0 && + (r2->rxbuf == NULL || r2->rxbuf->tail == r2->rxbuf->head)) { + /* XXX: bogosity++? */ + if (r2->rxbuf) + l = r2->rxbuf->size; + else + l = h2->local_settings.initial_window_size; + r2->rx_window += l; + H2_Send_WINDOW_UPDATE(h2, r2->stream, l); + } + + if (h2->rxf_flags & H2FF_END_STREAM) + h2_stream_setstate(r2, H2_S_CLOS_REM); + Lck_Lock(&h2->sess->mtx); + h2_reqbody_kick(r2); + Lck_Unlock(&h2->sess->mtx); + return (0); + } + + /* Make the buffer on demand */ + if (r2->rxbuf == NULL) { + unsigned bufsize; + size_t bstest; + struct stv_buffer *stvbuf; + struct h2_rxbuf *rxbuf; + + bufsize = h2->local_settings.initial_window_size; + if (bufsize < r2->rx_window) { + /* This will not happen because we do not have any + * mechanism to change the initial window size on + * a running session. But if we gain that ability, + * this future proofs it. */ + bufsize = r2->rx_window; + } + assert(bufsize > 0); + if ((h2->rxf_flags & H2FF_END_STREAM) && + bufsize > len) { + /* Cap the buffer size when we know this is the + * single data frame. */ + bufsize = len; + } + CHECK_OBJ_NOTNULL(stv_h2_rxbuf, STEVEDORE_MAGIC); + stvbuf = STV_AllocBuf(wrk, stv_h2_rxbuf, + bufsize + sizeof *rxbuf); + if (stvbuf == NULL) { + VSLb(h2->vsl, SLT_Debug, + "H2: stream %u: Failed to allocate request body" + " buffer", + h2->rxf_stream); + return (H2SE_INTERNAL_ERROR); + } + rxbuf = STV_GetBufPtr(stvbuf, &bstest); + AN(rxbuf); + assert(bstest >= bufsize + sizeof *rxbuf); + assert(PAOK(rxbuf)); + INIT_OBJ(rxbuf, H2_RXBUF_MAGIC); + rxbuf->size = bufsize; + rxbuf->stvbuf = stvbuf; + + r2->rxbuf = rxbuf; + } + + CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC); + assert(r2->rxbuf->tail <= r2->rxbuf->head); + l = r2->rxbuf->head - r2->rxbuf->tail; + assert(l <= r2->rxbuf->size); + l = r2->rxbuf->size - l; + assert(len <= l); /* Stream window handling ensures this */ + + l = len; + head = r2->rxbuf->head; + do { + l2 = l; + if ((head % r2->rxbuf->size) + l2 > r2->rxbuf->size) + l2 = r2->rxbuf->size - (head % r2->rxbuf->size); + assert(l2 > 0); + memcpy(&r2->rxbuf->data[head % r2->rxbuf->size], src, l2); + src += l2; + head += l2; + l -= l2; + } while (l > 0); + + Lck_Lock(&h2->sess->mtx); + /* Charge stream window. The entire frame including padding + * (h2->rxf_len) counts towards the window. The used padding + * bytes will be included in the next connection window update + * sent when the buffer bytes are consumed because that is + * calculated against the available buffer space. */ + r2->rx_window -= h2->rxf_len; + r2->rxbuf->head += len; + assert(r2->rxbuf->tail <= r2->rxbuf->head); + if (h2->rxf_flags & H2FF_END_STREAM) + h2_stream_setstate(r2, H2_S_CLOS_REM); + h2_reqbody_kick(r2); + Lck_Unlock(&h2->sess->mtx); + + return (0); +} + +static enum vfp_status v_matchproto_(vfp_pull_f) +h2_vfp_body(struct vfp_ctx *vc, struct vfp_entry *vfe, void *ptr, ssize_t *lp) +{ + struct h2_req *r2; + struct h2_sess *h2; + enum vfp_status retval; + h2_error h2e = NULL; + ssize_t l, l2; + uint64_t tail; + uint8_t *dst; + int wait_error = 0; + + CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC); + CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC); + CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC); + h2 = r2->h2sess; + + ASSERT_H2_REQ(h2); + + AN(ptr); + AN(lp); + assert(*lp >= 0); + + Lck_Lock(&h2->sess->mtx); + + while (1) { + CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); + if (r2->rxbuf) { + assert(r2->rxbuf->tail <= r2->rxbuf->head); + l = r2->rxbuf->head - r2->rxbuf->tail; + } else + l = 0; + + h2e = h2_errcheck(r2); + if (h2e != NULL) + break; + else if (r2->state >= H2_S_CLOS_REM && l <= *lp) + retval = VFP_END; + else { + if (l > *lp) + l = *lp; + retval = VFP_OK; + } + + if (retval != VFP_OK || l > 0) + break; + + wait_error = h2_reqbody_wait(r2, + VTIM_real() + SESS_TMO(h2->sess, timeout_idle)); + if (wait_error == ETIMEDOUT) + break; + } + + Lck_Unlock(&h2->sess->mtx); + + if (h2e != NULL) + retval = VFP_Error(vc, "H2: Request body error (%s)", h2e->txt); + else if (wait_error == ETIMEDOUT) + retval = VFP_Error(vc, "H2: Request body timed out"); + + if (l == 0 || retval == VFP_ERROR) { + *lp = 0; + return (retval); + } + + *lp = l; + dst = ptr; + tail = r2->rxbuf->tail; + do { + l2 = l; + if ((tail % r2->rxbuf->size) + l2 > r2->rxbuf->size) + l2 = r2->rxbuf->size - (tail % r2->rxbuf->size); + assert(l2 > 0); + memcpy(dst, &r2->rxbuf->data[tail % r2->rxbuf->size], l2); + dst += l2; + tail += l2; + l -= l2; + } while (l > 0); + + Lck_Lock(&h2->sess->mtx); + + CHECK_OBJ_NOTNULL(r2->rxbuf, H2_RXBUF_MAGIC); + r2->rxbuf->tail = tail; + assert(r2->rxbuf->tail <= r2->rxbuf->head); + + if (r2->rx_window < cache_param->h2_rx_window_low_water && + r2->state < H2_S_CLOS_REM) { + /* Kick the session thread so it can hand out an extended + * window to the peer. */ + h2_attention(h2); + } + + Lck_Unlock(&h2->sess->mtx); + return (retval); +} + +static void +h2_vfp_body_fini(struct vfp_ctx *vc, struct vfp_entry *vfe) +{ + struct h2_req *r2; + struct stv_buffer *stvbuf = NULL; + + CHECK_OBJ_NOTNULL(vc, VFP_CTX_MAGIC); + CHECK_OBJ_NOTNULL(vfe, VFP_ENTRY_MAGIC); + CAST_OBJ_NOTNULL(r2, vfe->priv1, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->req, REQ_MAGIC); + + ASSERT_H2_REQ(r2->h2sess); + + if (vc->failed) + h2_async_error(r2, H2SE_REFUSED_STREAM); + + CHECK_OBJ_ORNULL(r2->rxbuf, H2_RXBUF_MAGIC); + if (r2->state >= H2_S_CLOS_REM && r2->rxbuf != NULL) { + /* Free the buffer. This is safe without any locking + * because the session thread will only free the buffer as + * part of h2_del_req(), which won't be run as long as we + * are scheduled. */ + AN(r2->scheduled); + stvbuf = r2->rxbuf->stvbuf; + r2->rxbuf = NULL; + STV_FreeBuf(vc->wrk, &stvbuf); + } +} + +static const struct vfp h2_body = { + .name = "H2_BODY", + .pull = h2_vfp_body, + .fini = h2_vfp_body_fini, +}; + +void v_matchproto_(vtr_req_body_t) +h2_reqbody(struct req *req) +{ + struct h2_req *r2; + struct vfp_entry *vfe; + + CHECK_OBJ(req, REQ_MAGIC); + CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC); + vfe = VFP_Push(req->vfc, &h2_body); + AN(vfe); + vfe->priv1 = r2; +} diff --git a/bin/varnishd/http2/cache_http2_send.c b/bin/varnishd/http2/cache_http2_send.c index d4e66aab8c..62093b152e 100644 --- a/bin/varnishd/http2/cache_http2_send.c +++ b/bin/varnishd/http2/cache_http2_send.c @@ -32,414 +32,593 @@ #include "config.h" #include +#include +#include #include "cache/cache_varnishd.h" - #include "cache/cache_transport.h" #include "http2/cache_http2.h" #include "vend.h" #include "vtim.h" -#define H2_SEND_HELD(h2, r2) (VTAILQ_FIRST(&(h2)->txqueue) == (r2)) +static void +h2_send_vsl(struct vsl_log *vsl, const void *ptr, size_t len) +{ + const uint8_t *b; + struct vsb *vsb; + const char *p; + unsigned u; + + if (VSL_tag_is_masked(SLT_H2TxHdr) && + VSL_tag_is_masked(SLT_H2TxBody)) + return; + + AN(ptr); + assert(len >= 9); + b = ptr; -static h2_error -h2_errcheck(const struct h2_req *r2, const struct h2_sess *h2) + vsb = VSB_new_auto(); + AN(vsb); + p = h2_framename(b[3]); + if (p != NULL) + VSB_cat(vsb, p); + else + VSB_quote(vsb, b + 3, 1, VSB_QUOTE_HEX); + + u = vbe32dec(b) >> 8; + VSB_printf(vsb, "[%u] ", u); + VSB_quote(vsb, b + 4, 1, VSB_QUOTE_HEX); + VSB_putc(vsb, ' '); + VSB_quote(vsb, b + 5, 4, VSB_QUOTE_HEX); + AZ(VSB_finish(vsb)); + VSLb_bin(vsl, SLT_H2TxHdr, 9, b); + if (len > 9) + VSLb_bin(vsl, SLT_H2TxBody, len - 9, b + 9); + + VSLb(vsl, SLT_Debug, "H2TXF %s", VSB_data(vsb)); + VSB_destroy(&vsb); +} + +static void +h2_mk_hdr(uint8_t *hdr, h2_frame ftyp, uint8_t flags, + uint32_t len, uint32_t stream) { - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - if (r2->error != NULL) - return (r2->error); - if (h2->error != NULL && r2->stream > h2->goaway_last_stream) - return (h2->error); - return (NULL); + AN(hdr); + AZ(flags & ~(ftyp->flags)); + if (stream == 0) + AZ(ftyp->act_szero); + else + AZ(ftyp->act_snonzero); + assert(len < (1U << 24)); + vbe32enc(hdr, len << 8); + hdr[3] = ftyp->type; + hdr[4] = flags; + vbe32enc(hdr + 5, stream); } -static int -h2_cond_wait(pthread_cond_t *cond, struct h2_sess *h2, struct h2_req *r2) +static int64_t +h2_win_limit(const struct h2_req *r2) { - vtim_dur tmo = 0.; - vtim_real now; - h2_error h2e; - int r; - AN(cond); - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC); - Lck_AssertHeld(&h2->sess->mtx); + return (vmin_t(int64_t, r2->tx_window, r2->h2sess->tx_window)); +} - if (cache_param->h2_window_timeout > 0.) - tmo = cache_param->h2_window_timeout; +static void +h2_win_charge(struct h2_req *r2, uint32_t w) +{ + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + CHECK_OBJ_NOTNULL(r2->h2sess, H2_SESS_MAGIC); - r = Lck_CondWaitTimeout(cond, &h2->sess->mtx, tmo); - assert(r == 0 || r == ETIMEDOUT); + r2->tx_window -= w; + r2->h2sess->tx_window -= w; +} - now = VTIM_real(); +static int +h2_send_small(struct h2_sess *h2, h2_frame ftyp, uint8_t flags, + uint32_t stream, uint32_t len, const void *ptr) +{ - /* NB: when we grab h2_window_timeout before acquiring the session - * lock we may time out, but once we wake up both send_timeout and - * h2_window_timeout may have changed meanwhile. For this reason - * h2_stream_tmo() may not log what timed out and we need to call - * again with a magic NAN "now" that indicates to h2_stream_tmo() - * that the stream reached the h2_window_timeout via the lock and - * force it to log it. - */ - h2e = h2_stream_tmo(h2, r2, now); - if (h2e == NULL && r == ETIMEDOUT) { - h2e = h2_stream_tmo(h2, r2, NAN); - AN(h2e); + ASSERT_H2_SESS(h2); + AN(ftyp); + AZ(flags & ~(ftyp->flags)); + if (stream == 0) + AZ(ftyp->act_szero); + else + AZ(ftyp->act_snonzero); + assert(len + 9 <= pdiff(h2->tx_s_start, h2->tx_s_end)); + if (len > 0) + AN(ptr); + + while (len + 9 > pdiff(h2->tx_s_head, h2->tx_s_end)) { + /* Send something (up until h2->deadline) to free up space. */ + if (H2_Send_Something(h2) < 0) + return (-1); + } + + h2_mk_hdr(h2->tx_s_head, ftyp, flags, len, stream); + h2->tx_s_head += 9; + if (len > 0) { + memcpy(h2->tx_s_head, ptr, len); + h2->tx_s_head += len; } + assert(h2->tx_s_head <= h2->tx_s_end); + h2_send_vsl(h2->vsl, h2->tx_s_head - (9 + len), 9 + len); - if (r2->error == NULL) - r2->error = h2e; + h2->srq->acct.resp_hdrbytes += 9; + if (ftyp->overhead) + h2->srq->acct.resp_bodybytes += len; - return (h2e != NULL ? -1 : 0); + return (0); } -static void -h2_send_get_locked(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) +int +H2_Send_RST(struct h2_sess *h2, uint32_t stream, h2_error h2e) { + uint8_t buf[4]; - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - - Lck_AssertHeld(&h2->sess->mtx); - if (&wrk->cond == h2->cond) - ASSERT_RXTHR(h2); - r2->wrk = wrk; - VTAILQ_INSERT_TAIL(&h2->txqueue, r2, tx_list); - while (!H2_SEND_HELD(h2, r2)) - AZ(Lck_CondWait(&wrk->cond, &h2->sess->mtx)); - r2->wrk = NULL; + vbe32enc(buf, h2e->val); + return (h2_send_small(h2, H2_F_RST_STREAM, 0, stream, + sizeof buf, buf)); } -void -H2_Send_Get(struct worker *wrk, struct h2_sess *h2, struct h2_req *r2) +int +H2_Send_SETTINGS(struct h2_sess *h2, uint8_t flags, ssize_t len, + const uint8_t *buf) { - - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - - Lck_Lock(&h2->sess->mtx); - h2_send_get_locked(wrk, h2, r2); - Lck_Unlock(&h2->sess->mtx); + if (flags & H2FF_ACK) + assert(len == 0); + return (h2_send_small(h2, H2_F_SETTINGS, flags, 0, len, buf)); } -static void -h2_send_rel_locked(struct h2_sess *h2, const struct h2_req *r2) +int +H2_Send_PING(struct h2_sess *h2, uint8_t flags, uint64_t data) { - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - - Lck_AssertHeld(&h2->sess->mtx); - AN(H2_SEND_HELD(h2, r2)); - VTAILQ_REMOVE(&h2->txqueue, r2, tx_list); - r2 = VTAILQ_FIRST(&h2->txqueue); - if (r2 != NULL) { - CHECK_OBJ_NOTNULL(r2->wrk, WORKER_MAGIC); - PTOK(pthread_cond_signal(&r2->wrk->cond)); - } + return (h2_send_small(h2, H2_F_PING, flags, 0, sizeof data, &data)); } -void -H2_Send_Rel(struct h2_sess *h2, const struct h2_req *r2) +int +H2_Send_GOAWAY(struct h2_sess *h2, uint32_t last_stream_id, h2_error h2e) { - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + uint8_t buf[8]; - Lck_Lock(&h2->sess->mtx); - h2_send_rel_locked(h2, r2); - Lck_Unlock(&h2->sess->mtx); + vbe32enc(&buf[0], last_stream_id); + vbe32enc(&buf[4], h2e->val); + return (h2_send_small(h2, H2_F_GOAWAY, 0, 0, sizeof buf, buf)); } -static void -h2_mk_hdr(uint8_t *hdr, h2_frame ftyp, uint8_t flags, - uint32_t len, uint32_t stream) +int +H2_Send_WINDOW_UPDATE(struct h2_sess *h2, uint32_t stream, uint32_t incr) { + uint8_t buf[4]; - AN(hdr); - assert(len < (1U << 24)); - vbe32enc(hdr, len << 8); - hdr[3] = ftyp->type; - hdr[4] = flags; - vbe32enc(hdr + 5, stream); + vbe32enc(&buf[0], incr); + return (h2_send_small(h2, H2_F_WINDOW_UPDATE, 0, stream, + sizeof buf, buf)); } -/* - * This is the "raw" frame sender, all per-stream accounting and - * prioritization must have happened before this is called, and - * the session mtx must be held. - */ +struct h2_send_large { + unsigned magic; +#define H2_SEND_LARGE_MAGIC 0x478020e3 -void -H2_Send_Frame(struct worker *wrk, struct h2_sess *h2, - h2_frame ftyp, uint8_t flags, - uint32_t len, uint32_t stream, const void *ptr) + char last; + char started; + char returned; + + uint8_t flags; + h2_frame ftyp; + + VTAILQ_ENTRY(h2_send_large) list; + + pthread_cond_t cond; + + struct h2_req *r2; + + const void *ptr; + uint32_t len; + uint32_t count; +}; + +int +H2_Send(struct vsl_log *vsl, struct h2_req *r2, h2_frame ftyp, uint8_t flags, + uint32_t len, const void *ptr) { - uint8_t hdr[9]; - ssize_t s; - struct iovec iov[2]; + struct h2_sess *h2; + struct h2_send_large large; + h2_error h2e; - (void)wrk; + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + h2 = r2->h2sess; + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - AN(ftyp); + ASSERT_H2_REQ(h2); + + assert(ftyp == H2_F_HEADERS || ftyp == H2_F_DATA); AZ(flags & ~(ftyp->flags)); - if (stream == 0) - AZ(ftyp->act_szero); - else - AZ(ftyp->act_snonzero); - h2_mk_hdr(hdr, ftyp, flags, len, stream); + h2e = h2_errcheck(r2); + if (h2e != NULL) { + VSLb(vsl, SLT_Error, "H2: send error (%s)", h2e->name); + return (-1); + } + + assert(r2->state > H2_S_IDLE); + if (r2->state >= H2_S_CLOSED) { + VSLb(vsl, SLT_Error, "H2: send on closed stream"); + return (-1); + } + + INIT_OBJ(&large, H2_SEND_LARGE_MAGIC); + PTOK(pthread_cond_init(&large.cond, NULL)); + + large.ftyp = ftyp; + large.flags = flags; + large.r2 = r2; + large.ptr = ptr; + large.len = len; + Lck_Lock(&h2->sess->mtx); - VSLb_bin(h2->vsl, SLT_H2TxHdr, 9, hdr); - h2->srq->acct.resp_hdrbytes += 9; - if (ftyp->overhead) - h2->srq->acct.resp_bodybytes += len; - Lck_Unlock(&h2->sess->mtx); - memset(iov, 0, sizeof iov); - iov[0].iov_base = (void*)hdr; - iov[0].iov_len = sizeof hdr; - iov[1].iov_base = TRUST_ME(ptr); - iov[1].iov_len = len; - s = writev(h2->sess->fd, iov, len == 0 ? 1 : 2); - if (s != sizeof hdr + len) { - if (errno == EWOULDBLOCK) { - H2S_Lock_VSLb(h2, SLT_SessError, - "H2: stream %u: Hit idle_send_timeout", stream); - } - else { - H2S_Lock_VSLb(h2, SLT_Debug, - "H2: stream %u: write error s=%zd/%zu errno=%d", - stream, s, sizeof hdr + len, errno); - } - /* - * There is no point in being nice here, we will be unable - * to send a GOAWAY once the code unrolls, so go directly - * to the finale and be done with it. - */ - h2->error = H2CE_PROTOCOL_ERROR; - } else if (len > 0) { - Lck_Lock(&h2->sess->mtx); - VSLb_bin(h2->vsl, SLT_H2TxBody, len, ptr); - Lck_Unlock(&h2->sess->mtx); + if (!h2->tx_stopped) { + VTAILQ_INSERT_TAIL(&h2->tx_l_queue, &large, list); + h2->tx_l_stuck = 0; + h2_attention(h2); + + AZ(Lck_CondWait(&large.cond, &h2->sess->mtx)); + AN(large.returned); /* Sanity check */ + /* Note: We will have been removed from the `h2->tx_l_queue` + * list by the signaller. */ } -} -static int64_t -h2_win_limit(const struct h2_req *r2, const struct h2_sess *h2) -{ + h2e = h2_errcheck(r2); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(h2->req0, H2_REQ_MAGIC); + Lck_Unlock(&h2->sess->mtx); + + PTOK(pthread_cond_destroy(&large.cond)); + large.magic = 0; + + if (h2e != NULL) { + VSLb(vsl, SLT_Error, "H2: send error (%s)", h2e->name); + return (-1); + } - Lck_AssertHeld(&h2->sess->mtx); - return (vmin_t(int64_t, r2->t_window, h2->req0->t_window)); + return (0); } static void -h2_win_charge(struct h2_req *r2, const struct h2_sess *h2, uint32_t w) +h2_send_prep_large(struct h2_sess *h2, struct h2_send_large *large) { - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + struct h2_req *r2; + uint8_t flags; + ssize_t l, limit; + h2_frame ftyp; + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(h2->req0, H2_REQ_MAGIC); + AZ(h2->tx_l_current); - Lck_AssertHeld(&h2->sess->mtx); - r2->t_window -= w; - h2->req0->t_window -= w; + CHECK_OBJ_NOTNULL(large, H2_SEND_LARGE_MAGIC); + AN(large->ftyp); + r2 = large->r2; + CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); + + assert(large->ftyp == H2_F_DATA || large->ftyp == H2_F_HEADERS || + large->ftyp == H2_F_PUSH_PROMISE); + AN(large->ftyp->continuation); + + l = large->len - large->count; + if (l > h2->remote_settings.max_frame_size) + l = h2->remote_settings.max_frame_size; + + if (large->ftyp->respect_window) { + limit = h2_win_limit(r2); + assert(limit > 0); + if (l > limit) + l = limit; + h2_win_charge(r2, l); + if (r2->t_win_low == 0. && r2->tx_window == 0) { + /* The send window is low. Set a timestamp to + * record when this happened, so that we can + * become emo if the window isn't extended + * promptly. */ + /* XXX: This mechanism would be more effective if + * we had some threshold (10% of initial window + * size or something. */ + r2->t_win_low = VTIM_real(); + h2->win_low_streams++; + } + } + assert(large->count + l <= large->len); + + ftyp = large->ftyp; + flags = large->flags; + AZ(flags & ~(ftyp->flags)); + + if (large->count > 0) { + /* This is a continuation. Switch frame type and mask out + * the flags not defined on its continuation type. */ + ftyp = ftyp->continuation; + AN(ftyp); + flags &= ftyp->flags; + } + + if (large->count + l < large->len) { + /* We are breaking it up into smaller frames. Clear the + * last marker from the flags if present. */ + flags &= ~(ftyp->final_flags); + } + + h2_mk_hdr(h2->tx_l_hdrbuf, ftyp, flags, l, r2->stream); + h2_send_vsl(h2->vsl, h2->tx_l_hdrbuf, 9); + h2->tx_vec[0].iov_base = h2->tx_l_hdrbuf; + h2->tx_vec[0].iov_len = 9; + if (l == 0) { + /* Zero payload frame is valid. Will be used on + * "chunked encoding" and the end of stream is + * found. */ + h2->tx_nvec = 1; + } else { + h2->tx_vec[1].iov_base = + TRUST_ME((uintptr_t)large->ptr + large->count); + h2->tx_vec[1].iov_len = l; + h2->tx_nvec = 2; + large->count += l; + } + h2->tx_l_current = large; + + /* Charge the session accounting for the protocol bytes */ + h2->srq->acct.resp_hdrbytes += 9; + if (ftyp->overhead) + h2->srq->acct.resp_bodybytes += l; + + /* Charge the request accounting for HEADERS and DATA frames */ + if (large->ftyp == H2_F_HEADERS) + r2->req->acct.resp_hdrbytes += l; + else if (large->ftyp == H2_F_DATA) + r2->req->acct.resp_bodybytes += l; } -static int64_t -h2_do_window(struct worker *wrk, struct h2_req *r2, - struct h2_sess *h2, int64_t wanted) +ssize_t +H2_Send_TxStuff(struct h2_sess *h2) { - int64_t w = 0; + struct h2_send_large *large; + ssize_t l, ltot = 0; + int err = 0; + + ASSERT_H2_SESS(h2); - CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + AZ(h2->tx_stopped); + + if (h2->tx_nvec == 0 && h2->tx_s_head != h2->tx_s_start) { + /* Prioritise sending the small frames */ + assert(h2->tx_s_start < h2->tx_s_head); + assert(h2->tx_s_head <= h2->tx_s_end); + assert(h2->tx_s_mark == h2->tx_s_start); + h2->tx_vec[0].iov_base = h2->tx_s_start; + h2->tx_vec[0].iov_len = h2->tx_s_head - h2->tx_s_start; + h2->tx_nvec = 1; + h2->tx_s_mark = h2->tx_s_head; + } else if (h2->tx_nvec == 0) { + /* Construct a large frame from the queue (if possible + * considering the current windows). If we ever implement + * priorities, this would be the place to take them into + * account. */ + Lck_Lock(&h2->sess->mtx); - if (wanted == 0) - return (0); + VTAILQ_FOREACH(large, &h2->tx_l_queue, list) { + CHECK_OBJ_NOTNULL(large, H2_SEND_LARGE_MAGIC); + CHECK_OBJ_NOTNULL(large->r2, H2_REQ_MAGIC); + assert(large->count <= large->len); + AN(large->ftyp); + + if (h2_errcheck(large->r2) != NULL) { + VTAILQ_REMOVE(&h2->tx_l_queue, large, list); + large->returned = 1; + PTOK(pthread_cond_signal(&large->cond)); + continue; + } - Lck_Lock(&h2->sess->mtx); - if (r2->t_window <= 0 || h2->req0->t_window <= 0) { - r2->t_winupd = VTIM_real(); - h2_send_rel_locked(h2, r2); + if (!large->ftyp->respect_window) + break; - assert(h2->winup_streams >= 0); - h2->winup_streams++; + if (h2->tx_window <= 0) { + /* If the session window is empty, none of + * the respect_window frame types can be + * selected. */ + continue; + } - while (r2->t_window <= 0 && h2_errcheck(r2, h2) == NULL) { - r2->cond = &wrk->cond; - (void)h2_cond_wait(r2->cond, h2, r2); - r2->cond = NULL; + if (large->r2->tx_window > 0) + break; } - while (h2->req0->t_window <= 0 && h2_errcheck(r2, h2) == NULL) - (void)h2_cond_wait(h2->winupd_cond, h2, r2); - - if (h2_errcheck(r2, h2) == NULL) { - w = vmin_t(int64_t, h2_win_limit(r2, h2), wanted); - h2_win_charge(r2, h2, w); - assert (w > 0); + if (large == NULL) { + /* Tx is unable to make progress until there has + * been a window update. */ + h2->tx_l_stuck = 1; + } else { + h2->tx_l_stuck = 0; } - if (r2->error == H2SE_BROKE_WINDOW && - h2->open_streams <= h2->winup_streams) { - VSLb(h2->vsl, SLT_SessError, "H2: window bankrupt"); - h2->error = r2->error = H2CE_BANKRUPT; - } + Lck_Unlock(&h2->sess->mtx); - assert(h2->winup_streams > 0); - h2->winup_streams--; + if (large == NULL) + return (0); - h2_send_get_locked(wrk, h2, r2); + h2_send_prep_large(h2, large); } - if (w == 0 && h2_errcheck(r2, h2) == NULL) { - assert(r2->t_window > 0); - assert(h2->req0->t_window > 0); - w = h2_win_limit(r2, h2); - if (w > wanted) - w = wanted; - h2_win_charge(r2, h2, w); - assert (w > 0); + assert(h2->tx_nvec > 0); + while (h2->tx_nvec > 0) { + l = writev(h2->sess->fd, h2->tx_vec, h2->tx_nvec); + if (l < 0) { + /* Save the value of errno. This is strictly not + * necessary as none of the calls between here and + * the return should update errno, but done for + * future proofing. */ + err = errno; + break; + } + + assert(l > 0); + VIOV_prune(h2->tx_vec, &h2->tx_nvec, l); + ltot += l; } - r2->t_winupd = 0; - Lck_Unlock(&h2->sess->mtx); - return (w); + + if (h2->tx_nvec == 0 && h2->tx_l_current != NULL) { + /* We have just finished sending a large frame. */ + assert(h2->tx_s_mark == h2->tx_s_start); + + TAKE_OBJ_NOTNULL(large, &h2->tx_l_current, H2_SEND_LARGE_MAGIC); + AZ(h2->tx_l_current); + + AN(large->ftyp); + + assert(large->count <= large->len); + if (large->count == large->len) { + if (large->flags & H2FF_END_STREAM) + h2_stream_setstate(large->r2, H2_S_CLOSED); + + /* Signal that we are finished */ + Lck_Lock(&h2->sess->mtx); + VTAILQ_REMOVE(&h2->tx_l_queue, large, list); + PTOK(pthread_cond_signal(&large->cond)); + large->returned = 1; + Lck_Unlock(&h2->sess->mtx); + } else if (large->ftyp == H2_F_HEADERS || + large->ftyp == H2_F_PUSH_PROMISE) { + /* A CONTINUATION frame must come immediately + * after the previous + * HEADER|PUSH_PROMISE|CONTINUATION frame. Prepare + * the `large` again, which will force that to be + * the next output. */ + h2_send_prep_large(h2, large); + assert(large == h2->tx_l_current); + assert(h2->tx_nvec > 0); + } + } else if (h2->tx_nvec == 0) { + /* We have just finished sending the small buffer */ + assert(h2->tx_s_start < h2->tx_s_mark); + assert(h2->tx_s_mark <= h2->tx_s_head); + assert(h2->tx_s_head <= h2->tx_s_end); + memmove(h2->tx_s_start, h2->tx_s_mark, + h2->tx_s_head - h2->tx_s_mark); + h2->tx_s_head -= h2->tx_s_mark - h2->tx_s_start; + h2->tx_s_mark = h2->tx_s_start; + } + + if (ltot > 0) + return (ltot); + + errno = err; + return (-1); } -/* - * This is the per-stream frame sender. - * XXX: priority - */ +int +H2_Send_Something(struct h2_sess *h2) +{ + ssize_t l; + vtim_real now; + struct pollfd pfd[1]; + + /* Block up until h2->deadline and then send something. */ + + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + ASSERT_H2_SESS(h2); + AZ(h2->tx_stopped); + + assert(h2->sess->fd >= 0); + pfd->fd = h2->sess->fd; + pfd->events = POLLOUT; + + do { + now = VTIM_real(); + if (now > h2->deadline) + goto error; + l = poll(pfd, 1, VTIM_poll_tmo(h2->deadline - now)); + } while (l < 0 && errno == EINTR); + + if (l == 0 || !(pfd->revents & POLLOUT)) + goto error; + + l = H2_Send_TxStuff(h2); + if (l < 0 && errno != EWOULDBLOCK) + goto error; + + return (0); + +error: + /* Failure to send on the socket (IO error or timeout). */ + if (h2->error == NULL) + h2->error = H2CE_IO_ERROR; + return (-1); +} + +int +H2_Send_Pending(struct h2_sess *h2) +{ + CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); + ASSERT_H2_SESS(h2); + + if (h2->tx_nvec > 0) + return (1); + if (h2->tx_s_head != h2->tx_s_start) + return (1); + if (!VTAILQ_EMPTY(&h2->tx_l_queue) && !h2->tx_l_stuck) + return (1); + return (0); +} static void -h2_send(struct worker *wrk, struct h2_req *r2, h2_frame ftyp, uint8_t flags, - uint32_t len, const void *ptr, uint64_t *counter) +h2_send_close(struct h2_sess *h2, unsigned stop) { - struct h2_sess *h2; - uint32_t mfs, tf; - const char *p; - uint8_t final_flags; + struct h2_send_large *large, *large2; - CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - h2 = r2->h2sess; CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - assert(len == 0 || ptr != NULL); - AN(counter); + ASSERT_H2_SESS(h2); - AN(H2_SEND_HELD(h2, r2)); + Lck_Lock(&h2->sess->mtx); - if (h2_errcheck(r2, h2) != NULL) - return; + /* A session error state should have been set prior to calling + * this function. */ + AN(h2->error); + AZ(h2->tx_stopped); - AN(ftyp); - AZ(flags & ~(ftyp->flags)); - if (r2->stream == 0) - AZ(ftyp->act_szero); - else - AZ(ftyp->act_snonzero); + if (stop) { + h2->tx_stopped = 1; - Lck_Lock(&h2->sess->mtx); - mfs = h2->remote_settings.max_frame_size; - if (r2->counted && ( - (ftyp == H2_F_HEADERS && (flags & H2FF_HEADERS_END_STREAM)) || - (ftyp == H2_F_DATA && (flags & H2FF_DATA_END_STREAM)) || - ftyp == H2_F_RST_STREAM - )) { - assert(h2->open_streams > 0); - h2->open_streams--; - r2->counted = 0; + CHECK_OBJ_ORNULL(h2->tx_l_current, H2_SEND_LARGE_MAGIC); + if (h2->tx_l_current != NULL) { + /* Abort the large frame */ + h2->tx_l_current = NULL; + h2->tx_nvec = 0; + } } - Lck_Unlock(&h2->sess->mtx); - if (ftyp->respect_window) { - tf = h2_do_window(wrk, r2, h2, (len > mfs) ? mfs : len); - if (h2_errcheck(r2, h2) != NULL) - return; - AN(H2_SEND_HELD(h2, r2)); - } else - tf = mfs; - - if (len <= tf) { - H2_Send_Frame(wrk, h2, ftyp, flags, len, r2->stream, ptr); - *counter += len; - } else { - AN(ptr); - p = ptr; - final_flags = ftyp->final_flags & flags; - flags &= ~ftyp->final_flags; - do { - AN(ftyp->continuation); - if (!ftyp->respect_window) - tf = mfs; - if (ftyp->respect_window && p != ptr) { - tf = h2_do_window(wrk, r2, h2, - (len > mfs) ? mfs : len); - if (h2_errcheck(r2, h2) != NULL) - return; - AN(H2_SEND_HELD(h2, r2)); - } - if (tf < len) { - H2_Send_Frame(wrk, h2, ftyp, - flags, tf, r2->stream, p); - } else { - if (ftyp->respect_window) - assert(tf == len); - tf = len; - H2_Send_Frame(wrk, h2, ftyp, final_flags, tf, - r2->stream, p); - flags = 0; - } - p += tf; - len -= tf; - *counter += tf; - ftyp = ftyp->continuation; - flags &= ftyp->flags; - final_flags &= ftyp->flags; - } while (h2->error == NULL && len > 0); + VTAILQ_FOREACH_SAFE(large, &h2->tx_l_queue, list, large2) { + CHECK_OBJ_NOTNULL(large, H2_SEND_LARGE_MAGIC); + if (large == h2->tx_l_current) + continue; + VTAILQ_REMOVE(&h2->tx_l_queue, large, list); + large->returned = 1; + PTOK(pthread_cond_signal(&large->cond)); } + + Lck_Unlock(&h2->sess->mtx); } void -H2_Send_RST(struct worker *wrk, struct h2_sess *h2, const struct h2_req *r2, - uint32_t stream, h2_error h2e) +H2_Send_Shutdown(struct h2_sess *h2) { - char b[4]; - - CHECK_OBJ_NOTNULL(h2, H2_SESS_MAGIC); - CHECK_OBJ_NOTNULL(r2, H2_REQ_MAGIC); - AN(H2_SEND_HELD(h2, r2)); - AN(h2e); - - H2S_Lock_VSLb(h2, SLT_Debug, "H2: stream %u: %s", stream, h2e->txt); - vbe32enc(b, h2e->val); - - H2_Send_Frame(wrk, h2, H2_F_RST_STREAM, 0, sizeof b, stream, b); + h2_send_close(h2, 0); } void -H2_Send(struct worker *wrk, struct h2_req *r2, h2_frame ftyp, uint8_t flags, - uint32_t len, const void *ptr, uint64_t *counter) +H2_Send_Stop(struct h2_sess *h2) { - uint64_t dummy_counter = 0; - h2_error h2e; - - if (counter == NULL) - counter = &dummy_counter; - - h2_send(wrk, r2, ftyp, flags, len, ptr, counter); - - h2e = h2_errcheck(r2, r2->h2sess); - if (H2_ERROR_MATCH(h2e, H2SE_CANCEL)) - H2_Send_RST(wrk, r2->h2sess, r2, r2->stream, h2e); + h2_send_close(h2, 1); } diff --git a/bin/varnishd/http2/cache_http2_session.c b/bin/varnishd/http2/cache_http2_session.c index 46b02dc094..009b4ae128 100644 --- a/bin/varnishd/http2/cache_http2_session.c +++ b/bin/varnishd/http2/cache_http2_session.c @@ -31,10 +31,10 @@ #include "config.h" -#include "cache/cache_varnishd.h" - +#include #include +#include "cache/cache_varnishd.h" #include "cache/cache_transport.h" #include "http2/cache_http2.h" @@ -88,30 +88,6 @@ h2_local_settings(struct h2_settings *h2s) h2s->max_header_list_size = cache_param->http_req_size; } -void -H2S_Lock_VSLb(const struct h2_sess *h2, enum VSL_tag_e tag, const char *fmt, ...) -{ - va_list ap; - int held = 0; - - AN(h2); - - if (VSL_tag_is_masked(tag)) - return; - - if (h2->highest_stream > 0) { - held = 1; - Lck_Lock(&h2->sess->mtx); - } - - va_start(ap, fmt); - VSLbv(h2->vsl, tag, fmt, ap); - va_end(ap); - - if (held) - Lck_Unlock(&h2->sess->mtx); -} - /********************************************************************** * The h2_sess struct needs many of the same things as a request, * WS, VSL, HTC &c, but rather than implement all that stuff over, we @@ -120,20 +96,20 @@ H2S_Lock_VSLb(const struct h2_sess *h2, enum VSL_tag_e tag, const char *fmt, ... */ static struct h2_sess * -h2_init_sess(struct sess *sp, - struct h2_sess *h2s, struct req *srq, struct h2h_decode *decode) +h2_init_sess(struct sess *sp, struct h2_sess *h2s, struct req **psrq, + struct h2h_decode *decode) { + struct req *srq; uintptr_t *up; struct h2_sess *h2; + TAKE_OBJ_NOTNULL(srq, psrq, REQ_MAGIC); + /* proto_priv session attribute will always have been set up by H1 * before reaching here. */ AZ(SES_Get_proto_priv(sp, &up)); assert(*up == 0); - if (srq == NULL) - srq = Req_New(sp, NULL); - AN(srq); h2 = h2s; AN(h2); INIT_OBJ(h2, H2_SESS_MAGIC); @@ -146,12 +122,15 @@ h2_init_sess(struct sess *sp, h2->htc->rfd = &sp->fd; h2->sess = sp; h2->rxthr = pthread_self(); - PTOK(pthread_cond_init(h2->winupd_cond, NULL)); VTAILQ_INIT(&h2->streams); - VTAILQ_INIT(&h2->txqueue); h2_local_settings(&h2->local_settings); h2->remote_settings = H2_proto_settings; h2->decode = decode; + h2->expect_settings_next = 1; + VEFD_INIT(h2->efd); + + h2->tx_window = h2->remote_settings.initial_window_size; + h2->rx_window = h2->local_settings.initial_window_size; h2->rapid_reset = cache_param->h2_rapid_reset; h2->rapid_reset_limit = cache_param->h2_rapid_reset_limit; @@ -163,6 +142,19 @@ h2_init_sess(struct sess *sp, AZ(VHT_Init(h2->dectbl, h2->local_settings.header_table_size)); + /* Allocate a scratch space to use for staging small outgoing + * frames. */ + h2->tx_s_start = WS_Alloc(h2->ws, H2_TX_BUFSIZE); + AN(h2->tx_s_start); + h2->tx_s_end = h2->tx_s_start + H2_TX_BUFSIZE; + h2->tx_s_head = h2->tx_s_start; + h2->tx_s_mark = h2->tx_s_start; + + /* Init send queue */ + VTAILQ_INIT(&h2->tx_l_queue); + + h2->htc->pipeline_snap = WS_Snapshot(h2->ws); + *up = (uintptr_t)h2; return (h2); @@ -180,7 +172,8 @@ h2_del_sess(struct worker *wrk, struct h2_sess *h2, stream_close_t reason) AN(reason); VHT_Fini(h2->dectbl); - PTOK(pthread_cond_destroy(h2->winupd_cond)); + if (h2->efd->poll_fd >= 0) + VEFD_Close(h2->efd); TAKE_OBJ_NOTNULL(req, &h2->srq, REQ_MAGIC); assert(!WS_IsReserved(req->ws)); sp = h2->sess; @@ -264,28 +257,33 @@ h2_b64url_settings(struct h2_sess *h2, struct req *req) /**********************************************************************/ -static int -h2_ou_rel(struct worker *wrk, struct req *req) +static void +h2_ou_rel_req(struct worker *wrk, struct req **preq) { + struct req *req; + CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); - CHECK_OBJ_NOTNULL(req, REQ_MAGIC); + TAKE_OBJ_NOTNULL(req, preq, REQ_MAGIC); AZ(req->vcl); Req_AcctLogCharge(wrk->stats, req); Req_Release(req); - return (0); } -static int +static struct h2_req * h2_ou_session(struct worker *wrk, struct h2_sess *h2, - struct req *req) + struct req **preq) { + struct req *req; ssize_t sz; enum htc_status_e hs; struct h2_req *r2; + TAKE_OBJ_NOTNULL(req, preq, REQ_MAGIC); + if (h2_b64url_settings(h2, req)) { VSLb(h2->vsl, SLT_Debug, "H2: Bad HTTP-Settings"); - return (h2_ou_rel(wrk, req)); + h2_ou_rel_req(wrk, &req); + return (NULL); } sz = write(h2->sess->fd, h2_resp_101, strlen(h2_resp_101)); @@ -293,13 +291,11 @@ h2_ou_session(struct worker *wrk, struct h2_sess *h2, if (sz != strlen(h2_resp_101)) { VSLb(h2->vsl, SLT_Debug, "H2: Upgrade: Error writing 101" " response: %s\n", VAS_errtxt(errno)); - return (h2_ou_rel(wrk, req)); + h2_ou_rel_req(wrk, &req); + return (NULL); } - http_Unset(req->http, H_Upgrade); - http_Unset(req->http, H_HTTP2_Settings); - - /* Steal pipelined read-ahead, if any */ + /* Copy any pipelined data from the request into the session. */ h2->htc->pipeline_b = req->htc->pipeline_b; h2->htc->pipeline_e = req->htc->pipeline_e; req->htc->pipeline_b = NULL; @@ -309,36 +305,35 @@ h2_ou_session(struct worker *wrk, struct h2_sess *h2, do about the overflowing data is an open issue. */ HTC_RxInit(h2->htc, h2->ws); - /* Start req thread */ - r2 = h2_new_req(h2, 1, req); - AZ(h2->highest_stream); - h2->highest_stream = r2->stream; - req->transport = &HTTP2_transport; - assert(req->req_step == R_STP_TRANSPORT); - req->task->func = h2_do_req; - req->task->priv = req; - r2->scheduled = 1; - r2->state = H2_S_CLOS_REM; // rfc7540,l,489,491 - req->err_code = 0; - http_SetH(req->http, HTTP_HDR_PROTO, "HTTP/2.0"); - /* Wait for PRISM response */ hs = HTC_RxStuff(h2->htc, H2_prism_complete, NULL, NULL, NAN, h2->sess->t_idle + cache_param->timeout_idle, NAN, sizeof H2_prism); if (hs != HTC_S_COMPLETE) { VSLb(h2->vsl, SLT_Debug, "H2: No/Bad OU PRISM (hs=%d)", hs); - r2->scheduled = 0; - h2_del_req(wrk, r2); - return (0); + h2_ou_rel_req(wrk, &req); + return (NULL); } - if (Pool_Task(wrk->pool, req->task, TASK_QUEUE_REQ)) { - r2->scheduled = 0; - h2_del_req(wrk, r2); - VSLb(h2->vsl, SLT_Debug, "H2: No Worker-threads"); - return (0); - } - return (1); + + http_Unset(req->http, H_Upgrade); + http_Unset(req->http, H_HTTP2_Settings); + + /* Prepare the req thread, but do not start it. The RFC requires + * us to send our settings frame before any response frames, so we + * delay the start of the thread until after the settings frame + * has been sent. */ + r2 = h2_new_req(h2, 1, &req); + AZ(req); + AZ(h2->highest_stream); + h2->highest_stream = r2->stream; + r2->req->transport = &HTTP2_transport; + assert(r2->req->req_step == R_STP_TRANSPORT); + r2->req->task->func = h2_do_req; + r2->req->task->priv = r2->req; + h2_stream_setstate(r2, H2_S_CLOS_REM); // rfc7540,l,489,491 + http_SetH(r2->req->http, HTTP_HDR_PROTO, "HTTP/2.0"); + + return (r2); } /********************************************************************** @@ -366,14 +361,15 @@ H2_OU_Sess(struct worker *wrk, struct sess *sp, struct req *req) static void v_matchproto_(task_func_t) h2_new_session(struct worker *wrk, void *arg) { - struct req *req; + struct req *req, *srq = NULL; struct sess *sp; struct h2_sess h2s; struct h2_sess *h2; - struct h2_req *r2, *r22; - int again; + struct h2_req *r2_ou = NULL; + uint16_t marker; uint8_t settings[48]; struct h2h_decode decode; + stream_close_t reason; size_t l; CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC); @@ -386,90 +382,110 @@ h2_new_session(struct worker *wrk, void *arg) assert(req->transport == &HTTP2_transport); - assert (req->err_code == H2_PU_MARKER || req->err_code == H2_OU_MARKER); + marker = req->err_code; + assert(marker == H2_PU_MARKER || marker == H2_OU_MARKER); + req->err_code = 0; + + if (marker == H2_PU_MARKER) { + /* Prior knowledge. The incoming req does not hold + * anything of value and can be repurposed as the session + * req (srq). */ + srq = req; + req = NULL; + } else { + /* Opportunistic upgrade. The incoming req holds the first + * stream H/1 received request. We will need a fresh req + * for srq. */ + srq = Req_New(sp, NULL); + } + CHECK_OBJ_NOTNULL(srq, REQ_MAGIC); - h2 = h2_init_sess(sp, &h2s, - req->err_code == H2_PU_MARKER ? req : NULL, &decode); - h2->req0 = h2_new_req(h2, 0, NULL); + h2 = h2_init_sess(sp, &h2s, &srq, &decode); + AZ(srq); + + CHECK_OBJ_NOTNULL(h2->htc, HTTP_CONN_MAGIC); AZ(h2->htc->priv); h2->htc->priv = h2; - AZ(wrk->vsl); - wrk->vsl = h2->vsl; - - if (req->err_code == H2_OU_MARKER && !h2_ou_session(wrk, h2, req)) { - assert(h2->refcnt == 1); - h2_del_req(wrk, h2->req0); - h2_del_sess(wrk, h2, SC_RX_JUNK); + /* Set up the eventfd for communication with request handling + * threads. */ + if (VEFD_Open(h2->efd) < 0) { + VSLb(h2->vsl, SLT_Error, "H2: Failed to create eventfd"); + h2_del_sess(wrk, h2, SC_OVERLOAD); wrk->vsl = NULL; return; } + + AZ(wrk->vsl); + wrk->vsl = h2->vsl; + + if (marker == H2_OU_MARKER) { + /* Deal with opportunistic upgrade. The upgrade request + * was received by HTTP/1 and is held in req. The response + * will be sent by H/2. Convert the req struct to an H/2 + * req. */ + AN(req); + r2_ou = h2_ou_session(wrk, h2, &req); + AZ(req); + if (r2_ou == NULL) { + h2_del_sess(wrk, h2, SC_RX_JUNK); + wrk->vsl = NULL; + return; + } + + CHECK_OBJ_NOTNULL(r2_ou, H2_REQ_MAGIC); + AZ(r2_ou->scheduled); + } else + VSLb(h2->vsl, SLT_Debug, "H2: Got pu PRISM"); + assert(HTC_S_COMPLETE == H2_prism_complete(h2->htc)); + + /* Initialize the workspace rx buffer. Some read overshoot data + * may be present as pipeline data. This sequence of calls + * basically just resets the WS, memmove()s the pipeline data + * first, and sets htc->rxbuf_[be] to the pipeline data. */ HTC_RxPipeline(h2->htc, h2->htc->rxbuf_b + sizeof(H2_prism)); HTC_RxInit(h2->htc, h2->ws); - AN(WS_Reservation(h2->ws)); - VSLb(h2->vsl, SLT_Debug, "H2: Got pu PRISM"); + WS_ReleaseP(h2->htc->ws, h2->htc->rxbuf_e); THR_SetRequest(h2->srq); - AN(WS_Reservation(h2->ws)); + /* Send our settings */ l = h2_enc_settings(&h2->local_settings, settings, sizeof (settings)); - AN(WS_Reservation(h2->ws)); - H2_Send_Get(wrk, h2, h2->req0); - AN(WS_Reservation(h2->ws)); - H2_Send_Frame(wrk, h2, - H2_F_SETTINGS, H2FF_NONE, l, 0, settings); - AN(WS_Reservation(h2->ws)); - H2_Send_Rel(h2, h2->req0); - AN(WS_Reservation(h2->ws)); - - /* and off we go... */ - h2->cond = &wrk->cond; - - while (h2_rxframe(wrk, h2)) { - HTC_RxInit(h2->htc, h2->ws); - if (WS_Overflowed(h2->ws)) { - H2S_Lock_VSLb(h2, SLT_SessError, "H2: Empty Rx Workspace"); - h2->error = H2CE_INTERNAL_ERROR; - break; + H2_Send_SETTINGS(h2, H2FF_NONE, l, settings); + + if (r2_ou != NULL) { + /* Schedule the opportunistic request received over HTTP/1 + * as part of the upgrade. */ + AZ(r2_ou->scheduled); + r2_ou->scheduled = 1; + if (Pool_Task(wrk->pool, r2_ou->req->task, TASK_QUEUE_REQ)) { + /* We failed to schedule it. Make the client go + * away. + * + * Note: Calling h2_tx_goaway will set the + * h2->goaway flag, causing h2_rxframe() below to + * return failure without reading from the + * socket. */ + r2_ou->scheduled = 0; + VSLb(h2->vsl, SLT_Debug, "H2: No Worker-threads"); + h2_kill_req(wrk, h2, &r2_ou, H2SE_ENHANCE_YOUR_CALM); + h2->error = H2CE_ENHANCE_YOUR_CALM; } - AN(WS_Reservation(h2->ws)); + r2_ou = NULL; } - AN(h2->error); + /* and off we go... */ + h2_run(wrk, h2); - /* Delete all idle streams */ - Lck_Lock(&h2->sess->mtx); - VSLb(h2->vsl, SLT_Debug, "H2 CLEANUP %s", h2->error->name); - VTAILQ_FOREACH(r2, &h2->streams, list) { - if (r2->error == 0) - r2->error = h2->error; - if (r2->cond != NULL) - PTOK(pthread_cond_signal(r2->cond)); - } - PTOK(pthread_cond_broadcast(h2->winupd_cond)); - Lck_Unlock(&h2->sess->mtx); - while (1) { - again = 0; - VTAILQ_FOREACH_SAFE(r2, &h2->streams, list, r22) { - if (r2 != h2->req0) { - h2_kill_req(wrk, h2, r2, h2->error); - again++; - } - } - if (!again) - break; - Lck_Lock(&h2->sess->mtx); - VTAILQ_FOREACH(r2, &h2->streams, list) - VSLb(h2->vsl, SLT_Debug, "ST %u %d", - r2->stream, r2->state); - (void)Lck_CondWaitTimeout(h2->cond, &h2->sess->mtx, .1); - Lck_Unlock(&h2->sess->mtx); + AN(h2->error); + reason = h2->error->reason; + if (reason == SC_NULL) { + /* XXX: It's messy that some h2_errors have reasosn + * SC_NULL, which is just WRONG() wrt to SES_Delete(). */ + reason = SC_REM_CLOSE; } - h2->cond = NULL; - assert(h2->refcnt == 1); - h2_del_req(wrk, h2->req0); - h2_del_sess(wrk, h2, h2->error->reason); + h2_del_sess(wrk, h2, reason); wrk->vsl = NULL; } @@ -489,7 +505,7 @@ struct transport HTTP2_transport = { .deliver = h2_deliver, .minimal_response = h2_minimal_response, .new_session = h2_new_session, - .req_body = h2_req_body, + .req_body = h2_reqbody, .req_fail = h2_req_fail, .sess_panic = h2_sess_panic, .poll = h2_poll, diff --git a/bin/varnishtest/tests/f00007.vtc b/bin/varnishtest/tests/f00007.vtc index e982548a03..7976362def 100644 --- a/bin/varnishtest/tests/f00007.vtc +++ b/bin/varnishtest/tests/f00007.vtc @@ -62,6 +62,7 @@ client c3 { stream 1 { txreq -req POST -url /3 -hdr "content-length" "1" -nostrend txdata -data "A" -nostrend + rxwinup delay 0.5 txdata -data "GET /FAIL HTTP/1.1\r\n\r\n" rxrst diff --git a/bin/varnishtest/tests/r02387.vtc b/bin/varnishtest/tests/r02387.vtc index d2c9796e71..3d9dab7f45 100644 --- a/bin/varnishtest/tests/r02387.vtc +++ b/bin/varnishtest/tests/r02387.vtc @@ -11,8 +11,8 @@ varnish v1 -cliok "param.set feature +http2" varnish v1 -cliok "param.set debug +syncvsl" -barrier b1 cond 2 -barrier b2 cond 2 +barrier b1 cond 3 +barrier b2 cond 3 client c1 { stream 1 { @@ -27,7 +27,16 @@ client c1 { barrier b1 sync txcont -hdr "bar" "foo" - } -run + } -start + + barrier b2 sync + non_fatal + barrier b1 sync + + stream 1 -wait + stream 3 -wait + fatal + stream 0 { rxgoaway expect goaway.laststream == "1" diff --git a/bin/varnishtest/tests/r02679.vtc b/bin/varnishtest/tests/r02679.vtc index 590dfb264c..d3707151d7 100644 --- a/bin/varnishtest/tests/r02679.vtc +++ b/bin/varnishtest/tests/r02679.vtc @@ -22,7 +22,9 @@ client c1 { stream 1 { txreq -req POST -hdr "content-length" "31469" -nostrend txdata -datalen 1550 -nostrend + rxwinup txdata -datalen 16000 -nostrend + rxwinup txdata -datalen 13919 rxresp expect resp.status == 200 diff --git a/bin/varnishtest/tests/r02923.vtc b/bin/varnishtest/tests/r02923.vtc index 324f20cff6..537812eb22 100644 --- a/bin/varnishtest/tests/r02923.vtc +++ b/bin/varnishtest/tests/r02923.vtc @@ -34,9 +34,6 @@ varnish v1 -vcl+backend { } -start client c1 { - txpri - stream 0 rxsettings -run - stream 1 { txreq -url /sync rxresp diff --git a/bin/varnishtest/tests/r02937.vtc b/bin/varnishtest/tests/r02937.vtc index 8a2d00d58b..11dbf32330 100644 --- a/bin/varnishtest/tests/r02937.vtc +++ b/bin/varnishtest/tests/r02937.vtc @@ -21,5 +21,20 @@ client c1 { expect resp.http.upgrade == h2c expect resp.http.connection == Upgrade txpri + + stream 0 { + rxsettings + rxgoaway + expect goaway.err == ENHANCE_YOUR_CALM + expect goaway.laststream == 1 + } -start + + stream 1 { + rxrst + } -start + + stream 0 -wait + stream 1 -wait + expect_close } -run diff --git a/bin/varnishtest/tests/t02003.vtc b/bin/varnishtest/tests/t02003.vtc index fe30e82437..fab076273b 100644 --- a/bin/varnishtest/tests/t02003.vtc +++ b/bin/varnishtest/tests/t02003.vtc @@ -36,7 +36,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test reverse order stream numbers -client c1 { +client c2 { stream 0 { rxgoaway expect goaway.laststream == 3 @@ -61,7 +61,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test WINDOW_UPDATE error conditions -client c1 { +client c3 { stream 1 { txreq -nostrend txwinup -size 0 @@ -92,7 +92,7 @@ client c1 { } -run stream 0 -wait } -run -client c1 { +client c4 { stream 0 { txwinup -size 0x40000000 txwinup -size 0x40000000 @@ -102,7 +102,7 @@ client c1 { } -run } -run -client c1 { +client c5 { stream 1 { txreq rxresp @@ -123,7 +123,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test PING error conditions -client c1 { +client c6 { stream 0 { txping -ack -data "FOOBAR42" rxgoaway @@ -132,7 +132,7 @@ client c1 { } -run } -run -client c1 { +client c7 { stream 0 { sendhex "000008 06 80 00000001 0102030405060708" rxgoaway @@ -141,7 +141,7 @@ client c1 { } -run } -run -client c1 { +client c8 { stream 0 { sendhex "000007 06 80 00000000 01020304050607" rxgoaway @@ -160,7 +160,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test PUSH_PROMISE error conditions -client c1 { +client c9 { stream 0 { rxgoaway expect goaway.err == PROTOCOL_ERROR @@ -173,7 +173,7 @@ client c1 { stream 0 -wait } -run -client c1 { +client c10 { stream 0 { rxgoaway expect goaway.err == PROTOCOL_ERROR @@ -198,7 +198,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test RST_STREAM error conditions -client c1 { +client c11 { stream 0 { # RST idle stream sendhex "000004 03 00 00000007 00000008" @@ -208,7 +208,7 @@ client c1 { } -run } -run -client c1 { +client c12 { stream 0 { rxgoaway expect goaway.err == FRAME_SIZE_ERROR @@ -222,7 +222,7 @@ client c1 { stream 0 -wait } -run -client c1 { +client c13 { stream 0 { # RST stream zero sendhex "000000 03 00 00000000 00000008" @@ -232,8 +232,11 @@ client c1 { } -run } -run -client c1 { +barrier b14 cond 2 +client c14 { stream 0 { + barrier b14 sync + txgoaway rxgoaway expect goaway.err == NO_ERROR expect goaway.laststream == 3 @@ -246,11 +249,15 @@ client c1 { txreq -nostrend txrst -err 0x666 } -run + barrier b14 sync stream 0 -wait } -run -client c1 { +barrier b15 cond 2 +client c15 { stream 0 { + barrier b15 sync + txgoaway rxgoaway expect goaway.err == NO_ERROR expect goaway.laststream == 1 @@ -263,6 +270,7 @@ client c1 { # RST_STREAM on closed stream txrst } -run + barrier b15 sync stream 0 -wait } -run @@ -277,7 +285,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test SETTING error conditions -client c1 { +client c16 { stream 0 { # SETTING ACK with data sendhex "000001 04 01 00000000 aa" @@ -287,7 +295,7 @@ client c1 { } -run } -run -client c1 { +client c17 { stream 0 { # SETTING ACK with bad length sendhex "000001 04 00 00000000 aa" @@ -296,7 +304,7 @@ client c1 { expect goaway.laststream == 0 } -run } -run -client c1 { +client c18 { stream 0 { # SETTING ACK with bad value txsettings -winsize 0x80000000 @@ -306,7 +314,7 @@ client c1 { } -run } -run -client c1 { +client c19 { stream 0 { # SETTING unknown value sendhex "000006 04 00 00000000 ffff00000000" @@ -326,16 +334,19 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test GOAWAY error conditions -client c1 { +client c20 { stream 0 { txgoaway -err 2 + rxgoaway + expect goaway.err == NO_ERROR } -run expect_close } -run -client c1 { +client c21 { stream 0 { txgoaway -err 2222 + rxgoaway } -run expect_close } -run @@ -351,7 +362,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test HEADERS error conditions -client c1 { +client c22 { stream 1 { txreq -nostrend txreq -nostrend @@ -362,7 +373,7 @@ client c1 { expect_close } -run -client c1 { +client c23 { stream 0 { sendhex 00000c sendhex 01 @@ -376,7 +387,7 @@ client c1 { } -run } -run -client c1 { +client c24 { stream 0 { sendhex 000012 sendhex 01 @@ -388,7 +399,7 @@ client c1 { } -run } -run -client c1 { +client c25 { stream 1 { txreq -hdr ":bla" "foo" rxrst @@ -398,7 +409,7 @@ client c1 { #2349: Padding exceeds frame size -client c1 { +client c26 { stream 1 { sendhex 000001 sendhex 01 @@ -415,7 +426,7 @@ client c1 { } -run #2349: Padding equal to frame size -client c1 { +client c27 { stream 1 { sendhex 000001 sendhex 01 @@ -432,7 +443,7 @@ client c1 { } -run #2349: Integer underrun may also occur when the priority flag is set -client c1 { +client c28 { stream 1 { sendhex 000004 sendhex 01 @@ -458,7 +469,7 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test CONTINUATION error conditions -client c1 { +client c29 { stream 1 { txreq -nostrend txcont -hdr "bar" "foo" @@ -469,7 +480,7 @@ client c1 { expect_close } -run -client c1 { +client c30 { stream 0 { sendhex 000014 sendhex 01 @@ -489,7 +500,7 @@ client c1 { } -run } -run -client c1 { +client c31 { stream 1 { txreq -nohdrend txcont -hdr "bar" "foo" @@ -499,7 +510,7 @@ client c1 { } -run # 2350: Don't accept a continuation frame after stream is closed -client c1 { +client c32 { stream 1 { txreq rxresp @@ -522,25 +533,26 @@ varnish v1 -expect MEMPOOL.sess1.live == 0 ####################################################################### # Test DATA error conditions -client c1 { +client c33 { stream 1 { txdata -data "FOOBAR" } -run stream 0 { rxgoaway + expect goaway.err == PROTOCOL_ERROR } -run expect_close } -run -client c1 { +client c34 { stream 1 { txreq rxresp txdata -data "FOOBAR" } -run - stream 3 { - txreq - rxresp + stream 0 { + rxgoaway + expect goaway.err == PROTOCOL_ERROR } -run } -run diff --git a/bin/varnishtest/tests/t02005.vtc b/bin/varnishtest/tests/t02005.vtc index 39737f93a6..10f54ded8d 100644 --- a/bin/varnishtest/tests/t02005.vtc +++ b/bin/varnishtest/tests/t02005.vtc @@ -31,7 +31,7 @@ varnish v1 -vcl+backend { varnish v1 -cliok "param.set debug +syncvsl" logexpect l1 -v v1 -g raw { - expect * 1001 ReqAcct "80 7 87 78 8 86" + expect * 1001 ReqAcct "160 7 167 78 16 94" expect * 1000 ReqAcct "45 8 53 63 34 97" } -start diff --git a/bin/varnishtest/tests/t02008.vtc b/bin/varnishtest/tests/t02008.vtc index 75cee513fa..4a7f096d78 100644 --- a/bin/varnishtest/tests/t02008.vtc +++ b/bin/varnishtest/tests/t02008.vtc @@ -28,6 +28,8 @@ client c1 { } -run stream 0 { txgoaway -err 2 + rxgoaway + expect goaway.err == NO_ERROR } -run expect_close } -run diff --git a/bin/varnishtest/tests/t02011.vtc b/bin/varnishtest/tests/t02011.vtc index 88c64d9045..0d25eb6f2c 100644 --- a/bin/varnishtest/tests/t02011.vtc +++ b/bin/varnishtest/tests/t02011.vtc @@ -43,9 +43,6 @@ varnish v1 -vcl+backend { } -start client c1 { - txpri - stream 0 rxsettings -run - stream 1 { txreq -hdr should sync barrier b1 sync diff --git a/bin/varnishtest/tests/t02015.vtc b/bin/varnishtest/tests/t02015.vtc index 6e59dc7abc..860e79da38 100644 --- a/bin/varnishtest/tests/t02015.vtc +++ b/bin/varnishtest/tests/t02015.vtc @@ -14,17 +14,12 @@ varnish v1 -vcl+backend { } -start logexpect l1 -v v1 -g raw -q ReqAcct { - expect ? 1001 ReqAcct "46 0 46 69 12345 12414" - expect ? 1003 ReqAcct "46 0 46 74 1000 1074" + expect ? 1001 ReqAcct "92 0 92 69 24690 24759" + expect ? 1003 ReqAcct "92 0 92 74 13345 13419" } -start client c1 { - txpri - stream 0 { - rxsettings - expect settings.ack == false - txsettings -ack txsettings -winsize 1000 rxsettings expect settings.ack == true diff --git a/bin/varnishtest/tests/t02016.vtc b/bin/varnishtest/tests/t02016.vtc index 1e5a7dc8ae..7e8dd5a094 100644 --- a/bin/varnishtest/tests/t02016.vtc +++ b/bin/varnishtest/tests/t02016.vtc @@ -6,6 +6,8 @@ server s1 { } -start varnish v1 -cliok "param.set feature +http2" +varnish v1 -cliok "param.set debug +syncvsl" +varnish v1 -cliok "param.set timeout_idle 10" varnish v1 -vcl+backend { sub vcl_recv { if (req.url ~ "synth") { @@ -23,12 +25,7 @@ logexpect l1 -v v1 { } -start client c1 { - txpri - stream 0 { - rxsettings - expect settings.ack == false - txsettings -ack txsettings -winsize 1000 rxsettings expect settings.ack == true @@ -60,12 +57,7 @@ logexpect l2 -v v1 { } -start client c2 { - txpri - stream 0 { - rxsettings - expect settings.ack == false - txsettings -ack txsettings -winsize 1000 rxsettings expect settings.ack == true @@ -101,12 +93,7 @@ logexpect l3 -v v1 { } -start client c3 { - txpri - stream 0 { - rxsettings - expect settings.ack == false - txsettings -ack txsettings -winsize 1000 rxsettings expect settings.ack == true diff --git a/bin/varnishtest/tests/t02020.vtc b/bin/varnishtest/tests/t02020.vtc index e2bcb76f43..e12a5c18e5 100644 --- a/bin/varnishtest/tests/t02020.vtc +++ b/bin/varnishtest/tests/t02020.vtc @@ -1,6 +1,6 @@ varnishtest "H/2 received data frames with padding" -barrier b1 sock 3 +barrier b1 sock 2 server s1 { rxreq @@ -8,6 +8,7 @@ server s1 { expect req.body == abcde txresp rxreq + expect req.bodylen == 81500 txresp rxreq txresp @@ -48,31 +49,20 @@ client c2 { # by unblocking the client thread stuck in vcl_recv. From that # point on window updates will also be sent on the stream. - stream 0 { - rxwinup - rxwinup - rxwinup - rxwinup - barrier b1 sync - } -start - stream 3 { - txreq -req POST -url /3 -hdr "content-length" "131072" -nostrend - txdata -datalen 16300 -padlen 83 -nostrend - txdata -datalen 16300 -padlen 83 -nostrend - txdata -datalen 16300 -padlen 83 -nostrend + txreq -req POST -url /3 -hdr "content-length" "81500" -nostrend + loop 3 { + txdata -datalen 16300 -padlen 83 -nostrend + rxwinup + expect winup.size == 84 + } txdata -datalen 16300 -padlen 82 -nostrend - barrier b1 sync - rxwinup - txdata -datalen 16300 -padlen 83 -nostrend rxwinup - txdata -datalen 16300 -padlen 83 -nostrend - rxwinup - txdata -datalen 16300 -padlen 83 -nostrend - rxwinup - txdata -datalen 16300 -padlen 83 -nostrend + expect winup.size == 83 + barrier b1 sync rxwinup - txdata -datalen 672 + expect winup.size == 65200 + txdata -datalen 16300 -padlen 83 rxresp expect resp.status == 200 } -start diff --git a/bin/varnishtest/tests/t02023.vtc b/bin/varnishtest/tests/t02023.vtc index bdc722ce3a..039cc5f4ae 100644 --- a/bin/varnishtest/tests/t02023.vtc +++ b/bin/varnishtest/tests/t02023.vtc @@ -14,58 +14,43 @@ client c1 { expect resp.status == 400 } -run -client c1 { +client c2 { txreq -req "" rxresp expect resp.status == 400 } -run -client c1 { +client c3 { txreq -proto "" rxresp expect resp.status == 400 } -run -client c1 { +client c4 { stream 1 { txreq -url "" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c5 { stream 1 { txreq -scheme "" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c6 { stream 1 { txreq -req "" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c7 { stream 1 { txreq -hdr "empty" "" rxresp @@ -80,118 +65,74 @@ client c1 { varnish v1 -vsl_catchup -client c1 { +client c8 { stream 1 { txreq -hdr "foo" " bar" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c9 { stream 1 { txreq -hdr "foo" " " rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c10 { stream 1 { txreq -hdr ":foo" "bar" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c11 { stream 1 { txreq -hdr "foo" "b\x0car" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c12 { stream 1 { txreq -hdr "f o" "bar" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c13 { stream 1 { txreq -hdr "f: o" "bar" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c14 { stream 1 { txreq -hdr "foo" "bar " rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c15 { stream 1 { txreq -hdr "foo" " bar" rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - expect resp.status == 200 - } -run } -run -client c1 { +client c16 { stream 1 { txreq -hdr "foo" "bar " rxrst expect rst.err == PROTOCOL_ERROR } -run - stream 3 { - txreq - rxresp - } -run } -run diff --git a/bin/varnishtest/tests/t02027.vtc b/bin/varnishtest/tests/t02027.vtc index 5bc7b48160..835913b3b4 100644 --- a/bin/varnishtest/tests/t02027.vtc +++ b/bin/varnishtest/tests/t02027.vtc @@ -10,8 +10,7 @@ varnish v1 -arg "-p feature=+http2" -arg "-p debug=+syncvsl" -vcl { logexpect l0 -v v1 -g vxid -q "Begin ~ sess" { fail add * SessError expect * * Debug {^H2: Got pu PRISM} - expect 0 = Debug {^H2: HTC eof.*frame=complete goaway=0} - expect 0 = Debug {^H2 CLEANUP H2CE_NO_ERROR} + expect 0 = Debug {^H2: HTC eof .* frame=complete} expect 0 = ReqAcct {^0 0 0 18 26 44} expect 0 = SessClose {^REM_CLOSE} expect 0 = End @@ -22,18 +21,17 @@ logexpect l0 -v v1 -g vxid -q "Begin ~ sess" { client c0 { txpri shutdown -write - stream 0 { + stream 0 { rxsettings - rxgoaway - expect goaway.laststream == 0 - expect goaway.err == NO_ERROR + rxgoaway + expect goaway.laststream == 0 + expect goaway.err == NO_ERROR } -run } -run logexpect l1 -v v1 -g vxid -q "Begin ~ sess" { fail add * SessError - expect * * Debug {^H2: HTC eof.*frame=complete goaway=0} - expect 0 = Debug {^H2 CLEANUP H2CE_NO_ERROR} + expect * * Debug {^H2: HTC eof .* frame=complete} expect 9 = ReqAcct {^27 0 27 27 26 53} expect 0 = SessClose {^REM_CLOSE} expect 0 = End @@ -46,17 +44,16 @@ client c1 { txreq -nohdrend } -run shutdown -write - stream 0 { - rxgoaway - expect goaway.laststream == 1 - expect goaway.err == NO_ERROR + stream 0 { + rxgoaway + expect goaway.laststream == 1 + expect goaway.err == NO_ERROR } -run } -run logexpect l2 -v v1 -g vxid -q "Begin ~ sess" { fail add * SessError - expect * * Debug {^H2: HTC eof.*frame=complete goaway=0} - expect 0 = Debug {^H2 CLEANUP H2CE_NO_ERROR} + expect * * Debug {^H2: HTC eof .* frame=complete} expect 9 = ReqAcct {^27 0 27 27 26 53} expect 0 = SessClose {^REM_CLOSE} expect 0 = End @@ -69,17 +66,16 @@ client c2 { txreq -nostrend } -run shutdown -write - stream 0 { - rxgoaway - expect goaway.laststream == 1 - expect goaway.err == NO_ERROR + stream 0 { + rxgoaway + expect goaway.laststream == 1 + expect goaway.err == NO_ERROR } -run } -run logexpect l3 -v v1 -g vxid -q "Begin ~ sess" { fail add * SessError - expect * * Debug {^H2: HTC eof.*frame=partial goaway=0} - expect 0 = Debug {^H2 CLEANUP H2CE_NO_ERROR} + expect * * Debug {^H2: HTC eof .* frame=partial} expect 0 = ReqAcct {^18 0 18 27 26 53} expect 0 = SessClose {^REM_CLOSE} expect 0 = End @@ -89,10 +85,10 @@ logexpect l3 -v v1 -g vxid -q "Begin ~ sess" { # middle of frame client c3 { stream 1 { - # +- 01 END_STREAM + # +- 01 END_STREAM # +- 04 END_HEADERS - # | - # len ty fl strmid + # | + # len ty fl strmid sendhex { 000024 01 05 00000001 00053a70617468012f00073a6d6574686f640347455400073a736368656d6504687474 @@ -101,10 +97,10 @@ client c3 { # 00053a70617468012f00073a6d6574686f640347455400073a736368656d650468747470 } -run shutdown -write - stream 0 { - rxgoaway - expect goaway.laststream == 0 - expect goaway.err == NO_ERROR + stream 0 { + rxgoaway + expect goaway.laststream == 0 + expect goaway.err == NO_ERROR } -run } -run diff --git a/bin/varnishtest/tests/t02028.vtc b/bin/varnishtest/tests/t02028.vtc new file mode 100644 index 0000000000..cec3dd58f2 --- /dev/null +++ b/bin/varnishtest/tests/t02028.vtc @@ -0,0 +1,17 @@ +varnishtest "Bad preface: no SETTINGS frame first" + +varnish v1 -cliok "param.set feature +http2" +varnish v1 -cliok "param.set debug +syncvsl" +varnish v1 -vcl "backend default none;" -start + +client c1 { + txpri + + stream 0 { + rxsettings + expect settings.ack == false + txsettings -ack + rxgoaway + expect goaway.err == PROTOCOL_ERROR + } -run +} -run diff --git a/configure.ac b/configure.ac index 2483dfeac2..4ff0d3eb65 100644 --- a/configure.ac +++ b/configure.ac @@ -483,6 +483,8 @@ else ac_cv_func_port_create=no fi +AC_CHECK_FUNCS([eventfd]) + # --with-persistent-storage AC_ARG_WITH(persistent-storage, AS_HELP_STRING([--with-persistent-storage], diff --git a/include/Makefile.am b/include/Makefile.am index 347724d70a..a44b32ccbd 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -95,6 +95,7 @@ nobase_noinst_HEADERS = \ vcs_version.h \ vct.h \ vcurses.h \ + vefd.h \ venc.h \ vend.h \ vev.h \ diff --git a/include/tbl/h2_error.h b/include/tbl/h2_error.h index adfbbde422..ceffc41904 100644 --- a/include/tbl/h2_error.h +++ b/include/tbl/h2_error.h @@ -206,6 +206,24 @@ H2_ERROR( /* reason */ SC_NULL, /* descr */ "HTTP/2 header list exceeded http_req_size" ) + +H2_ERROR( + /* name */ SEND_TIMEOUT, + /* val */ 8, /* CANCEL */ + /* types */ 2, + /* goaway */ 0, + /* reason */ SC_NULL, + /* descr */ "send timeout" +) + +H2_ERROR( + /* name */ IO_ERROR, + /* val */ 0, + /* types */ 1, + /* goaway */ 1, + /* reason */ SC_REM_CLOSE, + /* descr */ "socket error" +) # undef H2_CUSTOM_ERRORS #endif diff --git a/include/tbl/h2_frames.h b/include/tbl/h2_frames.h index 2b1e2c04f5..52a987b9b2 100644 --- a/include/tbl/h2_frames.h +++ b/include/tbl/h2_frames.h @@ -138,17 +138,11 @@ #ifdef H2_FRAME_FLAGS /* lower, upper, flag */ H2_FRAME_FLAGS(none, NONE, 0x00) - H2_FRAME_FLAGS(data_end_stream, DATA_END_STREAM, 0x01) - H2_FRAME_FLAGS(data_padded, DATA_PADDED, 0x08) - H2_FRAME_FLAGS(headers_end_stream, HEADERS_END_STREAM, 0x01) - H2_FRAME_FLAGS(headers_end_headers, HEADERS_END_HEADERS, 0x04) - H2_FRAME_FLAGS(headers_padded, HEADERS_PADDED, 0x08) - H2_FRAME_FLAGS(headers_priority, HEADERS_PRIORITY, 0x20) - H2_FRAME_FLAGS(settings_ack, SETTINGS_ACK, 0x01) - H2_FRAME_FLAGS(push_promise_end_headers,PUSH_PROMISE_END_HEADERS, 0x04) - H2_FRAME_FLAGS(push_promise_padded, PUSH_PROMISE_PADDED, 0x08) - H2_FRAME_FLAGS(ping_ack, PING_ACK, 0x01) - H2_FRAME_FLAGS(continuation_end_headers,CONTINUATION_END_HEADERS, 0x04) + H2_FRAME_FLAGS(ack, ACK, 0x01) + H2_FRAME_FLAGS(end_stream, END_STREAM, 0x01) + H2_FRAME_FLAGS(end_headers, END_HEADERS, 0x04) + H2_FRAME_FLAGS(padded, PADDED, 0x08) + H2_FRAME_FLAGS(priority, PRIORITY, 0x20) #undef H2_FRAME_FLAGS #endif diff --git a/include/vefd.h b/include/vefd.h new file mode 100644 index 0000000000..5f41bada1e --- /dev/null +++ b/include/vefd.h @@ -0,0 +1,50 @@ +/*- + * Copyright (c) 2025 Varnish Software AS + * All rights reserved. + * + * Author: Dridi Boukelmoune + * + * SPDX-License-Identifier: BSD-2-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +struct vefd { + unsigned magic; +#define VEFD_MAGIC 0x1548c1a6 + int poll_fd; + int priv_fd; +}; + +#define VEFD_INIT(vefd) \ + do { \ + INIT_OBJ(vefd, VEFD_MAGIC); \ + (vefd)->poll_fd = -1; \ + (vefd)->priv_fd = -1; \ + } while (0) + +int VEFD_Open(struct vefd *); +int VEFD_Signal(struct vefd *); +int VEFD_Clear(struct vefd *); +int VEFD_Close(struct vefd *); diff --git a/lib/libvarnish/Makefile.am b/lib/libvarnish/Makefile.am index 2210911107..1e7e984b8b 100644 --- a/lib/libvarnish/Makefile.am +++ b/lib/libvarnish/Makefile.am @@ -23,6 +23,7 @@ libvarnish_la_SOURCES = \ vcli_serve.c \ vct.c \ venc.c \ + vefd.c \ version.c \ vev.c \ vfil.c \ diff --git a/lib/libvarnish/vefd.c b/lib/libvarnish/vefd.c new file mode 100644 index 0000000000..8be1c70311 --- /dev/null +++ b/lib/libvarnish/vefd.c @@ -0,0 +1,163 @@ +/*- + * Copyright (c) 2025 Varnish Software AS + * All rights reserved. + * + * Author: Dridi Boukelmoune + * + * SPDX-License-Identifier: BSD-2-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include "config.h" + +#if HAVE_EVENTFD +# include +#else +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include + +#if HAVE_EVENTFD +int +VEFD_Open(struct vefd *vefd) +{ + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd == -1); + assert(vefd->priv_fd == -1); + + vefd->poll_fd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + return (vefd->poll_fd); +} + +int +VEFD_Signal(struct vefd *vefd) +{ + int64_t buf = 1; + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd >= 0); + assert(vefd->priv_fd == -1); + assert(write(vefd->poll_fd, &buf, sizeof buf) == sizeof buf); + return (0); +} + +int +VEFD_Clear(struct vefd *vefd) +{ + int64_t buf; + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd >= 0); + assert(vefd->priv_fd == -1); + assert(read(vefd->poll_fd, &buf, sizeof buf) == sizeof buf); + return (0); +} + +int +VEFD_Close(struct vefd *vefd) +{ + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd >= 0); + assert(vefd->priv_fd == -1); + closefd(&vefd->poll_fd); + return (0); +} +#else /* !HAVE_EVENTFD */ +int +VEFD_Open(struct vefd *vefd) +{ + int fd[2]; + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd == -1); + assert(vefd->priv_fd == -1); + + if (pipe(fd) < 0) + return (-1); + + AZ(fcntl(fd[0], F_SETFL, O_CLOEXEC|O_NONBLOCK)); + AZ(fcntl(fd[1], F_SETFL, O_CLOEXEC|O_NONBLOCK)); + vefd->poll_fd = fd[0]; + vefd->priv_fd = fd[1]; + return (0); +} + +int +VEFD_Signal(struct vefd *vefd) +{ + ssize_t r; + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd >= 0); + assert(vefd->priv_fd >= 0); + assert(vefd->poll_fd != vefd->priv_fd); + r = write(vefd->priv_fd, "", 1); + if (r < 0 && errno != EAGAIN && errno != EWOULDBLOCK) + return (-1); + return (0); +} + +int +VEFD_Clear(struct vefd *vefd) +{ + char buf[64]; + ssize_t r; + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd >= 0); + assert(vefd->priv_fd >= 0); + assert(vefd->poll_fd != vefd->priv_fd); + do { + r = read(vefd->poll_fd, buf, sizeof buf); + } while (r > 0); + if (errno != EAGAIN && errno != EWOULDBLOCK) + return (-1); + return (0); +} + +int +VEFD_Close(struct vefd *vefd) +{ + + CHECK_OBJ_NOTNULL(vefd, VEFD_MAGIC); + assert(vefd->poll_fd >= 0); + assert(vefd->priv_fd >= 0); + assert(vefd->poll_fd != vefd->priv_fd); + closefd(&vefd->poll_fd); + closefd(&vefd->priv_fd); + return (0); +} +#endif /* HAVE_EVENTFD */