From 142c6f8a4f7bc4c639968633f0941449630dd39a Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Thu, 14 May 2026 17:51:14 -0400 Subject: [PATCH 1/4] block: add task-context bio completion infrastructure Some bio completion handlers need to run from preemptible task context, but bio_endio() may be called from IRQ context (e.g., buffer_head writeback). Callers need a way to ensure their callback eventually runs from a sleepable context. Add infrastructure for that, in two forms: 1. BIO_COMPLETE_IN_TASK, a bio flag the submitter sets when it knows in advance that its callback needs task context (e.g., dropbehind writeback). bio_endio() sees the flag and offloads completion to a worker automatically. 2. bio_complete_in_task(), a helper that completion callbacks can invoke from within bi_end_io() when the deferral decision is dynamic (e.g., fserror reporting). Both share a per-CPU batch list drained by a delayed work item on a WQ_PERCPU workqueue. Producers push the bio onto the local CPU's batch and schedule the work item, which then dispatches each bio's bi_end_io() from task context. The delayed work item uses a 1-jiffie delay to allow batches of completions to accumulate before processing. Both methods are gated on bio_in_atomic(), which returns true in any context where a sleeping bi_end_io() is unsafe, including non-preemptible task context. This logic is copied from commit c99fab6e80b7 ("erofs: fix atomic context detection when !CONFIG_DEBUG_LOCK_ALLOC"). Two CPU hotplug callbacks are used to drain remaining bios from the departing CPU's batch, while maintaining the per-CPU behavior. The CPUHP_AP_ONLINE_DYN callback disables the per-CPU delayed work while the CPU is still online, preventing it from running on an unbound worker later. CPUHP_BP_PREPARE_DYN then drains any bios added between disabling the work item and CPU offline. Link: https://lore.kernel.org/all/20260409160243.1008358-1-hch@lst.de/ Suggested-by: Matthew Wilcox Suggested-by: Christoph Hellwig Signed-off-by: Tal Zussman --- block/bio.c | 147 +++++++++++++++++++++++++++++++++++++- include/linux/bio.h | 32 +++++++++ include/linux/blk_types.h | 1 + 3 files changed, 179 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 5f10900b3f42a..6da8e463df68a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "blk.h" @@ -1720,6 +1721,79 @@ void bio_check_pages_dirty(struct bio *bio) } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); +/* + * Infrastructure for deferring bio completions to task-context via a per-CPU + * workqueue. Triggered either by the BIO_COMPLETE_IN_TASK bio flag (static + * decision at submit time) or by calling bio_complete_in_task() from + * bi_end_io() (dynamic decision at completion time). + */ + +struct bio_complete_batch { + local_lock_t lock; + struct bio_list list; + struct delayed_work work; + int cpu; +}; + +static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch) = { + .lock = INIT_LOCAL_LOCK(lock), +}; +static struct workqueue_struct *bio_complete_wq; + +static void bio_complete_work_fn(struct work_struct *w) +{ + struct delayed_work *dw = to_delayed_work(w); + struct bio_complete_batch *batch = + container_of(dw, struct bio_complete_batch, work); + + while (1) { + struct bio_list list; + struct bio *bio; + + local_lock_irq(&bio_complete_batch.lock); + list = batch->list; + bio_list_init(&batch->list); + local_unlock_irq(&bio_complete_batch.lock); + + if (bio_list_empty(&list)) + break; + + while ((bio = bio_list_pop(&list))) + bio->bi_end_io(bio); + + if (need_resched()) { + bool is_empty; + + local_lock_irq(&bio_complete_batch.lock); + is_empty = bio_list_empty(&batch->list); + local_unlock_irq(&bio_complete_batch.lock); + if (!is_empty) + mod_delayed_work_on(batch->cpu, + bio_complete_wq, + &batch->work, 0); + break; + } + } +} + +void __bio_complete_in_task(struct bio *bio) +{ + struct bio_complete_batch *batch; + unsigned long flags; + bool was_empty; + + local_lock_irqsave(&bio_complete_batch.lock, flags); + batch = this_cpu_ptr(&bio_complete_batch); + was_empty = bio_list_empty(&batch->list); + bio_list_add(&batch->list, bio); + local_unlock_irqrestore(&bio_complete_batch.lock, flags); + + if (was_empty) + mod_delayed_work_on(batch->cpu, bio_complete_wq, + &batch->work, 1); +} +EXPORT_SYMBOL_GPL(__bio_complete_in_task); + static inline bool bio_remaining_done(struct bio *bio) { /* @@ -1794,7 +1868,9 @@ void bio_endio(struct bio *bio) } #endif - if (bio->bi_end_io) + if (bio_flagged(bio, BIO_COMPLETE_IN_TASK) && bio_in_atomic()) + __bio_complete_in_task(bio); + else if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); @@ -1980,6 +2056,51 @@ int bioset_init(struct bio_set *bs, } EXPORT_SYMBOL(bioset_init); +static int bio_complete_batch_cpu_online(unsigned int cpu) +{ + enable_delayed_work(&per_cpu(bio_complete_batch, cpu).work); + return 0; +} + +/* + * Disable this CPU's delayed work so that it cannot run on an unbound worker + * after the CPU is offlined. + */ +static int bio_complete_batch_cpu_down_prep(unsigned int cpu) +{ + disable_delayed_work_sync(&per_cpu(bio_complete_batch, cpu).work); + return 0; +} + +/* + * Drain a dead CPU's deferred bio completions. The CPU is dead and the worker + * is canceled so no locking is needed. + */ +static int bio_complete_batch_cpu_dead(unsigned int cpu) +{ + struct bio_complete_batch *batch = + per_cpu_ptr(&bio_complete_batch, cpu); + struct bio *bio; + + while ((bio = bio_list_pop(&batch->list))) + bio->bi_end_io(bio); + + return 0; +} + +static void __init bio_complete_batch_init(int cpu) +{ + struct bio_complete_batch *batch = + per_cpu_ptr(&bio_complete_batch, cpu); + + bio_list_init(&batch->list); + INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn); + batch->cpu = cpu; + + if (!cpu_online(cpu)) + disable_delayed_work_sync(&batch->work); +} + static int __init init_bio(void) { int i; @@ -1994,6 +2115,30 @@ static int __init init_bio(void) SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } + for_each_possible_cpu(i) + bio_complete_batch_init(i); + + bio_complete_wq = alloc_workqueue("bio_complete", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); + if (!bio_complete_wq) + panic("bio: can't allocate bio_complete workqueue\n"); + + /* + * bio task-context completion draining on hot-unplugged CPUs: + * + * 1. Stop the per-CPU delayed work while the CPU is still online, so + * that it cannot run on an unbound worker later. + * 2. Drain leftover bios added between worker disabling and CPU + * offlining. + */ + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "block/bio:complete:online", + bio_complete_batch_cpu_online, + bio_complete_batch_cpu_down_prep); + cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN, + "block/bio:complete:dead", + NULL, bio_complete_batch_cpu_dead); + cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); diff --git a/include/linux/bio.h b/include/linux/bio.h index dc17780d6c1e3..7858be0ea1d40 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -369,6 +369,38 @@ static inline struct bio *bio_alloc(struct block_device *bdev, void submit_bio(struct bio *bio); +/** + * bio_in_atomic - check if the current context is unsafe for bio completion + * + * Return: %true in atomic contexts (e.g. hard/soft IRQ, preempt-disabled); + * %false when a bio can be safely completed in the current context. + */ +static inline bool bio_in_atomic(void) +{ + if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth()) + return true; + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return true; + return !preemptible(); +} + +void __bio_complete_in_task(struct bio *bio); + +/** + * bio_complete_in_task - ensure a bio is completed in preemptible task context + * @bio: bio to complete + * + * If called from non-task context, offload the bio completion to a worker + * thread and return %true. Else return %false and do nothing. + */ +static inline bool bio_complete_in_task(struct bio *bio) +{ + if (!bio_in_atomic()) + return false; + __bio_complete_in_task(bio); + return true; +} + extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8808ee76e73c0..d49d97a050d0e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -322,6 +322,7 @@ enum { BIO_REMAPPED, BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ + BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */ BIO_FLAG_LAST }; From 4f5818487341e9bfd203594203f1dc2d5454bed2 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Thu, 14 May 2026 17:51:15 -0400 Subject: [PATCH 2/4] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback Set BIO_COMPLETE_IN_TASK on iomap writeback bios when a dropbehind folio is added. This ensures that bi_end_io runs in task context, where folio_end_dropbehind() can safely invalidate folios. With the bio layer now handling task-context deferral generically, IOMAP_IOEND_DONTCACHE is no longer needed, as XFS no longer needs to route DONTCACHE ioends through its completion workqueue. Remove the flag and its NOMERGE entry. Without the NOMERGE, regular I/Os that get merged with a dropbehind folio will also have their completion deferred to task context. Signed-off-by: Tal Zussman Reviewed-by: Christoph Hellwig --- fs/iomap/ioend.c | 5 +++-- fs/xfs/xfs_aops.c | 4 ---- include/linux/iomap.h | 5 +---- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index acf3cf98b23a9..892dbfc77ae91 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -237,8 +237,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, if (wpc->iomap.flags & IOMAP_F_SHARED) ioend_flags |= IOMAP_IOEND_SHARED; - if (folio_test_dropbehind(folio)) - ioend_flags |= IOMAP_IOEND_DONTCACHE; if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) ioend_flags |= IOMAP_IOEND_BOUNDARY; @@ -255,6 +253,9 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff)) goto new_ioend; + if (folio_test_dropbehind(folio)) + bio_set_flag(&ioend->io_bio, BIO_COMPLETE_IN_TASK); + /* * Clamp io_offset and io_size to the incore EOF so that ondisk * file size updates in the ioend completion are byte-accurate. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f279055fcea03..0dcf78beae8ac 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -511,10 +511,6 @@ xfs_ioend_needs_wq_completion( if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)) return true; - /* Page cache invalidation cannot be done in irq context. */ - if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) - return true; - return false; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2c5685adf3a97..fef04e01116f7 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -399,16 +399,13 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, #define IOMAP_IOEND_BOUNDARY (1U << 2) /* is direct I/O */ #define IOMAP_IOEND_DIRECT (1U << 3) -/* is DONTCACHE I/O */ -#define IOMAP_IOEND_DONTCACHE (1U << 4) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \ - IOMAP_IOEND_DONTCACHE) + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) /* * Structure for writeback I/O completions. From e95ea36e488f049d7a4aa59c70010710c48d274d Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Thu, 14 May 2026 17:51:16 -0400 Subject: [PATCH 3/4] buffer: add dropbehind writeback support Add block_write_begin_iocb() which threads the kiocb through to __filemap_get_folio() so that buffer_head-based I/O can use DONTCACHE behavior. When the iocb has IOCB_DONTCACHE set, FGP_DONTCACHE is passed to mark the folio for dropbehind. The existing block_write_begin() is preserved as a wrapper that passes a NULL iocb. Set BIO_COMPLETE_IN_TASK in submit_bh_wbc() when the folio has dropbehind set, so that buffer_head writeback completions get deferred to task context. Signed-off-by: Tal Zussman Reviewed-by: Christoph Hellwig --- fs/buffer.c | 19 +++++++++++++++++-- include/linux/buffer_head.h | 3 +++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index b0b3792b1496e..d0abaf44d782e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2138,14 +2138,19 @@ EXPORT_SYMBOL(block_commit_write); * * The filesystem needs to handle block truncation upon failure. */ -int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, +int block_write_begin_iocb(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; + fgf_t fgp_flags = FGP_WRITEBEGIN; struct folio *folio; int status; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + if (iocb && iocb->ki_flags & IOCB_DONTCACHE) + fgp_flags |= FGP_DONTCACHE; + + folio = __filemap_get_folio(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -2160,6 +2165,13 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, *foliop = folio; return status; } + +int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, + struct folio **foliop, get_block_t *get_block) +{ + return block_write_begin_iocb(NULL, mapping, pos, len, foliop, + get_block); +} EXPORT_SYMBOL(block_write_begin); int block_write_end(loff_t pos, unsigned len, unsigned copied, @@ -2715,6 +2727,9 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); + if (folio_test_dropbehind(bh->b_folio)) + bio_set_flag(bio, BIO_COMPLETE_IN_TASK); + if (IS_ENABLED(CONFIG_FS_ENCRYPTION)) buffer_set_crypto_ctx(bio, bh, GFP_NOIO); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e4939e33b4b51..4ce50882d6213 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -260,6 +260,9 @@ int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); +int block_write_begin_iocb(const struct kiocb *iocb, + struct address_space *mapping, loff_t pos, unsigned len, + struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *); From fb55aaffb223e6ed5a0b6f514c7f7efa1e0c8ce0 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Thu, 14 May 2026 17:51:17 -0400 Subject: [PATCH 4/4] block: enable RWF_DONTCACHE for block devices Block device buffered reads and writes already pass through filemap_read() and iomap_file_buffered_write() respectively, both of which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files by setting FOP_DONTCACHE in def_blk_fops. For CONFIG_BUFFER_HEAD=y paths, use block_write_begin_iocb() in blkdev_write_begin() to thread the kiocb through so that buffer_head writeback gets dropbehind support. CONFIG_BUFFER_HEAD=n paths are handled by the previously added iomap BIO_COMPLETE_IN_TASK support. This support is useful for databases that operate on raw block devices, among other userspace applications. Signed-off-by: Tal Zussman Reviewed-by: Christoph Hellwig --- block/fops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index bb6642b45937c..31b073181d872 100644 --- a/block/fops.c +++ b/block/fops.c @@ -504,7 +504,8 @@ static int blkdev_write_begin(const struct kiocb *iocb, unsigned len, struct folio **foliop, void **fsdata) { - return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); + return block_write_begin_iocb(iocb, mapping, pos, len, foliop, + blkdev_get_block); } static int blkdev_write_end(const struct kiocb *iocb, @@ -966,7 +967,7 @@ const struct file_operations def_blk_fops = { .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, - .fop_flags = FOP_BUFFER_RASYNC, + .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE, }; static __init int blkdev_init(void)