Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 146 additions & 1 deletion block/bio.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>
#include <linux/local_lock.h>

#include <trace/events/block.h>
#include "blk.h"
Expand Down Expand Up @@ -1720,6 +1721,79 @@ void bio_check_pages_dirty(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);

/*
* Infrastructure for deferring bio completions to task-context via a per-CPU
* workqueue. Triggered either by the BIO_COMPLETE_IN_TASK bio flag (static
* decision at submit time) or by calling bio_complete_in_task() from
* bi_end_io() (dynamic decision at completion time).
*/

struct bio_complete_batch {
local_lock_t lock;
struct bio_list list;
struct delayed_work work;
int cpu;
};

static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch) = {
.lock = INIT_LOCAL_LOCK(lock),
};
static struct workqueue_struct *bio_complete_wq;

static void bio_complete_work_fn(struct work_struct *w)
{
struct delayed_work *dw = to_delayed_work(w);
struct bio_complete_batch *batch =
container_of(dw, struct bio_complete_batch, work);

while (1) {
struct bio_list list;
struct bio *bio;

local_lock_irq(&bio_complete_batch.lock);
list = batch->list;
bio_list_init(&batch->list);
local_unlock_irq(&bio_complete_batch.lock);

if (bio_list_empty(&list))
break;

while ((bio = bio_list_pop(&list)))
bio->bi_end_io(bio);

if (need_resched()) {
bool is_empty;

local_lock_irq(&bio_complete_batch.lock);
is_empty = bio_list_empty(&batch->list);
local_unlock_irq(&bio_complete_batch.lock);
if (!is_empty)
mod_delayed_work_on(batch->cpu,
bio_complete_wq,
&batch->work, 0);
break;
}
}
}

void __bio_complete_in_task(struct bio *bio)
{
struct bio_complete_batch *batch;
unsigned long flags;
bool was_empty;

local_lock_irqsave(&bio_complete_batch.lock, flags);
batch = this_cpu_ptr(&bio_complete_batch);
was_empty = bio_list_empty(&batch->list);
bio_list_add(&batch->list, bio);
local_unlock_irqrestore(&bio_complete_batch.lock, flags);

if (was_empty)
mod_delayed_work_on(batch->cpu, bio_complete_wq,
&batch->work, 1);
}
EXPORT_SYMBOL_GPL(__bio_complete_in_task);

static inline bool bio_remaining_done(struct bio *bio)
{
/*
Expand Down Expand Up @@ -1794,7 +1868,9 @@ void bio_endio(struct bio *bio)
}
#endif

if (bio->bi_end_io)
if (bio_flagged(bio, BIO_COMPLETE_IN_TASK) && bio_in_atomic())
__bio_complete_in_task(bio);
else if (bio->bi_end_io)
bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
Expand Down Expand Up @@ -1980,6 +2056,51 @@ int bioset_init(struct bio_set *bs,
}
EXPORT_SYMBOL(bioset_init);

static int bio_complete_batch_cpu_online(unsigned int cpu)
{
enable_delayed_work(&per_cpu(bio_complete_batch, cpu).work);
return 0;
}

/*
* Disable this CPU's delayed work so that it cannot run on an unbound worker
* after the CPU is offlined.
*/
static int bio_complete_batch_cpu_down_prep(unsigned int cpu)
{
disable_delayed_work_sync(&per_cpu(bio_complete_batch, cpu).work);
return 0;
}

/*
* Drain a dead CPU's deferred bio completions. The CPU is dead and the worker
* is canceled so no locking is needed.
*/
static int bio_complete_batch_cpu_dead(unsigned int cpu)
{
struct bio_complete_batch *batch =
per_cpu_ptr(&bio_complete_batch, cpu);
struct bio *bio;

while ((bio = bio_list_pop(&batch->list)))
bio->bi_end_io(bio);

return 0;
}

static void __init bio_complete_batch_init(int cpu)
{
struct bio_complete_batch *batch =
per_cpu_ptr(&bio_complete_batch, cpu);

bio_list_init(&batch->list);
INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
batch->cpu = cpu;

if (!cpu_online(cpu))
disable_delayed_work_sync(&batch->work);
}

static int __init init_bio(void)
{
int i;
Expand All @@ -1994,6 +2115,30 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}

for_each_possible_cpu(i)
bio_complete_batch_init(i);

bio_complete_wq = alloc_workqueue("bio_complete",
WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bio_complete_wq)
panic("bio: can't allocate bio_complete workqueue\n");

/*
* bio task-context completion draining on hot-unplugged CPUs:
*
* 1. Stop the per-CPU delayed work while the CPU is still online, so
* that it cannot run on an unbound worker later.
* 2. Drain leftover bios added between worker disabling and CPU
* offlining.
*/
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"block/bio:complete:online",
bio_complete_batch_cpu_online,
bio_complete_batch_cpu_down_prep);
cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
"block/bio:complete:dead",
NULL, bio_complete_batch_cpu_dead);

cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead);

Expand Down
5 changes: 3 additions & 2 deletions block/fops.c
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,8 @@ static int blkdev_write_begin(const struct kiocb *iocb,
unsigned len, struct folio **foliop,
void **fsdata)
{
return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
blkdev_get_block);
}

static int blkdev_write_end(const struct kiocb *iocb,
Expand Down Expand Up @@ -966,7 +967,7 @@ const struct file_operations def_blk_fops = {
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC,
.fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE,
};

static __init int blkdev_init(void)
Expand Down
19 changes: 17 additions & 2 deletions fs/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -2138,14 +2138,19 @@ EXPORT_SYMBOL(block_commit_write);
*
* The filesystem needs to handle block truncation upon failure.
*/
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
int block_write_begin_iocb(const struct kiocb *iocb,
struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
fgf_t fgp_flags = FGP_WRITEBEGIN;
struct folio *folio;
int status;

folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
fgp_flags |= FGP_DONTCACHE;

folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
Expand All @@ -2160,6 +2165,13 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
*foliop = folio;
return status;
}

int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block)
{
return block_write_begin_iocb(NULL, mapping, pos, len, foliop,
get_block);
}
EXPORT_SYMBOL(block_write_begin);

int block_write_end(loff_t pos, unsigned len, unsigned copied,
Expand Down Expand Up @@ -2715,6 +2727,9 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,

bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);

if (folio_test_dropbehind(bh->b_folio))
bio_set_flag(bio, BIO_COMPLETE_IN_TASK);

if (IS_ENABLED(CONFIG_FS_ENCRYPTION))
buffer_set_crypto_ctx(bio, bh, GFP_NOIO);

Expand Down
5 changes: 3 additions & 2 deletions fs/iomap/ioend.c
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,

if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
if (folio_test_dropbehind(folio))
ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;

Expand All @@ -255,6 +253,9 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;

if (folio_test_dropbehind(folio))
bio_set_flag(&ioend->io_bio, BIO_COMPLETE_IN_TASK);

/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
Expand Down
4 changes: 0 additions & 4 deletions fs/xfs/xfs_aops.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,6 @@ xfs_ioend_needs_wq_completion(
if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
return true;

/* Page cache invalidation cannot be done in irq context. */
if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
return true;

return false;
}

Expand Down
32 changes: 32 additions & 0 deletions include/linux/bio.h
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,38 @@ static inline struct bio *bio_alloc(struct block_device *bdev,

void submit_bio(struct bio *bio);

/**
* bio_in_atomic - check if the current context is unsafe for bio completion
*
* Return: %true in atomic contexts (e.g. hard/soft IRQ, preempt-disabled);
* %false when a bio can be safely completed in the current context.
*/
static inline bool bio_in_atomic(void)
{
if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
return true;
if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
return true;
return !preemptible();
}

void __bio_complete_in_task(struct bio *bio);

/**
* bio_complete_in_task - ensure a bio is completed in preemptible task context
* @bio: bio to complete
*
* If called from non-task context, offload the bio completion to a worker
* thread and return %true. Else return %false and do nothing.
*/
static inline bool bio_complete_in_task(struct bio *bio)
{
if (!bio_in_atomic())
return false;
__bio_complete_in_task(bio);
return true;
}

extern void bio_endio(struct bio *);

static inline void bio_io_error(struct bio *bio)
Expand Down
1 change: 1 addition & 0 deletions include/linux/blk_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
BIO_FLAG_LAST
};

Expand Down
3 changes: 3 additions & 0 deletions include/linux/buffer_head.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,9 @@ int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block);
int block_write_begin_iocb(const struct kiocb *iocb,
struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block);
int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block);
int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
Expand Down
5 changes: 1 addition & 4 deletions include/linux/iomap.h
Original file line number Diff line number Diff line change
Expand Up @@ -399,16 +399,13 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
/* is DONTCACHE I/O */
#define IOMAP_IOEND_DONTCACHE (1U << 4)

/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \
IOMAP_IOEND_DONTCACHE)
(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)

/*
* Structure for writeback I/O completions.
Expand Down
Loading