From ecef0643f106fa2fffdbbd288c20befbd18a51c4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 3 Jun 2026 21:27:41 +0800 Subject: [PATCH 1/5] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex blkg_destroy_all() iterates q->blkg_list without holding blkcg_mutex, which can race with blkg_free_workfn() that removes blkgs from the list while holding blkcg_mutex. Add blkcg_mutex protection around the q->blkg_list iteration to prevent potential list corruption or use-after-free issues. Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bc63bd220865d..697f0de9ff708 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -575,6 +575,7 @@ static void blkg_destroy_all(struct gendisk *disk) int i; restart: + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; @@ -593,6 +594,7 @@ static void blkg_destroy_all(struct gendisk *disk) if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); cond_resched(); goto restart; } @@ -612,6 +614,7 @@ static void blkg_destroy_all(struct gendisk *disk) q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); wake_up_var(&q->root_blkg); } From 226cdc183933d1c4d94e789f60481f33fbebcb5d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 3 Jun 2026 21:27:42 +0800 Subject: [PATCH 2/5] bfq: protect q->blkg_list iteration in bfq_end_wr_async() with blkcg_mutex bfq_end_wr_async() iterates q->blkg_list while only holding bfqd->lock, but not blkcg_mutex. This can race with blkg_free_workfn() that removes blkgs from the list while holding blkcg_mutex. Add blkcg_mutex protection in bfq_end_wr() before taking bfqd->lock to ensure proper synchronization when iterating q->blkg_list. Signed-off-by: Yu Kuai --- block/bfq-cgroup.c | 3 ++- block/bfq-iosched.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index ac83b06687640..1e7dc1a2d2fb9 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -940,7 +940,8 @@ void bfq_end_wr_async(struct bfq_data *bfqd) list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); - bfq_end_wr_async_queues(bfqd, bfqg); + if (bfqg) + bfq_end_wr_async_queues(bfqd, bfqg); } bfq_end_wr_async_queues(bfqd, bfqd->root_group); } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 141c602d5e858..42ccfd0c6140f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2645,6 +2645,9 @@ static void bfq_end_wr(struct bfq_data *bfqd) struct bfq_queue *bfqq; int i; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + mutex_lock(&bfqd->queue->blkcg_mutex); +#endif spin_lock_irq(&bfqd->lock); for (i = 0; i < bfqd->num_actuators; i++) { @@ -2656,6 +2659,9 @@ static void bfq_end_wr(struct bfq_data *bfqd) bfq_end_wr_async(bfqd); spin_unlock_irq(&bfqd->lock); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + mutex_unlock(&bfqd->queue->blkcg_mutex); +#endif } static sector_t bfq_io_struct_pos(void *io_struct, bool request) From 99238a68f382b361b4aa95b686287f1cf10c720d Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Wed, 3 Jun 2026 21:27:43 +0800 Subject: [PATCH 3/5] blk-cgroup: fix race between policy activation and blkg destruction When switching an IO scheduler on a block device, blkcg_activate_policy() allocates blkg_policy_data (pd) for all blkgs attached to the queue. However, blkcg_activate_policy() may race with concurrent blkcg deletion, leading to use-after-free and memory leak issues. The use-after-free occurs in the following race: T1 (blkcg_activate_policy): - Successfully allocates pd for blkg1 (loop0->queue, blkcgA) - Fails to allocate pd for blkg2 (loop0->queue, blkcgB) - Enters the enomem rollback path to release blkg1 resources T2 (blkcg deletion): - blkcgA is deleted concurrently - blkg1 is freed via blkg_free_workfn() - blkg1->pd is freed T1 (continued): - Rollback path accesses blkg1->pd->online after pd is freed - Triggers use-after-free In addition, blkg_free_workfn() frees pd before removing the blkg from q->blkg_list. This allows blkcg_activate_policy() to allocate a new pd for a blkg that is being destroyed, leaving the newly allocated pd unreachable when the blkg is finally freed. Fix these races by extending blkcg_mutex coverage to serialize blkcg_activate_policy() rollback and blkg destruction, ensuring pd lifecycle is synchronized with blkg list visibility. Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Signed-off-by: Zheng Qixing Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 697f0de9ff708..a656e89845ffd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1614,6 +1614,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); + + mutex_lock(&q->blkcg_mutex); retry: spin_lock_irq(&q->queue_lock); @@ -1676,6 +1678,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) spin_unlock_irq(&q->queue_lock); out: + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) From 7cbe2b2ee921167046933ef891700cb2cc9d99e0 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Wed, 3 Jun 2026 21:27:44 +0800 Subject: [PATCH 4/5] blk-cgroup: skip dying blkg in blkcg_activate_policy() When switching IO schedulers on a block device, blkcg_activate_policy() can race with concurrent blkcg deletion, leading to a use-after-free in rcu_accelerate_cbs. T1: T2: blkg_destroy kill(&blkg->refcnt) // blkg->refcnt=1->0 blkg_release // call_rcu(__blkg_release) ... blkg_free_workfn ->pd_free_fn(pd) elv_iosched_store elevator_switch ... iterate blkg list blkg_get(blkg) // blkg->refcnt=0->1 list_del_init(&blkg->q_node) blkg_put(pinned_blkg) // blkg->refcnt=1->0 blkg_release // call_rcu again rcu_accelerate_cbs // uaf Fix this by checking hlist_unhashed(&blkg->blkcg_node) before getting a reference to the blkg. This is the same check used in blkg_destroy() to detect if a blkg has already been destroyed. If the blkg is already unhashed, skip processing it since it's being destroyed. Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Signed-off-by: Zheng Qixing Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a656e89845ffd..1533ce9257ec6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1625,6 +1625,8 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) if (blkg->pd[pol->plid]) continue; + if (hlist_unhashed(&blkg->blkcg_node)) + continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { From 6359b8044cce5e89496ff0fcd17d69bfd2886eea Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Wed, 3 Jun 2026 21:27:45 +0800 Subject: [PATCH 5/5] blk-cgroup: factor policy pd teardown loop into helper Move the teardown sequence which offlines and frees per-policy blkg_policy_data (pd) into a helper for readability. No functional change intended. Signed-off-by: Zheng Qixing Reviewed-by: Christoph Hellwig Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- block/blk-cgroup.c | 57 ++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1533ce9257ec6..450ff7ff8ff84 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1577,6 +1577,31 @@ struct cgroup_subsys io_cgrp_subsys = { }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); +/* + * Tear down per-blkg policy data for @pol on @q. + */ +static void blkcg_policy_teardown_pds(struct request_queue *q, + const struct blkcg_policy *pol) +{ + struct blkcg_gq *blkg; + + list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; + + spin_lock(&blkcg->lock); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); + blkg->pd[pol->plid] = NULL; + } + spin_unlock(&blkcg->lock); + } +} + /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest @@ -1692,21 +1717,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) { - struct blkcg *blkcg = blkg->blkcg; - struct blkg_policy_data *pd; - - spin_lock(&blkcg->lock); - pd = blkg->pd[pol->plid]; - if (pd) { - if (pd->online && pol->pd_offline_fn) - pol->pd_offline_fn(pd); - pd->online = false; - pol->pd_free_fn(pd); - blkg->pd[pol->plid] = NULL; - } - spin_unlock(&blkcg->lock); - } + blkcg_policy_teardown_pds(q, pol); spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; @@ -1725,7 +1736,6 @@ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; - struct blkcg_gq *blkg; unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) @@ -1738,20 +1748,7 @@ void blkcg_deactivate_policy(struct gendisk *disk, spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); - - list_for_each_entry(blkg, &q->blkg_list, q_node) { - struct blkcg *blkcg = blkg->blkcg; - - spin_lock(&blkcg->lock); - if (blkg->pd[pol->plid]) { - if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[pol->plid]); - pol->pd_free_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; - } - spin_unlock(&blkcg->lock); - } - + blkcg_policy_teardown_pds(q, pol); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex);