diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 8f8fa14886ded..463155b0b1ff0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -247,8 +247,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->sb, &fs_holder_ops); + bdev_file = fs_bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->sb, fs_info->sb); if (IS_ERR(bdev_file)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev_file); @@ -325,7 +325,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, fs_info->sb); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a39460bf68a77..7e21fccd4868e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2579,7 +2579,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev_file) - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, fs_info->sb); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2630,7 +2630,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev_file) - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, fs_info->sb); out: btrfs_put_dev_args_from_path(&args); out_free: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a88e68f905646..6f7d7afb4d66a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -480,7 +480,12 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, struct block_device *bdev; int ret; - *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops); + if (holder) + *bdev_file = fs_bdev_file_open_by_path(device_path, flags, + holder, holder); + else + *bdev_file = bdev_file_open_by_path(device_path, flags, NULL, + NULL); if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); @@ -495,7 +500,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (holder) { ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE); if (ret) { - bdev_fput(*bdev_file); + fs_bdev_file_release(*bdev_file, holder); goto error; } } @@ -503,7 +508,10 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - bdev_fput(*bdev_file); + if (holder) + fs_bdev_file_release(*bdev_file, holder); + else + bdev_fput(*bdev_file); goto error; } @@ -727,7 +735,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, holder); return -EINVAL; } @@ -1082,7 +1090,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; if (device->bdev_file) { - bdev_fput(device->bdev_file); + fs_bdev_file_release(device->bdev_file, fs_devices->fs_info->sb); device->bdev = NULL; device->bdev_file = NULL; fs_devices->open_devices--; @@ -1129,7 +1137,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - bdev_fput(device->bdev_file); + fs_bdev_file_release(device->bdev_file, device->fs_info->sb); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -2820,8 +2828,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, - fs_info->sb, &fs_holder_ops); + bdev_file = fs_bdev_file_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->sb, fs_info->sb); if (IS_ERR(bdev_file)) return PTR_ERR(bdev_file); @@ -3045,7 +3053,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path error_free_device: btrfs_free_device(device); error: - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, fs_info->sb); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 44da21c9d7776..5220585293dfe 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -69,6 +69,9 @@ int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb, { struct erofs_sb_info *sbi = EROFS_SB(sb); + if (erofs_is_shutdown(sb)) + return -EIO; + buf->file = NULL; if (in_metabox) { if (unlikely(!sbi->metabox_inode)) @@ -236,6 +239,9 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) } up_read(&devs->rwsem); } + if (erofs_is_shutdown(sb) || + (map->m_dif && READ_ONCE(map->m_dif->dead))) + return -EIO; return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4792490161ec9..ca1ed7ce3961d 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -48,6 +48,7 @@ struct erofs_device_info { erofs_blk_t blocks; erofs_blk_t uniaddr; + bool dead; /* backing device gone; fence I/O */ }; enum { @@ -104,6 +105,7 @@ struct erofs_xattr_prefix_item { struct erofs_sb_info { struct erofs_device_info dif0; struct erofs_mount_opts opt; /* options */ + unsigned long flags; /* see EROFS_SB_* */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -195,6 +197,14 @@ static inline bool erofs_is_fscache_mode(struct super_block *sb) !erofs_is_fileio_mode(EROFS_SB(sb)) && !sb->s_bdev; } +/* erofs_sb_info->flags */ +#define EROFS_SB_SHUTDOWN 0 /* primary device gone; fail all I/O */ + +static inline bool erofs_is_shutdown(struct super_block *sb) +{ + return test_bit(EROFS_SB_SHUTDOWN, &EROFS_SB(sb)->flags); +} + enum { EROFS_ZIP_CACHE_DISABLED, EROFS_ZIP_CACHE_READAHEAD, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 802add6652fda..e03cb95be96b8 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -153,8 +153,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, } else if (!sbi->devs->flatdev) { file = erofs_is_fileio_mode(sbi) ? filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : - bdev_file_open_by_path(dif->path, - BLK_OPEN_READ, sb->s_type, NULL); + fs_bdev_file_open_by_path(dif->path, + BLK_OPEN_READ, sb->s_type, sb); if (IS_ERR(file)) { if (file == ERR_PTR(-ENOTBLK)) return -EINVAL; @@ -843,11 +843,16 @@ static int erofs_fc_reconfigure(struct fs_context *fc) static int erofs_release_device_info(int id, void *ptr, void *data) { + struct super_block *sb = data; struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->file) - fput(dif->file); + if (dif->file) { + if (S_ISBLK(file_inode(dif->file)->i_mode)) + fs_bdev_file_release(dif->file, sb); + else + fput(dif->file); + } erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -855,18 +860,19 @@ static int erofs_release_device_info(int id, void *ptr, void *data) return 0; } -static void erofs_free_dev_context(struct erofs_dev_context *devs) +static void erofs_free_dev_context(struct erofs_dev_context *devs, + struct super_block *sb) { if (!devs) return; - idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_for_each(&devs->tree, &erofs_release_device_info, sb); idr_destroy(&devs->tree); kfree(devs); } -static void erofs_sb_free(struct erofs_sb_info *sbi) +static void erofs_sb_free(struct erofs_sb_info *sbi, struct super_block *sb) { - erofs_free_dev_context(sbi->devs); + erofs_free_dev_context(sbi->devs, sb); kfree(sbi->fsid); kfree_sensitive(sbi->domain_id); if (sbi->dif0.file) @@ -879,8 +885,13 @@ static void erofs_fc_free(struct fs_context *fc) { struct erofs_sb_info *sbi = fc->s_fs_info; - if (sbi) /* free here if an error occurs before transferring to sb */ - erofs_sb_free(sbi); + /* + * Freed here only if an error occurs before the sb is set up; at that + * point no block-backed device has been claimed (that happens in + * fill_super), so the NULL sb never reaches fs_bdev_file_release(). + */ + if (sbi) + erofs_sb_free(sbi, NULL); } static const struct fs_context_operations erofs_context_ops = { @@ -936,7 +947,7 @@ static void erofs_kill_sb(struct super_block *sb) erofs_drop_internal_inodes(sbi); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); - erofs_sb_free(sbi); + erofs_sb_free(sbi, sb); sb->s_fs_info = NULL; } @@ -948,7 +959,7 @@ static void erofs_put_super(struct super_block *sb) erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); erofs_drop_internal_inodes(sbi); - erofs_free_dev_context(sbi->devs); + erofs_free_dev_context(sbi->devs, sb); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } @@ -1121,6 +1132,35 @@ static void erofs_evict_inode(struct inode *inode) clear_inode(inode); } +/* + * A blob device may back several erofs superblocks; fence only the affected + * one and keep the rest of the mount alive. The primary device falls back to + * the generic teardown (return non-zero). + */ +static int erofs_remove_bdev(struct super_block *sb, struct block_device *bdev) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + if (bdev == sb->s_bdev) + return 1; + + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode) && + file_bdev(dif->file)->bd_dev == bdev->bd_dev) + WRITE_ONCE(dif->dead, true); + } + up_read(&devs->rwsem); + return 0; +} + +static void erofs_shutdown(struct super_block *sb) +{ + set_bit(EROFS_SB_SHUTDOWN, &EROFS_SB(sb)->flags); +} + const struct super_operations erofs_sops = { .put_super = erofs_put_super, .alloc_inode = erofs_alloc_inode, @@ -1128,6 +1168,8 @@ const struct super_operations erofs_sops = { .evict_inode = erofs_evict_inode, .statfs = erofs_statfs, .show_options = erofs_show_options, + .remove_bdev = erofs_remove_bdev, + .shutdown = erofs_shutdown, }; module_init(erofs_module_init); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 27ab7bd844ec7..3f2cdcf70d8de 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1698,11 +1698,15 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f, continue; } - /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { .m_pa = round_down(pcl->pos, sb->s_blocksize), }; - (void)erofs_map_dev(sb, &mdev); + if (erofs_map_dev(sb, &mdev)) { + /* the backing device is gone; fail the batch */ + q[JQ_SUBMIT]->eio = true; + qtail[JQ_SUBMIT] = &pcl->next; + continue; + } cur = mdev.m_pa; end = round_up(cur + pcl->pageofs_in + pcl->pclustersize, @@ -1786,7 +1790,7 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f, * although background is preferred, no one is pending for submission. * don't issue decompression but drop it directly instead. */ - if (!*force_fg && !nr_bios) { + if (!*force_fg && !nr_bios && !q[JQ_SUBMIT]->eio) { kvfree(q[JQ_SUBMIT]); return; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6a77db4d3124e..8108d999008e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5793,7 +5793,7 @@ failed_mount8: __maybe_unused brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); - bdev_fput(sbi->s_journal_bdev_file); + fs_bdev_file_release(sbi->s_journal_bdev_file, sb); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5972,9 +5972,9 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, struct ext4_super_block *es; int errno; - bdev_file = bdev_file_open_by_dev(j_dev, + bdev_file = fs_bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, - sb, &fs_holder_ops); + sb, sb); if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", @@ -6034,7 +6034,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, out_bh: brelse(bh); out_bdev: - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, sb); return ERR_PTR(errno); } @@ -6073,7 +6073,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, out_journal: ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, sb); return ERR_PTR(errno); } @@ -7492,7 +7492,7 @@ static void ext4_kill_sb(struct super_block *sb) kill_block_super(sb); if (bdev_file) - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, sb); } static struct file_system_type ext4_fs_type = { diff --git a/fs/super.c b/fs/super.c index 378e81efe643b..983c2fbf52027 100644 --- a/fs/super.c +++ b/fs/super.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include /* for the emergency remount stuff */ @@ -1411,196 +1412,379 @@ EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* - * Lock the superblock that is holder of the bdev. Returns the superblock - * pointer if we successfully locked the superblock and it is alive. Otherwise - * we return NULL and just unlock bdev->bd_holder_lock. - * - * The function must be called with bdev->bd_holder_lock and releases it. + * Filesystems claim block devices through fs_bdev_file_open_by_{dev,path}(), + * which records a {dev_t -> super_block} entry in the global @fs_bdev_supers + * table. The fs_holder_ops callbacks resolve a device event to the + * superblock(s) using that device by looking it up there rather than reading + * bdev->bd_holder, so several superblocks may share one block device -- the + * holder is then only the block layer's exclusivity token. */ -static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) - __releases(&bdev->bd_holder_lock) +struct fs_bdev_holder { + dev_t dev; /* @fs_bdev_supers key */ + struct super_block *sb; + refcount_t fs_bdev_passive; /* @fs_bdev_active>0 bias + cursor pins */ + refcount_t fs_bdev_active; /* open claims for (dev, sb) */ + struct rhlist_head node; + struct rcu_head rcu; +}; + +static struct rhltable fs_bdev_supers; +static const struct rhashtable_params fs_bdev_params = { + .key_len = sizeof(dev_t), + .key_offset = offsetof(struct fs_bdev_holder, dev), + .head_offset = offsetof(struct fs_bdev_holder, node), +}; + +static int __init fs_bdev_supers_init(void) { - struct super_block *sb = bdev->bd_holder; - bool locked; + if (rhltable_init(&fs_bdev_supers, &fs_bdev_params)) + panic("VFS: Cannot initialise fs_bdev_supers\n"); + return 0; +} +fs_initcall(fs_bdev_supers_init); - lockdep_assert_held(&bdev->bd_holder_lock); - lockdep_assert_not_held(&sb->s_umount); - lockdep_assert_not_held(&bdev->bd_disk->open_mutex); +static void fs_bdev_holder_put(struct fs_bdev_holder *h) +{ + /* Unlink only once unpinned, so a cursor never resumes from a removed node. */ + if (refcount_dec_and_test(&h->fs_bdev_passive)) { + rhltable_remove(&fs_bdev_supers, &h->node, fs_bdev_params); + put_super(h->sb); + kfree_rcu(h, rcu); + } +} - /* Make sure sb doesn't go away from under us */ - spin_lock(&sb_lock); - sb->s_count++; - spin_unlock(&sb_lock); +/* + * Walk the superblocks sharing a block device the way __iterate_supers() walks + * super_blocks: fs_bdev_first()/fs_bdev_next() return each entry with its node + * pinned (refcount) so the chain link survives the RCU drop and the sleeping + * work the callbacks do between iterations; fs_bdev_next() also unpins the + * previous entry. The entry's fs_bdev_passive ref keeps @h->sb valid; callers + * take s_active and/or super_lock_shared() as needed and skip dying superblocks. + * A shared per-entry list node can't replace this because mark_dead and sync + * are not mutually serialised. + */ +static struct fs_bdev_holder *fs_bdev_pin(struct rhlist_head *pos) +{ + struct fs_bdev_holder *h; - mutex_unlock(&bdev->bd_holder_lock); + /* Caller holds rcu_read_lock(). */ + for (; pos; pos = rcu_dereference_all(pos->next)) { + h = container_of(pos, struct fs_bdev_holder, node); + if (refcount_inc_not_zero(&h->fs_bdev_passive)) + return h; + } + return NULL; +} - locked = super_lock(sb, excl); +static struct fs_bdev_holder *fs_bdev_first(dev_t dev) +{ + struct fs_bdev_holder *h; - /* - * If the superblock wasn't already SB_DYING then we hold - * s_umount and can safely drop our temporary reference. - */ - put_super(sb); + rcu_read_lock(); + h = fs_bdev_pin(rhltable_lookup(&fs_bdev_supers, &dev, fs_bdev_params)); + rcu_read_unlock(); + return h; +} - if (!locked) - return NULL; +static struct fs_bdev_holder *fs_bdev_next(struct fs_bdev_holder *prev) +{ + struct fs_bdev_holder *h; - if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { - super_unlock(sb, excl); - return NULL; - } + rcu_read_lock(); + h = fs_bdev_pin(rcu_dereference_all(prev->node.next)); + rcu_read_unlock(); - return sb; + fs_bdev_holder_put(prev); + return h; +} + +static int fs_super_freeze(struct super_block *sb) +{ + if (sb->s_op->freeze_super) + return sb->s_op->freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); + return freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); +} + +static int fs_super_thaw(struct super_block *sb) +{ + if (sb->s_op->thaw_super) + return sb->s_op->thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); + return thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { - struct super_block *sb; + struct fs_bdev_holder *h; + dev_t dev = bdev->bd_dev; - sb = bdev_super_lock(bdev, false); - if (!sb) - return; + mutex_unlock(&bdev->bd_holder_lock); - if (sb->s_op->remove_bdev) { - int ret; + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) { + struct super_block *sb = h->sb; - ret = sb->s_op->remove_bdev(sb, bdev); - if (!ret) { - super_unlock_shared(sb); - return; + if (!super_lock_shared(sb)) + continue; + if (sb->s_root && (sb->s_flags & SB_ACTIVE)) { + if (!sb->s_op->remove_bdev || + sb->s_op->remove_bdev(sb, bdev)) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + if (sb->s_op->shutdown) + sb->s_op->shutdown(sb); + } } - /* Fallback to shutdown. */ + super_unlock_shared(sb); } - - if (!surprise) - sync_filesystem(sb); - shrink_dcache_sb(sb); - evict_inodes(sb); - if (sb->s_op->shutdown) - sb->s_op->shutdown(sb); - - super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { - struct super_block *sb; - - sb = bdev_super_lock(bdev, false); - if (!sb) - return; + struct fs_bdev_holder *h; + dev_t dev = bdev->bd_dev; - sync_filesystem(sb); - super_unlock_shared(sb); -} + mutex_unlock(&bdev->bd_holder_lock); -static struct super_block *get_bdev_super(struct block_device *bdev) -{ - bool active = false; - struct super_block *sb; + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) { + struct super_block *sb = h->sb; - sb = bdev_super_lock(bdev, true); - if (sb) { - active = atomic_inc_not_zero(&sb->s_active); - super_unlock_excl(sb); + if (!super_lock_shared(sb)) + continue; + if (sb->s_root && (sb->s_flags & SB_ACTIVE)) + sync_filesystem(sb); + super_unlock_shared(sb); } - if (!active) - return NULL; - return sb; } /** - * fs_bdev_freeze - freeze owning filesystem of block device + * fs_bdev_freeze - freeze every superblock using a block device * @bdev: block device * - * Freeze the filesystem that owns this block device if it is still - * active. - * - * A filesystem that owns multiple block devices may be frozen from each - * block device and won't be unfrozen until all block devices are - * unfrozen. Each block device can only freeze the filesystem once as we - * nest freezes for block devices in the block layer. + * Freeze each live superblock using @bdev. A superblock owning several block + * devices is frozen once per device and stays frozen until all are thawed; the + * block layer nests these freezes so the count stays balanced. * - * Return: If the freeze was successful zero is returned. If the freeze - * failed a negative error code is returned. + * Return: 0, or the error from the one superblock on a single-fs device. When + * several superblocks share @bdev a per-superblock failure is swallowed + * (see below), but a sync_blockdev() failure is always reported. */ static int fs_bdev_freeze(struct block_device *bdev) { - struct super_block *sb; - int error = 0; + dev_t dev = bdev->bd_dev; + struct fs_bdev_holder *h; + unsigned int count = 0; + int error = 0, err; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); - sb = get_bdev_super(bdev); - if (!sb) - return -EINVAL; + mutex_unlock(&bdev->bd_holder_lock); - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); - else - error = freeze_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) { + if (!atomic_inc_not_zero(&h->sb->s_active)) + continue; + err = fs_super_freeze(h->sb); + if (err && !error) + error = err; + deactivate_super(h->sb); + count++; + } + + /* + * When several superblocks share the device, keep it frozen even if some + * of them failed to freeze and swallow the error: rolling the rest back + * via thaw_super() can fail too, so neither is a clear win. A single + * filesystem (count == 1) still reports its error. + */ + if (error && count > 1) + error = 0; if (!error) error = sync_blockdev(bdev); - deactivate_super(sb); return error; } /** - * fs_bdev_thaw - thaw owning filesystem of block device + * fs_bdev_thaw - thaw every superblock using a block device * @bdev: block device * - * Thaw the filesystem that owns this block device. + * The counterpart to fs_bdev_freeze(): thaw each live superblock using @bdev. + * A zero return does not imply a superblock is fully unfrozen; it may have been + * frozen more than once (by the kernel or via another device). * - * A filesystem that owns multiple block devices may be frozen from each - * block device and won't be unfrozen until all block devices are - * unfrozen. Each block device can only freeze the filesystem once as we - * nest freezes for block devices in the block layer. - * - * Return: If the thaw was successful zero is returned. If the thaw - * failed a negative error code is returned. If this function - * returns zero it doesn't mean that the filesystem is unfrozen - * as it may have been frozen multiple times (kernel may hold a - * freeze or might be frozen from other block devices). + * Return: 0, or the first error on a single-fs device; a shared device swallows + * per-superblock errors, as fs_bdev_freeze() does. */ static int fs_bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error; + dev_t dev = bdev->bd_dev; + struct fs_bdev_holder *h; + unsigned int count = 0; + int error = 0, err; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); - /* - * The block device may have been frozen before it was claimed by a - * filesystem. Concurrently another process might try to mount that - * frozen block device and has temporarily claimed the block device for - * that purpose causing a concurrent fs_bdev_thaw() to end up here. The - * mounter is already about to abort mounting because they still saw an - * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return - * NULL in that case. - */ - sb = get_bdev_super(bdev); - if (!sb) - return -EINVAL; + mutex_unlock(&bdev->bd_holder_lock); - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); - else - error = thaw_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); - deactivate_super(sb); + for (h = fs_bdev_first(dev); h; h = fs_bdev_next(h)) { + if (!atomic_inc_not_zero(&h->sb->s_active)) + continue; + err = fs_super_thaw(h->sb); + if (err && !error) + error = err; + deactivate_super(h->sb); + count++; + } + + /* Shared device: swallow per-superblock errors, like fs_bdev_freeze(). */ + if (error && count > 1) + error = 0; return error; } -const struct blk_holder_ops fs_holder_ops = { +static const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, }; -EXPORT_SYMBOL_GPL(fs_holder_ops); + +static int fs_bdev_register(struct file *bdev_file, struct super_block *sb) +{ + dev_t dev = file_bdev(bdev_file)->bd_dev; + struct rhlist_head *list, *pos; + struct fs_bdev_holder *h; + int err; + + /* + * A superblock may claim one device more than once (xfs with its log on + * the data device). Keep a single entry per (device, superblock) and + * count the claims in @fs_bdev_active; the entry lives until the last one + * is released. + */ + scoped_guard(rcu) { + list = rhltable_lookup(&fs_bdev_supers, &dev, fs_bdev_params); + rhl_for_each_entry_rcu(h, pos, list, node) + if (h->sb == sb && refcount_inc_not_zero(&h->fs_bdev_active)) + return 0; + } + + h = kmalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + h->dev = dev; + h->sb = sb; + refcount_set(&h->fs_bdev_passive, 1); + refcount_set(&h->fs_bdev_active, 1); + + err = rhltable_insert(&fs_bdev_supers, &h->node, fs_bdev_params); + if (err) { + kfree(h); + return err; + } + + /* The sb->s_count ref keeps @h->sb valid for as long as the entry exists. */ + spin_lock(&sb_lock); + sb->s_count++; + spin_unlock(&sb_lock); + + /* + * Don't bring a filesystem up on a frozen device. The entry is already + * published, so a freeze either is seen here or finds it and waits in + * super_lock() until this mount is born or (on -EBUSY) dies. The mount + * aborts, so the entry is torn down without rebalancing @fs_bdev_active. + */ + if (atomic_read(&file_bdev(bdev_file)->bd_fsfreeze_count) > 0) { + fs_bdev_holder_put(h); + return -EBUSY; + } + + return 0; +} + +/** + * fs_bdev_file_open_by_dev - claim a block device on behalf of a superblock + * @dev: block device number + * @mode: open mode + * @holder: block-layer exclusivity token (a superblock, or the file_system_type + * when the device may be shared by several superblocks of that type) + * @sb: superblock to drive fs_holder_ops events for + * + * Open @dev with &fs_holder_ops and register that @sb uses it, so device + * removal/sync/freeze/thaw are propagated to @sb (and any other superblock + * sharing @dev). Must be paired with fs_bdev_file_release(). + * + * Return: an opened block-device file or an ERR_PTR(). + */ +struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + struct super_block *sb) +{ + struct file *bdev_file; + int err; + + bdev_file = bdev_file_open_by_dev(dev, mode, holder, &fs_holder_ops); + if (IS_ERR(bdev_file)) + return bdev_file; + + err = fs_bdev_register(bdev_file, sb); + if (err) { + bdev_fput(bdev_file); + return ERR_PTR(err); + } + return bdev_file; +} +EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_dev); + +struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, struct super_block *sb) +{ + struct file *bdev_file; + int err; + + bdev_file = bdev_file_open_by_path(path, mode, holder, &fs_holder_ops); + if (IS_ERR(bdev_file)) + return bdev_file; + + err = fs_bdev_register(bdev_file, sb); + if (err) { + bdev_fput(bdev_file); + return ERR_PTR(err); + } + return bdev_file; +} +EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_path); + +/** + * fs_bdev_file_release - release a block device claimed for a superblock + * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}() + * @sb: superblock the device was claimed for + * + * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a + * pinning cursor defers the actual unlink). Then close the block device. + */ +void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb) +{ + dev_t dev = file_bdev(bdev_file)->bd_dev; + struct fs_bdev_holder *h, *found = NULL; + struct rhlist_head *list, *pos; + + rcu_read_lock(); + list = rhltable_lookup(&fs_bdev_supers, &dev, fs_bdev_params); + rhl_for_each_entry_rcu(h, pos, list, node) { + if (h->sb != sb) + continue; + /* At most one entry per (dev, sb); the last claim drops the bias. */ + if (refcount_dec_and_test(&h->fs_bdev_active)) + found = h; + break; + } + rcu_read_unlock(); + if (found) + fs_bdev_holder_put(found); + bdev_fput(bdev_file); +} +EXPORT_SYMBOL_GPL(fs_bdev_file_release); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) @@ -1609,7 +1793,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, struct file *bdev_file; struct block_device *bdev; - bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + bdev_file = fs_bdev_file_open_by_dev(sb->s_dev, mode, sb, sb); if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); @@ -1623,20 +1807,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - bdev_fput(bdev_file); + fs_bdev_file_release(bdev_file, sb); return -EACCES; } - /* - * It is enough to check bdev was not frozen before we set - * s_bdev as freezing will wait until SB_BORN is set. - */ - if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { - if (fc) - warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - bdev_fput(bdev_file); - return -EBUSY; - } spin_lock(&sb_lock); sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; @@ -1725,7 +1899,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - bdev_fput(sb->s_bdev_file); + fs_bdev_file_release(sb->s_bdev_file, sb); } } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0cea458f13536..053f91a5f4015 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1615,7 +1615,7 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_fput(btp->bt_file); + fs_bdev_file_release(btp->bt_file, btp->bt_mount->m_super); kfree(btp); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f8de44443e81c..3046672106959 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -400,8 +400,8 @@ xfs_blkdev_get( blk_mode_t mode; mode = sb_open_mode(mp->m_super->s_flags); - *bdev_filep = bdev_file_open_by_path(name, mode, - mp->m_super, &fs_holder_ops); + *bdev_filep = fs_bdev_file_open_by_path(name, mode, + mp->m_super, mp->m_super); if (IS_ERR(*bdev_filep)) { error = PTR_ERR(*bdev_filep); *bdev_filep = NULL; @@ -526,7 +526,7 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ if (logdev_file) - bdev_fput(logdev_file); + fs_bdev_file_release(logdev_file, mp->m_super); } return 0; @@ -538,10 +538,10 @@ xfs_open_devices( xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: if (rtdev_file) - bdev_fput(rtdev_file); + fs_bdev_file_release(rtdev_file, mp->m_super); out_close_logdev: if (logdev_file) - bdev_fput(logdev_file); + fs_bdev_file_release(logdev_file, mp->m_super); return error; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 890128cdea1ce..43d37c02febf8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -126,8 +126,6 @@ struct blk_integrity { unsigned char pi_tuple_size; }; -typedef unsigned int __bitwise blk_mode_t; - /* open for reading */ #define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) /* open for writing */ @@ -1762,13 +1760,6 @@ struct blk_holder_ops { int (*thaw)(struct block_device *bdev); }; -/* - * For filesystems using @fs_holder_ops, the @holder argument passed to - * helpers used to open and claim block devices via - * bd_prepare_to_claim() must point to a superblock. - */ -extern const struct blk_holder_ops fs_holder_ops; - /* * Return the correct open flags for blkdev_get_by_* for super block flags * as stored in sb->s_flags. diff --git a/include/linux/fs.h b/include/linux/fs.h index 11559c513dfbb..e9346be8470fa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1921,8 +1921,6 @@ struct dir_context { struct io_uring_cmd; struct offset_ctx; -typedef unsigned int __bitwise fop_flags_t; - struct file_operations { struct module *owner; fop_flags_t fop_flags; diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h index f21ffbb6dea5b..721d842e3b24c 100644 --- a/include/linux/fs/super.h +++ b/include/linux/fs/super.h @@ -235,4 +235,11 @@ int freeze_super(struct super_block *super, enum freeze_holder who, int thaw_super(struct super_block *super, enum freeze_holder who, const void *freeze_owner); +struct file; +struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + struct super_block *sb); +struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode, + void *holder, struct super_block *sb); +void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb); + #endif /* _LINUX_FS_SUPER_H */ diff --git a/include/linux/types.h b/include/linux/types.h index 608050dbca6a7..ef026585420b9 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -163,6 +163,8 @@ typedef u32 dma_addr_t; typedef unsigned int __bitwise gfp_t; typedef unsigned int __bitwise slab_flags_t; typedef unsigned int __bitwise fmode_t; +typedef unsigned int __bitwise blk_mode_t; +typedef unsigned int __bitwise fop_flags_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 phys_addr_t;