Am 23.05.2025 um 20:12 hat Andrey Drobyshev geschrieben:
> On 5/20/25 1:29 PM, Fiona Ebner wrote:
> > This is in preparation to mark bdrv_drained_begin() as GRAPH_UNLOCKED.
> >
> > More granular draining is not trivially possible, because
> > bdrv_snapshot_delete() can recursively call itself.
> >
> > The return value of bdrv_all_delete_snapshot() changes from -1 to
> > -errno propagated from failed sub-calls. This is fine for the existing
> > callers of bdrv_all_delete_snapshot().
> >
> > Signed-off-by: Fiona Ebner <[email protected]>
> > ---
> >
> > Changes in v2:
> > * Use 'must be drained' instead of 'needs to be drained'.
> > * Use goto for cleanup/error handling in bdrv_all_delete_snapshot().
> > * Don't use atomics to access bs->quiesce_counter.
> >
> > block/snapshot.c | 26 +++++++++++++++-----------
> > blockdev.c | 25 +++++++++++++++++--------
> > qemu-img.c | 2 ++
> > 3 files changed, 34 insertions(+), 19 deletions(-)
> >
> > diff --git a/block/snapshot.c b/block/snapshot.c
> > index 22567f1fb9..9f300a78bd 100644
> > --- a/block/snapshot.c
> > +++ b/block/snapshot.c
> > @@ -327,7 +327,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
> >
> > /**
> > * Delete an internal snapshot by @snapshot_id and @name.
> > - * @bs: block device used in the operation
> > + * @bs: block device used in the operation, must be drained
> > * @snapshot_id: unique snapshot ID, or NULL
> > * @name: snapshot name, or NULL
> > * @errp: location to store error
> > @@ -358,6 +358,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
> >
> > GLOBAL_STATE_CODE();
> >
> > + assert(bs->quiesce_counter > 0);
> > +
> > if (!drv) {
> > error_setg(errp, "Device '%s' has no medium",
> > bdrv_get_device_name(bs));
> > @@ -368,9 +370,6 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
> > return -EINVAL;
> > }
> >
> > - /* drain all pending i/o before deleting snapshot */
> > - bdrv_drained_begin(bs);
> > -
> > if (drv->bdrv_snapshot_delete) {
> > ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp);
> > } else if (fallback_bs) {
> > @@ -382,7 +381,6 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
> > ret = -ENOTSUP;
> > }
> >
> > - bdrv_drained_end(bs);
> > return ret;
> > }
> >
> > @@ -571,19 +569,22 @@ int bdrv_all_delete_snapshot(const char *name,
> > ERRP_GUARD();
> > g_autoptr(GList) bdrvs = NULL;
> > GList *iterbdrvs;
> > + int ret = 0;
> >
> > GLOBAL_STATE_CODE();
> > - GRAPH_RDLOCK_GUARD_MAINLOOP();
> >
> > - if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp)
> > < 0) {
> > - return -1;
> > + bdrv_drain_all_begin();
> > + bdrv_graph_rdlock_main_loop();
> > +
> > + ret = bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs,
> > errp);
> > + if (ret < 0) {
> > + goto out;
> > }
> >
> > iterbdrvs = bdrvs;
> > while (iterbdrvs) {
> > BlockDriverState *bs = iterbdrvs->data;
> > QEMUSnapshotInfo sn1, *snapshot = &sn1;
> > - int ret = 0;
> >
> > if ((devices || bdrv_all_snapshots_includes_bs(bs)) &&
> > bdrv_snapshot_find(bs, snapshot, name) >= 0)
> > @@ -594,13 +595,16 @@ int bdrv_all_delete_snapshot(const char *name,
> > if (ret < 0) {
> > error_prepend(errp, "Could not delete snapshot '%s' on '%s': ",
> > name, bdrv_get_device_or_node_name(bs));
> > - return -1;
> > + goto out;
> > }
> >
> > iterbdrvs = iterbdrvs->next;
> > }
> >
> > - return 0;
> > +out:
> > + bdrv_graph_rdunlock_main_loop();
> > + bdrv_drain_all_end();
> > + return ret;
> > }
> >
> >
> > diff --git a/blockdev.c b/blockdev.c
> > index 21443b4514..3982f9776b 100644
> > --- a/blockdev.c
> > +++ b/blockdev.c
> > @@ -1132,39 +1132,41 @@ SnapshotInfo
> > *qmp_blockdev_snapshot_delete_internal_sync(const char *device,
> > int ret;
> >
> > GLOBAL_STATE_CODE();
> > - GRAPH_RDLOCK_GUARD_MAINLOOP();
> > +
> > + bdrv_drain_all_begin();
> > + bdrv_graph_rdlock_main_loop();
> >
> > bs = qmp_get_root_bs(device, errp);
> > if (!bs) {
> > - return NULL;
> > + goto error;
> > }
> >
> > if (!id && !name) {
> > error_setg(errp, "Name or id must be provided");
> > - return NULL;
> > + goto error;
> > }
> >
> > if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
> > errp)) {
> > - return NULL;
> > + goto error;
> > }
> >
> > ret = bdrv_snapshot_find_by_id_and_name(bs, id, name, &sn, &local_err);
> > if (local_err) {
> > error_propagate(errp, local_err);
> > - return NULL;
> > + goto error;
> > }
> > if (!ret) {
> > error_setg(errp,
> > "Snapshot with id '%s' and name '%s' does not exist on "
> > "device '%s'",
> > STR_OR_NULL(id), STR_OR_NULL(name), device);
> > - return NULL;
> > + goto error;
> > }
> >
> > bdrv_snapshot_delete(bs, id, name, &local_err);
> > if (local_err) {
> > error_propagate(errp, local_err);
> > - return NULL;
> > + goto error;
> > }
> >
> > info = g_new0(SnapshotInfo, 1);
> > @@ -1180,6 +1182,9 @@ SnapshotInfo
> > *qmp_blockdev_snapshot_delete_internal_sync(const char *device,
> > info->has_icount = true;
> > }
> >
> > +error:
> > + bdrv_graph_rdunlock_main_loop();
> > + bdrv_drain_all_end();
> > return info;
> > }
> >
> > @@ -1295,12 +1300,14 @@ static void internal_snapshot_abort(void *opaque)
> > Error *local_error = NULL;
> >
> > GLOBAL_STATE_CODE();
> > - GRAPH_RDLOCK_GUARD_MAINLOOP();
> >
> > if (!state->created) {
> > return;
> > }
> >
> > + bdrv_drain_all_begin();
> > + bdrv_graph_rdlock_main_loop();
> > +
> > if (bdrv_snapshot_delete(bs, sn->id_str, sn->name, &local_error) < 0) {
> > error_reportf_err(local_error,
> > "Failed to delete snapshot with id '%s' and "
> > @@ -1308,6 +1315,8 @@ static void internal_snapshot_abort(void *opaque)
> > sn->id_str, sn->name,
> > bdrv_get_device_name(bs));
> > }
> > + bdrv_graph_rdunlock_main_loop();
> > + bdrv_drain_all_end();
> > }
> >
>
> Okay, I've got a very simple and naive question to ask. We've got this
> pattern recurring throughout the series:
>
> > GLOBAL_STATE_CODE();
> > bdrv_drain_all_begin();
> > bdrv_graph_rdlock_main_loop();
> >
> > ...
> >
> > bdrv_graph_rdunlock_main_loop();
> > bdrv_drain_all_end();
>
> bdrv_graph_rdlock_main_loop() doesn't actually take any locks, it
> asserts that we're running in the main thread and not in a coroutine.
> bdrv_graph_rdunlock_main_loop() does the same.
> GRAPH_RDLOCK_GUARD_MAINLOOP() does both those calls, in the beginning of
> a function and when leaving its scope, so essentially it also just does
> assert(qemu_in_main_thread() && !qemu_in_coroutine()).
>
> Therefore:
>
> 1. Is there any real benefit from using those
> {rdlock/rdunlock}_main_loop() constructions, or they're here due to
> historical reasons only?
It's the price we pay for the compiler to verify our locking rules.
> 2. Would it hurt if we only leave GRAPH_RDLOCK_GUARD_MAINLOOP() in all
> such occurrences? At least when it's obvious we can't get out of the
> main thread. That would simply deliver us from performing same checks
> several times, similar to what's done in commit 22/24 ("block/io: remove
> duplicate GLOBAL_STATE_CODE() in bdrv_do_drained_end()").
Once bdrv_drain_all_begin() is marked GRAPH_UNLOCKED, calling it after
GRAPH_RDLOCK_GUARD_MAINLOOP() would be wrong according to TSA rules
(which don't know anything about this being only a fake lock) and the
build would fail.
Kevin