24.01.2019 19:36, Kevin Wolf wrote:
> Am 24.01.2019 um 17:18 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 24.01.2019 17:17, Kevin Wolf wrote:
>>> Depending on the exact image layout and the storage backend (tmpfs is
>>> konwn to have very slow SEEK_HOLE/SEEK_DATA), caching lseek results can
>>> save us a lot of time e.g. during a mirror block job or qemu-img convert
>>> with a fragmented source image (.bdrv_co_block_status on the protocol
>>> layer can be called for every single cluster in the extreme case).
>>>
>>> We may only cache data regions because of possible concurrent writers.
>>> This means that we can later treat a recently punched hole as data, but
>>> this is safe. We can't cache holes because then we might treat recently
>>> written data as holes, which can cause corruption.
>>>
>>> Signed-off-by: Kevin Wolf <[email protected]>
>
>>> @@ -1555,8 +1561,17 @@ static int handle_aiocb_write_zeroes_unmap(void
>>> *opaque)
>>> {
>>> RawPosixAIOData *aiocb = opaque;
>>> BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
>>> + struct seek_data_cache *sdc;
>>> int ret;
>>>
>>> + /* Invalidate seek_data_cache if it overlaps */
>>> + sdc = &s->seek_data_cache;
>>> + if (sdc->valid && !(sdc->end < aiocb->aio_offset ||
>>> + sdc->start > aiocb->aio_offset +
>>> aiocb->aio_nbytes))
>>
>> to be presize: <= and >=
>
> Yes, you're right.
>
>>> + {
>>> + sdc->valid = false;
>>> + }
>>> +
>>> /* First try to write zeros and unmap at the same time */
>>>
>>
>>
>> Why not to drop cache on handle_aiocb_write_zeroes()? Otherwise, we'll
>> return DATA
>> for these regions which may unallocated read-as-zero, if I'm not mistaken.
>
> handle_aiocb_write_zeroes() is not allowed to unmap things, so we don't
> need to invalidate the cache there.
So, you want to say, that for fallocated regions we always return just _DATA,
without _ZERO?
If it is so, it's of course bad, it means that convert will have to copy (or at
least read
and detect zeroes by hand, if enabled) write-zeroed-without-unmap areas.
Let's check (hmm, I had to use qemu-img map inside qemu-io, patch attached for
it,
also I printed printf("%s\n", __func__) in handle_aiocb_write_zeroes_unmap and
handle_aiocb_write_zeroes):
Let's test:
]# cat test
./qemu-img create -f raw x 1M
./qemu-io -f raw x <<CMDS
write 0 1M
map
write -z 100K 100K
map
write -z -u 500K 100K
map
CMDS
rm -rf x
rm -rf x
before your patch:
]# ./test
Formatting 'x', fmt=raw size=1048576
qemu-io> wrote 1048576/1048576 bytes at offset 0
1 MiB, 1 ops; 0.0523 sec (19.093 MiB/sec and 19.0927 ops/sec)
qemu-io> [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data":
true, "offset": 0}]
qemu-io> handle_aiocb_write_zeroes
wrote 102400/102400 bytes at offset 102400
100 KiB, 1 ops; 0.0165 sec (5.898 MiB/sec and 60.3974 ops/sec)
qemu-io> [{ "start": 0, "length": 102400, "depth": 0, "zero": false, "data":
true, "offset": 0},
{ "start": 102400, "length": 102400, "depth": 0, "zero": true, "data": false,
"offset": 102400},
{ "start": 204800, "length": 843776, "depth": 0, "zero": false, "data": true,
"offset": 204800}]
qemu-io> handle_aiocb_write_zeroes_unmap
wrote 102400/102400 bytes at offset 512000
100 KiB, 1 ops; 0.0001 sec (545.566 MiB/sec and 5586.5922 ops/sec)
qemu-io> [{ "start": 0, "length": 102400, "depth": 0, "zero": false, "data":
true, "offset": 0},
{ "start": 102400, "length": 102400, "depth": 0, "zero": true, "data": false,
"offset": 102400},
{ "start": 204800, "length": 307200, "depth": 0, "zero": false, "data": true,
"offset": 204800},
{ "start": 512000, "length": 102400, "depth": 0, "zero": true, "data": false,
"offset": 512000},
{ "start": 614400, "length": 434176, "depth": 0, "zero": false, "data": true,
"offset": 614400}]
after your patch:
# ./test
Formatting 'x', fmt=raw size=1048576
qemu-io> wrote 1048576/1048576 bytes at offset 0
1 MiB, 1 ops; 0.0768 sec (13.019 MiB/sec and 13.0195 ops/sec)
qemu-io> [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data":
true, "offset": 0}]
qemu-io> handle_aiocb_write_zeroes
wrote 102400/102400 bytes at offset 102400
100 KiB, 1 ops; 0.0166 sec (5.883 MiB/sec and 60.2410 ops/sec)
qemu-io> [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data":
true, "offset": 0}]
qemu-io> handle_aiocb_write_zeroes_unmap
wrote 102400/102400 bytes at offset 512000
100 KiB, 1 ops; 0.0002 sec (469.501 MiB/sec and 4807.6923 ops/sec)
qemu-io> [{ "start": 0, "length": 102400, "depth": 0, "zero": false, "data":
true, "offset": 0},
{ "start": 102400, "length": 102400, "depth": 0, "zero": true, "data": false,
"offset": 102400},
{ "start": 204800, "length": 307200, "depth": 0, "zero": false, "data": true,
"offset": 204800},
{ "start": 512000, "length": 102400, "depth": 0, "zero": true, "data": false,
"offset": 512000},
{ "start": 614400, "length": 434176, "depth": 0, "zero": false, "data": true,
"offset": 614400}]
So, you've changed behavior of block_status after write_zeroes without UNMAP
for the worse.
Hmm, should I prepare patch for qemu-io? qemu-img map is definitely better.
>
>>> #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
>>> @@ -1634,11 +1649,20 @@ static int handle_aiocb_discard(void *opaque)
>>> RawPosixAIOData *aiocb = opaque;
>>> int ret = -EOPNOTSUPP;
>>> BDRVRawState *s = aiocb->bs->opaque;
>>> + struct seek_data_cache *sdc;
>>>
>>> if (!s->has_discard) {
>>> return -ENOTSUP;
>>> }
>>>
>>> + /* Invalidate seek_data_cache if it overlaps */
>>> + sdc = &s->seek_data_cache;
>>> + if (sdc->valid && !(sdc->end < aiocb->aio_offset ||
>>> + sdc->start > aiocb->aio_offset +
>>> aiocb->aio_nbytes))
>>
>> and <= and >=
>>
>> and if add same to handle_aiocb_write_zeroes(), then it worth to
>> create helper function to invalidate cache.
>
> Ok.
>
>>> + {
>>> + sdc->valid = false;
>>> + }
>>> +
>>> if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
>>> #ifdef BLKDISCARD
>>> do {
>>> @@ -2424,6 +2448,8 @@ static int coroutine_fn
>>> raw_co_block_status(BlockDriverState *bs,
>>> int64_t *map,
>>> BlockDriverState **file)
>>> {
>>> + BDRVRawState *s = bs->opaque;
>>> + struct seek_data_cache *sdc;
>>> off_t data = 0, hole = 0;
>>> int ret;
>>>
>>> @@ -2439,6 +2465,14 @@ static int coroutine_fn
>>> raw_co_block_status(BlockDriverState *bs,
>>> return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>> }
>>>
>>> + sdc = &s->seek_data_cache;
>>> + if (sdc->valid && sdc->start <= offset && sdc->end > offset) {
>>> + *pnum = MIN(bytes, sdc->end - offset);
>>> + *map = offset;
>>> + *file = bs;
>>> + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>> + }
>>> +
>>> ret = find_allocation(bs, offset, &data, &hole);
>>> if (ret == -ENXIO) {
>>> /* Trailing hole */
>>> @@ -2451,14 +2485,27 @@ static int coroutine_fn
>>> raw_co_block_status(BlockDriverState *bs,
>>> } else if (data == offset) {
>>> /* On a data extent, compute bytes to the end of the extent,
>>> * possibly including a partial sector at EOF. */
>>> - *pnum = MIN(bytes, hole - offset);
>>> + *pnum = hole - offset;
>>
>> hmm, why? At least you didn't mention it in commit-message..
>
> We want to cache the whole range returned by lseek(), not just whatever
> the raw_co_block_status() caller wanted to know.
>
> For the returned value, *pnum is adjusted to MIN(bytes, *pnum) below...
Oops, stupid question it was, sorry:(
>
>>> ret = BDRV_BLOCK_DATA;
>>> } else {
>>> /* On a hole, compute bytes to the beginning of the next extent.
>>> */
>>> assert(hole == offset);
>>> - *pnum = MIN(bytes, data - offset);
>>> + *pnum = data - offset;
>>> ret = BDRV_BLOCK_ZERO;
>>> }
>>> +
>>> + /* Caching allocated ranges is okay even if another process writes to
>>> the
>>> + * same file because we allow declaring things allocated even if there
>>> is a
>>> + * hole. However, we cannot cache holes without risking corruption. */
>>> + if (ret == BDRV_BLOCK_DATA) {
>>> + *sdc = (struct seek_data_cache) {
>>> + .valid = true,
>>> + .start = offset,
>>> + .end = offset + *pnum,
>>> + };
>>> + }
>>> +
>>> + *pnum = MIN(*pnum, bytes);
>
> ...here.
>
> So what we return doesn't change.
>
>>> *map = offset;
>>> *file = bs;
>>> return ret | BDRV_BLOCK_OFFSET_VALID;
>
> Kevin
>
--
Best regards,
Vladimir
From 8f52aa40514a290d7770ef2eba3ee3c8b93c1cab Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <[email protected]>
Date: Fri, 25 Jan 2019 11:48:05 +0300
Subject: [PATCH 1/2] my
---
include/qemu-io.h | 7 ++
qemu-img.c | 157 +--------------------------------------
qemu-io-cmds.c | 182 ++++++++++++++++++++++++++++++++++------------
3 files changed, 145 insertions(+), 201 deletions(-)
diff --git a/include/qemu-io.h b/include/qemu-io.h
index 7433239372..2abbbb4acf 100644
--- a/include/qemu-io.h
+++ b/include/qemu-io.h
@@ -54,4 +54,11 @@ void qemuio_complete_command(const char *input,
void (*fn)(const char *cmd, void *opaque),
void *opaque);
+typedef enum OutputFormat {
+ OFORMAT_JSON,
+ OFORMAT_HUMAN,
+} OutputFormat;
+
+int qemu_img_do_map(BlockBackend *blk, OutputFormat output_format);
+
#endif /* QEMU_IO_H */
diff --git a/qemu-img.c b/qemu-img.c
index ad04f59565..ecd708af2d 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -45,6 +45,7 @@
#include "block/qapi.h"
#include "crypto/init.h"
#include "trace/control.h"
+#include "qemu-io.h"
#define QEMU_IMG_VERSION "qemu-img version " QEMU_FULL_VERSION \
"\n" QEMU_COPYRIGHT "\n"
@@ -68,11 +69,6 @@ enum {
OPTION_SHRINK = 266,
};
-typedef enum OutputFormat {
- OFORMAT_JSON,
- OFORMAT_HUMAN,
-} OutputFormat;
-
/* Default to cache=writeback as data integrity is not important for qemu-img */
#define BDRV_DEFAULT_CACHE "writeback"
@@ -2740,128 +2736,12 @@ static int img_info(int argc, char **argv)
return 0;
}
-static void dump_map_entry(OutputFormat output_format, MapEntry *e,
- MapEntry *next)
-{
- switch (output_format) {
- case OFORMAT_HUMAN:
- if (e->data && !e->has_offset) {
- error_report("File contains external, encrypted or compressed clusters.");
- exit(1);
- }
- if (e->data && !e->zero) {
- printf("%#-16"PRIx64"%#-16"PRIx64"%#-16"PRIx64"%s\n",
- e->start, e->length,
- e->has_offset ? e->offset : 0,
- e->has_filename ? e->filename : "");
- }
- /* This format ignores the distinction between 0, ZERO and ZERO|DATA.
- * Modify the flags here to allow more coalescing.
- */
- if (next && (!next->data || next->zero)) {
- next->data = false;
- next->zero = true;
- }
- break;
- case OFORMAT_JSON:
- printf("%s{ \"start\": %"PRId64", \"length\": %"PRId64","
- " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s",
- (e->start == 0 ? "[" : ",\n"),
- e->start, e->length, e->depth,
- e->zero ? "true" : "false",
- e->data ? "true" : "false");
- if (e->has_offset) {
- printf(", \"offset\": %"PRId64"", e->offset);
- }
- putchar('}');
-
- if (!next) {
- printf("]\n");
- }
- break;
- }
-}
-
-static int get_block_status(BlockDriverState *bs, int64_t offset,
- int64_t bytes, MapEntry *e)
-{
- int ret;
- int depth;
- BlockDriverState *file;
- bool has_offset;
- int64_t map;
-
- /* As an optimization, we could cache the current range of unallocated
- * clusters in each file of the chain, and avoid querying the same
- * range repeatedly.
- */
-
- depth = 0;
- for (;;) {
- ret = bdrv_block_status(bs, offset, bytes, &bytes, &map, &file);
- if (ret < 0) {
- return ret;
- }
- assert(bytes);
- if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) {
- break;
- }
- bs = backing_bs(bs);
- if (bs == NULL) {
- ret = 0;
- break;
- }
-
- depth++;
- }
-
- has_offset = !!(ret & BDRV_BLOCK_OFFSET_VALID);
-
- *e = (MapEntry) {
- .start = offset,
- .length = bytes,
- .data = !!(ret & BDRV_BLOCK_DATA),
- .zero = !!(ret & BDRV_BLOCK_ZERO),
- .offset = map,
- .has_offset = has_offset,
- .depth = depth,
- .has_filename = file && has_offset,
- .filename = file && has_offset ? file->filename : NULL,
- };
-
- return 0;
-}
-
-static inline bool entry_mergeable(const MapEntry *curr, const MapEntry *next)
-{
- if (curr->length == 0) {
- return false;
- }
- if (curr->zero != next->zero ||
- curr->data != next->data ||
- curr->depth != next->depth ||
- curr->has_filename != next->has_filename ||
- curr->has_offset != next->has_offset) {
- return false;
- }
- if (curr->has_filename && strcmp(curr->filename, next->filename)) {
- return false;
- }
- if (curr->has_offset && curr->offset + curr->length != next->offset) {
- return false;
- }
- return true;
-}
-
static int img_map(int argc, char **argv)
{
int c;
OutputFormat output_format = OFORMAT_HUMAN;
BlockBackend *blk;
- BlockDriverState *bs;
const char *filename, *fmt, *output;
- int64_t length;
- MapEntry curr = { .length = 0 }, next;
int ret = 0;
bool image_opts = false;
bool force_share = false;
@@ -2940,41 +2820,10 @@ static int img_map(int argc, char **argv)
if (!blk) {
return 1;
}
- bs = blk_bs(blk);
-
- if (output_format == OFORMAT_HUMAN) {
- printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
- }
- length = blk_getlength(blk);
- while (curr.start + curr.length < length) {
- int64_t offset = curr.start + curr.length;
- int64_t n;
-
- /* Probe up to 1 GiB at a time. */
- n = MIN(1 << 30, length - offset);
- ret = get_block_status(bs, offset, n, &next);
-
- if (ret < 0) {
- error_report("Could not read file metadata: %s", strerror(-ret));
- goto out;
- }
-
- if (entry_mergeable(&curr, &next)) {
- curr.length += next.length;
- continue;
- }
-
- if (curr.length > 0) {
- dump_map_entry(output_format, &curr, &next);
- }
- curr = next;
- }
-
- dump_map_entry(output_format, &curr, NULL);
-
-out:
+ ret = qemu_img_do_map(blk, output_format);
blk_unref(blk);
+
return ret < 0;
}
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 2c39124036..791cedd96d 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1852,77 +1852,165 @@ static const cmdinfo_t alloc_cmd = {
.oneline = "checks if offset is allocated in the file",
};
+static void dump_map_entry(OutputFormat output_format, MapEntry *e,
+ MapEntry *next)
+{
+ switch (output_format) {
+ case OFORMAT_HUMAN:
+ if (e->data && !e->has_offset) {
+ error_report("File contains external, encrypted or compressed clusters.");
+ exit(1);
+ }
+ if (e->data && !e->zero) {
+ printf("%#-16"PRIx64"%#-16"PRIx64"%#-16"PRIx64"%s\n",
+ e->start, e->length,
+ e->has_offset ? e->offset : 0,
+ e->has_filename ? e->filename : "");
+ }
+ /* This format ignores the distinction between 0, ZERO and ZERO|DATA.
+ * Modify the flags here to allow more coalescing.
+ */
+ if (next && (!next->data || next->zero)) {
+ next->data = false;
+ next->zero = true;
+ }
+ break;
+ case OFORMAT_JSON:
+ printf("%s{ \"start\": %"PRId64", \"length\": %"PRId64","
+ " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s",
+ (e->start == 0 ? "[" : ",\n"),
+ e->start, e->length, e->depth,
+ e->zero ? "true" : "false",
+ e->data ? "true" : "false");
+ if (e->has_offset) {
+ printf(", \"offset\": %"PRId64"", e->offset);
+ }
+ putchar('}');
-static int map_is_allocated(BlockDriverState *bs, int64_t offset,
- int64_t bytes, int64_t *pnum)
-{
- int64_t num;
- int num_checked;
- int ret, firstret;
-
- num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
- ret = bdrv_is_allocated(bs, offset, num_checked, &num);
- if (ret < 0) {
- return ret;
+ if (!next) {
+ printf("]\n");
+ }
+ break;
}
+}
- firstret = ret;
- *pnum = num;
-
- while (bytes > 0 && ret == firstret) {
- offset += num;
- bytes -= num;
+static int get_block_status(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, MapEntry *e)
+{
+ int ret;
+ int depth;
+ BlockDriverState *file;
+ bool has_offset;
+ int64_t map;
+
+ /* As an optimization, we could cache the current range of unallocated
+ * clusters in each file of the chain, and avoid querying the same
+ * range repeatedly.
+ */
- num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
- ret = bdrv_is_allocated(bs, offset, num_checked, &num);
- if (ret == firstret && num) {
- *pnum += num;
- } else {
+ depth = 0;
+ for (;;) {
+ ret = bdrv_block_status(bs, offset, bytes, &bytes, &map, &file);
+ if (ret < 0) {
+ return ret;
+ }
+ assert(bytes);
+ if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) {
break;
}
+ bs = backing_bs(bs);
+ if (bs == NULL) {
+ ret = 0;
+ break;
+ }
+
+ depth++;
}
- return firstret;
+ has_offset = !!(ret & BDRV_BLOCK_OFFSET_VALID);
+
+ *e = (MapEntry) {
+ .start = offset,
+ .length = bytes,
+ .data = !!(ret & BDRV_BLOCK_DATA),
+ .zero = !!(ret & BDRV_BLOCK_ZERO),
+ .offset = map,
+ .has_offset = has_offset,
+ .depth = depth,
+ .has_filename = file && has_offset,
+ .filename = file && has_offset ? file->filename : NULL,
+ };
+
+ return 0;
}
-static int map_f(BlockBackend *blk, int argc, char **argv)
+static inline bool entry_mergeable(const MapEntry *curr, const MapEntry *next)
{
- int64_t offset, bytes;
- char s1[64], s2[64];
- int64_t num;
- int ret;
- const char *retstr;
+ if (curr->length == 0) {
+ return false;
+ }
+ if (curr->zero != next->zero ||
+ curr->data != next->data ||
+ curr->depth != next->depth ||
+ curr->has_filename != next->has_filename ||
+ curr->has_offset != next->has_offset) {
+ return false;
+ }
+ if (curr->has_filename && strcmp(curr->filename, next->filename)) {
+ return false;
+ }
+ if (curr->has_offset && curr->offset + curr->length != next->offset) {
+ return false;
+ }
+ return true;
+}
- offset = 0;
- bytes = blk_getlength(blk);
- if (bytes < 0) {
- error_report("Failed to query image length: %s", strerror(-bytes));
- return bytes;
+int qemu_img_do_map(BlockBackend *blk, OutputFormat output_format)
+{
+ BlockDriverState *bs = blk_bs(blk);
+ MapEntry curr = { .length = 0 }, next;
+ int64_t length;
+
+ if (output_format == OFORMAT_HUMAN) {
+ printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
}
- while (bytes) {
- ret = map_is_allocated(blk_bs(blk), offset, bytes, &num);
+ length = blk_getlength(blk);
+ while (curr.start + curr.length < length) {
+ int64_t offset = curr.start + curr.length;
+ int64_t n;
+ int ret;
+
+ /* Probe up to 1 GiB at a time. */
+ n = MIN(1 << 30, length - offset);
+ ret = get_block_status(bs, offset, n, &next);
+
if (ret < 0) {
- error_report("Failed to get allocation status: %s", strerror(-ret));
+ error_report("Could not read file metadata: %s", strerror(-ret));
return ret;
- } else if (!num) {
- error_report("Unexpected end of image");
- return -EIO;
}
- retstr = ret ? " allocated" : "not allocated";
- cvtstr(num, s1, sizeof(s1));
- cvtstr(offset, s2, sizeof(s2));
- printf("%s (0x%" PRIx64 ") bytes %s at offset %s (0x%" PRIx64 ")\n",
- s1, num, retstr, s2, offset);
+ if (entry_mergeable(&curr, &next)) {
+ curr.length += next.length;
+ continue;
+ }
- offset += num;
- bytes -= num;
+ if (curr.length > 0) {
+ dump_map_entry(output_format, &curr, &next);
+ }
+ curr = next;
}
+ dump_map_entry(output_format, &curr, NULL);
+
return 0;
}
+static int map_f(BlockBackend *blk, int argc, char **argv)
+{
+ return qemu_img_do_map(blk, OFORMAT_JSON);
+}
+
static const cmdinfo_t map_cmd = {
.name = "map",
.argmin = 0,
--
2.18.0