24.01.2019 19:36, Kevin Wolf wrote:
> Am 24.01.2019 um 17:18 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 24.01.2019 17:17, Kevin Wolf wrote:
>>> Depending on the exact image layout and the storage backend (tmpfs is
>>> konwn to have very slow SEEK_HOLE/SEEK_DATA), caching lseek results can
>>> save us a lot of time e.g. during a mirror block job or qemu-img convert
>>> with a fragmented source image (.bdrv_co_block_status on the protocol
>>> layer can be called for every single cluster in the extreme case).
>>>
>>> We may only cache data regions because of possible concurrent writers.
>>> This means that we can later treat a recently punched hole as data, but
>>> this is safe. We can't cache holes because then we might treat recently
>>> written data as holes, which can cause corruption.
>>>
>>> Signed-off-by: Kevin Wolf <[email protected]>
> 
>>> @@ -1555,8 +1561,17 @@ static int handle_aiocb_write_zeroes_unmap(void 
>>> *opaque)
>>>    {
>>>        RawPosixAIOData *aiocb = opaque;
>>>        BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
>>> +    struct seek_data_cache *sdc;
>>>        int ret;
>>>    
>>> +    /* Invalidate seek_data_cache if it overlaps */
>>> +    sdc = &s->seek_data_cache;
>>> +    if (sdc->valid && !(sdc->end < aiocb->aio_offset ||
>>> +                        sdc->start > aiocb->aio_offset + 
>>> aiocb->aio_nbytes))
>>
>> to be presize: <= and >=
> 
> Yes, you're right.
> 
>>> +    {
>>> +        sdc->valid = false;
>>> +    }
>>> +
>>>        /* First try to write zeros and unmap at the same time */
>>>    
>>
>>
>> Why not to drop cache on handle_aiocb_write_zeroes()? Otherwise, we'll 
>> return DATA
>> for these regions which may unallocated read-as-zero, if I'm not mistaken.
> 
> handle_aiocb_write_zeroes() is not allowed to unmap things, so we don't
> need to invalidate the cache there.

So, you want to say, that for fallocated regions we always return just _DATA, 
without _ZERO?
If it is so, it's of course bad, it means that convert will have to copy (or at 
least read
and detect zeroes by hand, if enabled) write-zeroed-without-unmap areas.

Let's check (hmm, I had to use qemu-img map inside qemu-io, patch attached for 
it,
also I printed printf("%s\n", __func__) in handle_aiocb_write_zeroes_unmap and
handle_aiocb_write_zeroes):

Let's test:
]# cat test
./qemu-img create -f raw x 1M

./qemu-io -f raw x <<CMDS
write 0 1M
map
write -z 100K 100K
map
write -z -u 500K 100K
map
CMDS

rm -rf x


rm -rf x

before your patch:
]# ./test
Formatting 'x', fmt=raw size=1048576
qemu-io> wrote 1048576/1048576 bytes at offset 0
1 MiB, 1 ops; 0.0523 sec (19.093 MiB/sec and 19.0927 ops/sec)
qemu-io> [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": 
true, "offset": 0}]
qemu-io> handle_aiocb_write_zeroes
wrote 102400/102400 bytes at offset 102400
100 KiB, 1 ops; 0.0165 sec (5.898 MiB/sec and 60.3974 ops/sec)
qemu-io> [{ "start": 0, "length": 102400, "depth": 0, "zero": false, "data": 
true, "offset": 0},
{ "start": 102400, "length": 102400, "depth": 0, "zero": true, "data": false, 
"offset": 102400},
{ "start": 204800, "length": 843776, "depth": 0, "zero": false, "data": true, 
"offset": 204800}]
qemu-io> handle_aiocb_write_zeroes_unmap
wrote 102400/102400 bytes at offset 512000
100 KiB, 1 ops; 0.0001 sec (545.566 MiB/sec and 5586.5922 ops/sec)
qemu-io> [{ "start": 0, "length": 102400, "depth": 0, "zero": false, "data": 
true, "offset": 0},
{ "start": 102400, "length": 102400, "depth": 0, "zero": true, "data": false, 
"offset": 102400},
{ "start": 204800, "length": 307200, "depth": 0, "zero": false, "data": true, 
"offset": 204800},
{ "start": 512000, "length": 102400, "depth": 0, "zero": true, "data": false, 
"offset": 512000},
{ "start": 614400, "length": 434176, "depth": 0, "zero": false, "data": true, 
"offset": 614400}]



after your patch:
# ./test
Formatting 'x', fmt=raw size=1048576
qemu-io> wrote 1048576/1048576 bytes at offset 0
1 MiB, 1 ops; 0.0768 sec (13.019 MiB/sec and 13.0195 ops/sec)
qemu-io> [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": 
true, "offset": 0}]
qemu-io> handle_aiocb_write_zeroes
wrote 102400/102400 bytes at offset 102400
100 KiB, 1 ops; 0.0166 sec (5.883 MiB/sec and 60.2410 ops/sec)
qemu-io> [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": 
true, "offset": 0}]
qemu-io> handle_aiocb_write_zeroes_unmap
wrote 102400/102400 bytes at offset 512000
100 KiB, 1 ops; 0.0002 sec (469.501 MiB/sec and 4807.6923 ops/sec)
qemu-io> [{ "start": 0, "length": 102400, "depth": 0, "zero": false, "data": 
true, "offset": 0},
{ "start": 102400, "length": 102400, "depth": 0, "zero": true, "data": false, 
"offset": 102400},
{ "start": 204800, "length": 307200, "depth": 0, "zero": false, "data": true, 
"offset": 204800},
{ "start": 512000, "length": 102400, "depth": 0, "zero": true, "data": false, 
"offset": 512000},
{ "start": 614400, "length": 434176, "depth": 0, "zero": false, "data": true, 
"offset": 614400}]


So, you've changed behavior of block_status after write_zeroes without UNMAP 
for the worse.

Hmm, should I prepare patch for qemu-io? qemu-img map is definitely better.

> 
>>>    #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
>>> @@ -1634,11 +1649,20 @@ static int handle_aiocb_discard(void *opaque)
>>>        RawPosixAIOData *aiocb = opaque;
>>>        int ret = -EOPNOTSUPP;
>>>        BDRVRawState *s = aiocb->bs->opaque;
>>> +    struct seek_data_cache *sdc;
>>>    
>>>        if (!s->has_discard) {
>>>            return -ENOTSUP;
>>>        }
>>>    
>>> +    /* Invalidate seek_data_cache if it overlaps */
>>> +    sdc = &s->seek_data_cache;
>>> +    if (sdc->valid && !(sdc->end < aiocb->aio_offset ||
>>> +                        sdc->start > aiocb->aio_offset + 
>>> aiocb->aio_nbytes))
>>
>> and <= and >=
>>
>> and if add same to handle_aiocb_write_zeroes(), then it worth to
>> create helper function to invalidate cache.
> 
> Ok.
> 
>>> +    {
>>> +        sdc->valid = false;
>>> +    }
>>> +
>>>        if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
>>>    #ifdef BLKDISCARD
>>>            do {
>>> @@ -2424,6 +2448,8 @@ static int coroutine_fn 
>>> raw_co_block_status(BlockDriverState *bs,
>>>                                                int64_t *map,
>>>                                                BlockDriverState **file)
>>>    {
>>> +    BDRVRawState *s = bs->opaque;
>>> +    struct seek_data_cache *sdc;
>>>        off_t data = 0, hole = 0;
>>>        int ret;
>>>    
>>> @@ -2439,6 +2465,14 @@ static int coroutine_fn 
>>> raw_co_block_status(BlockDriverState *bs,
>>>            return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>>        }
>>>    
>>> +    sdc = &s->seek_data_cache;
>>> +    if (sdc->valid && sdc->start <= offset && sdc->end > offset) {
>>> +        *pnum = MIN(bytes, sdc->end - offset);
>>> +        *map = offset;
>>> +        *file = bs;
>>> +        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>>> +    }
>>> +
>>>        ret = find_allocation(bs, offset, &data, &hole);
>>>        if (ret == -ENXIO) {
>>>            /* Trailing hole */
>>> @@ -2451,14 +2485,27 @@ static int coroutine_fn 
>>> raw_co_block_status(BlockDriverState *bs,
>>>        } else if (data == offset) {
>>>            /* On a data extent, compute bytes to the end of the extent,
>>>             * possibly including a partial sector at EOF. */
>>> -        *pnum = MIN(bytes, hole - offset);
>>> +        *pnum = hole - offset;
>>
>> hmm, why? At least you didn't mention it in commit-message..
> 
> We want to cache the whole range returned by lseek(), not just whatever
> the raw_co_block_status() caller wanted to know.
> 
> For the returned value, *pnum is adjusted to MIN(bytes, *pnum) below...

Oops, stupid question it was, sorry:(

> 
>>>            ret = BDRV_BLOCK_DATA;
>>>        } else {
>>>            /* On a hole, compute bytes to the beginning of the next extent. 
>>>  */
>>>            assert(hole == offset);
>>> -        *pnum = MIN(bytes, data - offset);
>>> +        *pnum = data - offset;
>>>            ret = BDRV_BLOCK_ZERO;
>>>        }
>>> +
>>> +    /* Caching allocated ranges is okay even if another process writes to 
>>> the
>>> +     * same file because we allow declaring things allocated even if there 
>>> is a
>>> +     * hole. However, we cannot cache holes without risking corruption. */
>>> +    if (ret == BDRV_BLOCK_DATA) {
>>> +        *sdc = (struct seek_data_cache) {
>>> +            .valid  = true,
>>> +            .start  = offset,
>>> +            .end    = offset + *pnum,
>>> +        };
>>> +    }
>>> +
>>> +    *pnum = MIN(*pnum, bytes);
> 
> ...here.
> 
> So what we return doesn't change.
> 
>>>        *map = offset;
>>>        *file = bs;
>>>        return ret | BDRV_BLOCK_OFFSET_VALID;
> 
> Kevin
> 


-- 
Best regards,
Vladimir
From 8f52aa40514a290d7770ef2eba3ee3c8b93c1cab Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <[email protected]>
Date: Fri, 25 Jan 2019 11:48:05 +0300
Subject: [PATCH 1/2] my

---
 include/qemu-io.h |   7 ++
 qemu-img.c        | 157 +--------------------------------------
 qemu-io-cmds.c    | 182 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 145 insertions(+), 201 deletions(-)

diff --git a/include/qemu-io.h b/include/qemu-io.h
index 7433239372..2abbbb4acf 100644
--- a/include/qemu-io.h
+++ b/include/qemu-io.h
@@ -54,4 +54,11 @@ void qemuio_complete_command(const char *input,
                              void (*fn)(const char *cmd, void *opaque),
                              void *opaque);
 
+typedef enum OutputFormat {
+    OFORMAT_JSON,
+    OFORMAT_HUMAN,
+} OutputFormat;
+
+int qemu_img_do_map(BlockBackend *blk, OutputFormat output_format);
+
 #endif /* QEMU_IO_H */
diff --git a/qemu-img.c b/qemu-img.c
index ad04f59565..ecd708af2d 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -45,6 +45,7 @@
 #include "block/qapi.h"
 #include "crypto/init.h"
 #include "trace/control.h"
+#include "qemu-io.h"
 
 #define QEMU_IMG_VERSION "qemu-img version " QEMU_FULL_VERSION \
                           "\n" QEMU_COPYRIGHT "\n"
@@ -68,11 +69,6 @@ enum {
     OPTION_SHRINK = 266,
 };
 
-typedef enum OutputFormat {
-    OFORMAT_JSON,
-    OFORMAT_HUMAN,
-} OutputFormat;
-
 /* Default to cache=writeback as data integrity is not important for qemu-img */
 #define BDRV_DEFAULT_CACHE "writeback"
 
@@ -2740,128 +2736,12 @@ static int img_info(int argc, char **argv)
     return 0;
 }
 
-static void dump_map_entry(OutputFormat output_format, MapEntry *e,
-                           MapEntry *next)
-{
-    switch (output_format) {
-    case OFORMAT_HUMAN:
-        if (e->data && !e->has_offset) {
-            error_report("File contains external, encrypted or compressed clusters.");
-            exit(1);
-        }
-        if (e->data && !e->zero) {
-            printf("%#-16"PRIx64"%#-16"PRIx64"%#-16"PRIx64"%s\n",
-                   e->start, e->length,
-                   e->has_offset ? e->offset : 0,
-                   e->has_filename ? e->filename : "");
-        }
-        /* This format ignores the distinction between 0, ZERO and ZERO|DATA.
-         * Modify the flags here to allow more coalescing.
-         */
-        if (next && (!next->data || next->zero)) {
-            next->data = false;
-            next->zero = true;
-        }
-        break;
-    case OFORMAT_JSON:
-        printf("%s{ \"start\": %"PRId64", \"length\": %"PRId64","
-               " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s",
-               (e->start == 0 ? "[" : ",\n"),
-               e->start, e->length, e->depth,
-               e->zero ? "true" : "false",
-               e->data ? "true" : "false");
-        if (e->has_offset) {
-            printf(", \"offset\": %"PRId64"", e->offset);
-        }
-        putchar('}');
-
-        if (!next) {
-            printf("]\n");
-        }
-        break;
-    }
-}
-
-static int get_block_status(BlockDriverState *bs, int64_t offset,
-                            int64_t bytes, MapEntry *e)
-{
-    int ret;
-    int depth;
-    BlockDriverState *file;
-    bool has_offset;
-    int64_t map;
-
-    /* As an optimization, we could cache the current range of unallocated
-     * clusters in each file of the chain, and avoid querying the same
-     * range repeatedly.
-     */
-
-    depth = 0;
-    for (;;) {
-        ret = bdrv_block_status(bs, offset, bytes, &bytes, &map, &file);
-        if (ret < 0) {
-            return ret;
-        }
-        assert(bytes);
-        if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) {
-            break;
-        }
-        bs = backing_bs(bs);
-        if (bs == NULL) {
-            ret = 0;
-            break;
-        }
-
-        depth++;
-    }
-
-    has_offset = !!(ret & BDRV_BLOCK_OFFSET_VALID);
-
-    *e = (MapEntry) {
-        .start = offset,
-        .length = bytes,
-        .data = !!(ret & BDRV_BLOCK_DATA),
-        .zero = !!(ret & BDRV_BLOCK_ZERO),
-        .offset = map,
-        .has_offset = has_offset,
-        .depth = depth,
-        .has_filename = file && has_offset,
-        .filename = file && has_offset ? file->filename : NULL,
-    };
-
-    return 0;
-}
-
-static inline bool entry_mergeable(const MapEntry *curr, const MapEntry *next)
-{
-    if (curr->length == 0) {
-        return false;
-    }
-    if (curr->zero != next->zero ||
-        curr->data != next->data ||
-        curr->depth != next->depth ||
-        curr->has_filename != next->has_filename ||
-        curr->has_offset != next->has_offset) {
-        return false;
-    }
-    if (curr->has_filename && strcmp(curr->filename, next->filename)) {
-        return false;
-    }
-    if (curr->has_offset && curr->offset + curr->length != next->offset) {
-        return false;
-    }
-    return true;
-}
-
 static int img_map(int argc, char **argv)
 {
     int c;
     OutputFormat output_format = OFORMAT_HUMAN;
     BlockBackend *blk;
-    BlockDriverState *bs;
     const char *filename, *fmt, *output;
-    int64_t length;
-    MapEntry curr = { .length = 0 }, next;
     int ret = 0;
     bool image_opts = false;
     bool force_share = false;
@@ -2940,41 +2820,10 @@ static int img_map(int argc, char **argv)
     if (!blk) {
         return 1;
     }
-    bs = blk_bs(blk);
-
-    if (output_format == OFORMAT_HUMAN) {
-        printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
-    }
 
-    length = blk_getlength(blk);
-    while (curr.start + curr.length < length) {
-        int64_t offset = curr.start + curr.length;
-        int64_t n;
-
-        /* Probe up to 1 GiB at a time.  */
-        n = MIN(1 << 30, length - offset);
-        ret = get_block_status(bs, offset, n, &next);
-
-        if (ret < 0) {
-            error_report("Could not read file metadata: %s", strerror(-ret));
-            goto out;
-        }
-
-        if (entry_mergeable(&curr, &next)) {
-            curr.length += next.length;
-            continue;
-        }
-
-        if (curr.length > 0) {
-            dump_map_entry(output_format, &curr, &next);
-        }
-        curr = next;
-    }
-
-    dump_map_entry(output_format, &curr, NULL);
-
-out:
+    ret = qemu_img_do_map(blk, output_format);
     blk_unref(blk);
+
     return ret < 0;
 }
 
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 2c39124036..791cedd96d 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1852,77 +1852,165 @@ static const cmdinfo_t alloc_cmd = {
     .oneline    = "checks if offset is allocated in the file",
 };
 
+static void dump_map_entry(OutputFormat output_format, MapEntry *e,
+                           MapEntry *next)
+{
+    switch (output_format) {
+    case OFORMAT_HUMAN:
+        if (e->data && !e->has_offset) {
+            error_report("File contains external, encrypted or compressed clusters.");
+            exit(1);
+        }
+        if (e->data && !e->zero) {
+            printf("%#-16"PRIx64"%#-16"PRIx64"%#-16"PRIx64"%s\n",
+                   e->start, e->length,
+                   e->has_offset ? e->offset : 0,
+                   e->has_filename ? e->filename : "");
+        }
+        /* This format ignores the distinction between 0, ZERO and ZERO|DATA.
+         * Modify the flags here to allow more coalescing.
+         */
+        if (next && (!next->data || next->zero)) {
+            next->data = false;
+            next->zero = true;
+        }
+        break;
+    case OFORMAT_JSON:
+        printf("%s{ \"start\": %"PRId64", \"length\": %"PRId64","
+               " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s",
+               (e->start == 0 ? "[" : ",\n"),
+               e->start, e->length, e->depth,
+               e->zero ? "true" : "false",
+               e->data ? "true" : "false");
+        if (e->has_offset) {
+            printf(", \"offset\": %"PRId64"", e->offset);
+        }
+        putchar('}');
 
-static int map_is_allocated(BlockDriverState *bs, int64_t offset,
-                            int64_t bytes, int64_t *pnum)
-{
-    int64_t num;
-    int num_checked;
-    int ret, firstret;
-
-    num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
-    ret = bdrv_is_allocated(bs, offset, num_checked, &num);
-    if (ret < 0) {
-        return ret;
+        if (!next) {
+            printf("]\n");
+        }
+        break;
     }
+}
 
-    firstret = ret;
-    *pnum = num;
-
-    while (bytes > 0 && ret == firstret) {
-        offset += num;
-        bytes -= num;
+static int get_block_status(BlockDriverState *bs, int64_t offset,
+                            int64_t bytes, MapEntry *e)
+{
+    int ret;
+    int depth;
+    BlockDriverState *file;
+    bool has_offset;
+    int64_t map;
+
+    /* As an optimization, we could cache the current range of unallocated
+     * clusters in each file of the chain, and avoid querying the same
+     * range repeatedly.
+     */
 
-        num_checked = MIN(bytes, BDRV_REQUEST_MAX_BYTES);
-        ret = bdrv_is_allocated(bs, offset, num_checked, &num);
-        if (ret == firstret && num) {
-            *pnum += num;
-        } else {
+    depth = 0;
+    for (;;) {
+        ret = bdrv_block_status(bs, offset, bytes, &bytes, &map, &file);
+        if (ret < 0) {
+            return ret;
+        }
+        assert(bytes);
+        if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) {
             break;
         }
+        bs = backing_bs(bs);
+        if (bs == NULL) {
+            ret = 0;
+            break;
+        }
+
+        depth++;
     }
 
-    return firstret;
+    has_offset = !!(ret & BDRV_BLOCK_OFFSET_VALID);
+
+    *e = (MapEntry) {
+        .start = offset,
+        .length = bytes,
+        .data = !!(ret & BDRV_BLOCK_DATA),
+        .zero = !!(ret & BDRV_BLOCK_ZERO),
+        .offset = map,
+        .has_offset = has_offset,
+        .depth = depth,
+        .has_filename = file && has_offset,
+        .filename = file && has_offset ? file->filename : NULL,
+    };
+
+    return 0;
 }
 
-static int map_f(BlockBackend *blk, int argc, char **argv)
+static inline bool entry_mergeable(const MapEntry *curr, const MapEntry *next)
 {
-    int64_t offset, bytes;
-    char s1[64], s2[64];
-    int64_t num;
-    int ret;
-    const char *retstr;
+    if (curr->length == 0) {
+        return false;
+    }
+    if (curr->zero != next->zero ||
+        curr->data != next->data ||
+        curr->depth != next->depth ||
+        curr->has_filename != next->has_filename ||
+        curr->has_offset != next->has_offset) {
+        return false;
+    }
+    if (curr->has_filename && strcmp(curr->filename, next->filename)) {
+        return false;
+    }
+    if (curr->has_offset && curr->offset + curr->length != next->offset) {
+        return false;
+    }
+    return true;
+}
 
-    offset = 0;
-    bytes = blk_getlength(blk);
-    if (bytes < 0) {
-        error_report("Failed to query image length: %s", strerror(-bytes));
-        return bytes;
+int qemu_img_do_map(BlockBackend *blk, OutputFormat output_format)
+{
+    BlockDriverState *bs = blk_bs(blk);
+    MapEntry curr = { .length = 0 }, next;
+    int64_t length;
+
+    if (output_format == OFORMAT_HUMAN) {
+        printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
     }
 
-    while (bytes) {
-        ret = map_is_allocated(blk_bs(blk), offset, bytes, &num);
+    length = blk_getlength(blk);
+    while (curr.start + curr.length < length) {
+        int64_t offset = curr.start + curr.length;
+        int64_t n;
+        int ret;
+
+        /* Probe up to 1 GiB at a time.  */
+        n = MIN(1 << 30, length - offset);
+        ret = get_block_status(bs, offset, n, &next);
+
         if (ret < 0) {
-            error_report("Failed to get allocation status: %s", strerror(-ret));
+            error_report("Could not read file metadata: %s", strerror(-ret));
             return ret;
-        } else if (!num) {
-            error_report("Unexpected end of image");
-            return -EIO;
         }
 
-        retstr = ret ? "    allocated" : "not allocated";
-        cvtstr(num, s1, sizeof(s1));
-        cvtstr(offset, s2, sizeof(s2));
-        printf("%s (0x%" PRIx64 ") bytes %s at offset %s (0x%" PRIx64 ")\n",
-               s1, num, retstr, s2, offset);
+        if (entry_mergeable(&curr, &next)) {
+            curr.length += next.length;
+            continue;
+        }
 
-        offset += num;
-        bytes -= num;
+        if (curr.length > 0) {
+            dump_map_entry(output_format, &curr, &next);
+        }
+        curr = next;
     }
 
+    dump_map_entry(output_format, &curr, NULL);
+
     return 0;
 }
 
+static int map_f(BlockBackend *blk, int argc, char **argv)
+{
+    return qemu_img_do_map(blk, OFORMAT_JSON);
+}
+
 static const cmdinfo_t map_cmd = {
        .name           = "map",
        .argmin         = 0,
-- 
2.18.0

Reply via email to