zfs: . sys

Andriy Gapon Thu, 15 Mar 2018 05:06:07 -0700

Author: avg
Date: Thu Mar 15 12:05:17 2018
New Revision: 330991
URL: https://svnweb.freebsd.org/changeset/base/330991


Log:
  MFC r329363: read-behind / read-ahead support for zfs_getpages()

Modified:
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c      Thu Mar 
15 11:06:04 2018        (r330990)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c      Thu Mar 
15 12:05:17 2018        (r330991)
@@ -1518,6 +1518,188 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_
        dmu_buf_rele_array(dbp, numbufs, FTAG);
        return (err);
 }
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size)
+{
+       struct sf_buf *sf;
+       vm_object_t vmobj;
+       vm_page_t m;
+       dmu_buf_t **dbp;
+       dmu_buf_t *db;
+       caddr_t va;
+       int numbufs, i;
+       int bufoff, pgoff, tocpy;
+       int mi, di;
+       int err;
+
+       ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+       ASSERT(last_size <= PAGE_SIZE);
+
+       err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+           IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+       if (err != 0)
+               return (err);
+
+#ifdef DEBUG
+       IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+       if (dbp[0]->db_offset != 0 || numbufs > 1) {
+               for (i = 0; i < numbufs; i++) {
+                       ASSERT(ISP2(dbp[i]->db_size));
+                       ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+                       ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+               }
+       }
+#endif
+
+       vmobj = ma[0]->object;
+       zfs_vmobject_wlock(vmobj);
+
+       db = dbp[0];
+       for (i = 0; i < *rbehind; i++) {
+               m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
+                   VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+               if (m == NULL)
+                       break;
+               if (m->valid != 0) {
+                       ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+                       break;
+               }
+               ASSERT(m->dirty == 0);
+               ASSERT(!pmap_page_is_mapped(m));
+
+               ASSERT(db->db_size > PAGE_SIZE);
+               bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+               va = zfs_map_page(m, &sf);
+               bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+               zfs_unmap_page(sf);
+               m->valid = VM_PAGE_BITS_ALL;
+               vm_page_lock(m);
+               if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+                       vm_page_activate(m);
+               else
+                       vm_page_deactivate(m);
+               vm_page_unlock(m);
+       }
+       *rbehind = i;
+
+       bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+       pgoff = 0;
+       for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+               if (pgoff == 0) {
+                       m = ma[mi];
+                       vm_page_assert_xbusied(m);
+                       ASSERT(m->valid == 0);
+                       ASSERT(m->dirty == 0);
+                       ASSERT(!pmap_page_is_mapped(m));
+                       va = zfs_map_page(m, &sf);
+               }
+               if (bufoff == 0)
+                       db = dbp[di];
+
+               ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+                   db->db_offset + bufoff);
+
+               /*
+                * We do not need to clamp the copy size by the file
+                * size as the last block is zero-filled beyond the
+                * end of file anyway.
+                */
+               tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+               bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+               pgoff += tocpy;
+               ASSERT(pgoff <= PAGESIZE);
+               if (pgoff == PAGESIZE) {
+                       zfs_unmap_page(sf);
+                       m->valid = VM_PAGE_BITS_ALL;
+                       ASSERT(mi < count);
+                       mi++;
+                       pgoff = 0;
+               }
+
+               bufoff += tocpy;
+               ASSERT(bufoff <= db->db_size);
+               if (bufoff == db->db_size) {
+                       ASSERT(di < numbufs);
+                       di++;
+                       bufoff = 0;
+               }
+       }
+
+#ifdef DEBUG
+       /*
+        * Three possibilities:
+        * - last requested page ends at a buffer boundary and , thus,
+        *   all pages and buffers have been iterated;
+        * - all requested pages are filled, but the last buffer
+        *   has not been exhausted;
+        *   the read-ahead is possible only in this case;
+        * - all buffers have been read, but the last page has not been
+        *   fully filled;
+        *   this is only possible if the file has only a single buffer
+        *   with a size that is not a multiple of the page size.
+        */
+       if (mi == count) {
+               ASSERT(di >= numbufs - 1);
+               IMPLY(*rahead != 0, di == numbufs - 1);
+               IMPLY(*rahead != 0, bufoff != 0);
+               ASSERT(pgoff == 0);
+       }
+       if (di == numbufs) {
+               ASSERT(mi >= count - 1);
+               ASSERT(*rahead == 0);
+               IMPLY(pgoff == 0, mi == count);
+               if (pgoff != 0) {
+                       ASSERT(mi == count - 1);
+                       ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+               }
+       }
+#endif
+       if (pgoff != 0) {
+               bzero(va + pgoff, PAGESIZE - pgoff);
+               zfs_unmap_page(sf);
+               m->valid = VM_PAGE_BITS_ALL;
+       }
+
+       for (i = 0; i < *rahead; i++) {
+               m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
+                   VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
+               if (m == NULL)
+                       break;
+               if (m->valid != 0) {
+                       ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+                       break;
+               }
+               ASSERT(m->dirty == 0);
+               ASSERT(!pmap_page_is_mapped(m));
+
+               ASSERT(db->db_size > PAGE_SIZE);
+               bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+               tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+               va = zfs_map_page(m, &sf);
+               bcopy((char *)db->db_data + bufoff, va, tocpy);
+               if (tocpy < PAGESIZE) {
+                       ASSERT(i == *rahead - 1);
+                       ASSERT((db->db_size & PAGE_MASK) != 0);
+                       bzero(va + tocpy, PAGESIZE - tocpy);
+               }
+               zfs_unmap_page(sf);
+               m->valid = VM_PAGE_BITS_ALL;
+               vm_page_lock(m);
+               if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+                       vm_page_activate(m);
+               else
+                       vm_page_deactivate(m);
+               vm_page_unlock(m);
+       }
+       *rahead = i;
+       zfs_vmobject_wunlock(vmobj);
+
+       dmu_buf_rele_array(dbp, numbufs, FTAG);
+       return (0);
+}
 #endif /* illumos */
 #endif /* _KERNEL */
 

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h  Thu Mar 
15 11:06:04 2018        (r330990)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h  Thu Mar 
15 12:05:17 2018        (r330991)
@@ -753,6 +753,8 @@ int dmu_write_pages(objset_t *os, uint64_t object, uin
 #else
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size);
 #endif
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c        
Thu Mar 15 11:06:04 2018        (r330990)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c        
Thu Mar 15 12:05:17 2018        (r330991)
@@ -4515,81 +4515,85 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int fla
 }
 
 static int
-zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind,
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
        znode_t *zp = VTOZ(vp);
        zfsvfs_t *zfsvfs = zp->z_zfsvfs;
        objset_t *os = zp->z_zfsvfs->z_os;
-       vm_page_t mlast;
+       rl_t *rl;
        vm_object_t object;
-       caddr_t va;
-       struct sf_buf *sf;
-       off_t startoff, endoff;
-       int i, error;
-       vm_pindex_t reqstart, reqend;
-       int lsize, size;
+       off_t start, end, obj_size;
+       uint_t blksz;
+       int pgsin_b, pgsin_a;
+       int error;
 
-       object = m[0]->object;
-       error = 0;
-
        ZFS_ENTER(zfsvfs);
        ZFS_VERIFY_ZP(zp);
 
-       zfs_vmobject_wlock(object);
-       if (m[count - 1]->valid != 0 && --count == 0) {
-               zfs_vmobject_wunlock(object);
-               goto out;
+       start = IDX_TO_OFF(ma[0]->pindex);
+       end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+       /*
+        * Lock a range covering all required and optional pages.
+        * Note that we need to handle the case of the block size growing.
+        */
+       for (;;) {
+               blksz = zp->z_blksz;
+               rl = zfs_range_lock(zp, rounddown(start, blksz),
+                   roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+               if (blksz == zp->z_blksz)
+                       break;
+               zfs_range_unlock(rl);
        }
 
-       mlast = m[count - 1];
-
-       if (IDX_TO_OFF(mlast->pindex) >=
-           object->un_pager.vnp.vnp_size) {
-               zfs_vmobject_wunlock(object);
+       object = ma[0]->object;
+       zfs_vmobject_wlock(object);
+       obj_size = object->un_pager.vnp.vnp_size;
+       zfs_vmobject_wunlock(object);
+       if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+               zfs_range_unlock(rl);
                ZFS_EXIT(zfsvfs);
                return (zfs_vm_pagerret_bad);
        }
 
-       PCPU_INC(cnt.v_vnodein);
-       PCPU_ADD(cnt.v_vnodepgsin, count);
+       pgsin_b = 0;
+       if (rbehind != NULL) {
+               pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+               pgsin_b = MIN(*rbehind, pgsin_b);
+       }
 
-       lsize = PAGE_SIZE;
-       if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
-               lsize = object->un_pager.vnp.vnp_size -
-                   IDX_TO_OFF(mlast->pindex);
-       zfs_vmobject_wunlock(object);
-
-       for (i = 0; i < count; i++) {
-               size = PAGE_SIZE;
-               if (i == count - 1)
-                       size = lsize;
-               va = zfs_map_page(m[i], &sf);
-               error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
-                   size, va, DMU_READ_PREFETCH);
-               if (size != PAGE_SIZE)
-                       bzero(va + size, PAGE_SIZE - size);
-               zfs_unmap_page(sf);
-               if (error != 0)
-                       goto out;
+       pgsin_a = 0;
+       if (rahead != NULL) {
+               pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+               if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+                       pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+               pgsin_a = MIN(*rahead, pgsin_a);
        }
 
-       zfs_vmobject_wlock(object);
-       for (i = 0; i < count; i++)
-               m[i]->valid = VM_PAGE_BITS_ALL;
-       zfs_vmobject_wunlock(object);
+       /*
+        * NB: we need to pass the exact byte size of the data that we expect
+        * to read after accounting for the file size.  This is required because
+        * ZFS will panic if we request DMU to read beyond the end of the last
+        * allocated block.
+        */
+       error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+           MIN(end, obj_size) - (end - PAGE_SIZE));
 
-out:
+       zfs_range_unlock(rl);
        ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
        ZFS_EXIT(zfsvfs);
-       if (error == 0) {
-               if (rbehind)
-                       *rbehind = 0;
-               if (rahead)
-                       *rahead = 0;
-               return (zfs_vm_pagerret_ok);
-       } else
+
+       if (error != 0)
                return (zfs_vm_pagerret_error);
+
+       PCPU_INC(cnt.v_vnodein);
+       PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
+       if (rbehind != NULL)
+               *rbehind = pgsin_b;
+       if (rahead != NULL)
+               *rahead = pgsin_a;
+       return (zfs_vm_pagerret_ok);
 }
 
 static int
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "[email protected]"

svn commit: r330991 - in stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Reply via email to