From: Matthew Wilcox <mawil...@microsoft.com>

Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
currently-in-use struct page) by using the page's refcount directly
(instead of maintaining a bias) and storing our current progress through
the page in the same bits currently used for page->index.  We no longer
need to reflect the page pfmemalloc state if we're storing the page
directly.

On the downside, we now call page_address() on every allocation, and we
do an atomic_inc() rather than a non-atomic decrement, but we should
touch the same number of cachelines and there is far less code (and
the code is less complex).

Signed-off-by: Matthew Wilcox <mawil...@microsoft.com>
---
 include/linux/mm_types.h |  17 +-----
 mm/page_alloc.c          | 135 ++++++++++++++++++++++++-----------------------
 net/core/skbuff.c        |   4 +-
 3 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c5dea402501..f922cb62bd91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -90,6 +90,7 @@ struct page {
        union {
                pgoff_t index;          /* Our offset within mapping. */
                void *freelist;         /* sl[aou]b first free object */
+               unsigned int offset;    /* page_frag highwater mark */
                /* page_deferred_list().prev    -- second tail page */
        };
 
@@ -219,22 +220,8 @@ struct page {
 #endif
 } _struct_page_alignment;
 
-#define PAGE_FRAG_CACHE_MAX_SIZE       __ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER      get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 struct page_frag_cache {
-       void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-       __u16 offset;
-       __u16 size;
-#else
-       __u32 offset;
-#endif
-       /* we maintain a pagecount bias, so that we dont dirty cache line
-        * containing page->_refcount every time we allocate a fragment.
-        */
-       unsigned int            pagecnt_bias;
-       bool pfmemalloc;
+       struct page *page;
 };
 
 typedef unsigned long vm_flags_t;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a9c14214ed2..f8a176aab287 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4319,34 +4319,72 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
 
 /*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
+ * The page fragment allocator is simple, yet effective.  It allocates
+ * pages from the page allocator, then hands out fragments of those
+ * pages to its callers.  It makes no effort to track which parts of
+ * the page remain in use, always allocating fresh memory.  The page
+ * reference count is used to keep track of whether any fragment is
+ * still in use; when all fragments in a page have been freed, the
+ * entire page is returned to the page allocator.
  *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ * The page fragment allocator performs no locking.  The caller is
+ * expected to ensure that two callers cannot simultaneously allocate
+ * from the same page_frag_cache.  Freeing is atomic and is permitted
+ * to happen simultaneously with other frees or an allocation.
+ *
+ * The allocator uses the struct page to store its state.  The 'offset'
+ * field in struct page is used to track how far through the page the
+ * allocation has proceeded.  The 'refcount' field is used to track
+ * how many fragments have been allocated from this page.  All other
+ * fields in struct page may be used by the owner of the page_frag_cache.
+ * The refcount is incremented by one while the page is still actively being
+ * allocated from; this prevents it from being freed prematurely.
  */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-                                            gfp_t gfp_mask)
+
+#define PAGE_FRAG_ALLOC_SIZE   (64 * 1024)
+#define PAGE_FRAG_ORDER                get_order(PAGE_FRAG_ALLOC_SIZE)
+
+static noinline
+struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+               unsigned int size, gfp_t gfp_mask)
 {
+       struct page *old = nc->page;
        struct page *page = NULL;
-       gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-       gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-                   __GFP_NOMEMALLOC;
-       page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-                               PAGE_FRAG_CACHE_MAX_ORDER);
-       nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
-       if (unlikely(!page))
-               page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 
-       nc->va = page ? page_address(page) : NULL;
+       if (size > PAGE_FRAG_ALLOC_SIZE)
+               return NULL;
+
+       /*
+        * If all the previous allocations from this page have already been
+        * freed, reuse the page if it can satisfy this allocation.
+        */
+       if (old && page_ref_count(old) == 1) {
+               unsigned int offset = PAGE_SIZE << compound_order(old);
+
+               if (offset > size) {
+                       old->offset = offset;
+                       return old;
+               }
+       }
+
+       if (PAGE_FRAG_ORDER > 0) {
+               gfp_t gfp = gfp_mask | __GFP_COMP | __GFP_NOWARN |
+                               __GFP_NORETRY | __GFP_NOMEMALLOC;
+
+               page = alloc_pages_node(NUMA_NO_NODE, gfp, PAGE_FRAG_ORDER);
+               if (unlikely(!page) && size > PAGE_SIZE)
+                       return NULL;
+       }
 
+       if (unlikely(!page))
+               page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+       if (unlikely(!page))
+               return NULL;
+
+       if (old)
+               put_page(old);
+       nc->page = page;
+       page->offset = PAGE_SIZE << compound_order(page);
        return page;
 }
 
@@ -4366,56 +4404,23 @@ void __page_frag_cache_drain(struct page *page, 
unsigned int count)
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
 void *page_frag_alloc(struct page_frag_cache *nc,
-                     unsigned int fragsz, gfp_t gfp_mask)
+                     unsigned int size, gfp_t gfp_mask)
 {
-       unsigned int size = PAGE_SIZE;
-       struct page *page;
-       int offset;
+       struct page *page = nc->page;
+       unsigned int offset = page->offset;
 
-       if (unlikely(!nc->va)) {
-refill:
-               page = __page_frag_cache_refill(nc, gfp_mask);
+       if (unlikely(!page || offset < size)) {
+               page = __page_frag_cache_refill(nc, size, gfp_mask);
                if (!page)
                        return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-               /* if size can vary use size else just use PAGE_SIZE */
-               size = nc->size;
-#endif
-               /* Even if we own the page, we do not use atomic_set().
-                * This would break get_page_unless_zero() users.
-                */
-               page_ref_add(page, size - 1);
-
-               /* reset page count bias and offset to start of new frag */
-               nc->pfmemalloc = page_is_pfmemalloc(page);
-               nc->pagecnt_bias = size;
-               nc->offset = size;
-       }
-
-       offset = nc->offset - fragsz;
-       if (unlikely(offset < 0)) {
-               page = virt_to_page(nc->va);
-
-               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-                       goto refill;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-               /* if size can vary use size else just use PAGE_SIZE */
-               size = nc->size;
-#endif
-               /* OK, page count is 0, we can safely set it */
-               set_page_count(page, size);
-
-               /* reset page count bias and offset to start of new frag */
-               nc->pagecnt_bias = size;
-               offset = size - fragsz;
+               offset = page->offset;
        }
 
-       nc->pagecnt_bias--;
-       nc->offset = offset;
+       page_ref_inc(page);
+       offset -= size;
+       page->offset = offset;
 
-       return nc->va + offset;
+       return page_address(page) + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09bd89c90a71..59df4db31aed 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -412,7 +412,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 
unsigned int len,
 
        nc = this_cpu_ptr(&netdev_alloc_cache);
        data = page_frag_alloc(nc, len, gfp_mask);
-       pfmemalloc = nc->pfmemalloc;
+       pfmemalloc = page_is_pfmemalloc(nc->page);
 
        local_irq_restore(flags);
 
@@ -486,7 +486,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, 
unsigned int len,
        }
 
        /* use OR instead of assignment to avoid clearing of bits in mask */
-       if (nc->page.pfmemalloc)
+       if (page_is_pfmemalloc(nc->page.page))
                skb->pfmemalloc = 1;
        skb->head_frag = 1;
 
-- 
2.16.2

Reply via email to