On Mon, Dec 14, 2009 at 01:58:34PM +0100, Otto Moerbeek wrote:

> On Mon, Dec 14, 2009 at 01:54:50PM +0100, Otto Moerbeek wrote:
> 
> > On Mon, Dec 14, 2009 at 07:44:31AM -0500, Ted Unangst wrote:
> > 
> > > On Mon, Dec 14, 2009 at 3:58 AM, Otto Moerbeek <o...@drijf.net> wrote:
> > > >> apart from the random page addresses obtained form mmap(2) malloc(3)
> > > >> itself also randomizes cache en chunk operations. It uses a nibble of
> > > >> randomness per call, so optimize that to not waste half the random
> > > >> bits.
> > > >>
> > > >> Please test, should be a bit faster.
> > > 
> > > I don't really like this.  It looks complicated (hard to judge).  arc4
> > > is fast enough I don't think we need to worry about conserving bits,
> > > and I'd be surprised if it actually were faster considering how much
> > > work you're doing per nibble compared to the work to just run arc4.
> > > However fast it may be, I feel much safer with the current code.
> > 
> > Every call to arc4random_buf() causes a getpid() call, which is a
> > syscall with all the overhead. This saves half of those. 
> 
> Hit "send" too fast....
> 
> Yes, it is a bit more more complicated, but I think not too much.
> 
>       -Otto

This is an alternative version from guenther@, which is a bit simpler
and generates samller code (as seen on i386). I intend to commit this
soon, the numbers from Joachim Schipper show a small, but significant
speedup. 

        -Otto

Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.122
diff -u -p -r1.122 malloc.c
--- malloc.c    7 Dec 2009 18:47:38 -0000       1.122
+++ malloc.c    15 Dec 2009 19:15:33 -0000
@@ -66,7 +66,7 @@
 
 #define MALLOC_MAXCHUNK                (1 << (MALLOC_PAGESHIFT-1))
 #define MALLOC_MAXCACHE                256
-#define MALLOC_DELAYED_CHUNKS  16      /* should be power of 2 */
+#define MALLOC_DELAYED_CHUNKS  15      /* max of getrnibble() */
 /*
  * When the P option is active, we move allocations between half a page
  * and a whole page towards the end, subject to alignment constraints.
@@ -112,7 +112,7 @@ struct dir_info {
                                        /* free pages cache */
        struct region_info free_regions[MALLOC_MAXCACHE];
                                        /* delayed free chunk slots */
-       void *delayed_chunks[MALLOC_DELAYED_CHUNKS];
+       void *delayed_chunks[MALLOC_DELAYED_CHUNKS + 1];
 #ifdef MALLOC_STATS
        size_t inserts;
        size_t insert_collisions;
@@ -185,9 +185,9 @@ static int  malloc_active;          /* status of 
 static size_t  malloc_guarded;         /* bytes used for guards */
 static size_t  malloc_used;            /* bytes allocated */
 
-static size_t rbytesused;              /* random bytes used */
+static size_t rnibblesused;            /* random nibbles used */
 static u_char rbytes[512];             /* random bytes */
-static u_char getrbyte(void);
+static u_char getrnibble(void);
 
 extern char    *__progname;
 
@@ -380,6 +380,24 @@ wrterror(char *p)
                abort();
 }
 
+static void
+rbytes_init(void)
+{
+       arc4random_buf(rbytes, sizeof(rbytes));
+       rnibblesused = 0;
+}
+
+static inline u_char
+getrnibble(void)
+{
+       u_char x;
+
+       if (rnibblesused >= 2 * sizeof(rbytes))
+               rbytes_init();
+       x = rbytes[rnibblesused++ / 2];
+       return (rnibblesused & 1 ? x & 0xf : x >> 4);
+}
+
 /*
  * Cache maintenance. We keep at most malloc_cache pages cached.
  * If the cache is becoming full, unmap pages in the cache for real,
@@ -410,7 +428,7 @@ unmap(struct dir_info *d, void *p, size_
        rsz = mopts.malloc_cache - d->free_regions_size;
        if (psz > rsz)
                tounmap = psz - rsz;
-       offset = getrbyte();
+       offset = getrnibble();
        for (i = 0; tounmap > 0 && i < mopts.malloc_cache; i++) {
                r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
                if (r->p != NULL) {
@@ -491,7 +509,7 @@ map(struct dir_info *d, size_t sz, int z
                /* zero fill not needed */
                return p;
        }
-       offset = getrbyte();
+       offset = getrnibble();
        for (i = 0; i < mopts.malloc_cache; i++) {
                r = &d->free_regions[(i + offset) & (mopts.malloc_cache - 1)];
                if (r->p != NULL) {
@@ -538,21 +556,6 @@ map(struct dir_info *d, size_t sz, int z
        return p;
 }
 
-static void
-rbytes_init(void)
-{
-       arc4random_buf(rbytes, sizeof(rbytes));
-       rbytesused = 0;
-}
-
-static u_char
-getrbyte(void)
-{
-       if (rbytesused >= sizeof(rbytes))
-               rbytes_init();
-       return rbytes[rbytesused++];
-}
-
 /*
  * Initialize a dir_info, which should have been cleared by caller
  */
@@ -1012,7 +1015,7 @@ malloc_bytes(struct dir_info *d, size_t 
        }
 
        /* advance a random # of positions */
-       i = (getrbyte() & (MALLOC_DELAYED_CHUNKS - 1)) % bp->free;
+       i = getrnibble() % bp->free;
        while (i > 0) {
                u += u;
                k++;
@@ -1275,7 +1278,7 @@ ofree(void *p)
                if (mopts.malloc_junk && sz > 0)
                        memset(p, SOME_FREEJUNK, sz);
                if (!mopts.malloc_freeprot) {
-                       i = getrbyte() & (MALLOC_DELAYED_CHUNKS - 1);
+                       i = getrnibble();
                        tmp = p;
                        p = g_pool->delayed_chunks[i];
                        g_pool->delayed_chunks[i] = tmp;

Reply via email to