libgo patch committed: Copy hash code from Go 1.7 runtime

Ian Lance Taylor Thu, 08 Dec 2016 08:39:37 -0800

This patch to libgo copies the memory hashing code from the Go 1.7
runtime.  This is particular important because we earlier copied the
hashmap code from Go 1.7, and that changed hash table sizes from prime
numbers to powers of two.  The memory hashing code used before this
patch was fine for prime numbers but for powers of two tended to hash
many values to the same bucket, making maps much much slower than they
should be.


I rewrote the AES hashing code from gc assembler to C code using
intrinsics.  The resulting code generates the same hash code for the
same input as the gc code--that doesn't matter as such, but testing it
ensures that the C code does something useful.

I changed mips64pe32le to mips64p32le in configure script--noticed
during CL review.

Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu, both with
and without the AES hashing code.  Committed to mainline.

Ian

Index: gcc/go/gofrontend/MERGE
===================================================================
--- gcc/go/gofrontend/MERGE     (revision 243444)
+++ gcc/go/gofrontend/MERGE     (working copy)
@@ -1,4 +1,4 @@
-2442fca7be8a4f51ddc91070fa69ef66e24593ac
+78e3527fcaf4ffd33b22e39a56e5d076844302be
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/types.cc
===================================================================
--- gcc/go/gofrontend/types.cc  (revision 243321)
+++ gcc/go/gofrontend/types.cc  (working copy)
@@ -1648,7 +1648,7 @@ Type::type_functions(Gogo* gogo, Named_t
   const char* equal_fnname;
   if (this->compare_is_identity(gogo))
     {
-      hash_fnname = "__go_type_hash_identity";
+      hash_fnname = "runtime.memhash";
       equal_fnname = "__go_type_equal_identity";
     }
   else
Index: libgo/Makefile.am
===================================================================
--- libgo/Makefile.am   (revision 243084)
+++ libgo/Makefile.am   (working copy)
@@ -422,6 +422,7 @@ endif
 endif
 
 runtime_files = \
+       runtime/aeshash.c \
        runtime/go-assert.c \
        runtime/go-breakpoint.c \
        runtime/go-caller.c \
Index: libgo/configure.ac
===================================================================
--- libgo/configure.ac  (revision 243084)
+++ libgo/configure.ac  (working copy)
@@ -197,7 +197,7 @@ AC_SUBST(USE_DEJAGNU)
 # supported by the gofrontend and all architectures supported by the
 # gc toolchain.
 # N.B. Keep in sync with gcc/testsuite/go.test/go-test.exp (go-set-goarch).
-ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 
mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64pe32le ppc 
ppc64 ppc64le s390 s390x sparc sparc64"
+ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 
mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64p32le ppc 
ppc64 ppc64le s390 s390x sparc sparc64"
 
 # All known GOARCH_FAMILY values.
 ALLGOARCHFAMILY="I386 ALPHA AMD64 ARM ARM64 IA64 M68K MIPS MIPS64 PPC PPC64 
S390 S390X SPARC SPARC64"
Index: libgo/go/runtime/alg.go
===================================================================
--- libgo/go/runtime/alg.go     (revision 243084)
+++ libgo/go/runtime/alg.go     (working copy)
@@ -23,12 +23,29 @@ import (
 //go:linkname efacevaleq runtime.efacevaleq
 //go:linkname eqstring runtime.eqstring
 //go:linkname cmpstring runtime.cmpstring
+//
+// Temporary to be called from C code.
+//go:linkname alginit runtime.alginit
 
 const (
        c0 = uintptr((8-sys.PtrSize)/4*2860486313 + 
(sys.PtrSize-4)/4*33054211828000289)
        c1 = uintptr((8-sys.PtrSize)/4*3267000013 + 
(sys.PtrSize-4)/4*23344194077549503)
 )
 
+var useAeshash bool
+
+// in C code
+func aeshashbody(p unsafe.Pointer, h, s uintptr, sched []byte) uintptr
+
+func aeshash(p unsafe.Pointer, h, s uintptr) uintptr {
+       return aeshashbody(p, h, s, aeskeysched[:])
+}
+
+func aeshashstr(p unsafe.Pointer, h uintptr) uintptr {
+       ps := (*stringStruct)(p)
+       return aeshashbody(unsafe.Pointer(ps.str), h, uintptr(ps.len), 
aeskeysched[:])
+}
+
 func interhash(p unsafe.Pointer, h uintptr, size uintptr) uintptr {
        a := (*iface)(p)
        tab := a.tab
@@ -198,7 +215,35 @@ func cmpstring(x, y string) int {
 
 // Force the creation of function descriptors for equality and hash
 // functions.  These will be referenced directly by the compiler.
+var _ = memhash
 var _ = interhash
 var _ = interequal
 var _ = nilinterhash
 var _ = nilinterequal
+
+const hashRandomBytes = sys.PtrSize / 4 * 64
+
+// used in asm_{386,amd64}.s to seed the hash function
+var aeskeysched [hashRandomBytes]byte
+
+// used in hash{32,64}.go to seed the hash function
+var hashkey [4]uintptr
+
+func alginit() {
+       // Install aes hash algorithm if we have the instructions we need
+       if (GOARCH == "386" || GOARCH == "amd64") &&
+               GOOS != "nacl" &&
+               cpuid_ecx&(1<<25) != 0 && // aes (aesenc)
+               cpuid_ecx&(1<<9) != 0 && // sse3 (pshufb)
+               cpuid_ecx&(1<<19) != 0 { // sse4.1 (pinsr{d,q})
+               useAeshash = true
+               // Initialize with random data so hash collisions will be hard 
to engineer.
+               getRandomData(aeskeysched[:])
+               return
+       }
+       getRandomData((*[len(hashkey) * 
sys.PtrSize]byte)(unsafe.Pointer(&hashkey))[:])
+       hashkey[0] |= 1 // make sure these numbers are odd
+       hashkey[1] |= 1
+       hashkey[2] |= 1
+       hashkey[3] |= 1
+}
Index: libgo/go/runtime/hash32.go
===================================================================
--- libgo/go/runtime/hash32.go  (revision 0)
+++ libgo/go/runtime/hash32.go  (working copy)
@@ -0,0 +1,94 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hashing algorithm inspired by
+//   xxhash: https://code.google.com/p/xxhash/
+// cityhash: https://code.google.com/p/cityhash/
+
+// +build 386 arm armbe m68k mipso32 mipsn32 mips mipsle ppc s390 sparc
+
+package runtime
+
+import "unsafe"
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname memhash runtime.memhash
+
+const (
+       // Constants for multiplication: four random odd 32-bit numbers.
+       m1 = 3168982561
+       m2 = 3339683297
+       m3 = 832293441
+       m4 = 2336365089
+)
+
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
+       if GOARCH == "386" && GOOS != "nacl" && useAeshash {
+               return aeshash(p, seed, s)
+       }
+       h := uint32(seed + s*hashkey[0])
+tail:
+       switch {
+       case s == 0:
+       case s < 4:
+               h ^= uint32(*(*byte)(p))
+               h ^= uint32(*(*byte)(add(p, s>>1))) << 8
+               h ^= uint32(*(*byte)(add(p, s-1))) << 16
+               h = rotl_15(h*m1) * m2
+       case s == 4:
+               h ^= readUnaligned32(p)
+               h = rotl_15(h*m1) * m2
+       case s <= 8:
+               h ^= readUnaligned32(p)
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, s-4))
+               h = rotl_15(h*m1) * m2
+       case s <= 16:
+               h ^= readUnaligned32(p)
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, 4))
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, s-8))
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, s-4))
+               h = rotl_15(h*m1) * m2
+       default:
+               v1 := h
+               v2 := uint32(seed * hashkey[1])
+               v3 := uint32(seed * hashkey[2])
+               v4 := uint32(seed * hashkey[3])
+               for s >= 16 {
+                       v1 ^= readUnaligned32(p)
+                       v1 = rotl_15(v1*m1) * m2
+                       p = add(p, 4)
+                       v2 ^= readUnaligned32(p)
+                       v2 = rotl_15(v2*m2) * m3
+                       p = add(p, 4)
+                       v3 ^= readUnaligned32(p)
+                       v3 = rotl_15(v3*m3) * m4
+                       p = add(p, 4)
+                       v4 ^= readUnaligned32(p)
+                       v4 = rotl_15(v4*m4) * m1
+                       p = add(p, 4)
+                       s -= 16
+               }
+               h = v1 ^ v2 ^ v3 ^ v4
+               goto tail
+       }
+       h ^= h >> 17
+       h *= m3
+       h ^= h >> 13
+       h *= m4
+       h ^= h >> 16
+       return uintptr(h)
+}
+
+// Note: in order to get the compiler to issue rotl instructions, we
+// need to constant fold the shift amount by hand.
+// TODO: convince the compiler to issue rotl instructions after inlining.
+func rotl_15(x uint32) uint32 {
+       return (x << 15) | (x >> (32 - 15))
+}
Index: libgo/go/runtime/hash64.go
===================================================================
--- libgo/go/runtime/hash64.go  (revision 0)
+++ libgo/go/runtime/hash64.go  (working copy)
@@ -0,0 +1,94 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hashing algorithm inspired by
+//   xxhash: https://code.google.com/p/xxhash/
+// cityhash: https://code.google.com/p/cityhash/
+
+// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le s390x alpha 
arm64be ia64 mipso64 mipsn64 mips64p32 mips64p32le sparc64
+
+package runtime
+
+import "unsafe"
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname memhash runtime.memhash
+
+const (
+       // Constants for multiplication: four random odd 64-bit numbers.
+       m1 = 16877499708836156737
+       m2 = 2820277070424839065
+       m3 = 9497967016996688599
+       m4 = 15839092249703872147
+)
+
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
+       if GOARCH == "amd64" && GOOS != "nacl" && useAeshash {
+               return aeshash(p, seed, s)
+       }
+       h := uint64(seed + s*hashkey[0])
+tail:
+       switch {
+       case s == 0:
+       case s < 4:
+               h ^= uint64(*(*byte)(p))
+               h ^= uint64(*(*byte)(add(p, s>>1))) << 8
+               h ^= uint64(*(*byte)(add(p, s-1))) << 16
+               h = rotl_31(h*m1) * m2
+       case s <= 8:
+               h ^= uint64(readUnaligned32(p))
+               h ^= uint64(readUnaligned32(add(p, s-4))) << 32
+               h = rotl_31(h*m1) * m2
+       case s <= 16:
+               h ^= readUnaligned64(p)
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, s-8))
+               h = rotl_31(h*m1) * m2
+       case s <= 32:
+               h ^= readUnaligned64(p)
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, 8))
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, s-16))
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, s-8))
+               h = rotl_31(h*m1) * m2
+       default:
+               v1 := h
+               v2 := uint64(seed * hashkey[1])
+               v3 := uint64(seed * hashkey[2])
+               v4 := uint64(seed * hashkey[3])
+               for s >= 32 {
+                       v1 ^= readUnaligned64(p)
+                       v1 = rotl_31(v1*m1) * m2
+                       p = add(p, 8)
+                       v2 ^= readUnaligned64(p)
+                       v2 = rotl_31(v2*m2) * m3
+                       p = add(p, 8)
+                       v3 ^= readUnaligned64(p)
+                       v3 = rotl_31(v3*m3) * m4
+                       p = add(p, 8)
+                       v4 ^= readUnaligned64(p)
+                       v4 = rotl_31(v4*m4) * m1
+                       p = add(p, 8)
+                       s -= 32
+               }
+               h = v1 ^ v2 ^ v3 ^ v4
+               goto tail
+       }
+
+       h ^= h >> 29
+       h *= m3
+       h ^= h >> 32
+       return uintptr(h)
+}
+
+// Note: in order to get the compiler to issue rotl instructions, we
+// need to constant fold the shift amount by hand.
+// TODO: convince the compiler to issue rotl instructions after inlining.
+func rotl_31(x uint64) uint64 {
+       return (x << 31) | (x >> (64 - 31))
+}
Index: libgo/go/runtime/os_gccgo.go
===================================================================
--- libgo/go/runtime/os_gccgo.go        (revision 0)
+++ libgo/go/runtime/os_gccgo.go        (working copy)
@@ -0,0 +1,23 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+       "unsafe"
+)
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+func getRandomData(r []byte) {
+       if startupRandomData != nil {
+               n := copy(r, startupRandomData)
+               extendRandom(r, n)
+               return
+       }
+       fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+       n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+       closefd(fd)
+       extendRandom(r, int(n))
+}
Index: libgo/go/runtime/runtime2.go
===================================================================
--- libgo/go/runtime/runtime2.go        (revision 243084)
+++ libgo/go/runtime/runtime2.go        (working copy)
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+       "runtime/internal/sys"
        "unsafe"
 )
 
@@ -668,7 +669,6 @@ type forcegcstate struct {
 // the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
 var startupRandomData []byte
 
-/*
 // extendRandom extends the random numbers in r[:n] to the whole slice r.
 // Treats n<0 as n==0.
 func extendRandom(r []byte, n int) {
@@ -689,7 +689,6 @@ func extendRandom(r []byte, n int) {
                }
        }
 }
-*/
 
 // deferred subroutine calls
 // This is the gccgo version.
@@ -770,11 +769,12 @@ var (
 
        sched schedt
 
-//     newprocs    int32
+       //      newprocs    int32
+
+       // Information about what cpu features are available.
+       // Set on startup.
+       cpuid_ecx uint32
 
-// Information about what cpu features are available.
-// Set on startup in asm_{x86,amd64}.s.
-//     cpuid_ecx         uint32
 //     cpuid_edx         uint32
 //     cpuid_ebx7        uint32
 //     lfenceBeforeRdtsc bool
Index: libgo/go/runtime/stubs.go
===================================================================
--- libgo/go/runtime/stubs.go   (revision 243084)
+++ libgo/go/runtime/stubs.go   (working copy)
@@ -248,6 +248,12 @@ func funcPC(f interface{}) uintptr {
        return **(**uintptr)(i.data)
 }
 
+// For gccgo, to communicate from the C code to the Go code.
+//go:linkname setCpuidECX runtime.setCpuidECX
+func setCpuidECX(v uint32) {
+       cpuid_ecx = v
+}
+
 // typedmemmove copies a typed value.
 // For gccgo for now.
 //go:nosplit
Index: libgo/go/runtime/unaligned1.go
===================================================================
--- libgo/go/runtime/unaligned1.go      (revision 0)
+++ libgo/go/runtime/unaligned1.go      (working copy)
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32 arm64 ppc64 ppc64le s390x ppc s390 arm64be
+
+package runtime
+
+import "unsafe"
+
+func readUnaligned32(p unsafe.Pointer) uint32 {
+       return *(*uint32)(p)
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+       return *(*uint64)(p)
+}
Index: libgo/go/runtime/unaligned2.go
===================================================================
--- libgo/go/runtime/unaligned2.go      (revision 0)
+++ libgo/go/runtime/unaligned2.go      (working copy)
@@ -0,0 +1,20 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm mips64 mips64le armbe m68k mipso32 mipsn32 mips mipsle sparc 
alpha ia64 mipso64 mipsn64 mips64p32 mips64p32le sparc64
+
+package runtime
+
+import "unsafe"
+
+// Note: These routines perform the read with an unspecified endianness.
+func readUnaligned32(p unsafe.Pointer) uint32 {
+       q := (*[4]byte)(p)
+       return uint32(q[0]) + uint32(q[1])<<8 + uint32(q[2])<<16 + 
uint32(q[3])<<24
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+       q := (*[8]byte)(p)
+       return uint64(q[0]) + uint64(q[1])<<8 + uint64(q[2])<<16 + 
uint64(q[3])<<24 + uint64(q[4])<<32 + uint64(q[5])<<40 + uint64(q[6])<<48 + 
uint64(q[7])<<56
+}
Index: libgo/runtime/aeshash.c
===================================================================
--- libgo/runtime/aeshash.c     (revision 0)
+++ libgo/runtime/aeshash.c     (working copy)
@@ -0,0 +1,583 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hash code using AES intrinsics.
+
+#include "runtime.h"
+
+uintptr aeshashbody(void*, uintptr, uintptr, Slice)
+       __asm__(GOSYM_PREFIX "runtime.aeshashbody");
+
+uintptr aeshashbody(void*, uintptr, uintptr, Slice)
+       __attribute__((no_split_stack));
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <wmmintrin.h>
+
+// Force appropriate CPU level.  We won't call here unless the CPU
+// supports it.
+
+#pragma GCC target("ssse3", "aes")
+
+#ifdef __x86_64__
+
+// aeshashbody implements a hash function using AES instructions
+// available in recent x86 processors. Note this is not encryption,
+// just hashing.
+//
+// This is written to produce exactly the same results as the gc
+// implementation, not because that matters, but just to ensure that
+// this does something reasonable.
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+       __m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8;
+       __m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8;
+
+       // Start with hash seed.
+       mseed = _mm_cvtsi64_si128(seed);
+       // Get 16 bits of length.
+       mseed = _mm_insert_epi16(mseed, size, 4);
+       // Repeat length 4 times total.
+       mseed = _mm_shufflehi_epi16(mseed, 0);
+       // Save unscrambled seed.
+       mseed2 = mseed;
+       // XOR in per-process seed.
+       mseed ^= _mm_loadu_si128(aeskeysched.__values);
+       // Scramble seed.
+       mseed = _mm_aesenc_si128(mseed, mseed);
+
+       if (size <= 16) {
+               if (size == 0) {
+                       // Return scrambled input seed.
+                       return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, 
mseed));
+               } else if (size < 16) {
+                       if ((((uintptr)(p) + 16) & 0xff0) != 0) {
+                               static const uint64 masks[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0x00000000000000ff, 0x0000000000000000,
+                                   0x000000000000ffff, 0x0000000000000000,
+                                   0x0000000000ffffff, 0x0000000000000000,
+                                   0x00000000ffffffff, 0x0000000000000000,
+                                   0x000000ffffffffff, 0x0000000000000000,
+                                   0x0000ffffffffffff, 0x0000000000000000,
+                                   0x00ffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x00000000000000ff,
+                                   0xffffffffffffffff, 0x000000000000ffff,
+                                   0xffffffffffffffff, 0x0000000000ffffff,
+                                   0xffffffffffffffff, 0x00000000ffffffff,
+                                   0xffffffffffffffff, 0x000000ffffffffff,
+                                   0xffffffffffffffff, 0x0000ffffffffffff,
+                                   0xffffffffffffffff, 0x00ffffffffffffff
+                                 };
+
+                               // 16 bytes loaded at p won't cross a page
+                               // boundary, so we can load directly.
+                               mval = _mm_loadu_si128(p);
+                               mval &= *(const __m128i*)(&masks[size*2]);
+                       } else {
+                               static const uint64 shifts[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0xffffffffffffff0f, 0xffffffffffffffff,
+                                   0xffffffffffff0f0e, 0xffffffffffffffff,
+                                   0xffffffffff0f0e0d, 0xffffffffffffffff,
+                                   0xffffffff0f0e0d0c, 0xffffffffffffffff,
+                                   0xffffff0f0e0d0c0b, 0xffffffffffffffff,
+                                   0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
+                                   0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
+                                   0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
+                                   0x0e0d0c0b0a090807, 0xffffffffffffff0f,
+                                   0x0d0c0b0a09080706, 0xffffffffffff0f0e,
+                                   0x0c0b0a0908070605, 0xffffffffff0f0e0d,
+                                   0x0b0a090807060504, 0xffffffff0f0e0d0c,
+                                   0x0a09080706050403, 0xffffff0f0e0d0c0b,
+                                   0x0908070605040302, 0xffff0f0e0d0c0b0a,
+                                   0x0807060504030201, 0xff0f0e0d0c0b0a09,
+                                 };
+
+                               // address ends in 1111xxxx. Might be
+                               // up against a page boundary, so load
+                               // ending at last byte.  Then shift
+                               // bytes down using pshufb.
+                               mval = _mm_loadu_si128((void*)((char*)p - 16 + 
size));
+                               mval = _mm_shuffle_epi8(mval, *(const 
__m128i*)(&shifts[size*2]));
+                       }
+               } else {
+                       mval = _mm_loadu_si128(p);
+               }
+
+               // XOR data with seed.
+               mval ^= mseed;
+               // Scramble combo 3 times.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval = _mm_aesenc_si128(mval, mval);
+               return _mm_cvtsi128_si64(mval);
+       } else if (size <= 32) {
+               // Make second starting seed.
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               // Load data to be hashed.
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
+               // XOR with seed.
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               // Scramble 3 times.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               // Combine results.
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       } else if (size <= 64) {
+               // Make 3 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
48));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + 16));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               mval3 ^= mseed3;
+               mval4 ^= mseed4;
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       } else if (size <= 128) {
+               // Make 7 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed5 = mseed2;
+               mseed6 = mseed2;
+               mseed7 = mseed2;
+               mseed8 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
48));
+               mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
64));
+               mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
80));
+               mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
96));
+               mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
112));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+               mseed5 = _mm_aesenc_si128(mseed5, mseed5);
+               mseed6 = _mm_aesenc_si128(mseed6, mseed6);
+               mseed7 = _mm_aesenc_si128(mseed7, mseed7);
+               mseed8 = _mm_aesenc_si128(mseed8, mseed8);
+
+               // Load data.
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + 16));
+               mval3 = _mm_loadu_si128((void*)((char*)p + 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + 48));
+               mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
+               mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
+               mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // XOR with seed.
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               mval3 ^= mseed3;
+               mval4 ^= mseed4;
+               mval5 ^= mseed5;
+               mval6 ^= mseed6;
+               mval7 ^= mseed7;
+               mval8 ^= mseed8;
+
+               // Scramble 3 times.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               // Combine results.
+               mval ^= mval5;
+               mval2 ^= mval6;
+               mval3 ^= mval7;
+               mval4 ^= mval8;
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       } else {
+               // Make 7 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed5 = mseed2;
+               mseed6 = mseed2;
+               mseed7 = mseed2;
+               mseed8 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
48));
+               mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
64));
+               mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
80));
+               mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
96));
+               mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
112));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+               mseed5 = _mm_aesenc_si128(mseed5, mseed5);
+               mseed6 = _mm_aesenc_si128(mseed6, mseed6);
+               mseed7 = _mm_aesenc_si128(mseed7, mseed7);
+               mseed8 = _mm_aesenc_si128(mseed8, mseed8);
+
+               // Start with last (possibly overlapping) block.
+               mval = _mm_loadu_si128((void*)((char*)p + size - 128));
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 112));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 96));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 80));
+               mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
+               mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
+               mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // XOR in seed.
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               mval3 ^= mseed3;
+               mval4 ^= mseed4;
+               mval5 ^= mseed5;
+               mval6 ^= mseed6;
+               mval7 ^= mseed7;
+               mval8 ^= mseed8;
+
+               // Compute number of remaining 128-byte blocks.
+               size--;
+               size >>= 7;
+               do {
+                       // Scramble state.
+                       mval = _mm_aesenc_si128(mval, mval);
+                       mval2 = _mm_aesenc_si128(mval2, mval2);
+                       mval3 = _mm_aesenc_si128(mval3, mval3);
+                       mval4 = _mm_aesenc_si128(mval4, mval4);
+                       mval5 = _mm_aesenc_si128(mval5, mval5);
+                       mval6 = _mm_aesenc_si128(mval6, mval6);
+                       mval7 = _mm_aesenc_si128(mval7, mval7);
+                       mval8 = _mm_aesenc_si128(mval8, mval8);
+
+                       // Scramble state, XOR in a block.
+                       mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
+                       mval2 = _mm_aesenc_si128(mval2, 
_mm_loadu_si128((void*)((char*)p + 16)));
+                       mval3 = _mm_aesenc_si128(mval3, 
_mm_loadu_si128((void*)((char*)p + 32)));
+                       mval4 = _mm_aesenc_si128(mval4, 
_mm_loadu_si128((void*)((char*)p + 48)));
+                       mval5 = _mm_aesenc_si128(mval5, 
_mm_loadu_si128((void*)((char*)p + 64)));
+                       mval6 = _mm_aesenc_si128(mval6, 
_mm_loadu_si128((void*)((char*)p + 80)));
+                       mval7 = _mm_aesenc_si128(mval7, 
_mm_loadu_si128((void*)((char*)p + 96)));
+                       mval8 = _mm_aesenc_si128(mval8, 
_mm_loadu_si128((void*)((char*)p + 112)));
+
+                       p = (void*)((char*)p + 128);
+               } while (--size > 0);
+
+               // 3 more scrambles to finish.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               mval ^= mval5;
+               mval2 ^= mval6;
+               mval3 ^= mval7;
+               mval4 ^= mval8;
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       }
+}
+
+#else // !defined(__x86_64__)
+
+// The 32-bit version of aeshashbody.
+
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+       __m128i mseed, mseed2, mseed3, mseed4;
+       __m128i mval, mval2, mval3, mval4;
+
+       // Start with hash seed.
+       mseed = _mm_cvtsi32_si128(seed);
+       // Get 16 bits of length.
+       mseed = _mm_insert_epi16(mseed, size, 4);
+       // Replace size with its low 2 bytes repeated 4 times.
+       mseed = _mm_shufflehi_epi16(mseed, 0);
+       // Save unscrambled seed.
+       mseed2 = mseed;
+       // XOR in per-process seed.
+       mseed ^= _mm_loadu_si128(aeskeysched.__values);
+       // Scramble seed.
+       mseed = _mm_aesenc_si128(mseed, mseed);
+
+       if (size <= 16) {
+               if (size == 0) {
+                       // Return scrambled input seed.
+                       return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, 
mseed));
+               } else if (size < 16) {
+                       if ((((uintptr)(p) + 16) & 0xff0) != 0) {
+                               static const uint64 masks[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0x00000000000000ff, 0x0000000000000000,
+                                   0x000000000000ffff, 0x0000000000000000,
+                                   0x0000000000ffffff, 0x0000000000000000,
+                                   0x00000000ffffffff, 0x0000000000000000,
+                                   0x000000ffffffffff, 0x0000000000000000,
+                                   0x0000ffffffffffff, 0x0000000000000000,
+                                   0x00ffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x00000000000000ff,
+                                   0xffffffffffffffff, 0x000000000000ffff,
+                                   0xffffffffffffffff, 0x0000000000ffffff,
+                                   0xffffffffffffffff, 0x00000000ffffffff,
+                                   0xffffffffffffffff, 0x000000ffffffffff,
+                                   0xffffffffffffffff, 0x0000ffffffffffff,
+                                   0xffffffffffffffff, 0x00ffffffffffffff
+                                 };
+
+                               // 16 bytes loaded at p won't cross a page
+                               // boundary, so we can load it directly.
+                               mval = _mm_loadu_si128(p);
+                               mval &= *(const __m128i*)(&masks[size*2]);
+                       } else {
+                               static const uint64 shifts[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0xffffffffffffff0f, 0xffffffffffffffff,
+                                   0xffffffffffff0f0e, 0xffffffffffffffff,
+                                   0xffffffffff0f0e0d, 0xffffffffffffffff,
+                                   0xffffffff0f0e0d0c, 0xffffffffffffffff,
+                                   0xffffff0f0e0d0c0b, 0xffffffffffffffff,
+                                   0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
+                                   0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
+                                   0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
+                                   0x0e0d0c0b0a090807, 0xffffffffffffff0f,
+                                   0x0d0c0b0a09080706, 0xffffffffffff0f0e,
+                                   0x0c0b0a0908070605, 0xffffffffff0f0e0d,
+                                   0x0b0a090807060504, 0xffffffff0f0e0d0c,
+                                   0x0a09080706050403, 0xffffff0f0e0d0c0b,
+                                   0x0908070605040302, 0xffff0f0e0d0c0b0a,
+                                   0x0807060504030201, 0xff0f0e0d0c0b0a09,
+                                 };
+
+                               // address ends in 1111xxxx. Might be
+                               // up against a page boundary, so load
+                               // ending at last byte.  Then shift
+                               // bytes down using pshufb.
+                               mval = _mm_loadu_si128((void*)((char*)p - 16 + 
size));
+                               mval = _mm_shuffle_epi8(mval, *(const 
__m128i*)(&shifts[size*2]));
+                       }
+               } else {
+                       mval = _mm_loadu_si128(p);
+               }
+
+               // Scramble input, XOR in seed.
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval = _mm_aesenc_si128(mval, mval);
+               return _mm_cvtsi128_si32(mval);
+       } else if (size <= 32) {
+               // Make second starting seed.
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               // Load data to be hashed.
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // Scramble 3 times.
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval2 = _mm_aesenc_si128(mval2, mseed2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+
+               // Combine results.
+               mval ^= mval2;
+               return _mm_cvtsi128_si32(mval);
+       } else if (size <= 64) {
+               // Make 3 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
48));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + 16));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval2 = _mm_aesenc_si128(mval2, mseed2);
+               mval3 = _mm_aesenc_si128(mval3, mseed3);
+               mval4 = _mm_aesenc_si128(mval4, mseed4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si32(mval);
+       } else {
+               // Make 3 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 
48));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+
+               // Start with last (possibly overlapping) block.
+               mval = _mm_loadu_si128((void*)((char*)p + size - 64));
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 48));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // Scramble state once.
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval2 = _mm_aesenc_si128(mval2, mseed2);
+               mval3 = _mm_aesenc_si128(mval3, mseed3);
+               mval4 = _mm_aesenc_si128(mval4, mseed4);
+
+               // Compute number of remaining 64-byte blocks.
+               size--;
+               size >>= 6;
+               do {
+                       // Scramble state, XOR in a block.
+                       mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
+                       mval2 = _mm_aesenc_si128(mval2, 
_mm_loadu_si128((void*)((char*)p + 16)));
+                       mval3 = _mm_aesenc_si128(mval3, 
_mm_loadu_si128((void*)((char*)p + 32)));
+                       mval4 = _mm_aesenc_si128(mval4, 
_mm_loadu_si128((void*)((char*)p + 48)));
+
+                       // Scramble state.
+                       mval = _mm_aesenc_si128(mval, mval);
+                       mval2 = _mm_aesenc_si128(mval2, mval2);
+                       mval3 = _mm_aesenc_si128(mval3, mval3);
+                       mval4 = _mm_aesenc_si128(mval4, mval4);
+
+                       p = (void*)((char*)p + 64);
+               } while (--size > 0);
+
+               // 2 more scrambles to finish.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si32(mval);
+       }
+}
+
+#endif // !defined(__x86_64__)
+
+#else // !defined(__i386__) && !defined(__x86_64__)
+
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+       // We should never get here on a non-x86 system.
+       runtime_throw("impossible call to aeshashbody");
+}
+
+#endif // !defined(__i386__) && !defined(__x86_64__)
Index: libgo/runtime/go-libmain.c
===================================================================
--- libgo/runtime/go-libmain.c  (revision 243084)
+++ libgo/runtime/go-libmain.c  (working copy)
@@ -61,6 +61,7 @@ initfn (int argc, char **argv, char** en
 
   runtime_isarchive = true;
 
+  runtime_cpuinit ();
   runtime_initsig(true);
 
   a = (struct args *) malloc (sizeof *a);
Index: libgo/runtime/go-main.c
===================================================================
--- libgo/runtime/go-main.c     (revision 243084)
+++ libgo/runtime/go-main.c     (working copy)
@@ -47,6 +47,7 @@ main (int argc, char **argv)
   runtime_isstarted = true;
 
   __go_end = (uintptr)_end;
+  runtime_cpuinit ();
   runtime_check ();
   runtime_args (argc, (byte **) argv);
   runtime_osinit ();
Index: libgo/runtime/go-type-identity.c
===================================================================
--- libgo/runtime/go-type-identity.c    (revision 243084)
+++ libgo/runtime/go-type-identity.c    (working copy)
@@ -9,44 +9,14 @@
 #include "runtime.h"
 #include "go-type.h"
 
-/* An identity hash function for a type.  This is used for types where
-   we can simply use the type value itself as a hash code.  This is
-   true of, e.g., integers and pointers.  */
+/* The hash functions for types that can compare as identity is
+   written in Go.  */
 
-uintptr_t
-__go_type_hash_identity (const void *key, uintptr_t seed, uintptr_t key_size)
-{
-  uintptr_t ret;
-  uintptr_t i;
-  const unsigned char *p;
-
-  if (key_size <= 8)
-    {
-      union
-      {
-       uint64 v;
-       unsigned char a[8];
-      } u;
-      u.v = 0;
-#ifdef WORDS_BIGENDIAN
-      __builtin_memcpy (&u.a[8 - key_size], key, key_size);
-#else
-      __builtin_memcpy (&u.a[0], key, key_size);
-#endif
-      if (sizeof (uintptr_t) >= 8)
-       return (uintptr_t) u.v ^ seed;
-      else
-       return (uintptr_t) ((u.v >> 32) ^ (u.v & 0xffffffff)) ^ seed;
-    }
-
-  ret = seed;
-  for (i = 0, p = (const unsigned char *) key; i < key_size; i++, p++)
-    ret = ret * 33 + *p;
-  return ret;
-}
+extern uintptr runtime_memhash(void *, uintptr, uintptr)
+  __asm__ (GOSYM_PREFIX "runtime.memhash");
 
 const FuncVal __go_type_hash_identity_descriptor =
-  { (void *) __go_type_hash_identity };
+  { (void *) runtime_memhash };
 
 /* An identity equality function for a type.  This is used for types
    where we can check for equality by checking that the values have
Index: libgo/runtime/go-type.h
===================================================================
--- libgo/runtime/go-type.h     (revision 243084)
+++ libgo/runtime/go-type.h     (working copy)
@@ -362,7 +362,6 @@ extern _Bool
 __go_type_descriptors_equal(const struct __go_type_descriptor*,
                            const struct __go_type_descriptor*);
 
-extern uintptr_t __go_type_hash_identity (const void *, uintptr_t, uintptr_t);
 extern const FuncVal __go_type_hash_identity_descriptor;
 extern _Bool __go_type_equal_identity (const void *, const void *, uintptr_t);
 extern const FuncVal __go_type_equal_identity_descriptor;
Index: libgo/runtime/proc.c
===================================================================
--- libgo/runtime/proc.c        (revision 243084)
+++ libgo/runtime/proc.c        (working copy)
@@ -455,7 +455,8 @@ runtime_schedinit(void)
        // runtime_symtabinit();
        runtime_mallocinit();
        mcommoninit(m);
-       
+       runtime_alginit(); // maps must not be used before this call
+
        // Initialize the itable value for newErrorCString,
        // so that the next time it gets called, possibly
        // in a fault during a garbage collection, it will not
Index: libgo/runtime/runtime.h
===================================================================
--- libgo/runtime/runtime.h     (revision 243424)
+++ libgo/runtime/runtime.h     (working copy)
@@ -265,6 +265,8 @@ struct __go_func_type;
 void   runtime_args(int32, byte**)
   __asm__ (GOSYM_PREFIX "runtime.args");
 void   runtime_osinit();
+void   runtime_alginit(void)
+  __asm__ (GOSYM_PREFIX "runtime.alginit");
 void   runtime_goargs(void)
   __asm__ (GOSYM_PREFIX "runtime.goargs");
 void   runtime_goenvs(void);
@@ -592,3 +594,7 @@ extern void *getitab(const struct __go_t
                     const struct __go_type_descriptor *,
                     _Bool)
   __asm__ (GOSYM_PREFIX "runtime.getitab");
+
+extern void runtime_cpuinit(void);
+extern void setCpuidECX(uint32)
+  __asm__ (GOSYM_PREFIX "runtime.setCpuidECX");
Index: libgo/runtime/runtime_c.c
===================================================================
--- libgo/runtime/runtime_c.c   (revision 243084)
+++ libgo/runtime/runtime_c.c   (working copy)
@@ -6,6 +6,10 @@
 #include <signal.h>
 #include <unistd.h>
 
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
 #include "config.h"
 
 #include "runtime.h"
@@ -204,3 +208,18 @@ go_errno()
 {
   return (intgo)errno;
 }
+
+// CPU-specific initialization.
+// Fetch CPUID info on x86.
+
+void
+runtime_cpuinit()
+{
+#if defined(__i386__) || defined(__x86_64__)
+       unsigned int eax, ebx, ecx, edx;
+
+       if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+               setCpuidECX(ecx);
+       }
+#endif
+}

libgo patch committed: Copy hash code from Go 1.7 runtime

Reply via email to