On Wed, Mar 4, 2015 at 4:19 PM, Matt Turner <matts...@gmail.com> wrote:
> The SSSE3 swizzling code was written for fast uploads to the GPU and > assumed the destination was always 16-byte aligned. When we began using > this code for fast downloads as well we didn't do anything to account > for the fact that the destination pointer given by glReadPixels() or > glGetTexImage() is not guaranteed to be suitably aligned. > > With SSSE3 enabled (at compile-time), some applications would crash when > an SSE aligned-store instruction tried to store to an unaligned > destination (or an assertion that the destination is aligned would > trigger). > > To remedy this, tell intel_get_memcpy() whether we're uploading or > downloading so that it can select whether to assume the destination or > source is aligned, respectively. > > Cc: 10.5 <mesa-sta...@lists.freedesktop.org> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89416 > --- > I don't know what benchmarks I should run to test this code? > > src/mesa/drivers/dri/i965/intel_pixel_read.c | 3 +- > src/mesa/drivers/dri/i965/intel_tex_image.c | 3 +- > src/mesa/drivers/dri/i965/intel_tex_subimage.c | 3 +- > src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 156 > +++++++++++++++++++------ > src/mesa/drivers/dri/i965/intel_tiled_memcpy.h | 15 ++- > 5 files changed, 138 insertions(+), 42 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c > b/src/mesa/drivers/dri/i965/intel_pixel_read.c > index df22a63..0972121 100644 > --- a/src/mesa/drivers/dri/i965/intel_pixel_read.c > +++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c > @@ -139,7 +139,8 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx, > rb->Format == MESA_FORMAT_R8G8B8X8_UNORM) > return false; > > - if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp)) > + if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp, > + INTEL_DOWNLOAD)) > return false; > > if (!irb->mt || > diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c > b/src/mesa/drivers/dri/i965/intel_tex_image.c > index da42fdd..a3312d2 100644 > --- a/src/mesa/drivers/dri/i965/intel_tex_image.c > +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c > @@ -408,7 +408,8 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context > *ctx, > texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM) > return false; > > - if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, > &cpp)) > + if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, > &cpp, > + INTEL_UPLOAD)) > Should be INTEL_DOWNLOAD > return false; > > /* If this is a nontrivial texture view, let another path handle it > instead. */ > diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c > b/src/mesa/drivers/dri/i965/intel_tex_subimage.c > index 4262f71..909ff25 100644 > --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c > +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c > @@ -118,7 +118,8 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, > packing->Invert) > return false; > > - if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, > &cpp)) > + if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, > &cpp, > + INTEL_UPLOAD)) > return false; > > /* If this is a nontrivial texture view, let another path handle it > instead. */ > diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > index f2b35cb..c43383b 100644 > --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > @@ -60,42 +60,79 @@ static const uint32_t ytile_span = 16; > static const uint8_t rgba8_permutation[16] = > { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; > > -/* NOTE: dst must be 16 byte aligned */ > -#define rgba8_copy_16(dst, src) \ > - *(__m128i *)(dst) = _mm_shuffle_epi8( \ > - (__m128i) _mm_loadu_ps((float *)(src)), \ > - *(__m128i *) rgba8_permutation \ > - ) > +/* NOTE: dst must be 16-byte aligned. src may be unaligned. */ > +#define rgba8_copy_16_aligned_dst(dst, src) \ > + _mm_store_si128((__m128i *)(dst), \ > + _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \ > + *(__m128i *) rgba8_permutation)) > + > +/* NOTE: src must be 16-byte aligned. dst may be unaligned. */ > +#define rgba8_copy_16_aligned_src(dst, src) \ > + _mm_storeu_si128((__m128i *)(dst), \ > + _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \ > + *(__m128i *) rgba8_permutation)) > #endif > > /** > - * Copy RGBA to BGRA - swap R and B. > + * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned. > */ > static inline void * > -rgba8_copy(void *dst, const void *src, size_t bytes) > +rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) > { > uint8_t *d = dst; > uint8_t const *s = src; > > #ifdef __SSSE3__ > - /* Fast copying for tile spans. > - * > - * As long as the destination texture is 16 aligned, > - * any 16 or 64 spans we get here should also be 16 aligned. > - */ > - > if (bytes == 16) { > assert(!(((uintptr_t)dst) & 0xf)); > - rgba8_copy_16(d+ 0, s+ 0); > + rgba8_copy_16_aligned_dst(d+ 0, s+ 0); > return dst; > } > > if (bytes == 64) { > assert(!(((uintptr_t)dst) & 0xf)); > - rgba8_copy_16(d+ 0, s+ 0); > - rgba8_copy_16(d+16, s+16); > - rgba8_copy_16(d+32, s+32); > - rgba8_copy_16(d+48, s+48); > + rgba8_copy_16_aligned_dst(d+ 0, s+ 0); > + rgba8_copy_16_aligned_dst(d+16, s+16); > + rgba8_copy_16_aligned_dst(d+32, s+32); > + rgba8_copy_16_aligned_dst(d+48, s+48); > + return dst; > + } > +#endif > + > + while (bytes >= 4) { > + d[0] = s[2]; > + d[1] = s[1]; > + d[2] = s[0]; > + d[3] = s[3]; > + d += 4; > + s += 4; > + bytes -= 4; > + } > + return dst; > +} > + > +/** > + * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned. > + */ > +static inline void * > +rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) > +{ > + uint8_t *d = dst; > + uint8_t const *s = src; > + > +#ifdef __SSSE3__ > + if (bytes == 16) { > + assert(!(((uintptr_t)src) & 0xf)); > + rgba8_copy_16_aligned_src(d+ 0, s+ 0); > + return dst; > + } > + > + if (bytes == 64) { > + assert(!(((uintptr_t)src) & 0xf)); > + rgba8_copy_16_aligned_src(d+ 0, s+ 0); > + rgba8_copy_16_aligned_src(d+16, s+16); > + rgba8_copy_16_aligned_src(d+32, s+32); > + rgba8_copy_16_aligned_src(d+48, s+48); > return dst; > } > #endif > @@ -357,16 +394,26 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > if (mem_copy == memcpy) > return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, > xtile_height, > dst, src, src_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > + return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, > xtile_height, > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, > xtile_height, > - dst, src, src_pitch, swizzle_bit, > rgba8_copy); > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > linear_to_xtiled will always be called with an aligned destination so you only need the one case. > } else { > if (mem_copy == memcpy) > return linear_to_xtiled(x0, x1, x2, x3, y0, y1, > dst, src, src_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > return linear_to_xtiled(x0, x1, x2, x3, y0, y1, > - dst, src, src_pitch, swizzle_bit, > rgba8_copy); > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > + return linear_to_xtiled(x0, x1, x2, x3, y0, y1, > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > } > linear_to_xtiled(x0, x1, x2, x3, y0, y1, > dst, src, src_pitch, swizzle_bit, mem_copy); > @@ -393,16 +440,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > if (mem_copy == memcpy) > return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, > ytile_height, > dst, src, src_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > + return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, > ytile_height, > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, > ytile_height, > - dst, src, src_pitch, swizzle_bit, > rgba8_copy); > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > Same for linear_to_ytiled > } else { > if (mem_copy == memcpy) > return linear_to_ytiled(x0, x1, x2, x3, y0, y1, > dst, src, src_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > return linear_to_ytiled(x0, x1, x2, x3, y0, y1, > - dst, src, src_pitch, swizzle_bit, > rgba8_copy); > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > + return linear_to_ytiled(x0, x1, x2, x3, y0, y1, > + dst, src, src_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > } > linear_to_ytiled(x0, x1, x2, x3, y0, y1, > dst, src, src_pitch, swizzle_bit, mem_copy); > @@ -429,16 +486,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > if (mem_copy == memcpy) > return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, > xtile_height, > dst, src, dst_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > + return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, > xtile_height, > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, > xtile_height, > - dst, src, dst_pitch, swizzle_bit, > rgba8_copy); > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > } else { > if (mem_copy == memcpy) > return xtiled_to_linear(x0, x1, x2, x3, y0, y1, > dst, src, dst_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > + return xtiled_to_linear(x0, x1, x2, x3, y0, y1, > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > return xtiled_to_linear(x0, x1, x2, x3, y0, y1, > - dst, src, dst_pitch, swizzle_bit, > rgba8_copy); > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_src); xtiled_to_linear will only ever be called with an aligned source > } > xtiled_to_linear(x0, x1, x2, x3, y0, y1, > dst, src, dst_pitch, swizzle_bit, mem_copy); > @@ -465,16 +532,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > if (mem_copy == memcpy) > return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, > ytile_height, > dst, src, dst_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, > ytile_height, > - dst, src, dst_pitch, swizzle_bit, > rgba8_copy); > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > + return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, > ytile_height, > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > same for ytiled_to_linear. Kill off the uneeded cases and fix the typo and this patch is Reviewed-by: Jason Ekstrand <jason.ekstr...@intel.com> > } else { > if (mem_copy == memcpy) > return ytiled_to_linear(x0, x1, x2, x3, y0, y1, > dst, src, dst_pitch, swizzle_bit, > memcpy); > - else if (mem_copy == rgba8_copy) > + else if (mem_copy == rgba8_copy_aligned_dst) > + return ytiled_to_linear(x0, x1, x2, x3, y0, y1, > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_dst); > + else if (mem_copy == rgba8_copy_aligned_src) > return ytiled_to_linear(x0, x1, x2, x3, y0, y1, > - dst, src, dst_pitch, swizzle_bit, > rgba8_copy); > + dst, src, dst_pitch, swizzle_bit, > + rgba8_copy_aligned_src); > } > ytiled_to_linear(x0, x1, x2, x3, y0, y1, > dst, src, dst_pitch, swizzle_bit, mem_copy); > @@ -684,7 +761,8 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2, > * \return true if the format and type combination are valid > */ > bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, > - GLenum type, mem_copy_fn* mem_copy, uint32_t* cpp) > + GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp, > + enum intel_memcpy_direction direction) > { > if (type == GL_UNSIGNED_INT_8_8_8_8_REV && > !(format == GL_RGBA || format == GL_BGRA)) > @@ -700,7 +778,8 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum > format, > if (format == GL_BGRA) { > *mem_copy = memcpy; > } else if (format == GL_RGBA) { > - *mem_copy = rgba8_copy; > + *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst > + : rgba8_copy_aligned_src; > } > } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) || > (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM)) { > @@ -709,7 +788,8 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum > format, > /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we > can > * use the same function. > */ > - *mem_copy = rgba8_copy; > + *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst > + : rgba8_copy_aligned_src; > } else if (format == GL_RGBA) { > *mem_copy = memcpy; > } > diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h > b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h > index 3ff0d71..9dc1088 100644 > --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h > +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h > @@ -55,7 +55,20 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2, > uint32_t tiling, > mem_copy_fn mem_copy); > > +/* Tells intel_get_memcpy() whether the memcpy() is > + * > + * - an upload to the GPU with an aligned destination and a potentially > + * unaligned source; or > + * - a download from the GPU with an aligned source and a potentially > + * unaligned destination. > + */ > +enum intel_memcpy_direction { > + INTEL_UPLOAD, > + INTEL_DOWNLOAD > +}; > + > bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, > - GLenum type, mem_copy_fn* mem_copy, uint32_t* cpp); > + GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp, > + enum intel_memcpy_direction direction); > > #endif /* INTEL_TILED_MEMCPY */ > -- > 2.0.5 > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev