[Mesa-dev] [PATCH] gallivm: fix lp_build_compare_ext

2015-07-03 Thread sroland
From: Roland Scheidegger 

The expansion should always be to the same width as the input arguments
no matter what, since these functions should work with any bit width of
the arguments (the sext is a no-op on any sane simd architecture).
Thus, fix the caller expecting differently.

This fixes https://bugs.freedesktop.org/show_bug.cgi?id=91222 (not tested
otherwise)
---
 src/gallium/auxiliary/gallivm/lp_bld_logic.c   | 2 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c 
b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index f724cfa..80b53e5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -81,7 +81,7 @@ lp_build_compare_ext(struct gallivm_state *gallivm,
  boolean ordered)
 {
LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, 
lp_type_int_vec(32, 32 * type.length));
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
LLVMValueRef zeros = LLVMConstNull(int_vec_type);
LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
LLVMValueRef cond;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 1f2af85..0ad78b0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1961,8 +1961,11 @@ dset_emit_cpu(
struct lp_build_emit_data * emit_data,
unsigned pipe_func)
 {
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
LLVMValueRef cond = lp_build_cmp(&bld_base->dbl_bld, pipe_func,
 emit_data->args[0], emit_data->args[1]);
+   /* arguments were 64 bit but store as 32 bit */
+   cond = LLVMBuildTrunc(builder, cond, bld_base->int_bld.int_vec_type, "");
emit_data->output[emit_data->chan] = cond;
 }
 
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] radeon/r200: mark state atoms as dirty after blits

2015-07-11 Thread sroland
From: Roland Scheidegger 

Blit submits lots of packets which are usually handled by state atoms, so
these must be dirtied.
Not sure if this fixes anything, but it was a concern raised by bug 51658
(with this all issues there seen as actual bugs should be fixed, with the
exception of the patch to upload non-used texenv state atoms which I just
don't understand).
---
 src/mesa/drivers/dri/r200/r200_blit.c | 16 
 src/mesa/drivers/dri/radeon/radeon_blit.c |  8 
 2 files changed, 24 insertions(+)

diff --git a/src/mesa/drivers/dri/r200/r200_blit.c 
b/src/mesa/drivers/dri/r200/r200_blit.c
index 3adc694..0e6afa0 100644
--- a/src/mesa/drivers/dri/r200/r200_blit.c
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -547,5 +547,21 @@ unsigned r200_blit(struct gl_context *ctx,
 
 radeonFlush(ctx);
 
+/* We submitted those packets outside our state atom mechanism. Thus
+ * make sure the atoms are resubmitted the next time. */
+r200->hw.cst.dirty = GL_TRUE;
+r200->hw.ctx.dirty = GL_TRUE;
+r200->hw.vap.dirty = GL_TRUE;
+r200->hw.msk.dirty = GL_TRUE;
+r200->hw.pix[0].dirty = GL_TRUE;
+r200->hw.pix[1].dirty = GL_TRUE;
+r200->hw.pix[2].dirty = GL_TRUE;
+r200->hw.pix[3].dirty = GL_TRUE;
+r200->hw.sci.dirty = GL_TRUE;
+r200->hw.set.dirty = GL_TRUE;
+r200->hw.tex[0].dirty = GL_TRUE;
+r200->hw.vte.dirty = GL_TRUE;
+r200->hw.vtx.dirty = GL_TRUE;
+
 return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c 
b/src/mesa/drivers/dri/radeon/radeon_blit.c
index 0de1751..028e8b0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_blit.c
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -425,5 +425,13 @@ unsigned r100_blit(struct gl_context *ctx,
 
 radeonFlush(ctx);
 
+/* We submitted those packets outside our state atom mechanism. Thus
+ * make sure they are all resubmitted the next time. */
+r100->hw.ctx.dirty = GL_TRUE;
+r100->hw.msk.dirty = GL_TRUE;
+r100->hw.set.dirty = GL_TRUE;
+r100->hw.tex[0].dirty = GL_TRUE;
+r100->hw.txr[0].dirty = GL_TRUE;
+
 return GL_TRUE;
 }
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] r200: fix fbo rendering by disabling optimized texture format chooser

2015-07-11 Thread sroland
From: Roland Scheidegger 

It is rather unfortunate that we don't know if a texture is going to be used
as a rt later, and we lack the means to do something about a format chosen
which we can't render to directly, so disable this and always chose renderable
format for rgba8 textures.
This addresses an issue raised on (old) bug 51658 with gnome-shell, don't know
if that's still applicable but it might fix other things as well.
---
 src/mesa/drivers/dri/radeon/radeon_texture.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c 
b/src/mesa/drivers/dri/radeon/radeon_texture.c
index edfd48b..95496b1 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -224,7 +224,19 @@ static mesa_format 
radeonChooseTexFormat(radeonContextPtr rmesa,
const GLuint ui = 1;
const GLubyte littleEndian = *((const GLubyte *)&ui);
 
-   if (fbo)
+
+   /* Unfortunately, regardless the fbo flag, we might still be asked to
+* attach a texture to a fbo later, which then won't succeed if we chose
+* one which isn't renderable. And unlike more exotic formats, apps 
aren't
+* really prepared for the incomplete framebuffer this results in 
(they'd
+* have to retry with same internalFormat even, just different
+* srcFormat/srcType, which can't really be expected anyway).
+* Ideally, we'd defer format selection until later (if the texture is
+* used as a rt it's likely there's never data uploaded to it before 
attached
+* to a fbo), but this isn't really possible, so for now just always use
+* a renderable format.
+*/
+   if (1||fbo)
return _radeon_texformat_argb;
 
if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] radeon: fix some potential big endian issues

2015-07-11 Thread sroland
From: Roland Scheidegger 

The formats chosen (both by texture format choser, fbo storage allocation)
are different for big endian not just for rgba8 but also lower bit width
formats (why I don't actually know). Even the function to test for renderable
formats used different formats, however the actual colorbuffer setup did not.
And the blitter did not take that into account neither.
Untested (what could possibly go wrong...).
---
 src/mesa/drivers/dri/radeon/radeon_blit.c   | 83 +++--
 src/mesa/drivers/dri/radeon/radeon_state_init.c |  3 +
 src/mesa/drivers/dri/radeon/radeon_tex.h| 35 +++
 src/mesa/drivers/dri/radeon/radeon_texstate.c   | 44 -
 4 files changed, 76 insertions(+), 89 deletions(-)

diff --git a/src/mesa/drivers/dri/radeon/radeon_blit.c 
b/src/mesa/drivers/dri/radeon/radeon_blit.c
index 028e8b0..50296e2 100644
--- a/src/mesa/drivers/dri/radeon/radeon_blit.c
+++ b/src/mesa/drivers/dri/radeon/radeon_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_blit.h"
+#include "radeon_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
   int reg, int count)
@@ -40,19 +41,36 @@ static inline uint32_t cmdpacket0(struct radeon_screen 
*rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r100_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-/* XXX others?  BE/LE? */
-switch (mesa_format) {
-case MESA_FORMAT_B8G8R8A8_UNORM:
-case MESA_FORMAT_B8G8R8X8_UNORM:
-case MESA_FORMAT_B5G6R5_UNORM:
-case MESA_FORMAT_B4G4R4A4_UNORM:
-case MESA_FORMAT_B5G5R5A1_UNORM:
-case MESA_FORMAT_A_UNORM8:
-case MESA_FORMAT_L_UNORM8:
-case MESA_FORMAT_I_UNORM8:
+/* XXX others?  */
+if (_mesa_little_endian()) {
+   switch (mesa_format) {
+   case MESA_FORMAT_B8G8R8A8_UNORM:
+   case MESA_FORMAT_B8G8R8X8_UNORM:
+   case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A_UNORM8:
+   case MESA_FORMAT_L_UNORM8:
+   case MESA_FORMAT_I_UNORM8:
break;
-default:
+   default:
+   return 0;
+   }
+}
+else {
+   switch (mesa_format) {
+   case MESA_FORMAT_A8R8G8B8_UNORM:
+   case MESA_FORMAT_X8R8G8B8_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
+   case MESA_FORMAT_A_UNORM8:
+   case MESA_FORMAT_L_UNORM8:
+   case MESA_FORMAT_I_UNORM8:
+   break;
+   default:
return 0;
+   }
 }
 
 /* Rendering to small buffer doesn't work.
@@ -106,40 +124,8 @@ static void inline emit_tx_setup(struct r100_context *r100,
 assert(height <= 2048);
 assert(offset % 32 == 0);
 
-/* XXX others?  BE/LE? */
-switch (mesa_format) {
-case MESA_FORMAT_B8G8R8A8_UNORM:
-   txformat |= RADEON_TXFORMAT_ARGB | RADEON_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_A8B8G8R8_UNORM:
-txformat |= RADEON_TXFORMAT_RGBA | 
RADEON_TXFORMAT_ALPHA_IN_MAP;
-break;
-case MESA_FORMAT_B8G8R8X8_UNORM:
-   txformat |= RADEON_TXFORMAT_ARGB;
-   break;
-case MESA_FORMAT_B5G6R5_UNORM:
-   txformat |= RADEON_TXFORMAT_RGB565;
-   break;
-case MESA_FORMAT_B4G4R4A4_UNORM:
-   txformat |= RADEON_TXFORMAT_ARGB | RADEON_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_B5G5R5A1_UNORM:
-   txformat |= RADEON_TXFORMAT_ARGB1555 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_A_UNORM8:
-case MESA_FORMAT_I_UNORM8:
-   txformat |= RADEON_TXFORMAT_I8 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_L_UNORM8:
-txformat |= RADEON_TXFORMAT_I8;
-break;
-case MESA_FORMAT_L8A8_UNORM:
-txformat |= RADEON_TXFORMAT_AI88 | RADEON_TXFORMAT_ALPHA_IN_MAP;
-break;
-default:
-   break;
-}
-
+txformat |= tx_table[mesa_format].format;
+
 if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
offset |= RADEON_TXO_MACRO_TILE;
 if (bo->flags & RADEON_BO_FLAGS_MICRO_TILE)
@@ -186,17 +172,24 @@ static inline void emit_cb_setup(struct r100_context 
*r100,
 
 /* XXX others?  BE/LE? */
 switch (mesa_format) {
+/* le */
 case MESA_FORMAT_B8G8R8A8_UNORM:
 case MESA_FORMAT_B8G8R8X8_UNORM:
+/* be */
+case MESA_FORMAT_A8R8G8B8_UNORM:
+case MESA_FORMAT_X8R8G8B8_UNORM:
dst_format = RADEON_COLOR_FORMAT_ARGB;
break;
 case MESA_FORMAT_B5G6R5_UNORM:
+case MESA_FORMAT_R5G6B5_UNORM:
dst_format = RADEON_COLOR_FORMAT_RGB565;
break;
 case MESA_FORMAT_B4G4R4A4_UNORM:
+case MESA_FORMAT_A4R4G4B4_UNORM:
dst_format = RADEON

[Mesa-dev] [PATCH 4/4] r200: fix some potential big endian issues

2015-07-11 Thread sroland
From: Roland Scheidegger 

The formats chosen (both by texture format choser, fbo storage allocation)
are different for big endian not just for rgba8 but also lower bit width
formats (why I don't actually know). Even the function to test for renderable
formats used different formats, however the actual colorbuffer setup did not.
And the blitter did not take that into account neither.
Untested (what could possibly go wrong...).
Same as for r100.
---
 src/mesa/drivers/dri/r200/r200_blit.c   | 118 +++-
 src/mesa/drivers/dri/r200/r200_context.h|   4 -
 src/mesa/drivers/dri/r200/r200_state_init.c |   5 +-
 src/mesa/drivers/dri/r200/r200_tex.h|  64 +++
 src/mesa/drivers/dri/r200/r200_texstate.c   |  71 -
 5 files changed, 133 insertions(+), 129 deletions(-)

diff --git a/src/mesa/drivers/dri/r200/r200_blit.c 
b/src/mesa/drivers/dri/r200/r200_blit.c
index 0e6afa0..0cab84b 100644
--- a/src/mesa/drivers/dri/r200/r200_blit.c
+++ b/src/mesa/drivers/dri/r200/r200_blit.c
@@ -28,6 +28,7 @@
 #include "radeon_common.h"
 #include "r200_context.h"
 #include "r200_blit.h"
+#include "r200_tex.h"
 
 static inline uint32_t cmdpacket0(struct radeon_screen *rscrn,
   int reg, int count)
@@ -40,22 +41,42 @@ static inline uint32_t cmdpacket0(struct radeon_screen 
*rscrn,
 /* common formats supported as both textures and render targets */
 unsigned r200_check_blit(mesa_format mesa_format, uint32_t dst_pitch)
 {
-/* XXX others?  BE/LE? */
-switch (mesa_format) {
-case MESA_FORMAT_B8G8R8A8_UNORM:
-case MESA_FORMAT_B8G8R8X8_UNORM:
-case MESA_FORMAT_B5G6R5_UNORM:
-case MESA_FORMAT_B4G4R4A4_UNORM:
-case MESA_FORMAT_B5G5R5A1_UNORM:
-case MESA_FORMAT_A_UNORM8:
-case MESA_FORMAT_L_UNORM8:
-case MESA_FORMAT_I_UNORM8:
-/* swizzled */
-case MESA_FORMAT_A8B8G8R8_UNORM:
-case MESA_FORMAT_R8G8B8A8_UNORM:
+/* XXX others? */
+if (_mesa_little_endian()) {
+   switch (mesa_format) {
+   case MESA_FORMAT_B8G8R8A8_UNORM:
+   case MESA_FORMAT_B8G8R8X8_UNORM:
+   case MESA_FORMAT_B5G6R5_UNORM:
+   case MESA_FORMAT_B4G4R4A4_UNORM:
+   case MESA_FORMAT_B5G5R5A1_UNORM:
+   case MESA_FORMAT_A_UNORM8:
+   case MESA_FORMAT_L_UNORM8:
+   case MESA_FORMAT_I_UNORM8:
+   /* swizzled */
+   case MESA_FORMAT_A8B8G8R8_UNORM:
+   case MESA_FORMAT_R8G8B8A8_UNORM:
break;
-default:
+   default:
return 0;
+   }
+}
+else {
+   switch (mesa_format) {
+   case MESA_FORMAT_A8R8G8B8_UNORM:
+   case MESA_FORMAT_X8R8G8B8_UNORM:
+   case MESA_FORMAT_R5G6B5_UNORM:
+   case MESA_FORMAT_A4R4G4B4_UNORM:
+   case MESA_FORMAT_A1R5G5B5_UNORM:
+   case MESA_FORMAT_A_UNORM8:
+   case MESA_FORMAT_L_UNORM8:
+   case MESA_FORMAT_I_UNORM8:
+   /* swizzled */
+   case MESA_FORMAT_R8G8B8A8_UNORM:
+   case MESA_FORMAT_A8B8G8R8_UNORM:
+  break;
+   default:
+  return 0;
+   }
 }
 
 /* Rendering to small buffer doesn't work.
@@ -112,41 +133,11 @@ static void inline emit_tx_setup(struct r200_context 
*r200,
 assert(height <= 2048);
 assert(offset % 32 == 0);
 
-/* XXX others?  BE/LE? */
-switch (src_mesa_format) {
-case MESA_FORMAT_B8G8R8A8_UNORM:
-   txformat |= R200_TXFORMAT_ARGB | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_A8B8G8R8_UNORM:
-   txformat |= R200_TXFORMAT_RGBA | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_R8G8B8A8_UNORM:
-   txformat |= R200_TXFORMAT_ABGR | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_B8G8R8X8_UNORM:
-   txformat |= R200_TXFORMAT_ARGB;
-   break;
-case MESA_FORMAT_B5G6R5_UNORM:
-   txformat |= R200_TXFORMAT_RGB565;
-   break;
-case MESA_FORMAT_B4G4R4A4_UNORM:
-   txformat |= R200_TXFORMAT_ARGB | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_B5G5R5A1_UNORM:
-   txformat |= R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_A_UNORM8:
-case MESA_FORMAT_I_UNORM8:
-   txformat |= R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-case MESA_FORMAT_L_UNORM8:
-   txformat |= R200_TXFORMAT_I8;
-   break;
-case MESA_FORMAT_L8A8_UNORM:
-   txformat |= R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP;
-   break;
-default:
-   break;
+if (_mesa_little_endian()) {
+   txformat |= tx_table_le[src_mesa_format].format;
+}
+else {
+   txformat |= tx_table_be[src_mesa_format].format;
 }
 
 if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
@@ -155,11 +146,19 @@ static void inline emit_tx_setup(struct r200_context 
*r200,
offset |= R200_TXO_MICRO_TILE;
 
 switch (dst_mesa_format) {
+/* le *

[Mesa-dev] [PATCH] mesa: fix up some texture error checks

2015-07-16 Thread sroland
From: Roland Scheidegger 

In particular, we were incorrectly accepting s3tc (and lots of others)
for CompressedTexSubImage3D (but not CompressedTexImage3D) calls with 3d
targets. At this time, the only allowed formats for these calls are the
bptc ones, since none of the specific extensions allow it (astc hdr would).
Also, fix up a bug in _mesa_target_can_be_compressed - 3d target needs to
be allowed for bptc formats.
---
 src/mesa/main/teximage.c   | 41 -
 src/mesa/main/texstorage.c |  1 +
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 3d85615..4389082 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1793,8 +1793,6 @@ GLboolean
 _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
GLenum intFormat)
 {
-   (void) intFormat;  /* not used yet */
-
switch (target) {
case GL_TEXTURE_2D:
case GL_PROXY_TEXTURE_2D:
@@ -1814,6 +1812,16 @@ _mesa_target_can_be_compressed(const struct gl_context 
*ctx, GLenum target,
case GL_PROXY_TEXTURE_CUBE_MAP_ARRAY:
case GL_TEXTURE_CUBE_MAP_ARRAY:
   return ctx->Extensions.ARB_texture_cube_map_array;
+   case GL_TEXTURE_3D:
+  switch (intFormat) {
+  case GL_COMPRESSED_RGBA_BPTC_UNORM:
+  case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+  case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+  case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+ return TRUE;
+  default:
+ return FALSE;
+  }
default:
   return GL_FALSE;
}
@@ -4575,32 +4583,23 @@ compressed_subtexture_target_check(struct gl_context 
*ctx, GLenum target,
*one of the EAC, ETC2, or RGTC formats and either border is
*non-zero, or the effective target for the texture is not
*TEXTURE_2D_ARRAY."
+   * Instead of listing all these, just list those which are allowed,
+   * which is (at this time) only bptc. Otherwise we'd say s3tc (and more)
+   * are valid here, which they are not, but of course not mentioned by
+   * core spec.
*/
   if (target != GL_TEXTURE_2D_ARRAY) {
  bool invalidformat;
  switch (format) {
 /* These came from _mesa_is_compressed_format in glformats.c. */
-/* EAC formats */
-case GL_COMPRESSED_RGBA8_ETC2_EAC:
-case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-case GL_COMPRESSED_R11_EAC:
-case GL_COMPRESSED_RG11_EAC:
-case GL_COMPRESSED_SIGNED_R11_EAC:
-case GL_COMPRESSED_SIGNED_RG11_EAC:
-/* ETC2 formats */
-case GL_COMPRESSED_RGB8_ETC2:
-case GL_COMPRESSED_SRGB8_ETC2:
-case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-/* RGTC formats */
-case GL_COMPRESSED_RED_RGTC1:
-case GL_COMPRESSED_SIGNED_RED_RGTC1:
-case GL_COMPRESSED_RG_RGTC2:
-case GL_COMPRESSED_SIGNED_RG_RGTC2:
-   invalidformat = true;
+case GL_COMPRESSED_RGBA_BPTC_UNORM:
+case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
+case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
+   invalidformat = false;
break;
 default:
-   invalidformat = false;
+   invalidformat = true;
  }
  if (invalidformat) {
 _mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 53cb2c0..aa8fa3e 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -309,6 +309,7 @@ tex_storage_error_check(struct gl_context *ctx,
   GL_INVALID_ENUM : GL_INVALID_OPERATION,
   "glTex%sStorage%dD(internalformat = %s)", suffix, dims,
   _mesa_lookup_enum_by_nr(internalformat));
+  return GL_TRUE;
}
 
/* levels check */
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] gallivm: fix tex offsets with mirror repeat linear

2015-10-22 Thread sroland
From: Roland Scheidegger 

Can't see why anyone would ever want to use this, but it was clearly broken.
This fixes the piglit texwrap offset test using this combination.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 125505e..26bfa0d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -405,16 +405,17 @@ lp_build_sample_wrap_linear(struct 
lp_build_sample_context *bld,
   break;
 
case PIPE_TEX_WRAP_MIRROR_REPEAT:
+  if (offset) {
+ offset = lp_build_int_to_float(coord_bld, offset);
+ offset = lp_build_div(coord_bld, offset, length_f);
+ coord = lp_build_add(coord_bld, coord, offset);
+  }
   /* compute mirror function */
   coord = lp_build_coord_mirror(bld, coord);
 
   /* scale coord to length */
   coord = lp_build_mul(coord_bld, coord, length_f);
   coord = lp_build_sub(coord_bld, coord, half);
-  if (offset) {
- offset = lp_build_int_to_float(coord_bld, offset);
- coord = lp_build_add(coord_bld, coord, offset);
-  }
 
   /* convert to int, compute lerp weight */
   lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] softpipe: fix using non-zero layer in non-array view from array resource

2015-10-22 Thread sroland
From: Roland Scheidegger 

For vertex/geometry shader sampling, this is the same as for llvmpipe - just
use the original resource target.
For fragment shader sampling though (which does not use first-layer based mip
offsets) adjust the sampling code to use first_layer in the non-array cases.
While here also fix up some code which looked wrong wrt buffer texel fetch
(no piglit change).
---
 src/gallium/drivers/softpipe/sp_state_sampler.c |  8 +++
 src/gallium/drivers/softpipe/sp_tex_sample.c| 32 +
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c 
b/src/gallium/drivers/softpipe/sp_state_sampler.c
index d7a3360..23ec4ef 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -214,10 +214,10 @@ prepare_shader_sampling(
   row_stride[j] = sp_tex->stride[j];
   img_stride[j] = sp_tex->img_stride[j];
}
-   if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-   view->target == PIPE_TEXTURE_2D_ARRAY ||
-   view->target == PIPE_TEXTURE_CUBE ||
-   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+   if (tex->target == PIPE_TEXTURE_1D_ARRAY ||
+   tex->target == PIPE_TEXTURE_2D_ARRAY ||
+   tex->target == PIPE_TEXTURE_CUBE ||
+   tex->target == PIPE_TEXTURE_CUBE_ARRAY) {
   num_layers = view->u.tex.last_layer - 
view->u.tex.first_layer + 1;
   for (j = first_level; j <= last_level; j++) {
  mip_offsets[j] += view->u.tex.first_layer *
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 8a09350..e3e28a3 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -1033,6 +1033,7 @@ img_filter_2d_linear_repeat_POT(const struct 
sp_sampler_view *sp_sview,
   
addr.value = 0;
addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
/* Can we fetch all four at once:
 */
@@ -1081,6 +1082,7 @@ img_filter_2d_nearest_repeat_POT(const struct 
sp_sampler_view *sp_sview,
 
addr.value = 0;
addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
out = get_texel_2d_no_border(sp_sview, addr, x0, y0);
for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -,6 +1113,7 @@ img_filter_2d_nearest_clamp_POT(const struct 
sp_sampler_view *sp_sview,
 
addr.value = 0;
addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
x0 = util_ifloor(u);
if (x0 < 0) 
@@ -1154,7 +1157,8 @@ img_filter_1d_nearest(const struct sp_sampler_view 
*sp_sview,
 
sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
 
-   out = get_texel_2d(sp_sview, sp_samp, addr, x, 0);
+   out = get_texel_1d_array(sp_sview, sp_samp, addr, x,
+sp_sview->base.u.tex.first_layer);
for (c = 0; c < TGSI_QUAD_SIZE; c++)
   rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
@@ -1215,6 +1219,7 @@ img_filter_2d_nearest(const struct sp_sampler_view 
*sp_sview,
  
addr.value = 0;
addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
@@ -1396,8 +1401,10 @@ img_filter_1d_linear(const struct sp_sampler_view 
*sp_sview,
 
sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw);
 
-   tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, 0);
-   tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, 0);
+   tx0 = get_texel_1d_array(sp_sview, sp_samp, addr, x0,
+sp_sview->base.u.tex.first_layer);
+   tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1,
+sp_sview->base.u.tex.first_layer);
 
/* interpolate R, G, B, A */
for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1523,6 +1530,7 @@ img_filter_2d_linear(const struct sp_sampler_view 
*sp_sview,
 
addr.value = 0;
addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
@@ -3252,10 +3260,22 @@ sp_get_texels(const struct sp_sampler_view *sp_sview,
 
switch (sp_sview->base.target) {
case PIPE_BUFFER:
+  for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ const int x = CLAMP(v_i[j] + offset[0] +
+ sp_sview->base.u.buf.first_element,
+ sp_sview->base.u.buf.first_element,
+ sp_sview->base.u.buf.last_element);
+ tx = get_texel_2d_no_border(sp_sview, addr, x, 0);
+ for (c = 0; c < 

[Mesa-dev] [PATCH 1/4] llvmpipe: fix using non-zero layer in non-array view from array resource

2015-10-22 Thread sroland
From: Roland Scheidegger 

Just need to use resource target not view target when calculating
first-layer based mip offsets. (This is a gl specific problem since
d3d10 does not distinguish between non-array and array resources neither
at the resource nor view level, only at the shader level.)
Fixes new piglit arb_texture_view sampling-2d-array-as-2d-layer test.
---
 src/gallium/drivers/llvmpipe/lp_setup.c | 8 
 src/gallium/drivers/llvmpipe/lp_state_sampler.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index 4c8167a..1778b13 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -854,10 +854,10 @@ lp_setup_set_fragment_sampler_views(struct 
lp_setup_context *setup,
  jit_tex->img_stride[j] = lp_tex->img_stride[j];
   }
 
-  if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-  view->target == PIPE_TEXTURE_2D_ARRAY ||
-  view->target == PIPE_TEXTURE_CUBE ||
-  view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+  if (res->target == PIPE_TEXTURE_1D_ARRAY ||
+  res->target == PIPE_TEXTURE_2D_ARRAY ||
+  res->target == PIPE_TEXTURE_CUBE ||
+  res->target == PIPE_TEXTURE_CUBE_ARRAY) {
  /*
   * For array textures, we don't have first_layer, instead
   * adjust last_layer (stored as depth) plus the mip level 
offsets
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index b205f02..1e05587 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -275,10 +275,10 @@ prepare_shader_sampling(
   row_stride[j] = lp_tex->row_stride[j];
   img_stride[j] = lp_tex->img_stride[j];
}
-   if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-   view->target == PIPE_TEXTURE_2D_ARRAY ||
-   view->target == PIPE_TEXTURE_CUBE ||
-   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+   if (tex->target == PIPE_TEXTURE_1D_ARRAY ||
+   tex->target == PIPE_TEXTURE_2D_ARRAY ||
+   tex->target == PIPE_TEXTURE_CUBE ||
+   tex->target == PIPE_TEXTURE_CUBE_ARRAY) {
   num_layers = view->u.tex.last_layer - 
view->u.tex.first_layer + 1;
   for (j = first_level; j <= last_level; j++) {
  mip_offsets[j] += view->u.tex.first_layer *
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/4] gallivm: fix sampling with texture offsets in SoA path

2015-10-22 Thread sroland
From: Roland Scheidegger 

When using nearest filtering and clamp / clamp to edge wrapping results could
be wrong for negative offsets. Fix this by adding the offset before doing
the conversion to int coords (could also use floor instead of trunc int
conversion but probably more complex on "typical" cpu).

This fixes the piglit texwrap offset failures with this filter/wrap combo
(which only leaves the linear/mirror repeat combination broken).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index b5c06b6..125505e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -567,12 +567,13 @@ lp_build_sample_wrap_nearest(struct 
lp_build_sample_context *bld,
  coord = lp_build_mul(coord_bld, coord, length_f);
   }
 
+  if (offset) {
+ offset = lp_build_int_to_float(coord_bld, offset);
+ coord = lp_build_add(coord_bld, coord, offset);
+  }
   /* floor */
   /* use itrunc instead since we clamp to 0 anyway */
   icoord = lp_build_itrunc(coord_bld, coord);
-  if (offset) {
- icoord = lp_build_add(int_coord_bld, icoord, offset);
-  }
 
   /* clamp to [0, length - 1]. */
   icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@@ -2586,6 +2587,10 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
   derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
   derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
}
+   /*
+* We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
+* so AoS path could be used. Not sure it's worth the trouble...
+*/
 
min_img_filter = derived_sampler_state.min_img_filter;
mag_img_filter = derived_sampler_state.mag_img_filter;
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: disable f16c when not using AVX

2015-10-23 Thread sroland
From: Roland Scheidegger 

f16c intrinsic can only be emitted when AVX is used. So when we disable AVX
due to forcing 128bit vectors we must not use this intrinsic (depending on
llvm version, this worked previously because llvm used AVX even when we didn't
tell it to, however I've seen this fail with llvm 3.3 since
718249843b915decf8fccec92e466ac1a6219934 which seems to have the side effect
of disabling avx in llvm albeit it only touches sse flags really).
Possibly one day should actually try to use avx even with 128bit vectors...
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 017d075..e6eede8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -427,6 +427,7 @@ lp_build_init(void)
*/
   util_cpu_caps.has_avx = 0;
   util_cpu_caps.has_avx2 = 0;
+  util_cpu_caps.has_f16c = 0;
}
 
 #ifdef PIPE_ARCH_PPC_64
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] radeon: fix bgrx8/xrgb8 blits

2015-11-12 Thread sroland
From: Roland Scheidegger 

Since d21320f6258b2e1780a15c1ca718963d8a15ca18 the same txformat table entries
are used for "normal" texturing as well as for blits. However, I forgot to put
in an entry for the bgrx8 (le) and xrgb8 (be) formats - the normal texturing
path can't hit them because the radeon tex format chooser will never chose
them, but we get that format from the dri buffers (at least I assume we got
it from there). This caused lots of piglit regressions (and probably lots of
trouble outside piglit too).
This fixes bug https://bugs.freedesktop.org/show_bug.cgi?id=92900.

Tested-by: Ian Romanick 
Cc: "11.0" 
---
 src/mesa/drivers/dri/radeon/radeon_tex.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h 
b/src/mesa/drivers/dri/radeon/radeon_tex.h
index f8ec432..37c2fa0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -63,6 +63,8 @@ static const struct tx_table tx_table[] =
[ MESA_FORMAT_R8G8B8A8_UNORM ] = { RADEON_TXFORMAT_RGBA | 
RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
[ MESA_FORMAT_B8G8R8A8_UNORM ] = { RADEON_TXFORMAT_ARGB | 
RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
[ MESA_FORMAT_A8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB | 
RADEON_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8X8_UNORM ] = { RADEON_TXFORMAT_ARGB, 0 },
+   [ MESA_FORMAT_X8R8G8B8_UNORM ] = { RADEON_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_BGR_UNORM8 ] = { RADEON_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_B5G6R5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
[ MESA_FORMAT_R5G6B5_UNORM ] = { RADEON_TXFORMAT_RGB565, 0 },
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] r200: fix bgrx8/xrgb8 blits

2015-11-12 Thread sroland
From: Roland Scheidegger 

Since 779cabfc7d022de8b7b9bc7fdac0caffa8646c51 the same txformat table entries
are used for "normal" texturing as well as for blits. However, I forgot to put
in an entry for the bgrx8 (le) and xrgb8 (be) formats - the normal texturing
path can't hit them because the radeon tex format chooser will never chose
them, but we get that format from the dri buffers (at least I assume we got
it from there).
This is untested but essentially addressing the same bug as for radeon.
(I don't think that the second entry per le/be table is actually necessary,
but shouldn't hurt...)

Cc: "11.0" 
---
 src/mesa/drivers/dri/r200/r200_tex.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/r200/r200_tex.h 
b/src/mesa/drivers/dri/r200/r200_tex.h
index a8c31b7..14f5e71 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -63,7 +63,9 @@ static const struct tx_table tx_table_be[] =
[ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_ABGR | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
[ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_RGBA | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
[ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8X8_UNORM ] = { R200_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_X8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_BGR_UNORM8 ] = { 0x, 0 },
[ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
[ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
@@ -91,7 +93,9 @@ static const struct tx_table tx_table_le[] =
[ MESA_FORMAT_A8B8G8R8_UNORM ] = { R200_TXFORMAT_RGBA | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
[ MESA_FORMAT_R8G8B8A8_UNORM ] = { R200_TXFORMAT_ABGR | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
[ MESA_FORMAT_B8G8R8A8_UNORM ] = { R200_TXFORMAT_ARGB | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_B8G8R8X8_UNORM ] = { R200_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_A8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB | 
R200_TXFORMAT_ALPHA_IN_MAP, 0 },
+   [ MESA_FORMAT_X8R8G8B8_UNORM ] = { R200_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_BGR_UNORM8 ] = { R200_TXFORMAT_ARGB, 0 },
[ MESA_FORMAT_B5G6R5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
[ MESA_FORMAT_R5G6B5_UNORM ] = { R200_TXFORMAT_RGB565, 0 },
-- 
2.1.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallium/docs: fix docs wrt ARL/ARR/FLR

2015-01-29 Thread sroland
From: Roland Scheidegger 

since the address reg holds integer values, ARL/ARR do an implicit float-to-int
conversion, so clarify that. Thus it is also incorrect to say that FLR really
does the same as ARL.
---
 src/gallium/docs/source/tgsi.rst | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index ff322e8..84b0ed6 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -48,13 +48,13 @@ used.
 
 .. math::
 
-  dst.x = \lfloor src.x\rfloor
+  dst.x = (int) \lfloor src.x\rfloor
 
-  dst.y = \lfloor src.y\rfloor
+  dst.y = (int) \lfloor src.y\rfloor
 
-  dst.z = \lfloor src.z\rfloor
+  dst.z = (int) \lfloor src.z\rfloor
 
-  dst.w = \lfloor src.w\rfloor
+  dst.w = (int) \lfloor src.w\rfloor
 
 
 .. opcode:: MOV - Move
@@ -313,8 +313,6 @@ This instruction replicates its result.
 
 .. opcode:: FLR - Floor
 
-This is identical to :opcode:`ARL`.
-
 .. math::
 
   dst.x = \lfloor src.x\rfloor
@@ -637,13 +635,13 @@ This instruction replicates its result.
 
 .. math::
 
-  dst.x = round(src.x)
+  dst.x = (int) round(src.x)
 
-  dst.y = round(src.y)
+  dst.y = (int) round(src.y)
 
-  dst.z = round(src.z)
+  dst.z = (int) round(src.z)
 
-  dst.w = round(src.w)
+  dst.w = (int) round(src.w)
 
 
 .. opcode:: SSG - Set Sign
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: don't enable NV_fragment_program_option with swrast

2015-02-14 Thread sroland
From: Roland Scheidegger 

Since dropping some NV_fragment_program opcodes (commits
868f95f1da74cf6dd7468cba1b56664aad585ccb, 
a3688d686f147f4252d19b298ae26d4ac72c2e08)
we can no longer parse all opcodes necessary for this extension, leading
to bugs (https://bugs.freedesktop.org/show_bug.cgi?id=86980).
Hence don't announce support for it in swrast.
(The rest of the infrastructure ranging from dead opcodes (TXP_NV) to
parsing condition codes along with the NV_fragment_program_option variable
in the context could probably nuked too this is just a minimal fix appropriate
for stable too.)
---
 src/mesa/main/extensions.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 220b220..c94009f 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -511,7 +511,6 @@ _mesa_enable_sw_extensions(struct gl_context *ctx)
ctx->Extensions.NV_point_sprite = GL_TRUE;
ctx->Extensions.NV_texture_env_combine4 = GL_TRUE;
ctx->Extensions.NV_texture_rectangle = GL_TRUE;
-   ctx->Extensions.NV_fragment_program_option = GL_TRUE;
ctx->Extensions.EXT_gpu_program_parameters = GL_TRUE;
ctx->Extensions.OES_standard_derivatives = GL_TRUE;
ctx->Extensions.TDFX_texture_compression_FXT1 = GL_TRUE;
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] gallium/auxiliary: optimize rgb9e5 helper some more

2015-08-09 Thread sroland
From: Roland Scheidegger 

I used this as some testing ground for investigating some compiler
bits initially (e.g. lrint calls etc.), figured I could do much better
in the end just for fun...
This is mathematically equivalent, but uses some tricks to avoid
doubles and also replaces some float math with ints. Good for another
performance doubling or so. As a side note, some quick tests show that
llvm's loop vectorizer would be able to properly vectorize this version
(which it failed to do earlier due to doubles, producing a mess), giving
another 3 times performance increase with sse2 (more with sse4.1), but this
may not apply to mesa.
---
 src/gallium/auxiliary/util/u_format_rgb9e5.h | 87 ++--
 1 file changed, 42 insertions(+), 45 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h 
b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index d11bfa8..21feba7 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -74,62 +74,59 @@ typedef union {
} field;
 } rgb9e5;
 
-static inline float rgb9e5_ClampRange(float x)
-{
-   if (x > 0.0f) {
-  if (x >= MAX_RGB9E5) {
- return MAX_RGB9E5;
-  } else {
- return x;
-  }
-   } else {
-  /* NaN gets here too since comparisons with NaN always fail! */
-  return 0.0f;
-   }
-}
 
-/* Ok, FloorLog2 is not correct for the denorm and zero values, but we
-   are going to do a max of this value with the minimum rgb9e5 exponent
-   that will hide these problem cases. */
-static inline int rgb9e5_FloorLog2(float x)
+static inline int rgb9e5_ClampRange(float x)
 {
float754 f;
-
+   float754 max;
f.value = x;
-   return (f.field.biasedexponent - 127);
+   max.value = MAX_RGB9E5;
+
+   if (f.raw > 0x7f80)
+  /* catches neg, NaNs */
+  return 0;
+   else if (f.raw >= max.raw)
+  return max.raw;
+   else
+  return f.raw;
 }
 
 static inline unsigned float3_to_rgb9e5(const float rgb[3])
 {
rgb9e5 retval;
-   float maxrgb;
-   int rm, gm, bm;
-   float rc, gc, bc;
-   int exp_shared, maxm;
+   int rm, gm, bm, exp_shared;
float754 revdenom = {0};
-
-   rc = rgb9e5_ClampRange(rgb[0]);
-   gc = rgb9e5_ClampRange(rgb[1]);
-   bc = rgb9e5_ClampRange(rgb[2]);
-
-   maxrgb = MAX3(rc, gc, bc);
-   exp_shared = MAX2(-RGB9E5_EXP_BIAS - 1, rgb9e5_FloorLog2(maxrgb)) + 1 + 
RGB9E5_EXP_BIAS;
+   float754 rc, bc, gc, maxrgb;
+
+   rc.raw = rgb9e5_ClampRange(rgb[0]);
+   gc.raw = rgb9e5_ClampRange(rgb[1]);
+   bc.raw = rgb9e5_ClampRange(rgb[2]);
+   maxrgb.raw = MAX3(rc.raw, gc.raw, bc.raw);
+
+   /*
+* Compared to what the spec suggests, instead of conditionally adjusting
+* the exponent after the fact do it here by doing the equivalent of +0.5 -
+* the int add will spill over into the exponent in this case.
+*/
+   maxrgb.raw += maxrgb.raw & (1 << (23-9));
+   exp_shared = MAX2((maxrgb.raw >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
+1 + RGB9E5_EXP_BIAS - 127;
+   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS -
+  RGB9E5_MANTISSA_BITS) + 1;
assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
-   assert(exp_shared >= 0);
-   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - 
RGB9E5_MANTISSA_BITS);
-
-   maxm = (int) (maxrgb * revdenom.value + 0.5);
-   if (maxm == MAX_RGB9E5_MANTISSA + 1) {
-  revdenom.value *= 0.5f;
-  exp_shared += 1;
-  assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
-   } else {
-  assert(maxm <= MAX_RGB9E5_MANTISSA);
-   }
-
-   rm = (int) (rc * revdenom.value + 0.5);
-   gm = (int) (gc * revdenom.value + 0.5);
-   bm = (int) (bc * revdenom.value + 0.5);
+
+   /*
+* The spec uses strict round-up behavior (d3d10 disagrees, but in any case
+* must match what is done above for figuring out exponent).
+* We avoid the doubles ((int) rc * revdenom + 0.5) by doing the rounding
+* ourselves (revdenom was adjusted by +1, above).
+*/
+   rm = (int) (rc.value * revdenom.value);
+   gm = (int) (gc.value * revdenom.value);
+   bm = (int) (bc.value * revdenom.value);
+   rm = (rm & 1) + (rm >> 1);
+   gm = (gm & 1) + (gm >> 1);
+   bm = (bm & 1) + (bm >> 1);
 
assert(rm <= MAX_RGB9E5_MANTISSA);
assert(gm <= MAX_RGB9E5_MANTISSA);
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] gallium/auxiliary: optimize rgb9e5 helper a bit

2015-08-09 Thread sroland
From: Roland Scheidegger 

This code (lifted straight from the extension) was doing things the most
inefficient way you could think of.
This drops some of the more expensive float operations, in particular
- int-cast floors (pointless, values always positive)
- 2 raised to (signed) integers (replace with simple exponent manipulation),
  getting rid of a misguided comment in the process (implement with table...)
- float division (replace with mul of reverse of those exponents)
This is like 3 times faster (measured for float3_to_rgb9e5), though it depends
(e.g. llvm is clever enough to replace exp2 with ldexp whereas gcc is not,
division is not too bad on cpus with early-exit divs).
Note that keeping the double math for now (float x + 0.5), as the results may
otherwise differ.
---
 src/gallium/auxiliary/util/u_format_rgb9e5.h | 35 ++--
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format_rgb9e5.h 
b/src/gallium/auxiliary/util/u_format_rgb9e5.h
index 59fc291..d11bfa8 100644
--- a/src/gallium/auxiliary/util/u_format_rgb9e5.h
+++ b/src/gallium/auxiliary/util/u_format_rgb9e5.h
@@ -21,7 +21,8 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-/* Copied from EXT_texture_shared_exponent and edited. */
+/* Copied from EXT_texture_shared_exponent and edited, getting rid of
+ * expensive float math bits too. */
 
 #ifndef RGB9E5_H
 #define RGB9E5_H
@@ -39,7 +40,6 @@
 #define RGB9E5_MANTISSA_VALUES   (1<= 0);
-   /* This exp2 function could be replaced by a table. */
-   denom = exp2(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS);
+   revdenom.field.biasedexponent = 127 - (exp_shared - RGB9E5_EXP_BIAS - 
RGB9E5_MANTISSA_BITS);
 
-   maxm = (int) floor(maxrgb / denom + 0.5);
-   if (maxm == MAX_RGB9E5_MANTISSA+1) {
-  denom *= 2;
+   maxm = (int) (maxrgb * revdenom.value + 0.5);
+   if (maxm == MAX_RGB9E5_MANTISSA + 1) {
+  revdenom.value *= 0.5f;
   exp_shared += 1;
   assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
} else {
   assert(maxm <= MAX_RGB9E5_MANTISSA);
}
 
-   rm = (int) floor(rc / denom + 0.5);
-   gm = (int) floor(gc / denom + 0.5);
-   bm = (int) floor(bc / denom + 0.5);
+   rm = (int) (rc * revdenom.value + 0.5);
+   gm = (int) (gc * revdenom.value + 0.5);
+   bm = (int) (bc * revdenom.value + 0.5);
 
assert(rm <= MAX_RGB9E5_MANTISSA);
assert(gm <= MAX_RGB9E5_MANTISSA);
@@ -151,15 +150,15 @@ static inline void rgb9e5_to_float3(unsigned rgb, float 
retval[3])
 {
rgb9e5 v;
int exponent;
-   float scale;
+   float754 scale = {0};
 
v.raw = rgb;
exponent = v.field.biasedexponent - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS;
-   scale = exp2f(exponent);
+   scale.field.biasedexponent = exponent + 127;
 
-   retval[0] = v.field.r * scale;
-   retval[1] = v.field.g * scale;
-   retval[2] = v.field.b * scale;
+   retval[0] = v.field.r * scale.value;
+   retval[1] = v.field.g * scale.value;
+   retval[2] = v.field.b * scale.value;
 }
 
 #endif
-- 
1.9.1

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: initialize shader inputs

2016-10-11 Thread sroland
From: Roland Scheidegger 

This should make the code more robust if a shader tries to use inputs which
aren't defined by the vertex element layout (which usually shouldn't happen).

No piglit change.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 87951fa..4270a8f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1705,6 +1705,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
   lp_build_printf(gallivm, " --- io %d = %p, loop counter %d\n",
   io_itr, io, lp_loop.counter);
 #endif
+
+  for (j = draw->pt.nr_vertex_elements; j < PIPE_MAX_SHADER_INPUTS; j++) {
+ for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
+inputs[j][i] = lp_build_zero(gallivm, vs_type);
+ }
+  }
+
   for (i = 0; i < vector_length; ++i) {
  LLVMValueRef vert_index =
 LLVMBuildAdd(builder,
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: improve vertex fetch

2016-10-11 Thread sroland
From: Roland Scheidegger 

The per-element fetch has quite some calculations which are constant,
these can be moved outside both the per-element as well as the main
shader loop (llvm can figure out it's constant mostly on its own, however
this can have a significant compile time cost).
Similarly, it looks easier swapping the fetch loops (outer loop per attrib,
inner loop filling up the per vertex elements - this way the aos->soa
conversion also can be done per attrib and not just at the end though again
this doesn't really make much of a difference in the generated code). (This
would also make it possible to vectorize the calculations leading to the
fetches.)
There's also some minimal change simplifying the overflow math slightly.
All in all, the generated code seems to look slightly simpler (depending
on the actual vs), but more importantly I've seen up to 4 times reduction
in compile times for some vs (albeit with old (3.3) llvm version - newer
versions tend to do alias analysis faster).

No changes with piglit.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 188 +++--
 .../auxiliary/gallivm/lp_bld_arit_overflow.c   |  24 +++
 .../auxiliary/gallivm/lp_bld_arit_overflow.h   |   6 +
 3 files changed, 133 insertions(+), 85 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 4270a8f..c1f23f0 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -659,80 +659,37 @@ generate_vs(struct draw_llvm_variant *variant,
 static void
 generate_fetch(struct gallivm_state *gallivm,
struct draw_context *draw,
-   LLVMValueRef vbuffers_ptr,
+   const struct util_format_description *format_desc,
+   LLVMValueRef vb_stride,
+   LLVMValueRef stride_fixed,
+   LLVMValueRef map_ptr,
+   LLVMValueRef buffer_size_adj,
+   LLVMValueRef ofbit,
LLVMValueRef *res,
-   struct pipe_vertex_element *velem,
-   LLVMValueRef vbuf,
-   LLVMValueRef index,
-   LLVMValueRef instance_id,
-   LLVMValueRef start_instance)
+   LLVMValueRef index)
 {
-   const struct util_format_description *format_desc =
-  util_format_description(velem->src_format);
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices =
-  LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
-   velem->vertex_buffer_index, 0);
-   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
-   &indices, 1, "");
-   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
-   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
-   LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
-   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
LLVMValueRef stride;
LLVMValueRef buffer_overflowed;
-   LLVMValueRef needed_buffer_size;
LLVMValueRef temp_ptr =
   lp_build_alloca(gallivm,
   lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
-   LLVMValueRef ofbit = NULL;
struct lp_build_if_state if_ctx;
 
-   if (velem->instance_divisor) {
-  /* Index is equal to the start instance plus the number of current 
-   * instance divided by the divisor. In this case we compute it as:
-   * index = start_instance + (instance_id  / divisor)
-   */
-  LLVMValueRef current_instance;
-  current_instance = LLVMBuildUDiv(builder, instance_id,
-   lp_build_const_int32(gallivm, 
velem->instance_divisor),
-   "instance_divisor");
-  index = lp_build_uadd_overflow(gallivm, start_instance,
- current_instance, &ofbit);
-   }
-
stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
-   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, &ofbit);
-   stride = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm, velem->src_offset), &ofbit);
-   needed_buffer_size = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm,
-   util_format_get_blocksize(velem->src_format)),
-  &ofbit);
+   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, &ofbit);
 
buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
- needed_buffer_size, buffer_size,
+ stride, buffer_size_adj,
  "buffer_overflowed");
buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
-#if 0
-   lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
- 

[Mesa-dev] [PATCH] draw: improved handling of undefined inputs

2016-10-13 Thread sroland
From: Roland Scheidegger 

Previous attempts to zero initialize all inputs were not really optimal
(though no performance impact was measurable). In fact this is not really
necessary, since we know the max number of inputs used.
Instead, just generate fetch for up to max inputs used by the shader,
directly replacing inputs for which there was no vertex element by zero.
This also cleans up key generation, which previously would have stored
some garbage for these elements.
And also drop the assertion which indicates such bogus usage by a
debug_printf (the whole point of initializing the undefined inputs was to
make this case safe to handle).
---
 src/gallium/auxiliary/draw/draw_llvm.c | 53 --
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 4270a8f..3b56856 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -689,6 +689,11 @@ generate_fetch(struct gallivm_state *gallivm,
LLVMValueRef ofbit = NULL;
struct lp_build_if_state if_ctx;
 
+   if (velem->src_format == PIPE_FORMAT_NONE) {
+  *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
+  return;
+   }
+
if (velem->instance_divisor) {
   /* Index is equal to the start instance plus the number of current 
* instance divided by the divisor. In this case we compute it as:
@@ -1706,12 +1711,6 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
   io_itr, io, lp_loop.counter);
 #endif
 
-  for (j = draw->pt.nr_vertex_elements; j < PIPE_MAX_SHADER_INPUTS; j++) {
- for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
-inputs[j][i] = lp_build_zero(gallivm, vs_type);
- }
-  }
-
   for (i = 0; i < vector_length; ++i) {
  LLVMValueRef vert_index =
 LLVMBuildAdd(builder,
@@ -1765,7 +1764,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
 gallivm->builder, true_index_array, true_index,
 lp_build_const_int32(gallivm, i), "");
 
- for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
+ for (j = 0; j < key->nr_vertex_elements; ++j) {
 struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
 LLVMValueRef vb_index =
lp_build_const_int32(gallivm, velem->vertex_buffer_index);
@@ -1776,7 +1775,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
  }
   }
   convert_to_soa(gallivm, aos_attribs, inputs,
- draw->pt.nr_vertex_elements, vs_type);
+ key->nr_vertex_elements, vs_type);
 
   /* In the paths with elts vertex id has to be unaffected by the
* index bias and because indices inside our elements array have
@@ -1873,15 +1872,6 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char 
*store)
 
key->clamp_vertex_color = llvm->draw->rasterizer->clamp_vertex_color; /**/
 
-   /* Presumably all variants of the shader should have the same
-* number of vertex elements - ie the number of shader inputs.
-* NOTE: we NEED to store the needed number of needed inputs
-* here, not the number of provided elements to match keysize
-* (and the offset of sampler state in the key).
-*/
-   key->nr_vertex_elements = 
llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_INPUT] + 1;
-   assert(key->nr_vertex_elements <= llvm->draw->pt.nr_vertex_elements);
-
/* will have to rig this up properly later */
key->clip_xy = llvm->draw->clip_xy;
key->clip_z = llvm->draw->clip_z;
@@ -1907,13 +1897,34 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char 
*store)
   key->nr_sampler_views = key->nr_samplers;
}
 
-   draw_sampler = draw_llvm_variant_key_samplers(key);
-
+   /* Presumably all variants of the shader should have the same
+* number of vertex elements - ie the number of shader inputs.
+* NOTE: we NEED to store the needed number of needed inputs
+* here, not the number of provided elements to match keysize
+* (and the offset of sampler state in the key).
+* If we have excess number of vertex elements, this is valid,
+* but the excess ones don't matter.
+* If we don't have enough vertex elements (which looks not really
+* valid but we'll handle it gracefully) fill out missing ones with
+* zero (we'll recognize these later by PIPE_FORMAT_NONE).
+*/
+   key->nr_vertex_elements =
+  llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_INPUT] + 1;
+
+   if (llvm->draw->pt.nr_vertex_elements < key->nr_vertex_elements) {
+  debug_printf("draw: vs with %d inputs but only have %d vertex 
elements\n",
+   key->nr_vertex_elements, llvm->draw->pt.nr_vertex_elements);
+  memset(key->vertex_element, 0,
+ sizeof(struct pipe_vertex_element) * key->nr_v

[Mesa-dev] [PATCH] gallivm: print out time for jitting functions with GALLIVM_DEBUG=perf

2016-10-13 Thread sroland
From: Roland Scheidegger 

Compilation to actual machine code can easily take as much time as the
optimization passes on the IR if not more, so print this out too.
---
 src/gallium/auxiliary/gallivm/lp_bld_init.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c 
b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 7114cde..d1b2369 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -659,13 +659,24 @@ gallivm_jit_function(struct gallivm_state *gallivm,
 {
void *code;
func_pointer jit_func;
+   int64_t time_begin = 0;
 
assert(gallivm->compiled);
assert(gallivm->engine);
 
+   if (gallivm_debug & GALLIVM_DEBUG_PERF)
+  time_begin = os_time_get();
+
code = LLVMGetPointerToGlobal(gallivm->engine, func);
assert(code);
jit_func = pointer_to_func(code);
 
+   if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+  int64_t time_end = os_time_get();
+  int time_msec = (int)(time_end - time_begin) / 1000;
+  debug_printf("   jitting func %s took %d msec\n",
+   LLVMGetValueName(func), time_msec);
+   }
+
return jit_func;
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: improve vertex fetch (v2)

2016-10-14 Thread sroland
From: Roland Scheidegger 

The per-element fetch has quite some calculations which are constant,
these can be moved outside both the per-element as well as the main
shader loop (llvm can figure out it's constant mostly on its own, however
this can have a significant compile time cost).
Similarly, it looks easier swapping the fetch loops (outer loop per attrib,
inner loop filling up the per vertex elements - this way the aos->soa
conversion also can be done per attrib and not just at the end though again
this doesn't really make much of a difference in the generated code). (This
would also make it possible to vectorize the calculations leading to the
fetches.)
There's also some minimal change simplifying the overflow math slightly.
All in all, the generated code seems to look slightly simpler (depending
on the actual vs), but more importantly I've seen a significant reduction
in compile times for some vs (albeit with old (3.3) llvm version, and the
time reduction is only really for the optimizations run on the IR).
v2: adapt to other draw change.

No changes with piglit.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 190 +++--
 .../auxiliary/gallivm/lp_bld_arit_overflow.c   |  24 +++
 .../auxiliary/gallivm/lp_bld_arit_overflow.h   |   6 +
 3 files changed, 134 insertions(+), 86 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 3b56856..2f82d9d 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -659,85 +659,42 @@ generate_vs(struct draw_llvm_variant *variant,
 static void
 generate_fetch(struct gallivm_state *gallivm,
struct draw_context *draw,
-   LLVMValueRef vbuffers_ptr,
+   const struct util_format_description *format_desc,
+   LLVMValueRef vb_stride,
+   LLVMValueRef stride_fixed,
+   LLVMValueRef map_ptr,
+   LLVMValueRef buffer_size_adj,
+   LLVMValueRef ofbit,
LLVMValueRef *res,
-   struct pipe_vertex_element *velem,
-   LLVMValueRef vbuf,
-   LLVMValueRef index,
-   LLVMValueRef instance_id,
-   LLVMValueRef start_instance)
+   LLVMValueRef index)
 {
-   const struct util_format_description *format_desc =
-  util_format_description(velem->src_format);
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef indices =
-  LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
-   velem->vertex_buffer_index, 0);
-   LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
-   &indices, 1, "");
-   LLVMValueRef vb_stride = draw_jit_vbuffer_stride(gallivm, vbuf);
-   LLVMValueRef vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vbuf);
-   LLVMValueRef map_ptr = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
-   LLVMValueRef buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
LLVMValueRef stride;
LLVMValueRef buffer_overflowed;
-   LLVMValueRef needed_buffer_size;
LLVMValueRef temp_ptr =
   lp_build_alloca(gallivm,
   lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
-   LLVMValueRef ofbit = NULL;
struct lp_build_if_state if_ctx;
 
-   if (velem->src_format == PIPE_FORMAT_NONE) {
+   if (format_desc->format == PIPE_FORMAT_NONE) {
   *res = lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
   return;
}
 
-   if (velem->instance_divisor) {
-  /* Index is equal to the start instance plus the number of current 
-   * instance divided by the divisor. In this case we compute it as:
-   * index = start_instance + (instance_id  / divisor)
-   */
-  LLVMValueRef current_instance;
-  current_instance = LLVMBuildUDiv(builder, instance_id,
-   lp_build_const_int32(gallivm, 
velem->instance_divisor),
-   "instance_divisor");
-  index = lp_build_uadd_overflow(gallivm, start_instance,
- current_instance, &ofbit);
-   }
-
stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
-   stride = lp_build_uadd_overflow(gallivm, stride, vb_buffer_offset, &ofbit);
-   stride = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm, velem->src_offset), &ofbit);
-   needed_buffer_size = lp_build_uadd_overflow(
-  gallivm, stride,
-  lp_build_const_int32(gallivm,
-   util_format_get_blocksize(velem->src_format)),
-  &ofbit);
+   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, &ofbit);
 
buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
- needed_buffer_size, buffer_size,
+

[Mesa-dev] [PATCH 1/2] llvmpipe: fix depth clamping wrt reversed near/far values

2016-08-14 Thread sroland
From: Roland Scheidegger 

This wasn't handled before (the result was that no matter what value got
clamped, it always ended up as the near value in this case) (if clamping
actually happened).
Fix this by using the util helper for that (the math is otherwise "mostly"
the same, mostly because there could actually be differences due to float
rounding, but I don't even know which one would be more correct).
---
 src/gallium/drivers/llvmpipe/lp_setup.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index 0a45db9..7cb4f7b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -39,6 +39,7 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
+#include "util/u_viewport.h"
 #include "draw/draw_pipe.h"
 #include "os/os_time.h"
 #include "lp_context.h"
@@ -771,15 +772,8 @@ lp_setup_set_viewports(struct lp_setup_context *setup,
for (i = 0; i < num_viewports; i++) {
   float min_depth;
   float max_depth;
-
-  if (lp->rasterizer->clip_halfz == 0) {
- float half_depth = viewports[i].scale[2];
- min_depth = viewports[i].translate[2] - half_depth;
- max_depth = min_depth + half_depth * 2.0f;
-  } else {
- min_depth = viewports[i].translate[2];
- max_depth = min_depth + viewports[i].scale[2];
-  }
+  util_viewport_zmin_zmax(&viewports[i], lp->rasterizer->clip_halfz,
+  &min_depth, &max_depth);
 
   if (setup->viewports[i].min_depth != min_depth ||
   setup->viewports[i].max_depth != max_depth) {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] llvmpipe: fix issues with depth clamp

2016-08-14 Thread sroland
From: Roland Scheidegger 

We only did depth clamp when the value was written from the fs.
This is very wrong both for d3d10 and GL, and only passed the
corresponding piglit test due to pure luck (it no longer does
with the enhanced test).
Also, interpolation clamped values to 1.0 always, which can legitimately
happen if depth clip is disabled, so fix that as well (untested).
There is one unresolved issue left, d3d10 always does depth clamping,
whereas GL does not (but does [0,1] clamp instead for fs depth outputs)
- this information isn't in any gallium state object, leave it as-is
for now (though it looks like llvmpipe misses the [0,1] clamp as well).
This (with the previous patch) fixes piglit depth-clamp-range test.
---
 src/gallium/drivers/llvmpipe/lp_bld_interp.c |  20 -
 src/gallium/drivers/llvmpipe/lp_bld_interp.h |   2 +
 src/gallium/drivers/llvmpipe/lp_state_fs.c   | 115 ---
 3 files changed, 88 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c 
b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 8e4f029..87a5417 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -338,10 +338,15 @@ attribs_update_simple(struct lp_build_interp_soa_context 
*bld,
break;
 }
 
-if ((attrib == 0) && (chan == 2)){
+if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
/* FIXME: Depth values can exceed 1.0, due to the fact that
 * setup interpolation coefficients refer to (0,0) which causes
-* precision loss. So we must clamp to 1.0 here to avoid 
artifacts
+* precision loss. So we must clamp to 1.0 here to avoid 
artifacts.
+* Note though values outside [0,1] are perfectly valid with
+* depth clip disabled.
+* XXX: If depth clip is disabled but we force depth clamp
+* we may get values larger than 1.0 in the fs (but not in
+* depth test). Not sure if that's an issue...
 */
a = lp_build_min(coeff_bld, a, coeff_bld->one);
 }
@@ -633,10 +638,15 @@ attribs_update(struct lp_build_interp_soa_context *bld,
}
 #endif
 
-   if (attrib == 0 && chan == 2) {
+   if (attrib == 0 && chan == 2 && !bld->depth_clamp) {
   /* FIXME: Depth values can exceed 1.0, due to the fact that
* setup interpolation coefficients refer to (0,0) which 
causes
-   * precision loss. So we must clamp to 1.0 here to avoid 
artifacts
+   * precision loss. So we must clamp to 1.0 here to avoid 
artifacts.
+   * Note though values outside [0,1] are perfectly valid with
+   * depth clip disabled..
+   * XXX: If depth clip is disabled but we force depth clamp
+   * we may get values larger than 1.0 in the fs (but not in
+   * depth test). Not sure if that's an issue...
*/
   a = lp_build_min(coeff_bld, a, coeff_bld->one);
}
@@ -677,6 +687,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context 
*bld,
  unsigned num_inputs,
  const struct lp_shader_input *inputs,
  boolean pixel_center_integer,
+ boolean depth_clamp,
  LLVMBuilderRef builder,
  struct lp_type type,
  LLVMValueRef a0_ptr,
@@ -738,6 +749,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context 
*bld,
} else {
   bld->pos_offset = 0.5;
}
+   bld->depth_clamp = depth_clamp;
 
pos_init(bld, x0, y0);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h 
b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 9029d2a..1b9ef5e 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -85,6 +85,7 @@ struct lp_build_interp_soa_context
unsigned mask[1 + PIPE_MAX_SHADER_INPUTS]; /**< TGSI_WRITE_MASK_x */
enum lp_interp interp[1 + PIPE_MAX_SHADER_INPUTS];
boolean simple_interp;
+   boolean depth_clamp;
 
double pos_offset;
 
@@ -116,6 +117,7 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context 
*bld,
  unsigned num_inputs,
  const struct lp_shader_input *inputs,
  boolean pixel_center_integer,
+ boolean depth_clamp,
  LLVMBuilderRef builder,
  struct lp_type type,
  LLVMValueRef a0_ptr,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b110202..3428eed 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.

[Mesa-dev] [PATCH] gallivm: Use native packs and unpacks for the lerps

2016-10-17 Thread sroland
From: Roland Scheidegger 

For the texturing packs, things looked pretty terrible. For every
lerp, we were repacking the values, and while those look sort of cheap
with 128bit, with 256bit we end up with 2 of them instead of just 1 but
worse, plus 2 extracts too (the unpack, however, works fine with a
single instruction, albeit only with llvm 3.8 - the vpmovzxbw).

Ideally we'd use more clever pack for llvmpipe backend conversion too
since we actually use the "wrong" shuffle (which is more work) when doing
the fs twiddle just so we end up with the wrong order for being able to
do native pack when converting from 2x8f -> 1x16b. But this requires some
refactoring, since the untwiddle is separate from conversion.

This is only used for avx2 256bit pack/unpack for now.

Improves openarena scores by 8% or so, though overall it's still pretty
disappointing how much faster 256bit vectors are even with avx2 (or
rather, aren't...). And, of course, eliminating the needless
packs/unpacks in the first place would eliminate most of that advantage
(not quite all) from this patch.
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c |  14 +--
 src/gallium/auxiliary/gallivm/lp_bld_pack.c | 139 ++--
 src/gallium/auxiliary/gallivm/lp_bld_pack.h |  16 
 3 files changed, 156 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index f5cacc4..3ea0734 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1046,14 +1046,14 @@ lp_build_mul(struct lp_build_context *bld,
   struct lp_type wide_type = lp_wider_type(type);
   LLVMValueRef al, ah, bl, bh, abl, abh, ab;
 
-  lp_build_unpack2(bld->gallivm, type, wide_type, a, &al, &ah);
-  lp_build_unpack2(bld->gallivm, type, wide_type, b, &bl, &bh);
+  lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
+  lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
 
   /* PMULLW, PSRLW, PADDW */
   abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
   abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
 
-  ab = lp_build_pack2(bld->gallivm, wide_type, type, abl, abh);
+  ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
 
   return ab;
}
@@ -1350,9 +1350,9 @@ lp_build_lerp(struct lp_build_context *bld,
 
   lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
 
-  lp_build_unpack2(bld->gallivm, type, wide_type, x,  &xl,  &xh);
-  lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
-  lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
+  lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
+  lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
+  lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
 
   /*
* Lerp both halves.
@@ -1363,7 +1363,7 @@ lp_build_lerp(struct lp_build_context *bld,
   resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
   resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
 
-  res = lp_build_pack2(bld->gallivm, wide_type, type, resl, resh);
+  res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
} else {
   res = lp_build_lerp_simple(bld, x, v0, v1, flags);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c 
b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index b0e76e6..e8d4fcd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -346,10 +346,10 @@ lp_build_interleave2(struct gallivm_state *gallivm,
  */
 LLVMValueRef
 lp_build_interleave2_half(struct gallivm_state *gallivm,
- struct lp_type type,
- LLVMValueRef a,
- LLVMValueRef b,
- unsigned lo_hi)
+  struct lp_type type,
+  LLVMValueRef a,
+  LLVMValueRef b,
+  unsigned lo_hi)
 {
if (type.length * type.width == 256) {
   LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, 
type.length, lo_hi);
@@ -359,11 +359,13 @@ lp_build_interleave2_half(struct gallivm_state *gallivm,
}
 }
 
+
 /**
  * Double the bit width.
  *
  * This will only change the number of bits the values are represented, not the
  * values themselves.
+ *
  */
 void
 lp_build_unpack2(struct gallivm_state *gallivm,
@@ -394,6 +396,65 @@ lp_build_unpack2(struct gallivm_state *gallivm,
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
*dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
*dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
+
+#else
+   *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
+   *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
+#endif
+
+   /* Cast the result 

[Mesa-dev] [PATCH] draw: use vectorized calculations for fetch

2016-10-31 Thread sroland
From: Roland Scheidegger 

Instead of doing all the math with scalars, use vectors. This means the
overflow math needs to be done manually, albeit that's only really
problematic for the stride/index mul, the rest has been pretty much
moved outside the shader loop (albeit the mul could actually be optimized
away too), where things are still scalar. Because llvm is complete fail
with the zero-extend widening mul, roll our own even...
To eliminate control flow in the main shader loop fetch, provide fake
buffers (so index 0 is always valid to fetch).
Still uses aos fetch though in the end - mostly because some more code
would be needed to handle unaligned fetches in that path, and because for
most formats it won't make a difference anyway (we generate some truly
horrendous code for things like R16G16_something for instance).

Instanced fetch however stays roughly the same as before, except that
no longer the same element is fetched multiple times (I've seen a reduction
of ~3 times in main shader loop size due to apparently llvm not being able
to deduce it's really all the same with a couple instanced elements).

Also, for elts gathering, use vectorized code as well - provide a fake
elt buffer if there's no valid one bound.

The generated shaders are smaller and faster to compile (not entirely sure
about execution speed, but generally unless there's just single vertices
to handle I would expect it to be faster - there's more opportunities
for future improvements by using soa fetch).

No piglit change.
---
 src/gallium/auxiliary/draw/draw_context.c |   5 +
 src/gallium/auxiliary/draw/draw_llvm.c| 562 ++
 2 files changed, 416 insertions(+), 151 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index 56abcff..0271fe3 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -787,6 +787,7 @@ draw_set_indexes(struct draw_context *draw,
  const void *elements, unsigned elem_size,
  unsigned elem_buffer_space)
 {
+   static const unsigned fake_index_buf = 0;
assert(elem_size == 0 ||
   elem_size == 1 ||
   elem_size == 2 ||
@@ -797,6 +798,10 @@ draw_set_indexes(struct draw_context *draw,
   draw->pt.user.eltMax = elem_buffer_space / elem_size;
else
   draw->pt.user.eltMax = 0;
+
+   if (draw->pt.user.eltMax == 0) {
+  draw->pt.user.elts = &fake_index_buf;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 2f82d9d..6ff824a 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -33,6 +33,8 @@
 
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_arit_overflow.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_gather.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_swizzle.h"
@@ -55,6 +57,7 @@
 #include "util/u_pointer.h"
 #include "util/u_string.h"
 #include "util/simple_list.h"
+#include "util/u_cpu_detect.h"
 
 
 #define DEBUG_STORE 0
@@ -656,34 +659,284 @@ generate_vs(struct draw_llvm_variant *variant,
}
 }
 
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+   LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+   LLVMValueRef dst_soa[TGSI_NUM_CHANNELS],
+   const struct lp_type soa_type)
+{
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
+
+   debug_assert(TGSI_NUM_CHANNELS == 4);
+   debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+  LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+  assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+  for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
+  }
+
+  aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, 
pixels_per_channel);
+   }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
+
+static LLVMValueRef
+lp_build_umul_32_lohi(struct lp_build_context *bld,
+  LLVMValueRef a,
+  LLVMValueRef b,
+  LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+   assert(bld->type.sign == 0);
+   assert(bld->type.length == 4 || bld->type.length == 8);
+
+   /*
+* XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+* is atrocious (even if the high bits weren't required), trying to handle
+* real 64bit i

[Mesa-dev] [PATCH 1/2] draw: fix undefined input handling some more...

2016-11-02 Thread sroland
From: Roland Scheidegger 

Previous fixes were incomplete - some code still iterated through the number
of elements provided by velem layout instead of the number stored in the key
(which is the same as the number defined by the vs). And also actually
accessed the elements from the layout directly instead of those in the key.
This mismatch could still cause crashes.
(Besides, it is a very good idea to only use data stored in the key anyway.)
---
 src/gallium/auxiliary/draw/draw_llvm.c | 77 ++
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 2f82d9d..d5fc1c2 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1658,10 +1658,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
/*
 * Pre-calculate everything which is constant per shader invocation.
 */
-   for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
+   for (j = 0; j < key->nr_vertex_elements; ++j) {
   LLVMValueRef vb_buffer_offset, buffer_size;
   LLVMValueRef vb_info, vbuffer_ptr;
-  struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
+  struct pipe_vertex_element *velem = &key->vertex_element[j];
   LLVMValueRef vb_index =
  lp_build_const_int32(gallivm, velem->vertex_buffer_index);
   LLVMValueRef bsize = lp_build_const_int32(gallivm,
@@ -1669,41 +1669,44 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
   LLVMValueRef src_offset = lp_build_const_int32(gallivm,
  velem->src_offset);
 
-  vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, &vb_index, 1, "");
-  vb_info = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
-  vb_stride[j] = draw_jit_vbuffer_stride(gallivm, vb_info);
-  vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vb_info);
-  map_ptr[j] = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
-  buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
-
-  ofbit[j] = NULL;
-  stride_fixed[j] = lp_build_uadd_overflow(gallivm, vb_buffer_offset,
-   src_offset, &ofbit[j]);
-  buffer_size_adj[j] = lp_build_usub_overflow(gallivm, buffer_size, bsize,
-   &ofbit[j]);
-
-  if (velem->instance_divisor) {
- /* Index is equal to the start instance plus the number of current 
-  * instance divided by the divisor. In this case we compute it as:
-  * index = start_instance + (instance_id  / divisor)
-  */
- LLVMValueRef current_instance;
- current_instance = LLVMBuildUDiv(builder, system_values.instance_id,
-  lp_build_const_int32(gallivm,
-   
velem->instance_divisor),
-  "instance_divisor");
- instance_index[j] = lp_build_uadd_overflow(gallivm, start_instance,
-current_instance, 
&ofbit[j]);
-  }
+  if (velem->src_format != PIPE_FORMAT_NONE) {
+ vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, &vb_index, 1, "");
+ vb_info = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, "");
+ vb_stride[j] = draw_jit_vbuffer_stride(gallivm, vb_info);
+ vb_buffer_offset = draw_jit_vbuffer_offset(gallivm, vb_info);
+ map_ptr[j] = draw_jit_dvbuffer_map(gallivm, vbuffer_ptr);
+ buffer_size = draw_jit_dvbuffer_size(gallivm, vbuffer_ptr);
+
+ ofbit[j] = NULL;
+ stride_fixed[j] = lp_build_uadd_overflow(gallivm, vb_buffer_offset,
+  src_offset, &ofbit[j]);
+ buffer_size_adj[j] = lp_build_usub_overflow(gallivm, buffer_size, 
bsize,
+ &ofbit[j]);
+
+ if (velem->instance_divisor) {
+/* Index is equal to the start instance plus the number of current
+ * instance divided by the divisor. In this case we compute it as:
+ * index = start_instance + (instance_id  / divisor)
+ */
+LLVMValueRef current_instance;
+current_instance = LLVMBuildUDiv(builder, 
system_values.instance_id,
+ lp_build_const_int32(gallivm,
+  
velem->instance_divisor),
+ "instance_divisor");
+instance_index[j] = lp_build_uadd_overflow(gallivm, start_instance,
+   current_instance, 
&ofbit[j]);
+ }
 
-  if (0) {
- lp_build_printf(gallivm, "vbuf index = %u, vb_stride is %u\n",
-   

[Mesa-dev] [PATCH 2/2] draw: use vectorized calculations for fetch

2016-11-02 Thread sroland
From: Roland Scheidegger 

Instead of doing all the math with scalars, use vectors. This means the
overflow math needs to be done manually, albeit that's only really
problematic for the stride/index mul, the rest has been pretty much
moved outside the shader loop (albeit the mul could actually be optimized
away too), where things are still scalar. Because llvm is complete fail
with the zero-extend widening mul, roll our own even...
To eliminate control flow in the main shader loop fetch, provide fake
buffers (so index 0 is always valid to fetch).
Still uses aos fetch though in the end - mostly because some more code
would be needed to handle unaligned fetches in that path, and because for
most formats it won't make a difference anyway (we generate some truly
horrendous code for things like R16G16_something for instance).

Instanced fetch however stays roughly the same as before, except that
no longer the same element is fetched multiple times (I've seen a reduction
of ~3 times in main shader loop size due to apparently llvm not being able
to deduce it's really all the same with a couple instanced elements).

Also, for elts gathering, use vectorized code as well - provide a fake
elt buffer if there's no valid one bound.

The generated shaders are smaller and faster to compile (not entirely sure
about execution speed, but generally unless there's just single vertices
to handle I would expect it to be faster - there's more opportunities
for future improvements by using soa fetch).

No piglit change.
---
 src/gallium/auxiliary/draw/draw_context.c |   5 +
 src/gallium/auxiliary/draw/draw_llvm.c| 569 ++
 2 files changed, 419 insertions(+), 155 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index 56abcff..0271fe3 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -787,6 +787,7 @@ draw_set_indexes(struct draw_context *draw,
  const void *elements, unsigned elem_size,
  unsigned elem_buffer_space)
 {
+   static const unsigned fake_index_buf = 0;
assert(elem_size == 0 ||
   elem_size == 1 ||
   elem_size == 2 ||
@@ -797,6 +798,10 @@ draw_set_indexes(struct draw_context *draw,
   draw->pt.user.eltMax = elem_buffer_space / elem_size;
else
   draw->pt.user.eltMax = 0;
+
+   if (draw->pt.user.eltMax == 0) {
+  draw->pt.user.elts = &fake_index_buf;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index d5fc1c2..ad9624b 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -33,6 +33,8 @@
 
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_arit_overflow.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_gather.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_swizzle.h"
@@ -55,6 +57,7 @@
 #include "util/u_pointer.h"
 #include "util/u_string.h"
 #include "util/simple_list.h"
+#include "util/u_cpu_detect.h"
 
 
 #define DEBUG_STORE 0
@@ -656,34 +659,284 @@ generate_vs(struct draw_llvm_variant *variant,
}
 }
 
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+   LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+   LLVMValueRef dst_soa[TGSI_NUM_CHANNELS],
+   const struct lp_type soa_type)
+{
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
+
+   debug_assert(TGSI_NUM_CHANNELS == 4);
+   debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+  LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+  assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+  for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
+  }
+
+  aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, 
pixels_per_channel);
+   }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
+
+static LLVMValueRef
+lp_build_umul_32_lohi(struct lp_build_context *bld,
+  LLVMValueRef a,
+  LLVMValueRef b,
+  LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+   assert(bld->type.sign == 0);
+   assert(bld->type.length == 4 || bld->type.length == 8);
+
+   /*
+* XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+* is atrocious (even if the high bits weren't required), trying to handle
+* real 64bit i

[Mesa-dev] [PATCH 1/2] gallivm: introduce 32x32->64bit lp_build_mul_32_lohi function

2016-11-03 Thread sroland
From: Roland Scheidegger 

This is used by shader umul_hi/imul_hi functions (and soon by draw).
It's actually useful separating this out on its own, however the real
reason for doing it is because we're using an optimized sse2 version,
since the code llvm generates is atrocious (since there's no widening
mul in llvm, and it does not recognize the widening mul pattern, so
it generates code for real 64x64->64bit mul, which the cpu can't do
natively, in contrast to 32x32->64bit mul which it could do).
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c| 150 +
 src/gallium/auxiliary/gallivm/lp_bld_arit.h|   6 +
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c |  54 +++-
 3 files changed, 172 insertions(+), 38 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 3ea0734..3de4628 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1091,6 +1091,156 @@ lp_build_mul(struct lp_build_context *bld,
return res;
 }
 
+/*
+ * Widening mul, valid for 32x32 bit -> 64bit only.
+ * Result is low 32bits, high bits returned in res_hi.
+ */
+LLVMValueRef
+lp_build_mul_32_lohi(struct lp_build_context *bld,
+ LLVMValueRef a,
+ LLVMValueRef b,
+ LLVMValueRef *res_hi)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+
+   assert(bld->type.width == 32);
+   assert(bld->type.floating == 0);
+   assert(bld->type.fixed == 0);
+   assert(bld->type.norm == 0);
+
+   /*
+* XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
+* for x86 simd is atrocious (even if the high bits weren't required),
+* trying to handle real 64bit inputs (which of course can't happen due
+* to using 64bit umul with 32bit numbers zero-extended to 64bit, but
+* apparently llvm does not recognize this widening mul). This includes 6
+* (instead of 2) pmuludq plus extra adds and shifts
+* The same story applies to signed mul, albeit fixing this requires sse41.
+* https://llvm.org/bugs/show_bug.cgi?id=30845
+* So, whip up our own code, albeit only for length 4 and 8 (which
+* should be good enough)...
+*/
+   if ((bld->type.length == 4 || bld->type.length == 8) &&
+   ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
+util_cpu_caps.has_sse4_1)) {
+  const char *intrinsic = NULL;
+  LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
+  LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
+  struct lp_type type_wide = lp_wider_type(bld->type);
+  LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
+  unsigned i;
+  for (i = 0; i < bld->type.length; i += 2) {
+ shuf[i] = lp_build_const_int32(gallivm, i+1);
+ shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+  }
+  shuf_vec = LLVMConstVector(shuf, bld->type.length);
+  aeven = a;
+  beven = b;
+  aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
+  bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
+
+  if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
+ if (bld->type.sign) {
+intrinsic = "llvm.x86.avx2.pmul.dq";
+ } else {
+intrinsic = "llvm.x86.avx2.pmulu.dq";
+ }
+ muleven = lp_build_intrinsic_binary(builder, intrinsic,
+ wider_type, aeven, beven);
+ mulodd = lp_build_intrinsic_binary(builder, intrinsic,
+wider_type, aodd, bodd);
+  }
+  else {
+ /* for consistent naming look elsewhere... */
+ if (bld->type.sign) {
+intrinsic = "llvm.x86.sse41.pmuldq";
+ } else {
+intrinsic = "llvm.x86.sse2.pmulu.dq";
+ }
+ /*
+  * XXX If we only have AVX but not AVX2 this is a pain.
+  * lp_build_intrinsic_binary_anylength() can't handle it
+  * (due to src and dst type not being identical).
+  */
+ if (bld->type.length == 8) {
+LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
+LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
+LLVMValueRef muleven2[2], mulodd2[2];
+struct lp_type type_wide_half = type_wide;
+LLVMTypeRef wtype_half;
+type_wide_half.length = 2;
+wtype_half = lp_build_vec_type(gallivm, type_wide_half);
+aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
+aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
+bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
+bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
+aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
+aoddhi = lp_build_e

[Mesa-dev] [PATCH 2/2] draw: use vectorized calculations for fetch

2016-11-03 Thread sroland
From: Roland Scheidegger 

Instead of doing all the math with scalars, use vectors. This means the
overflow math needs to be done manually, albeit that's only really
problematic for the stride/index mul, the rest has been pretty much
moved outside the shader loop (albeit the mul could actually be optimized
away too), where things are still scalar. Because llvm is complete fail
with the zero-extend widening mul, roll our own even...
To eliminate control flow in the main shader loop fetch, provide fake
buffers (so index 0 is always valid to fetch).
Still uses aos fetch though in the end - mostly because some more code
would be needed to handle unaligned fetches in that path, and because for
most formats it won't make a difference anyway (we generate some truly
horrendous code for things like R16G16_something for instance).

Instanced fetch however stays roughly the same as before, except that
no longer the same element is fetched multiple times (I've seen a reduction
of ~3 times in main shader loop size due to apparently llvm not being able
to deduce it's really all the same with a couple instanced elements).

Also, for elts gathering, use vectorized code as well - provide a fake
elt buffer if there's no valid one bound.

The generated shaders are smaller and faster to compile (not entirely sure
about execution speed, but generally unless there's just single vertices
to handle I would expect it to be faster - there's more opportunities
for future improvements by using soa fetch).

No piglit change.
---
 src/gallium/auxiliary/draw/draw_context.c |   5 +
 src/gallium/auxiliary/draw/draw_llvm.c| 436 +++---
 2 files changed, 282 insertions(+), 159 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c 
b/src/gallium/auxiliary/draw/draw_context.c
index 56abcff..0271fe3 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -787,6 +787,7 @@ draw_set_indexes(struct draw_context *draw,
  const void *elements, unsigned elem_size,
  unsigned elem_buffer_space)
 {
+   static const unsigned fake_index_buf = 0;
assert(elem_size == 0 ||
   elem_size == 1 ||
   elem_size == 2 ||
@@ -797,6 +798,10 @@ draw_set_indexes(struct draw_context *draw,
   draw->pt.user.eltMax = elem_buffer_space / elem_size;
else
   draw->pt.user.eltMax = 0;
+
+   if (draw->pt.user.eltMax == 0) {
+  draw->pt.user.elts = &fake_index_buf;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 5b4e2a1..cc98d26 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -33,6 +33,8 @@
 
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_arit_overflow.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_gather.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_swizzle.h"
@@ -656,28 +658,140 @@ generate_vs(struct draw_llvm_variant *variant,
}
 }
 
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+   LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+   LLVMValueRef dst_soa[TGSI_NUM_CHANNELS],
+   const struct lp_type soa_type)
+{
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[TGSI_NUM_CHANNELS];
+   unsigned pixels_per_channel = soa_type.length / TGSI_NUM_CHANNELS;
+
+   debug_assert(TGSI_NUM_CHANNELS == 4);
+   debug_assert((soa_type.length % TGSI_NUM_CHANNELS) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < TGSI_NUM_CHANNELS; ++j) {
+  LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+  assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+  for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + TGSI_NUM_CHANNELS * k];
+  }
+
+  aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, 
pixels_per_channel);
+   }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
+
+
 static void
-generate_fetch(struct gallivm_state *gallivm,
-   const struct util_format_description *format_desc,
-   LLVMValueRef vb_stride,
-   LLVMValueRef stride_fixed,
-   LLVMValueRef map_ptr,
-   LLVMValueRef buffer_size_adj,
-   LLVMValueRef ofbit,
-   LLVMValueRef *res,
-   LLVMValueRef index)
+fetch_vector(struct gallivm_state *gallivm,
+ const struct util_format_description *format_desc,
+ struct lp_type vs_type,
+ LLVMValueRef vb_stride,
+ LLVMValueRef map_ptr,
+ LLVMValueRef buffer_size_adj,
+ LLVMValueRef *inputs,
+ LLVMValueRef indices,
+ LLVMValueRef valid_mask)
 {
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderR

[Mesa-dev] [PATCH 2/3] draw: finally optimize bool clip mask generation

2016-11-12 Thread sroland
From: Roland Scheidegger 

lp_build_any_true_range is just what we need, though it will only produce
optimal code with sse41 (ptest + set) - but even without it on 64bit x86
the code is still better (1 unpack, 2 movq + or + set), on 32bit x86 it's
going to be roughly the same as before.
While here also make it a "real" 8bit boolean - cuts one instruction but
more importantly similar to ordinary booleans.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 43 --
 src/gallium/auxiliary/draw/draw_llvm.h |  4 +-
 .../draw/draw_pt_fetch_shade_pipeline_llvm.c   |  2 +-
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 115cb79..8aa2d11 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1383,20 +1383,21 @@ generate_clipmask(struct draw_llvm *llvm,
 
 /**
  * Returns boolean if any clipping has occurred
- * Used zero/non-zero i32 value to represent boolean
+ * Used zero/one i8 value to represent boolean
  */
 static LLVMValueRef
-clipmask_booli32(struct gallivm_state *gallivm,
- const struct lp_type vs_type,
- LLVMValueRef clipmask_bool_ptr,
- boolean edgeflag_in_clipmask)
+clipmask_booli8(struct gallivm_state *gallivm,
+const struct lp_type vs_type,
+LLVMValueRef clipmask_bool_ptr,
+boolean edgeflag_in_clipmask)
 {
LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
LLVMValueRef clipmask_bool = LLVMBuildLoad(builder, clipmask_bool_ptr, "");
-   LLVMValueRef ret = LLVMConstNull(int32_type);
-   LLVMValueRef temp;
-   int i;
+   LLVMValueRef ret;
+   struct lp_build_context bldivec;
+
+   lp_build_context_init(&bldivec, gallivm, lp_int_type(vs_type));
 
/*
 * We need to invert the edgeflag bit from the clipmask here
@@ -1404,19 +1405,20 @@ clipmask_booli32(struct gallivm_state *gallivm,
 * and we (may) need it if edgeflag was 0).
 */
if (edgeflag_in_clipmask) {
-  struct lp_type i32_type = lp_int_type(vs_type);
-  LLVMValueRef edge = lp_build_const_int_vec(gallivm, i32_type,
+  LLVMValueRef edge = lp_build_const_int_vec(gallivm, bldivec.type,
  1LL << 
DRAW_TOTAL_CLIP_PLANES);
   clipmask_bool = LLVMBuildXor(builder, clipmask_bool, edge, "");
}
+
/*
-* Could do much better with just cmp/movmskps.
+* XXX: probably should mask off bits from the mask which come from
+* vertices which were beyond the count (i.e. indices_valid for
+* linear fetches, for elts ones we don't have the correct mask
+* right now). Otherwise might run the pipeline for nothing,
+* though everything should still work.
 */
-   for (i=0; i < vs_type.length; i++) {
-  temp = LLVMBuildExtractElement(builder, clipmask_bool,
- lp_build_const_int32(gallivm, i) , "");
-  ret = LLVMBuildOr(builder, ret, temp, "");
-   }
+   ret = lp_build_any_true_range(&bldivec, vs_type.length, clipmask_bool);
+   ret = LLVMBuildZExt(builder, ret, int8_type, "");
return ret;
 }
 
@@ -1641,7 +1643,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
arg_types[i++] = int32_type; /* vertex_id_offset */
arg_types[i++] = int32_type; /* start_instance */
 
-   func_type = LLVMFunctionType(int32_type, arg_types, num_arg_types, 0);
+   func_type = LLVMFunctionType(LLVMInt8TypeInContext(context),
+arg_types, num_arg_types, 0);
 
variant_func = LLVMAddFunction(gallivm->module, func_name, func_type);
 
@@ -2005,8 +2008,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
sampler->destroy(sampler);
 
/* return clipping boolean value for function */
-   ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr,
-  enable_cliptest && key->need_edgeflags);
+   ret = clipmask_booli8(gallivm, vs_type, clipmask_bool_ptr,
+ enable_cliptest && key->need_edgeflags);
 
LLVMBuildRet(builder, ret);
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h 
b/src/gallium/auxiliary/draw/draw_llvm.h
index 271433c..9ea100f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -261,7 +261,7 @@ enum {
 
 
 
-typedef int
+typedef boolean
 (*draw_jit_vert_func)(struct draw_jit_context *context,
   struct vertex_header *io,
   const struct draw_vertex_buffer 
vbuffers[PIPE_MAX_ATTRIBS],
@@ -274,7 +274,7 @@ typedef int
   unsigned start_instance);
 
 
-typedef int
+typedef boolean
 (*draw_jit_vert_func_elts)(struct

[Mesa-dev] [PATCH 1/3] draw: use vectorized calculations for fetch (v2)

2016-11-12 Thread sroland
From: Roland Scheidegger 

Instead of doing all the math with scalars, use vectors. This means the
overflow math needs to be done manually, albeit that's only really
problematic for the stride/index mul, the rest has been pretty much
moved outside the shader loop (albeit the mul could actually be optimized
away too), where things are still scalar.
To eliminate control flow in the main shader loop fetch, provide fake
buffers (so index 0 is always valid to fetch).
Still uses aos fetch though in the end - mostly because some more code
would be needed to handle unaligned fetches in that path, and because for
most formats it won't make a difference anyway (we generate some truly
horrendous code for things like R16G16_something for instance).

Instanced fetch however stays roughly the same as before, except that
no longer the same element is fetched multiple times (I've seen a reduction
of ~3 times in main shader loop size due to llvm not recognizing it's all
the same fetch, since it would have been possible some of the fetches
getting replaced with zeros in case vector size exceeds remaining fetch
count - the values of such fetches don't matter at all though).

Also, for elts gathering, use vectorized code as well.

The generated shaders are smaller and faster to compile (not entirely sure
about execution speed, but generally unless there's just single vertices
to handle I would expect it to be faster - there's more opportunities
for future improvements by using soa fetch).

v3: skip the fake index buffer, not needed due to the jit code never seeing
the real index buffer in the first place.
Fix a bug with mask expansion (needs SExt, not ZExt).
Also, be really really careful to keep the behavior the same, even in cases
where it looks wrong, and add comments why the code is doing the seemingly
wrong stuff... Fortunately it's not actually more complex in the end...
Also change function order slightly just to make the diff more readable.

No piglit change. Passes some internal testing with another api too...
---
 src/gallium/auxiliary/draw/draw_llvm.c | 396 ++---
 1 file changed, 265 insertions(+), 131 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index ba86b11..115cb79 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -33,6 +33,8 @@
 
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_arit_overflow.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_gather.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_swizzle.h"
@@ -656,28 +658,28 @@ generate_vs(struct draw_llvm_variant *variant,
}
 }
 
+
 static void
-generate_fetch(struct gallivm_state *gallivm,
-   const struct util_format_description *format_desc,
-   LLVMValueRef vb_stride,
-   LLVMValueRef stride_fixed,
-   LLVMValueRef map_ptr,
-   LLVMValueRef buffer_size_adj,
-   LLVMValueRef ofbit,
-   LLVMValueRef *res,
-   LLVMValueRef index)
+fetch_instanced(struct gallivm_state *gallivm,
+const struct util_format_description *format_desc,
+struct lp_type vs_type,
+LLVMValueRef vb_stride,
+LLVMValueRef map_ptr,
+LLVMValueRef buffer_size_adj,
+LLVMValueRef ofbit,
+LLVMValueRef *inputs,
+LLVMValueRef index)
 {
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef stride;
-   LLVMValueRef buffer_overflowed;
+   LLVMValueRef stride, buffer_overflowed, aos;
LLVMValueRef temp_ptr =
   lp_build_alloca(gallivm,
   lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
struct lp_build_if_state if_ctx;
+   unsigned i;
 
stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
-   stride = lp_build_uadd_overflow(gallivm, stride, stride_fixed, &ofbit);
 
buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
  stride, buffer_size_adj,
@@ -685,8 +687,7 @@ generate_fetch(struct gallivm_state *gallivm,
buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
 
if (0) {
-  lp_build_printf(gallivm, "   stride = %u\n", stride);
-  lp_build_printf(gallivm, "   buffer size adj = %u\n", buffer_size_adj);
+  lp_build_print_value(gallivm, "   instance index = ", index);
   lp_build_print_value(gallivm, "   buffer overflowed = ", 
buffer_overflowed);
}
 
@@ -699,27 +700,34 @@ generate_fetch(struct gallivm_state *gallivm,
lp_build_else(&if_ctx);
{
   LLVMValueRef val;
-  map_ptr = LLVMBuildGEP(builder, map_ptr, &stride, 1, "");
 
   val = lp_build_fetch_rgba_aos(gallivm,
 format_desc

[Mesa-dev] [PATCH 3/3] draw: simplify vsplit elts code a bit

2016-11-12 Thread sroland
From: Roland Scheidegger 

vsplit_get_base_idx explicitly returned idx 0 and set the ofbit
in case of overflow. We'd then check the ofbit and use idx 0 instead of
looking it up. This was necessary because DRAW_GET_IDX used to return
DRAW_MAX_FETCH_IDX and not 0 in case of overflows.
However, this is all unnecessary, we can just let DRAW_GET_IDX return 0
in case of overflow. In fact before bbd1e60198548a12be3405fc32dd39a87e8968ab
the code already did that, not sure why this particular bit was changed
(might have been one half of an attempt to get these indices to actual draw
shader execution - in fact I think this would make things less awkward, it
would require moving the eltBias handling to the shader as well).
Note there's other callers of DRAW_GET_IDX - those code paths however
explicitly do not handle index buffer overflows, therefore the overflow
value doesn't matter for them.

Also do some trivial simplification - for (unsigned) a + b, checking res < a
is sufficient for overflow detection, we don't need to check for res < b too
(similar for signed).

And an index buffer overflow check looked bogus - eltMax is the number of
elements in the index buffer, not the maximum element which can be fetched.
(Drop the start check against the idx buffer though, this is already covered
by end check and end < start).
---
 src/gallium/auxiliary/draw/draw_private.h   |  7 ++--
 src/gallium/auxiliary/draw/draw_pt_vsplit.c | 46 +++--
 src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h |  5 ++-
 3 files changed, 18 insertions(+), 40 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_private.h 
b/src/gallium/auxiliary/draw/draw_private.h
index a6aa610..030bb2c 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -489,11 +489,10 @@ void draw_update_viewport_flags(struct draw_context 
*draw);
 
 /** 
  * Return index i from the index buffer.
- * If the index buffer would overflow we return the
- * maximum possible index.
+ * If the index buffer would overflow we return index 0.
  */
 #define DRAW_GET_IDX(_elts, _i)   \
-   (((_i) >= draw->pt.user.eltMax) ? DRAW_MAX_FETCH_IDX : (_elts)[_i])
+   (((_i) >= draw->pt.user.eltMax) ? 0 : (_elts)[_i])
 
 /**
  * Return index of the given viewport clamping it
@@ -515,7 +514,7 @@ draw_overflow_uadd(unsigned a, unsigned b,
unsigned overflow_value)
 {
unsigned res = a + b;
-   if (res < a || res < b) {
+   if (res < a) {
   res = overflow_value;
}
return res;
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c 
b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
index 8d448f9..fb131c3 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -33,7 +33,7 @@
 #define SEGMENT_SIZE 1024
 #define MAP_SIZE 256
 
-/* The largest possible index withing an index buffer */
+/* The largest possible index within an index buffer */
 #define MAX_ELT_IDX 0x
 
 struct vsplit_frontend {
@@ -108,55 +108,36 @@ vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned 
fetch, unsigned ofbias
 
 /**
  * Returns the base index to the elements array.
- * The value is checked for overflows (both integer overflows
- * and the elements array overflow).
+ * The value is checked for integer overflow.
  */
 static inline unsigned
-vsplit_get_base_idx(struct vsplit_frontend *vsplit,
-unsigned start, unsigned fetch, unsigned *ofbit)
+vsplit_get_base_idx(unsigned start, unsigned fetch)
 {
-   struct draw_context *draw = vsplit->draw;
-   unsigned elt_idx = draw_overflow_uadd(start, fetch, MAX_ELT_IDX);
-   if (ofbit)
-  *ofbit = 0;
-
-   /* Overflown indices need to wrap to the first element
-* in the index buffer */
-   if (elt_idx >= draw->pt.user.eltMax) {
-  if (ofbit)
- *ofbit = 1;
-  elt_idx = 0;
-   }
-
-   return elt_idx;
+   return draw_overflow_uadd(start, fetch, MAX_ELT_IDX);
 }
 
 /**
  * Returns the element index adjust for the element bias.
  * The final element index is created from the actual element
- * index, plus the element bias, clamped to maximum elememt
+ * index, plus the element bias, clamped to maximum element
  * index if that addition overflows.
  */
 static inline unsigned
-vsplit_get_bias_idx(struct vsplit_frontend *vsplit,
-int idx, int bias, unsigned *ofbias)
+vsplit_get_bias_idx(int idx, int bias, unsigned *ofbias)
 {
int res = idx + bias;
 
-   if (ofbias)
-  *ofbias = 0;
+   *ofbias = 0;
 
if (idx > 0 && bias > 0) {
-  if (res < idx || res < bias) {
+  if (res < idx) {
  res = DRAW_MAX_FETCH_IDX;
- if (ofbias)
-*ofbias = 1;
+ *ofbias = 1;
   }
} else if (idx < 0 && bias < 0) {
-  if (res > idx || res > bias) {
+  if (res > idx) {
  res = DRAW_MAX_FETCH_IDX;
- if (ofbias)
-*ofbias = 1;
+ *ofbias = 1;
   }
}

[Mesa-dev] [PATCH 2/5] draw: use same argument order for jit draw linear / elts functions

2016-11-13 Thread sroland
From: Roland Scheidegger 

This is a bit simpler. Mostly to make it easier to unify the paths later...
---
 src/gallium/auxiliary/draw/draw_llvm.c | 48 ++
 src/gallium/auxiliary/draw/draw_llvm.h |  8 ++--
 .../draw/draw_pt_fetch_shade_pipeline_llvm.c   |  8 ++--
 3 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 8aa2d11..a02c893 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1580,7 +1580,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
LLVMBuilderRef builder;
char func_name[64];
struct lp_type vs_type;
-   LLVMValueRef count, fetch_elts, fetch_elt_max, fetch_count;
+   LLVMValueRef count, fetch_elts, fetch_elt_max;
LLVMValueRef vertex_id_offset, start_instance, start;
LLVMValueRef stride, step, io_itr;
LLVMValueRef ind_vec;
@@ -1631,17 +1631,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
arg_types[i++] = get_context_ptr_type(variant);   /* context */
arg_types[i++] = get_vertex_header_ptr_type(variant); /* vertex_header */
arg_types[i++] = get_buffer_ptr_type(variant);/* vbuffers */
+   arg_types[i++] = int32_type;  /* count */
+   arg_types[i++] = int32_type;  /* 
start/fetch_elt_max */
+   arg_types[i++] = int32_type;  /* stride */
+   arg_types[i++] = get_vb_ptr_type(variant);/* 
pipe_vertex_buffer's */
+   arg_types[i++] = int32_type;  /* instance_id */
+   arg_types[i++] = int32_type;  /* vertex_id_offset */
+   arg_types[i++] = int32_type;  /* start_instance */
if (elts) {
-  arg_types[i++] = LLVMPointerType(int32_type, 0);/* fetch_elts  */
-  arg_types[i++] = int32_type;  /* fetch_elt_max */
-   } else
-  arg_types[i++] = int32_type;  /* start */
-   arg_types[i++] = int32_type; /* fetch_count / count */
-   arg_types[i++] = int32_type; /* stride */
-   arg_types[i++] = get_vb_ptr_type(variant);   /* pipe_vertex_buffer's */
-   arg_types[i++] = int32_type; /* instance_id */
-   arg_types[i++] = int32_type; /* vertex_id_offset */
-   arg_types[i++] = int32_type; /* start_instance */
+  arg_types[i++] = LLVMPointerType(int32_type, 0);   /* fetch_elts  */
+   }
 
func_type = LLVMFunctionType(LLVMInt8TypeInContext(context),
 arg_types, num_arg_types, 0);
@@ -1661,21 +1660,23 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
context_ptr   = LLVMGetParam(variant_func, 0);
io_ptr= LLVMGetParam(variant_func, 1);
vbuffers_ptr  = LLVMGetParam(variant_func, 2);
+   count = LLVMGetParam(variant_func, 3);
/*
 * XXX: stride is actually unused. The stride we use is strictly calculated
 * from the number of outputs (including the draw_extra outputs).
 * Should probably fix some day (we need a new vs just because of extra
 * outputs which the generated vs won't touch).
 */
-   stride= LLVMGetParam(variant_func, 5 + (elts ? 1 : 0));
-   vb_ptr= LLVMGetParam(variant_func, 6 + (elts ? 1 : 0));
-   system_values.instance_id = LLVMGetParam(variant_func, 7 + (elts ? 1 : 0));
-   vertex_id_offset  = LLVMGetParam(variant_func, 8 + (elts ? 1 : 0));
-   start_instance= LLVMGetParam(variant_func, 9 + (elts ? 1 : 0));
+   stride= LLVMGetParam(variant_func, 5);
+   vb_ptr= LLVMGetParam(variant_func, 6);
+   system_values.instance_id = LLVMGetParam(variant_func, 7);
+   vertex_id_offset  = LLVMGetParam(variant_func, 8);
+   start_instance= LLVMGetParam(variant_func, 9);
 
lp_build_name(context_ptr, "context");
lp_build_name(io_ptr, "io");
lp_build_name(vbuffers_ptr, "vbuffers");
+   lp_build_name(count, "count");
lp_build_name(stride, "stride");
lp_build_name(vb_ptr, "vb");
lp_build_name(system_values.instance_id, "instance_id");
@@ -1683,20 +1684,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
lp_build_name(start_instance, "start_instance");
 
if (elts) {
-  fetch_elts= LLVMGetParam(variant_func, 3);
   fetch_elt_max = LLVMGetParam(variant_func, 4);
-  fetch_count   = LLVMGetParam(variant_func, 5);
+  fetch_elts= LLVMGetParam(variant_func, 10);
   lp_build_name(fetch_elts, "fetch_elts");
   lp_build_name(fetch_elt_max, "fetch_elt_max");
-  lp_build_name(fetch_count, "fetch_count");
-  start = count = NULL;
+  s

[Mesa-dev] draw: simplify overflow handling, unify elts and linear jit code

2016-11-13 Thread sroland
Overflow handling is simplified quite a bit both in jit code and vsplit
paths (basically just let things wrap around everywhere). This seems to
be good enough for all apis.
Also, elts and linear jit code is unified since the differences are minimal
(even more so at the end of the series). The cost of having 2 different jit
shaders is definitely not worth it (the combined shader isn't really more
complex than the elt one was, with the difference to the linear one easily
being skipped for the most part for the linear case).
(These patches are on top of the last series.)

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/5] draw: drop unnecessary index overflow handling from vsplit code

2016-11-13 Thread sroland
From: Roland Scheidegger 

This was kind of strange, since it replaced indices which were only
overflowing due to bias with MAX_UINT. This would cause an overflow later
in the shader, except if stride was 0, however the vertex id would be
essentially random then (-1 + eltBias). No test cared about it, though.
So, drop this and just use ordinary int arithmetic wraparound as usual.
This is much simpler to understand and the results are "more correct" or
at least more consistent (vertex id as well as actual fetch results just
correspond to wrapped around arithmetic).
There's only one catch, it is now possible to hit the cache initialization
value also with ushort and ubyte elts path (this wouldn't be an issue if
we'd simply handle the eltBias itself later in the shader). Hence, we need
to make sure the cache logic doesn't think this element has already been
emitted when it has not (I believe some seriously bad things could happen
otherwise). So, borrow the logic which handled this from the uint case, but
not before fixing it up...
---
 src/gallium/auxiliary/draw/draw_pt_vsplit.c | 69 ++---
 src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h | 15 +-
 2 files changed, 28 insertions(+), 56 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c 
b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
index fb131c3..a68d5bf 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vsplit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -85,7 +85,7 @@ vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned 
flags)
  * Add a fetch element and add it to the draw elements.
  */
 static inline void
-vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch, unsigned 
ofbias)
+vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch)
 {
unsigned hash;
 
@@ -93,7 +93,7 @@ vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned 
fetch, unsigned ofbias
 
/* If the value isn't in the cache or it's an overflow due to the
 * element bias */
-   if (vsplit->cache.fetches[hash] != fetch || ofbias) {
+   if (vsplit->cache.fetches[hash] != fetch) {
   /* update cache */
   vsplit->cache.fetches[hash] = fetch;
   vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts;
@@ -108,7 +108,7 @@ vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned 
fetch, unsigned ofbias
 
 /**
  * Returns the base index to the elements array.
- * The value is checked for integer overflow.
+ * The value is checked for integer overflow (not sure it can happen?).
  */
 static inline unsigned
 vsplit_get_base_idx(unsigned start, unsigned fetch)
@@ -116,39 +116,14 @@ vsplit_get_base_idx(unsigned start, unsigned fetch)
return draw_overflow_uadd(start, fetch, MAX_ELT_IDX);
 }
 
-/**
- * Returns the element index adjust for the element bias.
- * The final element index is created from the actual element
- * index, plus the element bias, clamped to maximum element
- * index if that addition overflows.
+/*
+ * The final element index is just element index plus element bias.
  */
-static inline unsigned
-vsplit_get_bias_idx(int idx, int bias, unsigned *ofbias)
-{
-   int res = idx + bias;
-
-   *ofbias = 0;
-
-   if (idx > 0 && bias > 0) {
-  if (res < idx) {
- res = DRAW_MAX_FETCH_IDX;
- *ofbias = 1;
-  }
-   } else if (idx < 0 && bias < 0) {
-  if (res > idx) {
- res = DRAW_MAX_FETCH_IDX;
- *ofbias = 1;
-  }
-   }
-
-   return res;
-}
-
 #define VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias)\
unsigned elt_idx;   \
-   unsigned ofbias;\
-   elt_idx = vsplit_get_base_idx(start, fetch);\
-   elt_idx = vsplit_get_bias_idx(DRAW_GET_IDX(elts, elt_idx), elt_bias, 
&ofbias)
+   elt_idx = vsplit_get_base_idx(start, fetch);\
+   elt_idx = (unsigned)((int)(DRAW_GET_IDX(elts, elt_idx)) + (int)elt_bias);
+
 
 static inline void
 vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, const ubyte *elts,
@@ -156,7 +131,13 @@ vsplit_add_cache_ubyte(struct vsplit_frontend *vsplit, 
const ubyte *elts,
 {
struct draw_context *draw = vsplit->draw;
VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
-   vsplit_add_cache(vsplit, elt_idx, ofbias);
+   /* unlike the uint case this can only happen with elt_bias */
+   if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && 
!vsplit->cache.has_max_fetch) {
+  unsigned hash = fetch % MAP_SIZE;
+  vsplit->cache.fetches[hash] = 0;
+  vsplit->cache.has_max_fetch = TRUE;
+   }
+   vsplit_add_cache(vsplit, elt_idx);
 }
 
 static inline void
@@ -165,7 +146,13 @@ vsplit_add_cache_ushort(struct vsplit_frontend *vsplit, 
const ushort *elts,
 {
struct draw_context *draw = vsplit->draw;
VSPLIT_CREATE_IDX(elts, start, fetch, elt_bias);
-   vsplit_add_cache(vsplit, elt_idx, ofbias);
+   /* unlike the uint case this can only happen with elt_bias */
+   if (elt_bias && elt_idx == DRAW_MAX_FETCH_IDX && 
!vsplit->

[Mesa-dev] [PATCH 5/5] draw: drop some overflow computations

2016-11-13 Thread sroland
From: Roland Scheidegger 

It turns out that noone actually cares if the address computations overflow,
be it the stride mul or the offset adds.
Wrap around seems to be explicitly permitted even by some other API (which
is a _very_ surprising result, as these overflow computations were added just
for that and made some tests pass at that time - I suspect some later fixes
fixed the actual root cause...). So the requirements in that other api were
actually sane there all along after all...
Still need to make sure the computed buffer size needed is valid, of course.
This ditches the shiny new widening mul from these codepaths, ah well...

And now that I really understand this, change the fishy min limiting
indices to what it really should have done. Which is simply to prevent
fetching more values than valid for the last loop iteration. (This makes
the code path in the loop minimally more complex for the non-indexed case
as we have to skip the optimization combining two adds. I think it should
be safe to skip this actually there, but I don't care much about this
especially since skipping that optimization actually makes the code easier
to read elsewhere.)
---
 src/gallium/auxiliary/draw/draw_llvm.c | 111 ++---
 1 file changed, 46 insertions(+), 65 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 414f2dc..c548572 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -669,18 +669,17 @@ fetch_instanced(struct gallivm_state *gallivm,
LLVMValueRef zero = LLVMConstNull(i32_t);
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef stride, buffer_overflowed, aos, index_valid;
-   LLVMValueRef ofbit = NULL;
unsigned i;
 
aosf_t = lp_build_vec_type(gallivm, lp_float32_vec4_type());
aosi_t = lp_build_vec_type(gallivm, lp_int32_vec4_type());
 
-   stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
+   /* This mul can overflow. Wraparound is ok. */
+   stride = LLVMBuildMul(builder, vb_stride, index, "");
 
buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGE,
  stride, buffer_size_adj,
  "buffer_overflowed");
-   buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
 
if (0) {
   lp_build_print_value(gallivm, "   instance index = ", index);
@@ -759,7 +758,7 @@ fetch_vector(struct gallivm_state *gallivm,
LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context blduivec;
-   LLVMValueRef offset, tmp, valid_mask;
+   LLVMValueRef offset, valid_mask;
LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
unsigned i;
 
@@ -768,24 +767,11 @@ fetch_vector(struct gallivm_state *gallivm,
vb_stride = lp_build_broadcast_scalar(&blduivec, vb_stride);
buffer_size_adj = lp_build_broadcast_scalar(&blduivec, buffer_size_adj);
 
-   /*
-* Sort of interestingly, with interleaved attribs, llvm 3.7+ will
-* recognize these calculations to be constant with different attribs
-* (the different offset has been added to map_ptr).
-* llvm 3.3, however, will not (I can't get llvm 3.4-3.6 to link...)
-*
-* XXX: could actually avoid this altogether (replacing by simple
-* non-widening mul) by precalculating the max index instead outside
-* the loop (at the cost of one scalar udiv per vertex element).
-*/
-   offset = lp_build_mul_32_lohi_cpu(&blduivec, vb_stride, indices, &tmp);
+   /* This mul can overflow. Wraparound is ok. */
+   offset = lp_build_mul(&blduivec, vb_stride, indices);
 
valid_mask = lp_build_compare(gallivm, blduivec.type,
- PIPE_FUNC_EQUAL, tmp, blduivec.zero);
-
-   tmp = lp_build_compare(gallivm, blduivec.type,
-  PIPE_FUNC_LESS, offset, buffer_size_adj);
-   valid_mask = LLVMBuildAnd(builder, tmp, valid_mask, "");
+ PIPE_FUNC_LESS, offset, buffer_size_adj);
 
/* not valid elements use offset 0 */
offset = LLVMBuildAnd(builder, offset, valid_mask, "");
@@ -1566,10 +1552,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant)
LLVMBuilderRef builder;
char func_name[64];
struct lp_type vs_type;
-   LLVMValueRef count, fetch_elts, start_or_maxelt, start;
+   LLVMValueRef count, fetch_elts, start_or_maxelt;
LLVMValueRef vertex_id_offset, start_instance;
LLVMValueRef stride, step, io_itr;
-   LLVMValueRef ind_vec, ind_vec_store, have_elts, fetch_max, tmp;
+   LLVMValueRef ind_vec, start_vec, have_elts, fetch_max, tmp;
LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
LLVMValueRef vb_stride[PIPE_MAX_ATTRIBS];
LLVMValueRef map_ptr[PIPE_MAX_ATTRIBS];
@@ -1580,7 +1566,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant)
struct draw

[Mesa-dev] [PATCH 3/5] draw: unify linear and elts draw jit functions

2016-11-13 Thread sroland
From: Roland Scheidegger 

The code for elts and linear paths was nearly 100% identical by now - with
the elts path simply having some additional gather for the elements in the
main loop (with some additional small differences before the main loop).

Hence nuke the separate functions and decide this at jit shader execution
time (simply based on the presence of the elts pointer).

Some analysis shows that the generated vs jit functions seem to be just very
minimally more complex than the former elts functions, and almost none of the
additional complexity is in the main loop (basically just the branch logic
for the branch fetching the actual indices).
Compared to linear, the codesize of the function is of course a bit larger,
however the actual executed code in the main loop appears to be near 100%
identical (the additional code looking up indices is skipped as expected).

So, I would not expect a (meaningful) performance difference with the
generated code, neither with elts nor linear, this does however roughly
half the compilation time (the compiled shaders should also use only half
the memory of course).
---
 src/gallium/auxiliary/draw/draw_llvm.c | 92 +++---
 src/gallium/auxiliary/draw/draw_llvm.h | 21 +
 .../draw/draw_pt_fetch_shade_pipeline_llvm.c   | 46 +--
 3 files changed, 70 insertions(+), 89 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index a02c893..2478b11 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -63,8 +63,7 @@
 
 
 static void
-draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var,
-   boolean elts);
+draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var);
 
 
 struct draw_gs_llvm_iface {
@@ -577,17 +576,13 @@ draw_llvm_create_variant(struct draw_llvm *llvm,
 
variant->vertex_header_ptr_type = LLVMPointerType(vertex_header, 0);
 
-   draw_llvm_generate(llvm, variant, FALSE);  /* linear */
-   draw_llvm_generate(llvm, variant, TRUE);   /* elts */
+   draw_llvm_generate(llvm, variant);
 
gallivm_compile_module(variant->gallivm);
 
variant->jit_func = (draw_jit_vert_func)
  gallivm_jit_function(variant->gallivm, variant->function);
 
-   variant->jit_func_elts = (draw_jit_vert_func_elts)
- gallivm_jit_function(variant->gallivm, variant->function_elts);
-
gallivm_free_ir(variant->gallivm);
 
variant->list_item_global.base = variant;
@@ -1565,25 +1560,23 @@ draw_gs_llvm_epilogue(const struct 
lp_build_tgsi_gs_iface *gs_base,
 }
 
 static void
-draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
-   boolean elts)
+draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
struct gallivm_state *gallivm = variant->gallivm;
LLVMContextRef context = gallivm->context;
LLVMTypeRef int32_type = LLVMInt32TypeInContext(context);
LLVMTypeRef arg_types[11];
-   unsigned num_arg_types =
-  elts ? ARRAY_SIZE(arg_types) : ARRAY_SIZE(arg_types) - 1;
+   unsigned num_arg_types = ARRAY_SIZE(arg_types);
LLVMTypeRef func_type;
LLVMValueRef context_ptr;
LLVMBasicBlockRef block;
LLVMBuilderRef builder;
char func_name[64];
struct lp_type vs_type;
-   LLVMValueRef count, fetch_elts, fetch_elt_max;
-   LLVMValueRef vertex_id_offset, start_instance, start;
+   LLVMValueRef count, fetch_elts, start_or_maxelt, start;
+   LLVMValueRef vertex_id_offset, start_instance;
LLVMValueRef stride, step, io_itr;
-   LLVMValueRef ind_vec;
+   LLVMValueRef ind_vec, ind_vec_store, have_elts, fetch_max, tmp;
LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
LLVMValueRef vb_stride[PIPE_MAX_ATTRIBS];
LLVMValueRef map_ptr[PIPE_MAX_ATTRIBS];
@@ -1597,9 +1590,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
unsigned i, j;
struct lp_build_context bld, bldivec, blduivec;
struct lp_build_loop_state lp_loop;
+   struct lp_build_if_state if_ctx;
const int vector_length = lp_native_vector_width / 32;
LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
-   LLVMValueRef fetch_max;
struct lp_build_sampler_soa *sampler = 0;
LLVMValueRef ret, clipmask_bool_ptr;
struct draw_llvm_variant_key *key = &variant->key;
@@ -1624,8 +1617,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant,
 
memset(&system_values, 0, sizeof(system_values));
 
-   util_snprintf(func_name, sizeof(func_name), "draw_llvm_vs_variant%u_%s",
- variant->shader->variants_cached, elts ? "elts" : "linear");
+   util_snprintf(func_name, sizeof(func_name), "draw_llvm_vs_variant%u",
+ variant->shader->variants_cached);
 
i = 0;
arg_types[i++] = get_context_ptr_type(variant);   /* context */
@@ -1638,19 +1631,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct

[Mesa-dev] [PATCH 4/5] draw: simplify fetch some more

2016-11-13 Thread sroland
From: Roland Scheidegger 

Don't keep the ofbit. This is just a minor simplification, just adjust
the buffer size so that there will always be an overflow if buffers aren't
valid to fetch from.
Also, get rid of control flow from the instanced path too. Not worried about
performance, but it's simpler and keeps the code more similar to ordinary
fetch.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 118 +++--
 1 file changed, 55 insertions(+), 63 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 2478b11..414f2dc 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -661,22 +661,23 @@ fetch_instanced(struct gallivm_state *gallivm,
 LLVMValueRef vb_stride,
 LLVMValueRef map_ptr,
 LLVMValueRef buffer_size_adj,
-LLVMValueRef ofbit,
 LLVMValueRef *inputs,
 LLVMValueRef index)
 {
-   LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
+   LLVMTypeRef i32_t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef aosf_t, aosi_t;
+   LLVMValueRef zero = LLVMConstNull(i32_t);
LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef stride, buffer_overflowed, aos;
-   LLVMValueRef temp_ptr =
-  lp_build_alloca(gallivm,
-  lp_build_vec_type(gallivm, lp_float32_vec4_type()), "");
-   struct lp_build_if_state if_ctx;
+   LLVMValueRef stride, buffer_overflowed, aos, index_valid;
+   LLVMValueRef ofbit = NULL;
unsigned i;
 
+   aosf_t = lp_build_vec_type(gallivm, lp_float32_vec4_type());
+   aosi_t = lp_build_vec_type(gallivm, lp_int32_vec4_type());
+
stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
 
-   buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGT,
+   buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGE,
  stride, buffer_size_adj,
  "buffer_overflowed");
buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
@@ -686,28 +687,22 @@ fetch_instanced(struct gallivm_state *gallivm,
   lp_build_print_value(gallivm, "   buffer overflowed = ", 
buffer_overflowed);
}
 
-   lp_build_if(&if_ctx, gallivm, buffer_overflowed);
-   {
-  LLVMValueRef val =
- lp_build_const_vec(gallivm, lp_float32_vec4_type(), 0);
-  LLVMBuildStore(builder, val, temp_ptr);
-   }
-   lp_build_else(&if_ctx);
-   {
-  LLVMValueRef val;
-
-  val = lp_build_fetch_rgba_aos(gallivm,
-format_desc,
-lp_float32_vec4_type(),
-FALSE,
-map_ptr,
-stride, zero, zero,
-NULL);
-  LLVMBuildStore(builder, val, temp_ptr);
-   }
-   lp_build_endif(&if_ctx);
+   index_valid = LLVMBuildNot(builder, buffer_overflowed, "");
+   index_valid = LLVMBuildSExt(builder, index_valid, i32_t, "");
+   stride = LLVMBuildAnd(builder, stride, index_valid, "");
 
-   aos = LLVMBuildLoad(builder, temp_ptr, "aos");
+   aos = lp_build_fetch_rgba_aos(gallivm,
+ format_desc,
+ lp_float32_vec4_type(),
+ FALSE,
+ map_ptr,
+ stride, zero, zero,
+ NULL);
+
+   index_valid = lp_build_broadcast(gallivm, aosi_t, index_valid);
+   aos = LLVMBuildBitCast(builder, aos, aosi_t, "");
+   aos = LLVMBuildAnd(builder, aos, index_valid, "");
+   aos = LLVMBuildBitCast(builder, aos, aosf_t, "");
 
for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
   LLVMValueRef index = lp_build_const_int32(gallivm, i);
@@ -758,7 +753,6 @@ fetch_vector(struct gallivm_state *gallivm,
  LLVMValueRef vb_stride,
  LLVMValueRef map_ptr,
  LLVMValueRef buffer_size_adj,
- LLVMValueRef ofmask,
  LLVMValueRef *inputs,
  LLVMValueRef indices)
 {
@@ -786,12 +780,11 @@ fetch_vector(struct gallivm_state *gallivm,
 */
offset = lp_build_mul_32_lohi_cpu(&blduivec, vb_stride, indices, &tmp);
 
-   tmp = lp_build_compare(gallivm, blduivec.type,
-  PIPE_FUNC_EQUAL, tmp, blduivec.zero);
-   valid_mask = lp_build_andnot(&blduivec, tmp, ofmask);
+   valid_mask = lp_build_compare(gallivm, blduivec.type,
+ PIPE_FUNC_EQUAL, tmp, blduivec.zero);
 
tmp = lp_build_compare(gallivm, blduivec.type,
-  PIPE_FUNC_LEQUAL, offset, buffer_size_adj);
+  PIPE_FUNC_LESS, offset, buffer_size_adj);
valid_mask = LLVMBuildAnd(builder, tmp, valid_mask, "");
 
/* not valid elements use offset 0 */
@@ -1581,7 +1574,6 @@ draw_llvm_gener

[Mesa-dev] [PATCH] glsl: fix ldexp lowering if bitfield insert lowering is also requested

2016-12-03 Thread sroland
From: Roland Scheidegger 

Trivial, this just resurrects the code which was there once upon a time
(the code can't lower instructions generated in the lowering pass there,
and even if it could it would probably be suboptimal).
This fixes piglit mesa_shader_integer_functions fs-ldexp.shader_test and
vs-ldexp.shader_test with llvmpipe.
---
 src/compiler/glsl/lower_instructions.cpp | 21 -
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/compiler/glsl/lower_instructions.cpp 
b/src/compiler/glsl/lower_instructions.cpp
index 372ded1..3e25e2b 100644
--- a/src/compiler/glsl/lower_instructions.cpp
+++ b/src/compiler/glsl/lower_instructions.cpp
@@ -392,7 +392,6 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression 
*ir)
ir_constant *sign_mask = new(ir) ir_constant(0x8000u, vec_elem);
 
ir_constant *exp_shift = new(ir) ir_constant(23, vec_elem);
-   ir_constant *exp_width = new(ir) ir_constant(8, vec_elem);
 
/* Temporary variables */
ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
@@ -455,10 +454,22 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression 
*ir)
 */
 
ir_constant *exp_shift_clone = exp_shift->clone(ir, NULL);
-   ir->operation = ir_unop_bitcast_i2f;
-   ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
- exp_shift_clone, exp_width);
-   ir->operands[1] = NULL;
+
+   /* Don't generate new IR that would need to be lowered in an additional
+* pass.
+*/
+   if (!lowering(INSERT_TO_SHIFTS)) {
+  ir_constant *exp_width = new(ir) ir_constant(8u, vec_elem);
+  ir->operation = ir_unop_bitcast_i2f;
+  ir->operands[0] = bitfield_insert(bitcast_f2i(x), resulting_biased_exp,
+exp_shift_clone, exp_width);
+  ir->operands[1] = NULL;
+   } else {
+  ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x807fu, 
vec_elem);
+  ir->operation = ir_unop_bitcast_u2f;
+  ir->operands[0] = bit_or(bit_and(bitcast_f2u(x), sign_mantissa_mask),
+   lshift(i2u(resulting_biased_exp), 
exp_shift_clone));
+   }
 
this->progress = true;
 }
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] gallivm: optimize gather a bit, by using supplied destination type

2016-12-03 Thread sroland
From: Roland Scheidegger 

By using a dst_type in the the gather interface, gather has some more
knowledge about how values should be fetched.
E.g. if this is a 3x32bit fetch and dst_type is 4x32bit vector gather
will no longer do a ZExt with a 96bit scalar value to 128bit, but
just fetch the 96bit as 3x32bit vector (this is still going to be
2 loads of course, but the loads can be done directly to simd vector
that way).
Also, we can now do some try to use the right int/float type. This should
make no difference really since there's typically no domain transition
penalties for such simd loads, however it actually makes a difference
since llvm will use different shuffle lowering afterwards so the caller
can use this to trick llvm into using sane shuffle afterwards (and yes
llvm is really stupid there - nothing against using the shuffle
instruction from the correct domain, but not at the cost of doing 3 times
more shuffles, the case which actually matters is refusal to use shufps
for integer values).
Also do some attempt to avoid things which look great on paper but llvm
doesn't really handle (e.g. fetching 3-element 8 bit and 16 bit vectors
which is simply disastrous - I suspect type legalizer is to blame trying
to extend these vectors to 128bit types somehow, so fetching these with
scalars like before which is suboptimal due to the ZExt).

Remove the ability for truncation (no point, this is gather, not conversion)
as it is complex enough already.

While here also implement not just the float, but also the 64bit avx2
gathers (disabled though since based on the theoretical numbers the benefit
just isn't there at all until Skylake at least).
---
 src/gallium/auxiliary/gallivm/lp_bld_gather.c | 42 +--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c 
b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
index 439bbb6..1f7ba92 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_gather.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -33,6 +33,7 @@
 #include "lp_bld_format.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_swizzle.h"
+#include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_intr.h"
 
@@ -270,17 +271,52 @@ lp_build_gather(struct gallivm_state *gallivm,
 
   LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, 
dst_width);
   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+  LLVMTypeRef gather_vec_type = dst_vec_type;
   unsigned i;
-
-  res = LLVMGetUndef(dst_vec_type);
+  boolean vec_zext = FALSE;
+  unsigned gather_width = dst_width;
+
+
+  if (src_width == 16 && dst_width == 32) {
+ LLVMTypeRef g_elem_type = LLVMIntTypeInContext(gallivm->context, 
dst_width / 2);
+ gather_vec_type = LLVMVectorType(g_elem_type, length);
+ /*
+  * Note that llvm is never able to optimize zext/insert combos
+  * directly (i.e. zero the simd reg, then place the elements into
+  * the appropriate place directly). And 16->32bit zext simd loads
+  * aren't possible (instead loading to scalar reg first).
+  * (I think this has to do with scalar/vector transition.)
+  * No idea about other archs...
+  * We could do this manually, but instead we just use a vector
+  * zext, which is simple enough (and, in fact, llvm might optimize
+  * this away).
+  * (We're not trying that with other bit widths as that might not be
+  * easier, in particular with 8 bit values at least with only sse2.)
+  */
+ vec_zext = TRUE;
+ gather_width = 16;
+  }
+  res = LLVMGetUndef(gather_vec_type);
   for (i = 0; i < length; ++i) {
  LLVMValueRef index = lp_build_const_int32(gallivm, i);
  LLVMValueRef elem;
  elem = lp_build_gather_elem(gallivm, length,
- src_width, dst_width, aligned,
+ src_width, gather_width, aligned,
  base_ptr, offsets, i, vector_justify);
  res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
   }
+  if (vec_zext) {
+ res = LLVMBuildZExt(gallivm->builder, res, dst_vec_type, "");
+ if (vector_justify) {
+#if PIPE_ARCH_BIG_ENDIAN
+struct lp_type dst_type;
+unsigned sv = dst_width - src_width;
+dst_type = lp_type_uint_vec(dst_width, dst_width * length);
+res = LLVMBuildShl(gallivm->builder, res,
+   lp_build_const_int_vec(gallivm, dst_type, sv), 
"");
+#endif
+ }
+  }
}
 
return res;
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] util: (trivial) ETC1 meets the criteria for fitting into unorm8

2016-12-03 Thread sroland
From: Roland Scheidegger 

Just like other similar compressed formats.
---
 src/gallium/auxiliary/util/u_format.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/auxiliary/util/u_format.c 
b/src/gallium/auxiliary/util/u_format.c
index 72dd60f..3d28190 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -561,6 +561,11 @@ util_format_fits_8unorm(const struct 
util_format_description *format_desc)
  return TRUE;
   return FALSE;
 
+   case UTIL_FORMAT_LAYOUT_ETC:
+  if (format_desc->format == PIPE_FORMAT_ETC1_RGB8)
+ return TRUE;
+  return FALSE;
+
case UTIL_FORMAT_LAYOUT_PLAIN:
   /*
* For these we can find a generic rule.
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] gallivm: handle 16bit float fetches in lp_build_fetch_rgba_soa

2016-12-03 Thread sroland
From: Roland Scheidegger 

Note that we really want to _never_ reach the bottom of the function, which
resorts to AoS fetch.
Half floats can be handled just like other formats which fit into 32bit
vectors (so, only 1x16 and 2x16 formats, albeit with more channels things
are not THAT bad), with minimal plumbing. I've seen code size go down nearly
by a factor of 3 for a complete texture sampling function (including bilinear
filtering) using R16F.
(What we should do for everything not special cased is to do AoS gather,
shuffle/shift things into SoA vectors, and then do the conversion there.
Otherwise it's particularly bad with 1 or 2 channel formats - that r16f
format with either 4 or 8-wide vectors was still doing one element at a
time, essentially doing exactly the same work as for rgba16f. Also replacing
the channels with SWIZZLE0/1 (particularly the latter) adds even more
work, as it has to be done per aos vector, and not just straightforward
at the end with the SoA vector.)
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 7fc4e8d..7444c51 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -239,9 +239,22 @@ lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
 
   case UTIL_FORMAT_TYPE_FLOAT:
  if (type.floating) {
-assert(start == 0);
-assert(stop == 32);
-assert(type.width == 32);
+if (format_desc->channel[chan].size == 16) {
+   struct lp_type f16i_type = type;
+   f16i_type.width /= 2;
+   f16i_type.floating = 0;
+   if (start) {
+  input = LLVMBuildLShr(builder, input,
+ lp_build_const_int_vec(gallivm, type, start), "");
+   }
+   input = LLVMBuildTrunc(builder, input,
+  lp_build_vec_type(gallivm, f16i_type), 
"");
+   input = lp_build_half_to_float(gallivm, input);
+} else {
+   assert(start == 0);
+   assert(stop == 32);
+   assert(type.width == 32);
+}
 input = LLVMBuildBitCast(builder, input, 
lp_build_vec_type(gallivm, type), "");
  }
  else {
@@ -369,7 +382,8 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
format_desc->block.height == 1 &&
format_desc->block.bits <= type.width &&
(format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
-format_desc->channel[0].size == 32))
+format_desc->channel[0].size == 32 ||
+format_desc->channel[0].size == 16))
{
   /*
* The packed pixel fits into an element of the destination format. Put
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] main: allow NEAREST_MIPMAP_NEAREST for stencil texturing

2016-12-05 Thread sroland
From: Roland Scheidegger 

As per GL 4.5 rules, which fixed a spec mistake in GL_ARB_stencil_texturing.
The extension spec wasn't updated, but just allow it with older GL versions
as well, hoping there aren't any crazy tests which want to see an error
there... (Compile tested only.)

Reported by Józef Kucia 
---
 src/mesa/main/texobj.h | 23 ---
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h
index 378d87a..8776763 100644
--- a/src/mesa/main/texobj.h
+++ b/src/mesa/main/texobj.h
@@ -125,7 +125,14 @@ static inline GLboolean
 _mesa_is_texture_complete(const struct gl_texture_object *texObj,
   const struct gl_sampler_object *sampler)
 {
-   if (texObj->_IsIntegerFormat &&
+   /*
+* According to ARB_stencil_texturing, NEAREST_MIPMAP_NEAREST would
+* be forbidden, however it is allowed per GL 4.5 rules, allow it
+* even without GL 4.5 since it was a spec mistake.
+*/
+   if ((texObj->_IsIntegerFormat ||
+(texObj->StencilSampling &&
+ texObj->Image[0][texObj->BaseLevel]->_BaseFormat == 
GL_DEPTH_STENCIL)) &&
(sampler->MagFilter != GL_NEAREST ||
 (sampler->MinFilter != GL_NEAREST &&
  sampler->MinFilter != GL_NEAREST_MIPMAP_NEAREST))) {
@@ -133,20 +140,6 @@ _mesa_is_texture_complete(const struct gl_texture_object 
*texObj,
   return GL_FALSE;
}
 
-   /* From the ARB_stencil_texturing specification:
-* "Add a new bullet point for the conditions that cause the texture
-*  to not be complete:
-*
-*  * The internal format of the texture is DEPTH_STENCIL, the
-*DEPTH_STENCIL_TEXTURE_MODE for the texture is STENCIL_INDEX and either
-*the magnification filter or the minification filter is not NEAREST."
-*/
-   if (texObj->StencilSampling &&
-   texObj->Image[0][texObj->BaseLevel]->_BaseFormat == GL_DEPTH_STENCIL &&
-   (sampler->MagFilter != GL_NEAREST || sampler->MinFilter != GL_NEAREST)) 
{
-  return GL_FALSE;
-   }
-
if (_mesa_is_mipmap_filter(sampler))
   return texObj->_MipmapComplete;
else
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/6] gallivm: (trivial) handle non-aligned fetch for lp_build_fetch_rgba_soa

2016-12-11 Thread sroland
From: Roland Scheidegger 

soa fetch so far always assumed that data was aligned. However, we want to
use this for vertex fetch, and data might not be aligned there, so handle
it in this path too (basically just pass through alignment through to other
functions). (It looks like it wouldn't work for for cached s3tc but this is
no different than with AoS fetch.)
---
 src/gallium/auxiliary/gallivm/lp_bld_format.h |  1 +
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 15 +--
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h 
b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 5c866f4..6540caa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -143,6 +143,7 @@ void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 const struct util_format_description *format_desc,
 struct lp_type type,
+boolean aligned,
 LLVMValueRef base_ptr,
 LLVMValueRef offsets,
 LLVMValueRef i,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 7444c51..389bfa0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -349,6 +349,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
  *
  * \param type  the desired return type for 'rgba'.  The vector length
  *  is the number of texels to fetch
+ * \param aligned if the offset is guaranteed to be aligned to element width
  *
  * \param base_ptr  points to the base of the texture mip tree.
  * \param offsetoffset to start of the texture image block.  For non-
@@ -365,6 +366,7 @@ void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 const struct util_format_description *format_desc,
 struct lp_type type,
+boolean aligned,
 LLVMValueRef base_ptr,
 LLVMValueRef offset,
 LLVMValueRef i,
@@ -402,7 +404,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
type.length,
format_desc->block.bits,
type.width,
-   TRUE,
+   aligned,
base_ptr, offset, FALSE);
 
   /*
@@ -428,7 +430,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 
   packed = lp_build_gather(gallivm, type.length,
format_desc->block.bits,
-   type.width, TRUE,
+   type.width, aligned,
base_ptr, offset, FALSE);
   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) {
  lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
@@ -456,14 +458,14 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
  LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
  offset = LLVMBuildAdd(builder, offset, s_offset, "");
  packed = lp_build_gather(gallivm, type.length, 32, type.width,
-  TRUE, base_ptr, offset, FALSE);
+  aligned, base_ptr, offset, FALSE);
  packed = LLVMBuildAnd(builder, packed,
lp_build_const_int_vec(gallivm, type, mask), 
"");
   }
   else {
  assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
  packed = lp_build_gather(gallivm, type.length, 32, type.width,
-  TRUE, base_ptr, offset, TRUE);
+  aligned, base_ptr, offset, TRUE);
  packed = LLVMBuildBitCast(builder, packed,
lp_build_vec_type(gallivm, type), "");
   }
@@ -489,7 +491,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
   tmp_type.norm = TRUE;
 
   tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-TRUE, base_ptr, offset, i, j, cache);
+aligned, base_ptr, offset, i, j, cache);
 
   lp_build_rgba8_to_fi32_soa(gallivm,
 type,
@@ -509,6 +511,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
   const struct util_format_description *flinear_desc;
   LLVMValueRef packed;
   flinear_desc = 
util_format_description(util_format_linear(format_desc->format));
+  /* This probably only works with aligned data */
   packed = lp_build_fetch_cached_texels(gallivm,
 flinear_desc,
 

[Mesa-dev] [PATCH 6/6] draw: use SoA fetch, not AoS one

2016-12-11 Thread sroland
From: Roland Scheidegger 

Now that there's some SoA fetch which never falls back, we should usually get
results which are better or at least not worse (something like rgba32f will
stay the same). I suppose though it might be worse in some cases where the
format doesn't require conversion (e.g. rg32f) and goes straight to output -
if llvm was able to see through all shuffles then it might have been able
to do away with the aos->soa->aos transpose entirely which can no longer work
possibly except for 4-channel formats (due to replacing the undef channels
with 0/1 before the second transpose and not the first - llvm will
definitely not be able to figure that out). That might actually be quite
common, but I'm not sure llvm really could optimize it in the first place,
and if it's a problem we should just special case such inputs (though note
that if conversion is needed, it isn't obvious if it's better to skip
the transpose or do the conversion AoS-style).

For cases which get way better, think something like R16_UNORM with 8-wide
vectors: this was 8 sign-extend fetches, 8 cvt, 8 muls, followed by
a couple of shuffles to stitch things together (if it is smart enough,
6 unpacks) and then a (8-wide) transpose (not sure if llvm could even
optimize the shuffles + transpose, since the 16bit values were actually
sign-extended to 128bit before being cast to a float vec, so that would be
another 8 unpacks). Now that is just 8 fetches (directly inserted into
vector, albeit there's one 128bit insert needed), 1 cvt, 1 mul.
---
 src/gallium/auxiliary/draw/draw_llvm.c | 54 +-
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index 19b75a5..f895b76 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -755,11 +755,9 @@ fetch_vector(struct gallivm_state *gallivm,
  LLVMValueRef *inputs,
  LLVMValueRef indices)
 {
-   LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context blduivec;
LLVMValueRef offset, valid_mask;
-   LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
unsigned i;
 
lp_build_context_init(&blduivec, gallivm, lp_uint_type(vs_type));
@@ -783,21 +781,49 @@ fetch_vector(struct gallivm_state *gallivm,
}
 
/*
-* Note: we probably really want to use SoA fetch, not AoS one (albeit
-* for most formats it will amount to the same as this isn't very
-* optimized). But looks dangerous since it assumes alignment.
+* Use SoA fetch. This should produce better code usually.
+* Albeit it's possible there's exceptions (in particular if the fetched
+* value is going directly to output if it's something like RG32F).
 */
-   for (i = 0; i < vs_type.length; i++) {
-  LLVMValueRef offset1, elem;
-  elem = lp_build_const_int32(gallivm, i);
-  offset1 = LLVMBuildExtractElement(builder, offset, elem, "");
+   if (1) {
+  struct lp_type res_type = vs_type;
+  /* The type handling is annoying here... */
+  if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+  format_desc->channel[0].pure_integer) {
+ if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+res_type = lp_type_int_vec(vs_type.width, vs_type.width * 
vs_type.length);
+ }
+ else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+res_type = lp_type_uint_vec(vs_type.width, vs_type.width * 
vs_type.length);
+ }
+  }
 
-  aos_fetch[i] = lp_build_fetch_rgba_aos(gallivm, format_desc,
- lp_float32_vec4_type(),
- FALSE, map_ptr, offset1,
- zero, zero, NULL);
+  lp_build_fetch_rgba_soa(gallivm, format_desc,
+  res_type, FALSE, map_ptr, offset,
+  blduivec.zero, blduivec.zero,
+  NULL, inputs);
+
+  for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
+ inputs[i] = LLVMBuildBitCast(builder, inputs[i],
+  lp_build_vec_type(gallivm, vs_type), "");
+  }
+
+   }
+   else {
+  LLVMValueRef zero = 
LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
+  LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
+  for (i = 0; i < vs_type.length; i++) {
+ LLVMValueRef offset1, elem;
+ elem = lp_build_const_int32(gallivm, i);
+ offset1 = LLVMBuildExtractElement(builder, offset, elem, "");
+
+ aos_fetch[i] = lp_build_fetch_rgba_aos(gallivm, format_desc,
+lp_float32_vec4_type(),
+FALSE, map_ptr, offset1,
+  

[Mesa-dev] [PATCH 5/6] gallivm: generalize the compressed format soa fetch a bit

2016-12-11 Thread sroland
From: Roland Scheidegger 

This can now handle rgtc (unorm) too - this path no longer handles plain
formats, but that's unnecessary they now all have their proper SoA unpack
(this will still be dog-slow though due to the actual fetch being per-pixel
util fallbacks).
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 86 +--
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 9550f26..68cbb10 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -733,64 +733,69 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
 
/*
 * Try calling lp_build_fetch_rgba_aos for all pixels.
+* Should only really hit subsampled, compressed
+* (for s3tc srgb too, for rgtc the unorm ones only) by now.
+* (This is invalid for plain 8unorm formats because we're lazy with
+* the swizzle since some results would arrive swizzled, some not.)
 */
 
-   if (util_format_fits_8unorm(format_desc) &&
+   if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
+   (util_format_fits_8unorm(format_desc) ||
+format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
type.floating && type.width == 32 &&
(type.length == 1 || (type.length % 4 == 0))) {
   struct lp_type tmp_type;
-  LLVMValueRef tmp;
+  struct lp_build_context bld;
+  LLVMValueRef packed, rgba[4];
+  const struct util_format_description *flinear_desc;
+  const struct util_format_description *frgba8_desc;
+  unsigned chan;
 
+  lp_build_context_init(&bld, gallivm, type);
+
+  /*
+   * Make sure the conversion in aos really only does convert to rgba8
+   * and not anything more (so use linear format, adjust type).
+   */
+  flinear_desc = util_format_description(util_format_linear(format));
   memset(&tmp_type, 0, sizeof tmp_type);
   tmp_type.width = 8;
   tmp_type.length = type.length * 4;
   tmp_type.norm = TRUE;
 
-  tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-aligned, base_ptr, offset, i, j, cache);
+  packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
+   aligned, base_ptr, offset, i, j, cache);
+  packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
 
-  lp_build_rgba8_to_fi32_soa(gallivm,
-type,
-tmp,
-rgba_out);
-
-  return;
-   }
-
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
-   /* non-srgb case is already handled above */
-   format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
-   type.floating && type.width == 32 &&
-   (type.length == 1 || (type.length % 4 == 0)) &&
-   cache) {
-  const struct util_format_description *format_decompressed;
-  const struct util_format_description *flinear_desc;
-  LLVMValueRef packed;
-  flinear_desc = 
util_format_description(util_format_linear(format_desc->format));
-  /* This probably only works with aligned data */
-  packed = lp_build_fetch_cached_texels(gallivm,
-flinear_desc,
-type.length,
-base_ptr,
-offset,
-i, j,
-cache);
-  packed = LLVMBuildBitCast(builder, packed,
-lp_build_int_vec_type(gallivm, type), "");
   /*
-   * The values are now packed so they match ordinary srgb RGBA8 format,
+   * The values are now packed so they match ordinary (srgb) RGBA8 format,
* hence need to use matching format for unpack.
*/
-  format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
-
+  frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
+  if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+ assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
+ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+  }
   lp_build_unpack_rgba_soa(gallivm,
-   format_decompressed,
+   frgba8_desc,
type,
-   packed, rgba_out);
+   packed, rgba);
 
+  /*
+   * We converted 4 channels. Make sure llvm can drop unneeded ones
+   * (luckily the rgba order is fixed, only la needs special case).
+   */
+  for (chan = 0; chan < 4; chan++) {
+ enum pipe_swizzle swizzle = format_desc->swizzle[chan];
+ if (chan == 3 && util_format_is_luminance_alpha(f

[Mesa-dev] [PATCH 4/6] gallivm: provide soa fetch path handling formats with more than 32bit

2016-12-11 Thread sroland
From: Roland Scheidegger 

This previously always fell back to AoS conversion. Even for 4-float formats
(which is the optimal case by far for that fallback case) this was suboptimal,
since it meant the conversion couldn't be done with 256bit vectors. While this
may still only be partly possible for some formats, (unless there's AVX2
support) at least the transpose can be done with half the unpacks
(and before using the transpose for AoS fallbacks, it was worse still).
With less than 4 channels, things got way worse with the AoS fallback
quickly even with 128bit vectors.
The strategy is pretty much the same as the existing one for formats
which fit into 32 bits, except there's now multiple vectors to be
fetched (2 or 4 to be exact), which need to be shuffled first (if it's 4
vectors, this amounts to a transpose, for 2 it's a bit different),
then the unpack is done the same (with the exception that the shift
of the channels is now modulo 32, and we need to select the right
vector).
In fact the most complex part about it is to get the shuffles right
for separating into lo/hi parts for AVX/AVX2...
This also makes use of the new ability of gather to use provided type
information, which we abuse to outsmart llvm so we get decent shuffles,
and to fetch 3x32bit vectors without having to ZExt the scalar.
And just because we can, we handle double formats too, albeit they are
a bit different (draw sometimes needs to handle that).
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 529 +++---
 1 file changed, 375 insertions(+), 154 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index b3ea709..9550f26 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -31,6 +31,7 @@
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
+#include "util/u_math.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -113,6 +114,166 @@ lp_build_format_swizzle_soa(const struct 
util_format_description *format_desc,
 }
 
 
+
+static LLVMValueRef
+lp_build_extract_soa_chan(struct lp_build_context *bld,
+  unsigned blockbits,
+  boolean srgb_chan,
+  struct util_format_channel_description chan_desc,
+  LLVMValueRef packed)
+{
+   struct gallivm_state *gallivm = bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type type = bld->type;
+   LLVMValueRef input = packed;
+   const unsigned width = chan_desc.size;
+   const unsigned start = chan_desc.shift;
+   const unsigned stop = start + width;
+
+   /* Decode the input vector component */
+
+   switch(chan_desc.type) {
+   case UTIL_FORMAT_TYPE_VOID:
+  input = bld->undef;
+  break;
+
+   case UTIL_FORMAT_TYPE_UNSIGNED:
+  /*
+   * Align the LSB
+   */
+  if (start) {
+ input = LLVMBuildLShr(builder, input,
+   lp_build_const_int_vec(gallivm, type, start), 
"");
+  }
+
+  /*
+   * Zero the MSBs
+   */
+  if (stop < blockbits) {
+ unsigned mask = ((unsigned long long)1 << width) - 1;
+ input = LLVMBuildAnd(builder, input,
+  lp_build_const_int_vec(gallivm, type, mask), "");
+  }
+
+  /*
+   * Type conversion
+   */
+  if (type.floating) {
+ if (srgb_chan) {
+struct lp_type conv_type = lp_uint_type(type);
+input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
+ }
+ else {
+if(chan_desc.normalized)
+   input = lp_build_unsigned_norm_to_float(gallivm, width, type, 
input);
+else
+   input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ }
+  }
+  else if (chan_desc.pure_integer) {
+ /* Nothing to do */
+  } else {
+  /* FIXME */
+  assert(0);
+  }
+  break;
+
+   case UTIL_FORMAT_TYPE_SIGNED:
+  /*
+   * Align the sign bit first.
+   */
+  if (stop < type.width) {
+ unsigned bits = type.width - stop;
+ LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+ input = LLVMBuildShl(builder, input, bits_val, "");
+  }
+
+  /*
+   * Align the LSB (with an arithmetic shift to preserve the sign)
+   */
+  if (chan_desc.size < type.width) {
+ unsigned bits = type.width - chan_desc.size;
+ LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
+ input = LLVMBuildAShr(builder, input, bits_val, "");
+  }
+
+  /*
+   * Type conversion
+   */
+  if (type.floating) {
+ input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
+ if (chan_desc.normalized) {
+double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
+

[Mesa-dev] [PATCH 2/6] gallivm: optimize SoA AoS fallback fetch path a little

2016-12-11 Thread sroland
From: Roland Scheidegger 

We should do transpose, not extract/insert, at least with "sufficient" amount
of channels (for 4 channels, extract/insert shuffles generated otherwise look
truly terrifying). Albeit we shouldn't fallback to that so often in any case.
---
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c | 83 +++
 1 file changed, 70 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 389bfa0..902c763 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -40,6 +40,39 @@
 #include "lp_bld_debug.h"
 #include "lp_bld_format.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+static void
+convert_to_soa(struct gallivm_state *gallivm,
+   LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
+   LLVMValueRef dst_soa[4],
+   const struct lp_type soa_type)
+{
+   unsigned j, k;
+   struct lp_type aos_channel_type = soa_type;
+
+   LLVMValueRef aos_channels[4];
+   unsigned pixels_per_channel = soa_type.length / 4;
+
+   debug_assert((soa_type.length % 4) == 0);
+
+   aos_channel_type.length >>= 1;
+
+   for (j = 0; j < 4; ++j) {
+  LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
+
+  assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
+
+  for (k = 0; k < pixels_per_channel; ++k) {
+ channel[k] = src_aos[j + 4 * k];
+  }
+
+  aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, 
pixels_per_channel);
+   }
+
+   lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
+}
 
 
 void
@@ -48,9 +81,6 @@ lp_build_format_swizzle_soa(const struct 
util_format_description *format_desc,
 const LLVMValueRef *unswizzled,
 LLVMValueRef swizzled_out[4])
 {
-   assert(PIPE_SWIZZLE_0 == (int)PIPE_SWIZZLE_0);
-   assert(PIPE_SWIZZLE_1 == (int)PIPE_SWIZZLE_1);
-
if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
   enum pipe_swizzle swizzle;
   LLVMValueRef depth_or_stencil;
@@ -547,9 +577,11 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
{
   unsigned k, chan;
   struct lp_type tmp_type;
+  LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
+  boolean vec_transpose = FALSE;
 
   if (gallivm_debug & GALLIVM_DEBUG_PERF) {
- debug_printf("%s: scalar unpacking of %s\n",
+ debug_printf("%s: AoS fetch fallback for %s\n",
   __FUNCTION__, format_desc->short_name);
   }
 
@@ -560,12 +592,31 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
  rgba_out[chan] = lp_build_undef(gallivm, type);
   }
 
+  if (format_desc->nr_channels > 2 ||
+  format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
+ /*
+  * Note that vector transpose can be worse. This is because
+  * llvm will ensure the missing channels have the correct
+  * values, in particular typically 1.0 for the last channel
+  * (if they are used or not doesn't matter, usually llvm can't
+  * figure this out here probably due to the transpose).
+  * But with the extract/insert path, since those missing elements
+  * were just directly inserted/extracted llvm can optimize this
+  * somewhat (though it still doesn't look great - and not for
+  * the compressed formats due to their external fetch funcs).
+  * So restrict to cases where we are sure it helps (albeit
+  * with 2 channels it MIGHT be worth it at least with AVX).
+  * In any case, this is just a bandaid, it does NOT replace proper
+  * SoA format unpack.
+  */
+ vec_transpose = TRUE;
+  }
+
   /* loop over number of pixels */
   for(k = 0; k < type.length; ++k) {
  LLVMValueRef index = lp_build_const_int32(gallivm, k);
  LLVMValueRef offset_elem;
  LLVMValueRef i_elem, j_elem;
- LLVMValueRef tmp;
 
  offset_elem = LLVMBuildExtractElement(builder, offset,
index, "");
@@ -574,20 +625,26 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
  j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
  /* Get a single float[4]={R,G,B,A} pixel */
- tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-   aligned, base_ptr, offset_elem,
-   i_elem, j_elem, cache);
+ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
+aligned, base_ptr, offset_elem,
+i_elem, j_elem, cache);
 
  /*
   * Insert the AoS tmp value channels into the SoA result vectors at
   * position = 'index'.
   */
-

[Mesa-dev] [PATCH 3/6] gallivm: optimize gather a bit, by using supplied destination type

2016-12-11 Thread sroland
From: Roland Scheidegger 

By using a dst_type in the the gather interface, gather has some more
knowledge about how values should be fetched.
E.g. if this is a 3x32bit fetch and dst_type is 4x32bit vector gather
will no longer do a ZExt with a 96bit scalar value to 128bit, but
just fetch the 96bit as 3x32bit vector (this is still going to be
2 loads of course, but the loads can be done directly to simd vector
that way).
Also, we can now do some try to use the right int/float type. This should
make no difference really since there's typically no domain transition
penalties for such simd loads, however it actually makes a difference
since llvm will use different shuffle lowering afterwards so the caller
can use this to trick llvm into using sane shuffle afterwards (and yes
llvm is really stupid there - nothing against using the shuffle
instruction from the correct domain, but not at the cost of doing 3 times
more shuffles, the case which actually matters is refusal to use shufps
for integer values).
Also do some attempt to avoid things which look great on paper but llvm
doesn't really handle (e.g. fetching 3-element 8 bit and 16 bit vectors
which is simply disastrous - I suspect type legalizer is to blame trying
to extend these vectors to 128bit types somehow, so fetching these with
scalars like before which is suboptimal due to the ZExt).

Remove the ability for truncation (no point, this is gather, not conversion)
as it is complex enough already.

While here also implement not just the float, but also the 64bit avx2
gathers (disabled though since based on the theoretical numbers the benefit
just isn't there at all until Skylake at least).
---
 src/gallium/auxiliary/draw/draw_llvm.c |   2 +-
 src/gallium/auxiliary/gallivm/lp_bld_format_aos.c  |   5 +-
 .../auxiliary/gallivm/lp_bld_format_aos_array.c|   9 +-
 src/gallium/auxiliary/gallivm/lp_bld_format_soa.c  |  23 +-
 src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c  |   4 +-
 src/gallium/auxiliary/gallivm/lp_bld_gather.c  | 359 +
 src/gallium/auxiliary/gallivm/lp_bld_gather.h  |   2 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c  |   8 +-
 8 files changed, 333 insertions(+), 79 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c 
b/src/gallium/auxiliary/draw/draw_llvm.c
index c548572..19b75a5 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1864,7 +1864,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct 
draw_llvm_variant *variant)

LLVMPointerType(LLVMInt8TypeInContext(context),
0), "");
  tmp = lp_build_gather(gallivm, vs_type.length,
-   32, 32, TRUE,
+   32, bld.type, TRUE,
fetch_elts, tmp, FALSE);
  LLVMBuildStore(builder, tmp, index_store);
   }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 9f6b9e9..322e7b8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -224,6 +224,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state 
*gallivm,
/* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
 * into masked = {X, Y, Z, W}
 */
+   /* Note: we cannot do this shift on x86 natively until AVX2. */
shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
 
@@ -394,6 +395,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
util_is_power_of_two(format_desc->block.bits)) {
   LLVMValueRef packed;
   LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, type);
+  struct lp_type fetch_type;
   unsigned vec_len = type.width * type.length;
 
   /*
@@ -401,8 +403,9 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
* scaling or converting.
*/
 
+  fetch_type = lp_type_uint(type.width*4);
   packed = lp_build_gather(gallivm, type.length/4,
-   format_desc->block.bits, type.width*4,
+   format_desc->block.bits, fetch_type,
aligned, base_ptr, offset, TRUE);
 
   assert(format_desc->block.bits <= vec_len);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
index 8cad3a6..636a4a6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos_array.c
@@ -70,7 +70,14 @@ lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
 
src_vec_type  = lp_build_vec_type(gallivm,  src_type);
 
-   /* Read whole vector from memory, unaligned */
+   /*
+* Read whole vector from memory, unaligned.
+* XXX: Note it's

[Mesa-dev] [PATCH 1/4] llvmpipe: (trivial) minimally simplify mask construction

2016-12-20 Thread sroland
From: Roland Scheidegger 

simd instruction sets usually have comparisons for equal, not unequal.
So use a different comparison against the mask itself - which also means
we don't need a all-zero as well as a all-one (for the pxor) reg.

Also add code to avoid scalar expansion of i1 values which we definitely
shouldn't do. There's problems with this though with llvm select
interaction, so it's disabled (basically using llvm select instead of
intrinsics may still produce atrocious code, even in cases where we
figured it should not, albeit I think this could probably be fixed
with some better selection of optimization passes, but I have zero
idea there really).
---
 src/gallium/auxiliary/gallivm/lp_bld_logic.c |  2 ++
 src/gallium/drivers/llvmpipe/lp_bld_depth.c  | 52 ++--
 src/gallium/drivers/llvmpipe/lp_state_fs.c   | 16 +
 3 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c 
b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 1a50e82..524917a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -327,6 +327,8 @@ lp_build_select(struct lp_build_context *bld,
* supported yet for a long time, and LLVM will generate poor code when
* the mask is not the result of a comparison.
* Also, llvm 3.7 may miscompile them (bug 94972).
+   * XXX: Even if the instruction was an SExt, this may still produce
+   * terrible code. Try piglit stencil-twoside.
*/
 
   /* Convert the mask to a vector of booleans.
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c 
b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 0c27c2f..d5d5c5a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -963,16 +963,48 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
if (stencil[0].enabled) {
 
   if (face) {
- LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
-
- /* front_facing = face != 0 ? ~0 : 0 */
- front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
- front_facing = LLVMBuildSExt(builder, front_facing,
-  LLVMIntTypeInContext(gallivm->context,
- 
s_bld.type.length*s_bld.type.width),
-  "");
- front_facing = LLVMBuildBitCast(builder, front_facing,
- s_bld.int_vec_type, "");
+ if (0) {
+/*
+ * XXX: the scalar expansion below produces atrocious code
+ * (basically producing a 64bit scalar value, then moving the 2
+ * 32bit pieces separately to simd, plus 4 shuffles, which is
+ * seriously lame). But the scalar-simd transitions are always
+ * tricky, so no big surprise there.
+ * This here would be way better, however llvm has some serious
+ * trouble later using it in the select, probably because it will
+ * recognize the expression as constant and move the simd value
+ * away (out of the loop) - and then it will suddenly try
+ * constructing i1 high-bit masks out of it later...
+ * (Try piglit stencil-twoside.)
+ * Note this is NOT due to using SExt/Trunc, it fails exactly the
+ * same even when using native compare/select.
+ * I cannot reproduce this problem when using stand-alone compiler
+ * though, suggesting some problem with optimization passes...
+ * (With stand-alone compilation, the construction of this mask
+ * value, no matter if the easy 3 instruction here or the complex
+ * 16+ one below, never gets separated from where it's used.)
+ * The scalar code still has the same problem, but the generated
+ * code looks a bit better at least for some reason, even if
+ * mostly by luck (the fundamental issue clearly is the same).
+ */
+front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);
+/* front_facing = face != 0 ? ~0 : 0 */
+front_facing = lp_build_compare(gallivm, s_bld.type,
+PIPE_FUNC_NOTEQUAL,
+front_facing, s_bld.zero);
+ } else {
+LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+
+/* front_facing = face != 0 ? ~0 : 0 */
+front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+front_facing = LLVMBuildSExt(builder, front_facing,
+ LLVMIntTypeInContext(gallivm->context,
+
s_bld.type.length*s_bld.type.width),
+ "");
+front_facing 

[Mesa-dev] [PATCH 2/4] gallivm: use 2 srcs for 32->16bit conversions in lp_bld_conv_auto

2016-12-20 Thread sroland
From: Roland Scheidegger 

If we only feed one source vector at a time, we cannot use pack intrinsics
(as we only have a 64bit destination dst vector). lp_bld_conv_auto is
specifically designed to alter the length and number of destination vectors,
so this works just fine (if we use single source vectors at a time, afterwards
we immediately reassemble the vectors).
For AVX though this isn't really possible, since we expect 128bit output
already for a single 256bit input. (One day we should handle AVX2 which again
would need multiple inputs, however there's the problem that we get different
ordered output there and we don't want to reorder, so would need to be able
to tell build_conv to handle upper and lower halfs independently.)
A similar strategy would probably work for 32->8bit too (if it doesn't hit
the special case) but I'm going to try something different for that...
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 69d24a5..c8f9c28 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -497,8 +497,25 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
if (src_type.width == dst_type->width) {
   lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, 
num_dsts);
} else {
-  for (i = 0; i < num_srcs; ++i) {
- lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
+  /*
+   * If dst_width is 16 bits and src_width 32 and the dst vector size
+   * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
+   * (For AVX, this isn't needed, since we usually get 256bit src and
+   * 128bit dst vectors which works ok. If we do AVX2 pack this should
+   * be extended but need to be able to tell conversion code about pack
+   * ordering first.)
+   */
+  unsigned ratio = 1;
+  if (src_type.width == 2 * dst_type->width &&
+  src_type.length == dst_type->length &&
+  dst_type->floating == 0 && (num_srcs % 2 == 0) &&
+  dst_type->width * dst_type->length == 64) {
+ ratio = 2;
+ num_dsts /= 2;
+ dst_type->length *= 2;
+  }
+  for (i = 0; i < num_dsts; i++) {
+ lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, 
&dst[i], 1);
   }
}
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] gallivm: implement aos unpack (to unorm8) for small unorm formats

2016-12-20 Thread sroland
From: Roland Scheidegger 

Using bit replication. This path now resembles something which might make
sense. (The logic was mostly copied from llvmpipe fs backend.)
I am not convinced though it is actually faster than SoA sampling (actually
I'm quite certain it's always a loss with AVX).
With SoA it's just shift/mask/cvt/mul for getting the colors, whereas
there's still roughly 3 shifts, 3 or/and per channel for AoS
(i.e. for SoA it's exactly the same as it would be for a rgba8 format,
whereas the extra effort for AoS is significant). The filtering
might still be faster (albeit with FMA the instruction count gets down
quite a bit there on the SoA float filtering path on new cpus). And those
small unorm formats often don't have an alpha channel (which makes things
worse relatively for AoS path).
(This also fixes a trivial bug in the llvmpipe fs code this was derived
from, albeit it was only relevant for 4-bit channels.)
---
 src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 164 --
 src/gallium/drivers/llvmpipe/lp_state_fs.c|   8 +-
 2 files changed, 155 insertions(+), 17 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 574bb64..11d1118 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -52,6 +52,8 @@
 #include "lp_bld_format.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_bitarit.h"
 
 
 /**
@@ -139,6 +141,73 @@ format_matches_type(const struct util_format_description 
*desc,
return TRUE;
 }
 
+/*
+ * Do rounding when converting small unorm values to larger ones.
+ * Not quite 100% accurate, as it's done by appending MSBs, but
+ * should be good enough.
+ */
+
+static inline LLVMValueRef
+scale_bits_up(struct gallivm_state *gallivm,
+  int src_bits,
+  int dst_bits,
+  LLVMValueRef src,
+  struct lp_type src_type)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef result = src;
+
+   if (src_bits == 1 && dst_bits > 1) {
+  /*
+   * Useful for a1 - we'd need quite some repeated copies otherwise.
+   */
+  struct lp_build_context bld;
+  LLVMValueRef dst_mask;
+  lp_build_context_init(&bld, gallivm, src_type);
+  dst_mask = lp_build_const_int_vec(gallivm, src_type,
+(1 << dst_bits) - 1),
+  result = lp_build_cmp(&bld, PIPE_FUNC_EQUAL, src,
+lp_build_const_int_vec(gallivm, src_type, 0));
+  result = lp_build_andnot(&bld, dst_mask, result);
+   }
+   else if (dst_bits > src_bits) {
+  /* Scale up bits */
+  int db = dst_bits - src_bits;
+
+  /* Shift left by difference in bits */
+  result = LLVMBuildShl(builder,
+src,
+lp_build_const_int_vec(gallivm, src_type, db),
+"");
+
+  if (db <= src_bits) {
+ /* Enough bits in src to fill the remainder */
+ LLVMValueRef lower = LLVMBuildLShr(builder,
+src,
+lp_build_const_int_vec(gallivm, 
src_type,
+   src_bits - 
db),
+"");
+
+ result = LLVMBuildOr(builder, result, lower, "");
+  } else if (db > src_bits) {
+ /* Need to repeatedly copy src bits to fill remainder in dst */
+ unsigned n;
+
+ for (n = src_bits; n < dst_bits; n *= 2) {
+LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+result = LLVMBuildOr(builder,
+ result,
+ LLVMBuildLShr(builder, result, shuv, ""),
+ "");
+ }
+  }
+   } else {
+  assert (dst_bits == src_bits);
+   }
+
+   return result;
+}
 
 /**
  * Unpack a single pixel into its XYZW components.
@@ -451,6 +520,86 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
}
 
/*
+* Bit arithmetic for converting small_unorm to unorm8.
+*
+* This misses some opportunities for optimizations (like skipping mask
+* for the highest channel for instance, or doing bit scaling in parallel
+* for channels with the same bit width) but it should be passable for
+* all arithmetic formats.
+*/
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
+   format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
+   util_format_fits_8unorm(format_desc) &&
+   type.width == 8 && type.norm == 1 && type.sign == 0 &&
+   type.fixed == 0 && type.floating == 0) {
+  LLVMValueRef packed, res, chans[4], rgba[4];
+  LLVMTypeRef dst_vec_type, conv_vec_type;
+  struct lp_type fetch_type, conv_type;
+  struct lp

[Mesa-dev] [PATCH 3/4] gallivm: optimize lp_build_unpack_arith_rgba_aos slightly

2016-12-20 Thread sroland
From: Roland Scheidegger 

This code uses a vector shift which has to be emulated on x86 unless
there's AVX2. Luckily in some cases we can actually avoid the shift
altogether, so do that.
Also make sure we hit the fast lp_build_conv() path when applicable,
albeit that's quite the hack...
That said, this path is taken for AoS sampling for small unorm (smaller
than rgba8) formats, and it is completely hopeless even with those
changes, with or without AVX.
(Probably should have some code similar to the one in the llvmpipe fs
backend code, using bit replication to extend to rgba - rounding
is not quite 100% accurate but if it's good enough there it should be
here as well.)
---
 src/gallium/auxiliary/gallivm/lp_bld_format_aos.c | 116 ++
 1 file changed, 97 insertions(+), 19 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c 
b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 322e7b8..574bb64 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_pointer.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_arit.h"
 #include "lp_bld_init.h"
@@ -49,6 +50,7 @@
 #include "lp_bld_gather.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_format.h"
+#include "lp_bld_pack.h"
 #include "lp_bld_intr.h"
 
 
@@ -156,6 +158,7 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state 
*gallivm,
LLVMValueRef shifts[4];
LLVMValueRef masks[4];
LLVMValueRef scales[4];
+   LLVMTypeRef vec32_type;
 
boolean normalized;
boolean needs_uitofp;
@@ -171,19 +174,17 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state 
*gallivm,
 * matches floating point size */
assert (LLVMTypeOf(packed) == LLVMInt32TypeInContext(gallivm->context));
 
+   vec32_type = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+
/* Broadcast the packed value to all four channels
 * before: packed = BGRA
 * after: packed = {BGRA, BGRA, BGRA, BGRA}
 */
-   packed = LLVMBuildInsertElement(builder,
-   
LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-   packed,
+   packed = LLVMBuildInsertElement(builder, LLVMGetUndef(vec32_type), packed,

LLVMConstNull(LLVMInt32TypeInContext(gallivm->context)),
"");
-   packed = LLVMBuildShuffleVector(builder,
-   packed,
-   
LLVMGetUndef(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
-   
LLVMConstNull(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4)),
+   packed = LLVMBuildShuffleVector(builder, packed, LLVMGetUndef(vec32_type),
+   LLVMConstNull(vec32_type),
"");
 
/* Initialize vector constants */
@@ -224,9 +225,40 @@ lp_build_unpack_arith_rgba_aos(struct gallivm_state 
*gallivm,
/* Ex: convert packed = {XYZW, XYZW, XYZW, XYZW}
 * into masked = {X, Y, Z, W}
 */
-   /* Note: we cannot do this shift on x86 natively until AVX2. */
-   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
-   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   if (desc->block.bits < 32 && normalized) {
+  /*
+   * Note: we cannot do the shift below on x86 natively until AVX2.
+   *
+   * Old llvm versions will resort to scalar extract/shift insert,
+   * which is definitely terrible, new versions will just do
+   * several vector shifts and shuffle/blend results together.
+   * We could turn this into a variable left shift plus a constant
+   * right shift, and llvm would then turn the variable left shift
+   * into a mul for us (albeit without sse41 the mul needs emulation
+   * too...). However, since we're going to do a float mul
+   * anyway, we just adjust that mul instead (plus the mask), skipping
+   * the shift completely.
+   * We could also use a extra mul when the format isn't normalized and
+   * we don't have AVX2 support, but don't bother for now. Unfortunately,
+   * this strategy doesn't work for 32bit formats (such as rgb10a2 or even
+   * rgba8 if it ends up here), as that would require UIToFP, albeit that
+   * would be fixable with easy 16bit shuffle (unless there's channels
+   * crossing 16bit boundaries).
+   */
+  for (i = 0; i < 4; ++i) {
+ if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+unsigned bits = desc->channel[i].size;
+unsigned shift = desc->channel[i].shift;
+unsigned long long mask = ((1ULL << bits) - 1) << shift;
+scales[i] = lp_build_const_float(gallivm, 1.0 / mask);
+masks[i] = lp_build_const_in

[Mesa-dev] [PATCH 1/4] llvmpipe: use scalar load instead of vectors for small vectors in fs backend

2016-12-21 Thread sroland
From: Roland Scheidegger 

llvm has _huge_ problems trying to load things like <4 x i8> vectors and
stitching such loads together to form 128bit vectors. My understanding
of the problem is that the type legalizer tries to extend that to
really a <4 x i32> vector and not a <16 x i8> vector with the 4 elements
first then followed by padding, so the shuffles for then combining things
together are more or less impossible - you can in fact see the pmovzxd
llvm generates. Pre-4.0 llvm just gives up on it completely and does a 30+
pextrb/pinsrb sequence instead.
It looks like current llvm has fixed this behavior (my guess would be
due to better shuffle combination and load/shuffle folds), but we can
avoid this by just loading as <1 x i32> values, combine that and only
cast at the end. (I suspect it might also work if we'd pad the loaded
vectors immediately before shuffling them together, instead of directly
stitching 2 such vectors together pairwise before combining the pair.
But this _might_ lose the ability to load the values directly into
their right place in the vector with pinsrd.). But using 32bit values
is probably easier for llvm as it will never give it funny ideas how
the vector should look like.
(This is possibly only a problem for 1x8bit formats, since 2x8bit will
end up fetching 64bit hence only two vectors are stitched together,
not 4, but we use the same strategy anyway.)
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 56 ++
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index e56ce1d..b6f4c2a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -799,7 +799,8 @@ load_unswizzled_block(struct gallivm_state *gallivm,
   gep[1] = LLVMBuildAdd(builder, bx, by, "");
 
   dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
-  dst_ptr = LLVMBuildBitCast(builder, dst_ptr, 
LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+  dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
+ LLVMPointerType(lp_build_vec_type(gallivm, 
dst_type), 0), "");
 
   dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
 
@@ -843,7 +844,8 @@ store_unswizzled_block(struct gallivm_state *gallivm,
   gep[1] = LLVMBuildAdd(builder, bx, by, "");
 
   src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
-  src_ptr = LLVMBuildBitCast(builder, src_ptr, 
LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+  src_ptr = LLVMBuildBitCast(builder, src_ptr,
+ LLVMPointerType(lp_build_vec_type(gallivm, 
src_type), 0), "");
 
   src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
 
@@ -1632,6 +1634,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
struct lp_type blend_type;
struct lp_type row_type;
struct lp_type dst_type;
+   struct lp_type ls_type;
 
unsigned char swizzle[TGSI_NUM_CHANNELS];
unsigned vector_width;
@@ -2057,17 +2060,41 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
 */
dst_alignment = MIN2(16, dst_alignment);
 
+   ls_type = dst_type;
+
+   if (dst_count > src_count) {
+  if ((dst_type.width == 8 || dst_type.width == 16) &&
+  util_is_power_of_two(dst_type.length) &&
+  dst_type.length * dst_type.width < 128) {
+ /*
+  * Never try to load values as 4xi8 which we will then
+  * concatenate to larger vectors. This gives llvm a real
+  * headache (the problem is the type legalizer (?) will
+  * try to load that as 4xi8 zext to 4xi32 to fill the vector,
+  * then the shuffles to concatenate are more or less impossible
+  * - llvm is easily capable of generating a sequence of 32
+  * pextrb/pinsrb instructions for that. Albeit it appears to
+  * be fixed in llvm 4.0. So, load and concatenate with 32bit
+  * width to avoid the trouble (16bit seems not as bad, llvm
+  * probably recognizes the load+shuffle as only one shuffle
+  * is necessary, but we can do just the same anyway).
+  */
+ ls_type.length = dst_type.length * dst_type.width / 32;
+ ls_type.width = 32;
+  }
+   }
+
if (is_1d) {
   load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
-dst, dst_type, dst_count / 4, dst_alignment);
+dst, ls_type, dst_count / 4, dst_alignment);
   for (i = dst_count / 4; i < dst_count; i++) {
- dst[i] = lp_build_undef(gallivm, dst_type);
+ dst[i] = lp_build_undef(gallivm, ls_type);
   }
 
}
else {
   load_unswizzled_block(gallivm, color_ptr, stride, block_width, 
block_height,
-dst, dst_type, dst_count, dst_alignment);
+dst, ls_type, dst_count, dst_alignment);
}
 
 
@@ -2082,

[Mesa-dev] [PATCH 2/4] llvmpipe: use alpha from already converted color if possible

2016-12-21 Thread sroland
From: Roland Scheidegger 

For rgbx formats, there is no point in doing alpha conversion again (and
with different tranpose even, so llvm can't eliminate it).
Albeit it looks like there's some minimal changes needed in the blend code
(found by code inspection, no test seemed to complain) if we do this -
the blend factors are already sanitized if we have no destination alpha,
however for src_alpha_saturate it looks like it still might make a
difference (note that we forced has_alpha to true before for some formats
and nothing complained, but this seems safer).
---
 src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c | 29 +
 src/gallium/drivers/llvmpipe/lp_state_fs.c  | 43 ++---
 2 files changed, 54 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c 
b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index a57670d..45c5c2b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -74,6 +74,7 @@ struct lp_build_blend_aos_context
LLVMValueRef dst;
LLVMValueRef const_;
LLVMValueRef const_alpha;
+   boolean has_dst_alpha;
 
LLVMValueRef inv_src;
LLVMValueRef inv_src_alpha;
@@ -115,10 +116,10 @@ lp_build_blend_factor_unswizzled(struct 
lp_build_blend_aos_context *bld,
  return bld->base.one;
   else {
  /*
-  * if there's separate src_alpha there's no dst alpha hence the 
complement
-  * is zero but for unclamped float inputs min can be non-zero 
(negative).
+  * If there's no dst alpha the complement is zero but for unclamped
+  * float inputs min can be non-zero (negative).
   */
- if (bld->src_alpha) {
+ if (!bld->has_dst_alpha) {
 if (!bld->saturate)
bld->saturate = lp_build_min(&bld->base, src_alpha, 
bld->base.zero);
  }
@@ -264,7 +265,8 @@ lp_build_blend_factor(struct lp_build_blend_aos_context 
*bld,
if (alpha_swizzle != PIPE_SWIZZLE_NONE) {
   rgb_swizzle   = lp_build_blend_factor_swizzle(rgb_factor);
   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, 
TRUE);
-  return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, 
rgb_swizzle, alpha_swizzle, num_channels);
+  return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, 
rgb_swizzle,
+alpha_swizzle, num_channels);
} else {
   return rgb_factor_;
}
@@ -327,6 +329,7 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
bld.src_alpha = src_alpha;
bld.src1_alpha = src1_alpha;
bld.const_alpha = const_alpha;
+   bld.has_dst_alpha = FALSE;
 
/* Find the alpha channel if not provided seperately */
if (!src_alpha) {
@@ -335,6 +338,14 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
 alpha_swizzle = i;
  }
   }
+  /*
+   * Note that we may get src_alpha included from source (and 4 channels)
+   * even if the destination doesn't have an alpha channel (for rgbx
+   * formats). Generally this shouldn't make much of a difference (we're
+   * relying on blend factors being sanitized already if there's no
+   * dst alpha).
+   */
+  bld.has_dst_alpha = desc->swizzle[3] <= PIPE_SWIZZLE_W;
}
 
if (blend->logicop_enable) {
@@ -347,7 +358,9 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
} else if (!state->blend_enable) {
   result = src;
} else {
-  boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor 
&& state->alpha_src_factor == state->alpha_dst_factor) || nr_channels == 1;
+  boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor 
&&
+state->alpha_src_factor == 
state->alpha_dst_factor) ||
+   nr_channels == 1;
 
   src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
  state->alpha_src_factor,
@@ -370,7 +383,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
   rgb_alpha_same,
   false);
 
-  if(state->rgb_func != state->alpha_func && nr_channels > 1 && 
alpha_swizzle != PIPE_SWIZZLE_NONE) {
+  if(state->rgb_func != state->alpha_func && nr_channels > 1 &&
+alpha_swizzle != PIPE_SWIZZLE_NONE) {
  LLVMValueRef alpha;
 
  alpha = lp_build_blend(&bld.base,
@@ -397,7 +411,8 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
if (!util_format_colormask_full(desc, state->colormask)) {
   LLVMValueRef color_mask;
 
-  color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, 
state->colormask, nr_channels, swizzle);
+  color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type,
+state->colormask, 
nr_channels, swizzle);
   lp_build_name(color_mask, "c

[Mesa-dev] [PATCH 3/4] gallivm: generalize 4x4f->1x16ub special case conversion

2016-12-21 Thread sroland
From: Roland Scheidegger 

This special packing path can be easily extended to handle not just
float->unorm8 but also float->snorm8 and uint32->uint8 and int32->int8
(i.e. all interesting cases for llvmpipe fs backend code).
The packing parts all stay the same (only the last step packing will
be signed->signed instead of signed->unsigned but luckily even sse2 can do
both).
While here also note some bugs with that (we keep the bugs identical to
what we did before on x86, albeit other archs may differ). In particular
float->unorm8 too large values will still get clamped to 0, not 255, and for
float->snorm8 NaNs will end up as -1, not 0 (but we do the clamp against 1.0
there to prevent too large values ending up as -1.0 - this is inconsistent
to unorm8 handling but is what we ended up before, I'm not sure we can get
away without it). This is quite fishy in any case as we depend on
arch-dependent behavior of the iround (my understanding is in fact with
altivec the conversion would actually saturate although I've no idea about
NaNs, so probably wouldn't need to do anything for snorm).
(There are only minimal piglit tests for unorm clamping behavior AFAICT, in
particular nothing seems to test values which are too large to be handled by
the float->int conversion.)
For uint32->uint8 we also do a min against MAX_INT, since the source for
the packs is always signed (again, on x86 - should probably be able to
express these arch-dependent bits better some day).
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c | 174 +++-
 1 file changed, 118 insertions(+), 56 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index c8f9c28..c688965 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -456,21 +456,21 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
src_type.sign == dst_type->sign)
   return num_dsts;
 
-   /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
+   /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
 */
-   if (src_type.floating == 1 &&
-   src_type.fixed== 0 &&
-   src_type.sign == 1 &&
-   src_type.norm == 0 &&
+   if (src_type.norm == 0 &&
src_type.width== 32 &&
+   src_type.fixed== 0 &&
 
dst_type->floating == 0 &&
dst_type->fixed== 0 &&
-   dst_type->sign == 0 &&
-   dst_type->norm == 1 &&
-   dst_type->width== 8)
-   {
-  /* Special case 4x4f --> 1x16ub */
+   dst_type->width== 8 &&
+
+   ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) 
||
+(src_type.floating == 0 && dst_type->floating == 0 &&
+ src_type.sign == dst_type->sign && dst_type->norm == 0))) {
+
+  /* Special case 4x4x32 --> 1x16x8 */
   if (src_type.length == 4 &&
 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
   {
@@ -481,7 +481,7 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
  return num_dsts;
   }
 
-  /* Special case 2x8f --> 1x16ub */
+  /* Special case 2x8x32 --> 1x16x8 */
   if (src_type.length == 8 &&
   util_cpu_caps.has_avx)
   {
@@ -558,21 +558,25 @@ lp_build_conv(struct gallivm_state *gallivm,
num_tmps = num_srcs;
 
 
-   /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
+   /*
+* Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
+* Only float -> s/unorm8 and (u)int32->(u)int8.
+* XXX: This should cover all interesting backend cases for 8 bit,
+* but should use same strategy if dst is 16 bit.
 */
-   if (src_type.floating == 1 &&
-   src_type.fixed== 0 &&
-   src_type.sign == 1 &&
-   src_type.norm == 0 &&
+   if (src_type.norm == 0 &&
src_type.width== 32 &&
src_type.length   == 4 &&
+   src_type.fixed== 0 &&
 
dst_type.floating == 0 &&
dst_type.fixed== 0 &&
-   dst_type.sign == 0 &&
-   dst_type.norm == 1 &&
dst_type.width== 8 &&
 
+   ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
+(src_type.floating == 0 && dst_type.floating == 0 &&
+ src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
+
((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
 (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) 
&&
 
@@ -581,7 +585,7 @@ lp_build_conv(struct gallivm_state *gallivm,
   struct lp_build_context bld;
   struct lp_type int16_type, int32_type;
   struct lp_type dst_type_ext = dst_type;
-  LLVMValueRef const_255f;
+  LLVMValueRef const_scale;
   unsigned i, j;
 
   lp_build_context_init(&bld, gallivm, src_type);
@@ -597,14 +601,54 @@ lp_build_conv(struct gallivm_state *gallivm,
   int32_type.length /= 4;
   int32_type.sign = 1;
 
-  const_255f = lp_build_const_vec(gal

[Mesa-dev] [PATCH 4/4] llvmpipe: do transpose/untwiddle after conversion for 8bit formats

2016-12-21 Thread sroland
From: Roland Scheidegger 

Generally we should do tranpose after conversion, if the format has less than
32 bits per channel (if it has 32 bits, conversion is going to be a no-op
anyway...). This is obviously because there's less vectors to deal with.
Though the advantage for 16 bit formats isn't that big, and in fact with AVX
there isn't really any (as the 32bit unpacks can be done with 256bit, but
the smaller ones cannot, although that would change again with proper AVX2
support).
Only makes sense for 2d and not 1d cases. And to keep things easy, only handle
1,2 and 4 channels (rgbx is just fine).
For rgba unorm8 format the backend conversion sums up to these instruction
totals (not counting the movs for SSE2 due to 2-op syntax - generally every 2
unpacks need an additional mov).
 SSE2AVX
transpose:   32 unpack   16 unpack
untwiddle:   0   8 (128bit low/high permutes)
convert: 16 mul + 16 cvt 8 mul + 8 cvt
32->8bit:12 pack 8 (128bit extract) + 12 pack

When doing transpose/untwiddle afterwards we get:
convert: 16 mul + 16 cvt 8 mul + 8 cvt
32->8bit:12 pack 8 (128bit extract) + 12 pack
transpose/untwiddle  12 unpack   12 unpack

So for SSE2, this drops 20 unpacks (total instruction count 76->56)
whereas for AVX it replaces the 16 256bit unpacks with 8 128bit ones
and drops the 8 lo/hi permutes (in total 60->48). (Albeit to be fair,
the permutes could be dropped even when doing the transpose first,
they are extremely pointless but we'd need to be able to tell
lp_build_conv to reorder the vectors, for AVX2 we're going to need to
be able to tell lp_build_conv about ordering in any case.)

(With different ordering going into conversion, it would be possible
to do 4 unpacks + 4 pshufbs instead of 12 unpacks, but that might not
be better, and not all cpus can do it. Proper AVX2 support should eliminate
the 8 128bit extracts, reduce these 12 packs to 6 and the 12 unpacks to 2
pshufb + 2 permq ideally (+ 2 final 128bit extracts).)
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 150 +++--
 1 file changed, 143 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c 
b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 2c0339c..af47b52 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -734,6 +734,10 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
   }
} else if (twiddle) {
   /* Twiddle pixels across elements of array */
+  /*
+   * XXX: we should avoid this in some cases, but would need to tell
+   * lp_build_conv to reorder (or deal with it ourselves).
+   */
   lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
} else {
   /* Do nothing */
@@ -764,6 +768,94 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
 }
 
 
+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef *src,
+ unsigned src_count,
+ LLVMValueRef *dst)
+{
+   unsigned i, j;
+   struct lp_type type64, type16, type32;
+   LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef tmp[4], shuf[8];
+   for (j = 0; j < 2; j++) {
+  shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+  shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+  shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+  shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+   }
+
+   assert(src_count == 4 || src_count == 2 || src_count == 1);
+   assert(type.width == 8);
+   assert(type.length == 16);
+
+   type8_t = lp_build_vec_type(gallivm, type);
+
+   type64 = type;
+   type64.length /= 8;
+   type64.width *= 8;
+   type64_t = lp_build_vec_type(gallivm, type64);
+
+   type16 = type;
+   type16.length /= 2;
+   type16.width *= 2;
+   type16_t = lp_build_vec_type(gallivm, type16);
+
+   type32 = type;
+   type32.length /= 4;
+   type32.width *= 4;
+   type32_t = lp_build_vec_type(gallivm, type32);
+
+   lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+   if (src_count == 1) {
+  /* transpose was no-op, just untwiddle */
+  LLVMValueRef shuf_vec;
+  shuf_vec = LLVMConstVector(shuf, 8);
+  tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+  tmp[0] = LLVMBuildS

[Mesa-dev] [PATCH] gallivm: ignore rho approximation for cube maps

2013-09-30 Thread sroland
From: Roland Scheidegger 

There's two reasons for this:
1) even when ignoring rho approximation for cube maps, the result is still
not correct, but it's better as the max error at edges is now sqrt(2) instead
of 2 (which was a full mip level), same as it is for ordinary 2d maps when
doing rho approximations (so the error actually goes from factor 2 at edges and
sqrt(2) completely inside a face to sqrt(2) at edges and 0 inside a face).
2) I want to repurpose rho_no_approx for cubemaps for fully correct cubemap
derivatives (so don't need yet another debug var).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   34 +
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index c775382..ea6bec7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -269,10 +269,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
   /* Could optimize this for single quad just skip the broadcast */
   cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 rho_bld->type, float_size, index0);
-  if (no_rho_opt) {
- /* skipping sqrt hence returning rho squared */
- cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
-  }
+  /* skipping sqrt hence returning rho squared */
+  cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
   rho = lp_build_mul(rho_bld, cubesize, rho);
}
else if (derivs && !(bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE)) {
@@ -757,8 +755,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
   }
   else {
  LLVMValueRef rho;
- boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
-   (bld->dims > 1);
+ boolean rho_squared = ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) 
&&
+(bld->dims > 1)) || cube_rho;
 
  rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
@@ -1602,31 +1600,23 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
   * know the texture is square which simplifies things (we can omit the
   * size mul which happens very early completely here and do it at the
   * very end).
+  * Also always do calculations according to 
GALLIVM_DEBUG_NO_RHO_APPROX
+  * since the error can get quite big otherwise at edges.
+  * (With no_rho_approx max error is sqrt(2) at edges, same as it is
+  * without no_rho_approx for 2d textures, otherwise it would be 
factor 2.)
   */
  ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 
- if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
-ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
- }
- else {
-ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
- }
+ ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+ ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 
  tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
 
- if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
-rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
- }
- else {
-rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
-rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
- }
+ rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+ rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
 
  tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] gallivm: ignore rho approximation for cube maps

2013-10-03 Thread sroland
From: Roland Scheidegger 

There's two reasons for this:
1) even when ignoring rho approximation for cube maps, the result is still
not correct, but it's better as the max error at edges is now sqrt(2) instead
of 2 (which was a full mip level), same as it is for ordinary 2d maps when
doing rho approximations (so the error actually goes from factor 2 at edges and
sqrt(2) completely inside a face to sqrt(2) at edges and 0 inside a face).
2) I want to repurpose rho_no_approx for cubemaps for fully correct cubemap
derivatives (so don't need yet another debug var).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   34 +
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index c775382..ea6bec7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -269,10 +269,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
   /* Could optimize this for single quad just skip the broadcast */
   cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 rho_bld->type, float_size, index0);
-  if (no_rho_opt) {
- /* skipping sqrt hence returning rho squared */
- cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
-  }
+  /* skipping sqrt hence returning rho squared */
+  cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
   rho = lp_build_mul(rho_bld, cubesize, rho);
}
else if (derivs && !(bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE)) {
@@ -757,8 +755,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
   }
   else {
  LLVMValueRef rho;
- boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
-   (bld->dims > 1);
+ boolean rho_squared = ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) 
&&
+(bld->dims > 1)) || cube_rho;
 
  rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
@@ -1602,31 +1600,23 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
   * know the texture is square which simplifies things (we can omit the
   * size mul which happens very early completely here and do it at the
   * very end).
+  * Also always do calculations according to 
GALLIVM_DEBUG_NO_RHO_APPROX
+  * since the error can get quite big otherwise at edges.
+  * (With no_rho_approx max error is sqrt(2) at edges, same as it is
+  * without no_rho_approx for 2d textures, otherwise it would be 
factor 2.)
   */
  ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 
- if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
-ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
- }
- else {
-ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
- }
+ ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+ ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 
  tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
 
- if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
-rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
- }
- else {
-rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
-rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
- }
+ rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+ rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
 
  tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] gallivm: handle explicit derivatives for cubemaps

2013-10-03 Thread sroland
From: Roland Scheidegger 

They need some special handling. Quite complicated.
Additionally, use the same code for implicit derivatives too if no_rho_approx
and no_quad_lod is set, because it seems while generally it should be ok
to use per quad lod for implicit derivatives there's at least some test which
insists that in case of cubemaps the shared lod value MUST come from a pixel
inside the primitive (due to the derivatives becoming different if a different
larger major axis is chosen).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  221 +++--
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |   35 +++-
 3 files changed, 231 insertions(+), 28 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index ea6bec7..ce05522 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -273,7 +273,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
   cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
   rho = lp_build_mul(rho_bld, cubesize, rho);
}
-   else if (derivs && !(bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE)) {
+   else if (derivs) {
   LLVMValueRef ddmax[3], ddx[3], ddy[3];
   for (i = 0; i < dims; i++) {
  LLVMValueRef floatdim;
@@ -1488,8 +1488,9 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
  LLVMValueRef *coords,
- const struct lp_derivatives *derivs, /* optional */
+ const struct lp_derivatives *derivs_in, /* optional */
  LLVMValueRef *rho,
+ struct lp_derivatives *derivs_out, /* optional */
  boolean need_derivs)
 {
struct lp_build_context *coord_bld = &bld->coord_bld;
@@ -1512,8 +1513,6 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
* the edge). Still this is possibly a win over just selecting the same 
face
* for all pixels. Unfortunately, something like that doesn't work for
* explicit derivatives.
-   * TODO: handle explicit derivatives by transforming them alongside 
coords
-   * somehow.
*/
   struct lp_build_context *cint_bld = &bld->int_coord_bld;
   struct lp_type intctype = cint_bld->type;
@@ -1522,7 +1521,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
   LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
   LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
   LLVMValueRef tnegi, rnegi;
-  LLVMValueRef ma, mai, ima;
+  LLVMValueRef ma, mai, imahalfpos;
   LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
   LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
  1 << (intctype.width - 
1));
@@ -1561,7 +1560,195 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
   maxasat = lp_build_max(coord_bld, as, at);
   ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
 
-  if (need_derivs) {
+  if (need_derivs && (derivs_in ||
+  ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
+   (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX {
+ /*
+  * XXX: This is really really complex.
+  * It is a bit overkill to use this for implicit derivatives as well,
+  * no way this is worth the cost in practice, but seems to be the
+  * only way for getting accurate and per-pixel lod values.
+  */
+ LLVMValueRef imapos, tmp, ddx[3], ddy[3];
+ LLVMValueRef madx, mady, madxdivma, madydivma;
+ LLVMValueRef sdxi, tdxi, rdxi, signsdx, signtdx, signrdx;
+ LLVMValueRef sdyi, tdyi, rdyi, signsdy, signtdy, signrdy;
+ LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
+ LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
+ LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
+ LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
+ LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm,
+   coord_bld->type, 0.5);
+ /*
+  * s = 1/2 * ( sc / ma + 1)
+  * t = 1/2 * ( tc / ma + 1)
+  *
+  * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
+  * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
+  *
+  * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
+  * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
+  * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
+  * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
+  */
+
+ /* select ma, calculate ima */
+ ma = lp_build_select(coord_bld, as_ge_at, s, t);
+ ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma

[Mesa-dev] [PATCH 3/3] gallivm: kill old per-quad face selection code

2013-10-03 Thread sroland
From: Roland Scheidegger 

Not used since ages, and it wouldn't work at all with explicit derivatives now
(not that it did before as it ignored them but now the code would just use
the derivs pre-projected which would be quite random numbers).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  751 +++--
 1 file changed, 313 insertions(+), 438 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index ce05522..3fac981 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1493,323 +1493,135 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
  struct lp_derivatives *derivs_out, /* optional */
  boolean need_derivs)
 {
+   /*
+* Do per-pixel face selection. We cannot however (as we used to do)
+* simply calculate the derivs afterwards (which is very bogus for
+* explicit derivs btw) because the values would be "random" when
+* not all pixels lie on the same face. So what we do here is just
+* calculate the derivatives after scaling the coords by the absolute
+* value of the inverse major axis, and essentially do rho calculation
+* steps as if it were a 3d texture. This is perfect if all pixels hit
+* the same face, but not so great at edges, I believe the max error
+* should be sqrt(2) with no_rho_approx or 2 otherwise (essentially 
measuring
+* the 3d distance between 2 points on the cube instead of measuring up/down
+* the edge). Still this is possibly a win over just selecting the same face
+* for all pixels. Unfortunately, something like that doesn't work for
+* explicit derivatives.
+*/
struct lp_build_context *coord_bld = &bld->coord_bld;
LLVMBuilderRef builder = bld->gallivm->builder;
struct gallivm_state *gallivm = bld->gallivm;
LLVMValueRef si, ti, ri;
+   struct lp_build_context *cint_bld = &bld->int_coord_bld;
+   struct lp_type intctype = cint_bld->type;
+   LLVMValueRef signs, signt, signr, signma;
+   LLVMValueRef as, at, ar, face, face_s, face_t;
+   LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
+   LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+   LLVMValueRef tnegi, rnegi;
+   LLVMValueRef ma, mai, imahalfpos;
+   LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+   LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+  1 << (intctype.width - 1));
+   LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+   intctype.width -1);
+   LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, 
PIPE_TEX_FACE_POS_X);
+   LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, 
PIPE_TEX_FACE_POS_Y);
+   LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, 
PIPE_TEX_FACE_POS_Z);
+   LLVMValueRef s = coords[0];
+   LLVMValueRef t = coords[1];
+   LLVMValueRef r = coords[2];
+
+   assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+   assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+   assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
 
-   if (1 || coord_bld->type.length > 4) {
-  /*
-   * Do per-pixel face selection. We cannot however (as we used to do)
-   * simply calculate the derivs afterwards (which is very bogus for
-   * explicit derivs btw) because the values would be "random" when
-   * not all pixels lie on the same face. So what we do here is just
-   * calculate the derivatives after scaling the coords by the absolute
-   * value of the inverse major axis, and essentially do rho calculation
-   * steps as if it were a 3d texture. This is perfect if all pixels hit
-   * the same face, but not so great at edges, I believe the max error
-   * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially 
measuring
-   * the 3d distance between 2 points on the cube instead of measuring 
up/down
-   * the edge). Still this is possibly a win over just selecting the same 
face
-   * for all pixels. Unfortunately, something like that doesn't work for
-   * explicit derivatives.
-   */
-  struct lp_build_context *cint_bld = &bld->int_coord_bld;
-  struct lp_type intctype = cint_bld->type;
-  LLVMValueRef signs, signt, signr, signma;
-  LLVMValueRef as, at, ar, face, face_s, face_t;
-  LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
-  LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
-  LLVMValueRef tnegi, rnegi;
-  LLVMValueRef ma, mai, imahalfpos;
-  LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
-  LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
- 1 << (intctype.width - 
1));
-  LLVMValueRef signshift = lp_build_const_int_vec(gal

[Mesa-dev] [PATCH 2/3] gallivm: handle explicit derivatives for cubemaps

2013-10-04 Thread sroland
From: Roland Scheidegger 

They need some special handling. Quite complicated.
Additionally, use the same code for implicit derivatives too if no_rho_approx
and no_quad_lod is set, because it seems while generally it should be ok
to use per quad lod for implicit derivatives there's at least some test which
insists that in case of cubemaps the shared lod value MUST come from a pixel
inside the primitive (due to the derivatives becoming different if a different
larger major axis is chosen).

v2: based on Brian's feedback, clean up code a bit.
And use sign bit of major axis instead of pre-select s/t/r sign for coord
mirroring (which should be the same in the end, saves 2 ands).
Also fix two bugs with select/mirror of derivatives, the minor axes need to
use major axis sign as well (instead of major derivative axis sign), and
don't mistakenly use absolute values of major derivative and inverse major
values.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  253 +
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |   35 ++-
 3 files changed, 235 insertions(+), 56 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index dc593aa..39c3a2f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -273,7 +273,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
   cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
   rho = lp_build_mul(rho_bld, cubesize, rho);
}
-   else if (derivs && !(bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE)) {
+   else if (derivs) {
   LLVMValueRef ddmax[3], ddx[3], ddy[3];
   for (i = 0; i < dims; i++) {
  LLVMValueRef floatdim;
@@ -1481,6 +1481,21 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 }
 
 
+/** Helper for doing 3-wise selection.
+ * Returns sel1 ? val2 : (sel0 ? val0 : val1).
+ */
+static LLVMValueRef
+lp_build_select3(struct lp_build_context *sel_bld,
+ LLVMValueRef sel0,
+ LLVMValueRef sel1,
+ LLVMValueRef val0,
+ LLVMValueRef val1,
+ LLVMValueRef val2)
+{
+   LLVMValueRef tmp;
+   tmp = lp_build_select(sel_bld, sel0, val0, val1);
+   return lp_build_select(sel_bld, sel1, val2, tmp);
+}
 
 /**
  * Generate code to do cube face selection and compute per-face texcoords.
@@ -1488,8 +1503,9 @@ lp_build_cube_face(struct lp_build_sample_context *bld,
 void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
  LLVMValueRef *coords,
- const struct lp_derivatives *derivs, /* optional */
+ const struct lp_derivatives *derivs_in, /* optional */
  LLVMValueRef *rho,
+ struct lp_derivatives *derivs_out, /* optional */
  boolean need_derivs)
 {
struct lp_build_context *coord_bld = &bld->coord_bld;
@@ -1512,19 +1528,16 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
* the edge). Still this is possibly a win over just selecting the same 
face
* for all pixels. Unfortunately, something like that doesn't work for
* explicit derivatives.
-   * TODO: handle explicit derivatives by transforming them alongside 
coords
-   * somehow.
*/
   struct lp_build_context *cint_bld = &bld->int_coord_bld;
   struct lp_type intctype = cint_bld->type;
   LLVMTypeRef coord_vec_type = coord_bld->vec_type;
   LLVMTypeRef cint_vec_type = cint_bld->vec_type;
-  LLVMValueRef signs, signt, signr, signma;
   LLVMValueRef as, at, ar, face, face_s, face_t;
   LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
   LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
   LLVMValueRef tnegi, rnegi;
-  LLVMValueRef ma, mai, ima;
+  LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
   LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
   LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
  1 << (intctype.width - 
1));
@@ -1563,7 +1576,166 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
   maxasat = lp_build_max(coord_bld, as, at);
   ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
 
-  if (need_derivs) {
+  if (need_derivs && (derivs_in ||
+  ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
+   (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX {
+ /*
+  * XXX: This is really really complex.
+  * It is a bit overkill to use this for implicit derivatives as well,
+  * no way this is worth the cost in practice, but seems to be the
+  * only way for getting accurate and per-pixel lod values.
+  */
+ LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
+

[Mesa-dev] [PATCH 1/3] gallivm: ignore rho approximation for cube maps

2013-10-04 Thread sroland
From: Roland Scheidegger 

There's two reasons for this:
1) even when ignoring rho approximation for cube maps, the result is still
not correct, but it's better as the max error at edges is now sqrt(2) instead
of 2 (which was a full mip level), same as it is for ordinary 2d maps when
doing rho approximations (so the error actually goes from factor 2 at edges and
sqrt(2) completely inside a face to sqrt(2) at edges and 0 inside a face).
2) I want to repurpose rho_no_approx for cubemaps for fully correct cubemap
derivatives (so don't need yet another debug var).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   50 ++---
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index c775382..dc593aa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -269,10 +269,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
   /* Could optimize this for single quad just skip the broadcast */
   cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
 rho_bld->type, float_size, index0);
-  if (no_rho_opt) {
- /* skipping sqrt hence returning rho squared */
- cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
-  }
+  /* skipping sqrt hence returning rho squared */
+  cubesize = lp_build_mul(rho_bld, cubesize, cubesize);
   rho = lp_build_mul(rho_bld, cubesize, rho);
}
else if (derivs && !(bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE)) {
@@ -757,8 +755,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
   }
   else {
  LLVMValueRef rho;
- boolean rho_squared = (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
-   (bld->dims > 1);
+ boolean rho_squared = ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) 
&&
+(bld->dims > 1)) || cube_rho;
 
  rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
 
@@ -1519,6 +1517,8 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
*/
   struct lp_build_context *cint_bld = &bld->int_coord_bld;
   struct lp_type intctype = cint_bld->type;
+  LLVMTypeRef coord_vec_type = coord_bld->vec_type;
+  LLVMTypeRef cint_vec_type = cint_bld->vec_type;
   LLVMValueRef signs, signt, signr, signma;
   LLVMValueRef as, at, ar, face, face_s, face_t;
   LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
@@ -1602,40 +1602,32 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
   * know the texture is square which simplifies things (we can omit the
   * size mul which happens very early completely here and do it at the
   * very end).
+  * Also always do calculations according to 
GALLIVM_DEBUG_NO_RHO_APPROX
+  * since the error can get quite big otherwise at edges.
+  * (With no_rho_approx max error is sqrt(2) at edges, same as it is
+  * without no_rho_approx for 2d textures, otherwise it would be 
factor 2.)
   */
  ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
  ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 
- if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
-ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
- }
- else {
-ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
- }
+ ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+ ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 
  tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
  tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
  tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
 
- if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
-rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
-rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
- }
- else {
-rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
-rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
- }
+ rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+ rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
 
  tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
  tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
  *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
   }
 
-  si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), 
"");
-  ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), 
"");
-  ri = LLVMBui

[Mesa-dev] [PATCH 3/3] gallivm: kill old per-quad face selection code

2013-10-04 Thread sroland
From: Roland Scheidegger 

Not used since ages, and it wouldn't work at all with explicit derivatives now
(not that it did before as it ignored them but now the code would just use
the derivs pre-projected which would be quite random numbers).

v2: also get rid of 3 helper functions no longer used.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  761 ++---
 1 file changed, 286 insertions(+), 475 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 39c3a2f..1c35200 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1414,72 +1414,6 @@ lp_build_cube_imapos(struct lp_build_context *coord_bld, 
LLVMValueRef coord)
return ima;
 }
 
-/** Helper used by lp_build_cube_lookup() */
-static LLVMValueRef
-lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
-{
-   /* ima = -0.5 / abs(coord); */
-   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, 
coord_bld->type, -0.5);
-   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
-   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
-   return ima;
-}
-
-/**
- * Helper used by lp_build_cube_lookup()
- * FIXME: the sign here can also be 0.
- * Arithmetically this could definitely make a difference. Either
- * fix the comment or use other (simpler) sign function, not sure
- * which one it should be.
- * \param sign  scalar +1 or -1
- * \param coord  float vector
- * \param ima  float vector
- */
-static LLVMValueRef
-lp_build_cube_coord(struct lp_build_context *coord_bld,
-LLVMValueRef sign, int negate_coord,
-LLVMValueRef coord, LLVMValueRef ima)
-{
-   /* return negate(coord) * ima * sign + 0.5; */
-   LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 
0.5);
-   LLVMValueRef res;
-
-   assert(negate_coord == +1 || negate_coord == -1);
-
-   if (negate_coord == -1) {
-  coord = lp_build_negate(coord_bld, coord);
-   }
-
-   res = lp_build_mul(coord_bld, coord, ima);
-   if (sign) {
-  sign = lp_build_broadcast_scalar(coord_bld, sign);
-  res = lp_build_mul(coord_bld, res, sign);
-   }
-   res = lp_build_add(coord_bld, res, half);
-
-   return res;
-}
-
-
-/** Helper used by lp_build_cube_lookup()
- * Return (major_coord >= 0) ? pos_face : neg_face;
- */
-static LLVMValueRef
-lp_build_cube_face(struct lp_build_sample_context *bld,
-   LLVMValueRef major_coord,
-   unsigned pos_face, unsigned neg_face)
-{
-   struct gallivm_state *gallivm = bld->gallivm;
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
-major_coord,
-bld->float_bld.zero, "");
-   LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
-   LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
-   LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
-   return res;
-}
-
 
 /** Helper for doing 3-wise selection.
  * Returns sel1 ? val2 : (sel0 ? val0 : val1).
@@ -1497,6 +1431,7 @@ lp_build_select3(struct lp_build_context *sel_bld,
return lp_build_select(sel_bld, sel1, val2, tmp);
 }
 
+
 /**
  * Generate code to do cube face selection and compute per-face texcoords.
  */
@@ -1513,301 +1448,141 @@ lp_build_cube_lookup(struct lp_build_sample_context 
*bld,
struct gallivm_state *gallivm = bld->gallivm;
LLVMValueRef si, ti, ri;
 
-   if (1 || coord_bld->type.length > 4) {
-  /*
-   * Do per-pixel face selection. We cannot however (as we used to do)
-   * simply calculate the derivs afterwards (which is very bogus for
-   * explicit derivs btw) because the values would be "random" when
-   * not all pixels lie on the same face. So what we do here is just
-   * calculate the derivatives after scaling the coords by the absolute
-   * value of the inverse major axis, and essentially do rho calculation
-   * steps as if it were a 3d texture. This is perfect if all pixels hit
-   * the same face, but not so great at edges, I believe the max error
-   * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially 
measuring
-   * the 3d distance between 2 points on the cube instead of measuring 
up/down
-   * the edge). Still this is possibly a win over just selecting the same 
face
-   * for all pixels. Unfortunately, something like that doesn't work for
-   * explicit derivatives.
-   */
-  struct lp_build_context *cint_bld = &bld->int_coord_bld;
-  struct lp_type intctype = cint_bld->type;
-  LLVMTypeRef coord_vec_type = coord_bld->vec_type;
-  LLVMTypeRef cint_vec_type = cint_bld->vec_type;
-  LLVMValueRef as, at, ar, face, face_s, face_t;
-  LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
-  LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;

[Mesa-dev] [PATCH] softpipe: fix seamless cube filtering

2013-10-10 Thread sroland
From: Roland Scheidegger 

Fix coord wrapping (and face selection too) in case of edges.
Unfortunately, the coord wrapping is way more complicated than what
the code did, as it depends on the face and the direction where the
texel falls off the face (the logic needed to get this right in fact
seems utterly ridiculous).
Also fix a bug in (y direction under/overflow) face selection.
And get rid of complicated cube corner handling. Just like edge case,
the coord wrapping was wrong and it seems very difficult to fix.
I'm near certain it can't always work anyway (though ordinary seamless
filtering on edge has actually a similar problem but not as severe)
because we don't have per-pixel face, hence could have multiple corner
texels which would make it very difficult to average the remaining texels
correctly. Hence simply pick a texel which would only have fallen off one
edge but not both instead, which is not quite accurate but actually I think
should be enough to meet OpenGL (but not d3d10) requirements.
---
 src/gallium/drivers/softpipe/sp_tex_sample.c |  200 +++---
 1 file changed, 150 insertions(+), 50 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 8dcc297..b905790 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -608,6 +608,48 @@ get_texel_2d(const struct sp_sampler_view *sp_sview,
}
 }
 
+
+/*
+ * Here's the complete logic (HOLY CRAP) for finding next face and doing the
+ * corresponding coord wrapping, implemented by get_next_face,
+ * get_next_xcoord, get_next_ycoord.
+ * Read like that (first line):
+ * If face is +x and s coord is below zero, then
+ * new face is +z, new s is max , new t is old t
+ * (max is always cube size - 1).
+ *
+ * +x s- -> +z: s = max,   t = t
+ * +x s+ -> -z: s = 0, t = t
+ * +x t- -> +y: s = max,   t = max-s
+ * +x t+ -> -y: s = max,   t = s
+ *
+ * -x s- -> -z: s = max,   t = t
+ * -x s+ -> +z: s = 0, t = t
+ * -x t- -> +y: s = 0, t = s
+ * -x t+ -> -y: s = 0, t = max-s
+ *
+ * +y s- -> -x: s = t, t = 0
+ * +y s+ -> +x: s = max-t, t = 0
+ * +y t- -> -z: s = max-s, t = 0
+ * +y t+ -> +z: s = s, t = 0
+ *
+ * -y s- -> -x: s = max-t, t = max
+ * -y s+ -> +x: s = t, t = max
+ * -y t- -> +z: s = s, t = max
+ * -y t+ -> -z: s = max-s, t = max
+
+ * +z s- -> -x: s = max,   t = t
+ * +z s+ -> +x: s = 0, t = t
+ * +z t- -> +y: s = s, t = max
+ * +z t+ -> -y: s = s, t = 0
+
+ * -z s- -> +x: s = max,   t = t
+ * -z s+ -> -x: s = 0, t = t
+ * -z t- -> +y: s = max-s, t = 0
+ * -z t+ -> -y: s = max-s, t = max
+ */
+
+
 /*
  * seamless cubemap neighbour array.
  * this array is used to find the adjacent face in each of 4 directions,
@@ -617,49 +659,101 @@ static const unsigned face_array[PIPE_TEX_FACE_MAX][4] = 
{
/* pos X first then neg X is Z different, Y the same */
/* PIPE_TEX_FACE_POS_X,*/
{ PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z,
- PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+ PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y },
/* PIPE_TEX_FACE_NEG_X */
{ PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z,
- PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+ PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y },
 
/* pos Y first then neg Y is X different, X the same */
/* PIPE_TEX_FACE_POS_Y */
{ PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
- PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z },
+ PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z },
 
/* PIPE_TEX_FACE_NEG_Y */
{ PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
- PIPE_TEX_FACE_NEG_Z, PIPE_TEX_FACE_POS_Z },
+ PIPE_TEX_FACE_POS_Z, PIPE_TEX_FACE_NEG_Z },
 
/* pos Z first then neg Y is X different, X the same */
/* PIPE_TEX_FACE_POS_Z */
{ PIPE_TEX_FACE_NEG_X, PIPE_TEX_FACE_POS_X,
- PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y },
+ PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y },
 
/* PIPE_TEX_FACE_NEG_Z */
{ PIPE_TEX_FACE_POS_X, PIPE_TEX_FACE_NEG_X,
- PIPE_TEX_FACE_NEG_Y, PIPE_TEX_FACE_POS_Y }
+ PIPE_TEX_FACE_POS_Y, PIPE_TEX_FACE_NEG_Y }
 };
 
 static INLINE unsigned
-get_next_face(unsigned face, int x, int y)
+get_next_face(unsigned face, int idx)
 {
-   int idx = 0;
+   return face_array[face][idx];
+}
 
-   if (x == 0 && y == 0)
-  return face;
-   if (x == -1)
-  idx = 0;
-   else if (x == 1)
-  idx = 1;
-   else if (y == -1)
-  idx = 2;
-   else if (y == 1)
-  idx = 3;
+static INLINE unsigned
+get_next_xcoord(unsigned face, unsigned fall_off_index, int max, int xc, int 
yc)
+{
+   if ((face == 0 && fall_off_index != 1) ||
+   (face == 1 && fall_off_index == 0) ||
+   (face == 4 && fall_off_index == 0) ||
+   (face == 5 && fall_off_index == 0)){
+  return max;
+   }
+   if ((face == 1 && fall_off_index != 0) ||
+   (face == 0 && fall_off_index == 1) ||
+   (face == 4 && fall_off_index == 1) ||
+   (face == 5 && fall_off_in

[Mesa-dev] [PATCH] llvmpipe: increase fs shader variant instruction cache limit by factor 4

2013-10-11 Thread sroland
From: Roland Scheidegger 

The previous limit of of 128*1024 was reported to cause frequent recompiles
in some apps due to shader variant thrashing on IRC in some apps leading
to noticeable lags.
Note that the LP_MAX_SHADER_VARIANTS limit (1024) was more or less impossible
to reach, since even simple fragment shaders without texturing (glxgears) used
more than twice than 128 instructions, hence the instruction limit would have
always been reached first (excluding things like trivial shaders not writing
color). Even with the new limit it is VERY likely the instruction limit is hit
first.
Should help with such lags due to recompiles (though other shader types have
their own limits, LP_MAX_SETUP_VARIANTS and DRAW_MAX_SHADER_VARIANTS, in
particular the latter seems a bit small (128)).
---
 src/gallium/drivers/llvmpipe/lp_limits.h |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h 
b/src/gallium/drivers/llvmpipe/lp_limits.h
index af31b35..8cfab5a 100644
--- a/src/gallium/drivers/llvmpipe/lp_limits.h
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -79,7 +79,7 @@
  * Max number of instructions (for all fragment shaders combined per context)
  * that will be kept around.
  */
-#define LP_MAX_SHADER_INSTRUCTIONS (128*1024)
+#define LP_MAX_SHADER_INSTRUCTIONS (512*LP_MAX_SHADER_VARIANTS)
 
 /**
  * Max number of setup variants that will be kept around.
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] gallivm: implement seamless cube filtering

2013-10-18 Thread sroland
From: Roland Scheidegger 

For seamless cube filtering it is necessary to determine new faces and new
coords per sample. The logic for this is _seriously_ complex (what needs
to happen is very "asymmetric" wrt face, x/y under/overflow), further
complicated by the fact that if the 4 samples are in a corner (meaning we
only have actually 3 samples, and all 3 are on different faces) then
falling off the edge is happening _both_ on x and y axis simultaneously.
There was a noticeable performance hit in mesa's cubemap demo when seamless
filtering was forced on (just below 10 percent or so in a debug build, when
disabling all filtering hacks, otherwise it would probably be a bit more) and
when always doing the logic, hence use a branch which it only does it if any
of the pixels in a quad (or in two quads) actually hit this. With that there
was no measurable performance hit in the cubemap demo (neither in a debug nor
release buidl), but this will vary (cubemap demo very rarely hits edges).
Might also be different on other cpus, as this forces SoA sampling path which
potentially can be quite a bit slower.
Note that as for corners, this code gets all the 3 samples which actually
exist right, and the 4th texel will simply be the same as one of the others,
meaning that filter weights will be a bit wrong. This however should be
enough for full OpenGL (but not d3d10) compliance.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |  138 +++
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |   13 ++
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  257 +
 3 files changed, 368 insertions(+), 40 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 1c35200..a032d9d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1402,6 +1402,144 @@ lp_build_unnormalized_coords(struct 
lp_build_sample_context *bld,
}
 }
 
+/**
+ * Generate new coords and faces for cubemap texels falling off the face.
+ *
+ * @param face   face (center) of the pixel
+ * @param x0 lower x coord
+ * @param x1 higher x coord (must be x0 + 1)
+ * @param y0 lower y coord
+ * @param y1 higher y coord (must be x0 + 1)
+ * @param max_coord texture cube (level) size - 1
+ * @param next_facesnew face values when falling off
+ * @param next_xcoords  new x coord values when falling off
+ * @param next_ycoords  new y coord values when falling off
+ *
+ * The arrays hold the new values when under/overflow of
+ * lower x, higher x, lower y, higher y coord would occur (in this order).
+ * next_xcoords/next_ycoords have two entries each (for both new lower and
+ * higher coord).
+ */
+void
+lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
+LLVMValueRef face,
+LLVMValueRef x0,
+LLVMValueRef x1,
+LLVMValueRef y0,
+LLVMValueRef y1,
+LLVMValueRef max_coord,
+LLVMValueRef next_faces[4],
+LLVMValueRef next_xcoords[4][2],
+LLVMValueRef next_ycoords[4][2])
+{
+   /*
+* Lookup tables aren't nice for simd code hence try some logic here.
+* (Note that while it would not be necessary to do per-sample (4) lookups
+* when using a LUT as it's impossible that texels fall off of positive
+* and negative edges simultaneously, it would however be necessary to
+* do 2 lookups for corner handling as in this case texels both fall off
+* of x and y axes.)
+*/
+   /*
+* Next faces (for face 012345):
+* x < 0.0  : 451110
+* x >= 1.0 : 540001
+* y < 0.0  : 225422
+* y >= 1.0 : 334533
+* Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
+* nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
+* nfy+: face & ~4 > 1 ? face + 2 : 3;
+* This could also use pshufb instead, but would need (manually coded)
+* ssse3 intrinsic (llvm won't do non-constant shuffles).
+*/
+   struct gallivm_state *gallivm = ivec_bld->gallivm;
+   LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
+   LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
+   LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
+   LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
+   LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
+   LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
+
+   sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
+   tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
+   sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one);
+   faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
+   tmp = lp_build_add(ivec_bld, faceand1, c4);
+   next_faces[0] = lp_build_select(ivec_bld, sel_f2345, 

[Mesa-dev] [PATCH 2/2] llvmpipe: enable seamless cube filtering

2013-10-18 Thread sroland
From: Roland Scheidegger 

---
 src/gallium/drivers/llvmpipe/lp_screen.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index 723e40e..4c81022 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -175,7 +175,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
   return 1;
case PIPE_CAP_SEAMLESS_CUBE_MAP:
case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
-  return 0;
+  return 1;
case PIPE_CAP_SCALED_RESOLVE:
   return 0;
/* this is a lie could support arbitrary large offsets */
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: implement fully accurate corner filtering for seamless cube maps

2013-10-21 Thread sroland
From: Roland Scheidegger 

d3d10 requires that cube corners are filtered with accurate weights (that
is, the weight of the non-existing corner texel should be evenly distributed
to the other 3 texels). OpenGL does not require this (but recommends it).
This requires us to use different filtering code, since we need per-texel
weights which our 2d lerp doesn't (and can't) do. And of course the (now
per element) weights need to be adjusted too for it to work.
Invoke the new filtering code whenever there's an edge to keep things simpler,
as it will work for edges too not just corners but of course it's only needed
with corners.
More ugly code for not much gain but at least a hacked up cubemap demo
shows very nice corners now... Not sure yet if and how this should be
configurable...
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  138 +++--
 1 file changed, 130 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 8e2d0d9..5d3511d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -840,7 +840,11 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
  const LLVMValueRef *offsets,
  LLVMValueRef colors_out[4])
 {
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_build_context *ivec_bld = &bld->int_coord_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
const unsigned dims = bld->dims;
+   struct lp_build_if_state edge_if;
LLVMValueRef width_vec;
LLVMValueRef height_vec;
LLVMValueRef depth_vec;
@@ -848,6 +852,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context 
*bld,
LLVMValueRef flt_width_vec;
LLVMValueRef flt_height_vec;
LLVMValueRef flt_depth_vec;
+   LLVMValueRef fall_off[4], have_edge;
LLVMValueRef z1 = NULL;
LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
@@ -856,6 +861,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context 
*bld,
LLVMValueRef xs[4], ys[4], zs[4];
LLVMValueRef neighbors[2][2][4];
int chan, texel_index;
+   boolean silly_but_accurate_cube_corner_filtering = TRUE;
 
lp_build_extract_image_sizes(bld,
 &bld->int_size_bld,
@@ -918,12 +924,7 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
   }
}
else {
-  LLVMBuilderRef builder = bld->gallivm->builder;
-  struct lp_build_context *ivec_bld = &bld->int_coord_bld;
-  struct lp_build_context *coord_bld = &bld->coord_bld;
-  struct lp_build_if_state edge_if;
-  LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
-  LLVMValueRef fall_off[4], coord, have_edge;
+  LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2], coord;
   LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp;
   LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
   LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
@@ -1074,7 +1075,7 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
 
if (linear_mask) {
   /*
-   * Whack filter weights into place. Whatever pixel had more weight is
+   * Whack filter weights into place. Whatever texel had more weight is
* the one which should have been selected by nearest filtering hence
* just use 100% weight for it.
*/
@@ -1135,7 +1136,7 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
}
else {
   /* 2D/3D texture */
-  LLVMValueRef colors0[4];
+  LLVMValueRef colors0[4], colorss[4];
 
   /* get x0/x1 texels at y1 */
   lp_build_sample_texel_soa(bld,
@@ -1149,6 +1150,111 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
 row_stride_vec, img_stride_vec,
 data_ptr, mipoffsets, neighbors[1][1]);
 
+  /*
+   * To avoid having to duplicate linear_mask / fetch code use
+   * another branch (with same edge condition) here (note that
+   * since we're using another branch anyway we COULD restrict this
+   * rather easily to just corners).
+   */
+  if (silly_but_accurate_cube_corner_filtering &&
+  bld->static_texture_state->target == PIPE_TEXTURE_CUBE &&
+  bld->static_sampler_state->seamless_cube_map) {
+ LLVMValueRef w00, w01, w10, w11, wx0, wy0;
+ LLVMValueRef c_weight, c00, c01, c10, c11;
+ LLVMValueRef one_third, tmp;
+
+ colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+ colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+ colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+ colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type,

[Mesa-dev] [PATCH] gallivm: implement fully accurate corner filtering for seamless cube maps

2013-10-23 Thread sroland
From: Roland Scheidegger 

d3d10 requires that cube corners are filtered with accurate weights (that
is, the weight of the non-existing corner texel should be evenly distributed
to the other 3 texels). OpenGL does not require this (but recommends it).
This requires us to use different filtering code, since we need per-texel
weights which our 2d lerp doesn't (and can't) do. And of course the (now
per element) weights need to be adjusted too for it to work.
Invoke the new filtering code whenever there's an edge to keep things simpler,
as it will work for edges too not just corners but of course it's only needed
with corners.
More ugly code for not much gain but at least a hacked up cubemap demo
shows very nice corners now... Not sure yet if and how this should be
configurable...

v2: incorporate feedback from Jose, only use special corner filtering code
when there's a corner not when there's only an edge (as corner filtering code
is slower, though a perf difference was only measureable when always
forcing edge code). Plus some minor style fixes.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  164 +++--
 1 file changed, 151 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 8e2d0d9..2d83331 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -822,6 +822,12 @@ lp_build_masklerp2d(struct lp_build_context *bld,
return lp_build_lerp(bld, weight1, val0, val1, 0);
 }
 
+/*
+ * this is a bit excessive code for something OpenGL just recommends
+ * but does not require.
+ */
+#define ACCURATE_CUBE_CORNERS 1
+
 /**
  * Generate code to sample a mipmap level with linear filtering.
  * If sampling a cube texture, r = cube face in [0,5].
@@ -840,6 +846,9 @@ lp_build_sample_image_linear(struct lp_build_sample_context 
*bld,
  const LLVMValueRef *offsets,
  LLVMValueRef colors_out[4])
 {
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_build_context *ivec_bld = &bld->int_coord_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
const unsigned dims = bld->dims;
LLVMValueRef width_vec;
LLVMValueRef height_vec;
@@ -848,6 +857,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context 
*bld,
LLVMValueRef flt_width_vec;
LLVMValueRef flt_height_vec;
LLVMValueRef flt_depth_vec;
+   LLVMValueRef fall_off[4], have_corners;
LLVMValueRef z1 = NULL;
LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
@@ -856,6 +866,11 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
LLVMValueRef xs[4], ys[4], zs[4];
LLVMValueRef neighbors[2][2][4];
int chan, texel_index;
+   boolean seamless_cube_filter, accurate_cube_corners;
+
+   seamless_cube_filter = bld->static_texture_state->target == 
PIPE_TEXTURE_CUBE &&
+  bld->static_sampler_state->seamless_cube_map;
+   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
 
lp_build_extract_image_sizes(bld,
 &bld->int_size_bld,
@@ -875,8 +890,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context 
*bld,
 * Compute integer texcoords.
 */
 
-   if (bld->static_texture_state->target != PIPE_TEXTURE_CUBE ||
-   !bld->static_sampler_state->seamless_cube_map) {
+   if (!seamless_cube_filter) {
   lp_build_sample_wrap_linear(bld, coords[0], width_vec,
   flt_width_vec, offsets[0],
   bld->static_texture_state->pot_width,
@@ -918,13 +932,11 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
   }
}
else {
-  LLVMBuilderRef builder = bld->gallivm->builder;
-  struct lp_build_context *ivec_bld = &bld->int_coord_bld;
-  struct lp_build_context *coord_bld = &bld->coord_bld;
   struct lp_build_if_state edge_if;
+  LLVMTypeRef int1t;
   LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
-  LLVMValueRef fall_off[4], coord, have_edge;
-  LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp;
+  LLVMValueRef coord, have_edge, have_corner;
+  LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, 
fall_off_y;
   LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
   LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
   LLVMValueRef face = coords[2];
@@ -957,12 +969,15 @@ lp_build_sample_image_linear(struct 
lp_build_sample_context *bld,
   fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
   fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, 
length_minus_one);
 
-  have_edge = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
-  have_edge = lp_build_or(ivec_bld, have_edge, fall_off[2]);
- 

[Mesa-dev] [PATCH] gallium: kill off PIPE_FORMAT_Z32_UNORM with extreme prejudice

2013-10-24 Thread sroland
From: Roland Scheidegger 

This format, while still supported in OpenGL (but optional) and glx, is just
causing major nuisance everywhere and needs special code in some places,
because things like 1 << depth_bits don't work.
It is also the reason why we chose (just like in GL) depth clear values as
doubles and not floats.
The format however is just a disaster, no hw (as far as I know) supports it
(because hw has all float pipelines, and floats don't have enough mantissa
bits), and while we CLAIM to support it in software rasterizers, fact is it
cannot (and never will) really work right, unless we wouldn't use floats for
depth calculations. E.g. depth offsets can't work right (small depth bias will
disappear with primitive depths > 1/256 due to the float calcs without enough
mantissa bits), and it is generally useless (since the calculations are all
float, use a float buffer if you need something better than z24 unorm).
---
 src/gallium/auxiliary/util/u_format.csv|1 -
 src/gallium/auxiliary/util/u_format.h  |4 +-
 src/gallium/auxiliary/util/u_format_tests.c|3 -
 src/gallium/auxiliary/util/u_format_zs.c   |   69 
 src/gallium/auxiliary/util/u_pack_color.h  |6 --
 src/gallium/auxiliary/util/u_tile.c|   56 
 src/gallium/drivers/ilo/ilo_format.c   |2 +-
 src/gallium/drivers/softpipe/sp_quad_depth_test.c  |   17 -
 src/gallium/drivers/svga/svga_format.c |3 -
 src/gallium/drivers/svga/svga_pipe_misc.c  |4 --
 src/gallium/include/pipe/p_format.h|2 +-
 src/gallium/state_trackers/dri/common/dri_screen.c |   18 ++---
 src/gallium/state_trackers/egl/common/egl_g3d.c|1 -
 src/gallium/state_trackers/glx/xlib/xm_api.c   |3 -
 src/gallium/state_trackers/wgl/stw_pixelformat.c   |1 -
 src/gallium/state_trackers/xa/xa_tracker.c |3 -
 src/gallium/state_trackers/xorg/xorg_dri2.c|3 -
 src/mesa/state_tracker/st_cb_fbo.c |3 -
 src/mesa/state_tracker/st_format.c |6 +-
 19 files changed, 12 insertions(+), 193 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format.csv 
b/src/gallium/auxiliary/util/u_format.csv
index 8d04b00..662cc59 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -126,7 +126,6 @@ PIPE_FORMAT_R5SG5SB6U_NORM, plain, 1, 1, sn5 , sn5 
, un6 , , xyz1, r
 # Depth-stencil formats
 PIPE_FORMAT_S8_UINT , plain, 1, 1, up8 , , , , 
_x__, zs
 PIPE_FORMAT_Z16_UNORM   , plain, 1, 1, un16, , , , 
x___, zs
-PIPE_FORMAT_Z32_UNORM   , plain, 1, 1, un32, , , , 
x___, zs
 PIPE_FORMAT_Z32_FLOAT   , plain, 1, 1, f32 , , , , 
x___, zs
 PIPE_FORMAT_Z24_UNORM_S8_UINT   , plain, 1, 1, un24, up8 , , , 
xy__, zs
 PIPE_FORMAT_S8_UINT_Z24_UNORM   , plain, 1, 1, up8 , un24, , , 
yx__, zs
diff --git a/src/gallium/auxiliary/util/u_format.h 
b/src/gallium/auxiliary/util/u_format.h
index 84f16d5..2101293 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -289,7 +289,7 @@ struct util_format_description
unsigned i, unsigned j);
 
/**
-* Unpack pixels to Z32_UNORM.
+* Unpack pixels to 32bit unorm.
 * Note: strides are in bytes.
 *
 * Only defined for depth formats.
@@ -300,7 +300,7 @@ struct util_format_description
unsigned width, unsigned height);
 
/**
-* Pack pixels from Z32_FLOAT.
+* Pack pixels from 32bit unorm.
 * Note: strides are in bytes.
 *
 * Only defined for depth formats.
diff --git a/src/gallium/auxiliary/util/u_format_tests.c 
b/src/gallium/auxiliary/util/u_format_tests.c
index 64224cd..0e07dd1 100644
--- a/src/gallium/auxiliary/util/u_format_tests.c
+++ b/src/gallium/auxiliary/util/u_format_tests.c
@@ -347,9 +347,6 @@ util_format_test_cases[] =
{PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0x), PACKED_1x16(0x), 
UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
{PIPE_FORMAT_Z16_UNORM, PACKED_1x16(0x), PACKED_1x16(0x), 
UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
 
-   {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0x), PACKED_1x32(0x), 
UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
-   {PIPE_FORMAT_Z32_UNORM, PACKED_1x32(0x), PACKED_1x32(0x), 
UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
-
{PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0x), PACKED_1x32(0x), 
UNPACKED_1x1(0.0, 0.0, 0.0, 0.0)},
{PIPE_FORMAT_Z32_FLOAT, PACKED_1x32(0x), PACKED_1x32(0x3f80), 
UNPACKED_1x1(1.0, 0.0, 0.0, 0.0)},
 
diff --git a/src/gallium/auxiliary/util/u_format_zs.c 
b/src/gallium/auxiliary/util/u_format_zs.c
index ed45c52..f1a7b4c 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -224,75 +224,6

[Mesa-dev] [PATCH] llvmpipe: fix bogus layer clamping in setup

2013-10-25 Thread sroland
From: Roland Scheidegger 

The layer coming from GS needs to be clamped (not sure if that's actually
the correct error behavior but we need something) as the number can be higher
than the amount of layers in the fb. However, this code was using the layer
calculation from the scene, and this was actually calculated in
lp_scene_begin_rasterization() hence too late (so setup was using the value
from the _previous_ scene or just zero if it was the first scene).
Since the value is used in both rasterization and setup, move calculation up
to lp_scene_begin_binning() though it's a bit more inconvenient to calculate
there. (Theoretically could move _all_ code which was in
lp_scene_begin_rasterization() to there, because ever since we got rid of
swizzled render/depth buffers our "map" functions preparing the fb data for
render don't actually change the data in there at all, but it feels like
it would be a hack.)
---
 src/gallium/drivers/llvmpipe/lp_scene.c |   25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c 
b/src/gallium/drivers/llvmpipe/lp_scene.c
index 2abbd25..483bfa5 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -151,7 +151,6 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
 {
const struct pipe_framebuffer_state *fb = &scene->fb;
int i;
-   unsigned max_layer = ~0;
 
//LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
 
@@ -162,7 +161,6 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
cbuf->u.tex.level);
  scene->cbufs[i].layer_stride = llvmpipe_layer_stride(cbuf->texture,
   
cbuf->u.tex.level);
- max_layer = MIN2(max_layer, cbuf->u.tex.last_layer - 
cbuf->u.tex.first_layer);
 
  scene->cbufs[i].map = llvmpipe_resource_map(cbuf->texture,
  cbuf->u.tex.level,
@@ -173,7 +171,6 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
  struct llvmpipe_resource *lpr = llvmpipe_resource(cbuf->texture);
  unsigned pixstride = util_format_get_blocksize(cbuf->format);
  scene->cbufs[i].stride = cbuf->texture->width0;
- max_layer = 0;
 
  scene->cbufs[i].map = lpr->data;
  scene->cbufs[i].map += cbuf->u.buf.first_element * pixstride;
@@ -184,15 +181,12 @@ lp_scene_begin_rasterization(struct lp_scene *scene)
   struct pipe_surface *zsbuf = scene->fb.zsbuf;
   scene->zsbuf.stride = llvmpipe_resource_stride(zsbuf->texture, 
zsbuf->u.tex.level);
   scene->zsbuf.layer_stride = llvmpipe_layer_stride(zsbuf->texture, 
zsbuf->u.tex.level);
-  max_layer = MIN2(max_layer, zsbuf->u.tex.last_layer - 
zsbuf->u.tex.first_layer);
 
   scene->zsbuf.map = llvmpipe_resource_map(zsbuf->texture,
zsbuf->u.tex.level,
zsbuf->u.tex.first_layer,
LP_TEX_USAGE_READ_WRITE);
}
-
-   scene->fb_max_layer = max_layer;
 }
 
 
@@ -506,6 +500,9 @@ end:
 void lp_scene_begin_binning( struct lp_scene *scene,
  struct pipe_framebuffer_state *fb, boolean 
discard )
 {
+   int i;
+   unsigned max_layer = ~0;
+
assert(lp_scene_is_empty(scene));
 
scene->discard = discard;
@@ -513,9 +510,23 @@ void lp_scene_begin_binning( struct lp_scene *scene,
 
scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
-
assert(scene->tiles_x <= TILES_X);
assert(scene->tiles_y <= TILES_Y);
+
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+  struct pipe_surface *cbuf = scene->fb.cbufs[i];
+  if (llvmpipe_resource_is_texture(cbuf->texture)) {
+ max_layer = MIN2(max_layer, cbuf->u.tex.last_layer - 
cbuf->u.tex.first_layer);
+  }
+  else {
+ max_layer = 0;
+  }
+   }
+   if (fb->zsbuf) {
+  struct pipe_surface *zsbuf = scene->fb.zsbuf;
+  max_layer = MIN2(max_layer, zsbuf->u.tex.last_layer - 
zsbuf->u.tex.first_layer);
+   }
+   scene->fb_max_layer = max_layer;
 }
 
 
-- 
1.7.9.5
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: optimize lp_build_minify for sse

2013-11-05 Thread sroland
From: Roland Scheidegger 

SSE can't handle true vector shifts (with variable shift count),
so llvm is turning them into a mess of extracts, scalar shifts and inserts.
It is however possible to emulate them in lp_build_minify with float muls,
which should be way faster (saves over 20 instructions per 8-wide
lp_build_minify). This wouldn't work for "generic" 32bit shifts though
since we've got only 24bits of mantissa (actually for left shifts it would
work by using sse41 int mul instead of float mul but not for right shifts).
Note that this has very limited scope for now, since this is only used with
per-pixel lod (otherwise we're avoiding the non-constant shift count by doing
per-quad shifts manually), and only 1d textures even then (though the latter
should change).
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c |   62 +
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |3 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |2 +-
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index a032d9d..e60a035 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -36,6 +36,7 @@
 #include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 #include "lp_bld_arit.h"
 #include "lp_bld_const.h"
 #include "lp_bld_debug.h"
@@ -248,7 +249,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
  bld->gallivm, texture_unit);
first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
-   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
+   int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec, 
TRUE);
float_size = lp_build_int_to_float(float_size_bld, int_size);
 
if (cube_rho) {
@@ -1089,12 +1090,14 @@ lp_build_get_mip_offsets(struct lp_build_sample_context 
*bld,
 
 /**
  * Codegen equivalent for u_minify().
+ * @param lod_scalar  if lod is a (broadcasted) scalar
  * Return max(1, base_size >> level);
  */
 LLVMValueRef
 lp_build_minify(struct lp_build_context *bld,
 LLVMValueRef base_size,
-LLVMValueRef level)
+LLVMValueRef level,
+boolean lod_scalar)
 {
LLVMBuilderRef builder = bld->gallivm->builder;
assert(lp_check_value(bld->type, base_size));
@@ -1105,10 +1108,49 @@ lp_build_minify(struct lp_build_context *bld,
   return base_size;
}
else {
-  LLVMValueRef size =
- LLVMBuildLShr(builder, base_size, level, "minify");
+  LLVMValueRef size;
   assert(bld->type.sign);
-  size = lp_build_max(bld, size, bld->one);
+  if (lod_scalar ||
+ (util_cpu_caps.has_avx2 || !util_cpu_caps.has_sse)) {
+ size = LLVMBuildLShr(builder, base_size, level, "minify");
+ size = lp_build_max(bld, size, bld->one);
+  }
+  else {
+ /*
+  * emulate shift with float mul, since intel "forgot" shifts with
+  * per-element shift count until avx2, which results in terrible
+  * scalar extraction (both count and value), scalar shift,
+  * vector reinsertion. Should not be an issue on any non-x86 cpu
+  * with a vector instruction set.
+  * On cpus with AMD's XOP this should also be unnecessary but I'm
+  * not sure if llvm would emit this with current flags.
+  */
+ LLVMValueRef const127, const23, lf;
+ struct lp_type ftype;
+ struct lp_build_context fbld;
+ ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
+ lp_build_context_init(&fbld, bld->gallivm, ftype);
+ const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
+ const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
+
+ /* calculate 2^(-level) float */
+ lf = lp_build_sub(bld, const127, level);
+ lf = lp_build_shl(bld, lf, const23);
+ lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
+
+ /* finish shift operation by doing float mul */
+ base_size = lp_build_int_to_float(&fbld, base_size);
+ size = lp_build_mul(&fbld, base_size, lf);
+ /*
+  * do the max also with floats because
+  * a) non-emulated int max requires sse41
+  *(this is actually a lie as we could cast to 16bit values
+  *as 16bit is sufficient and 16bit int max is sse2)
+  * b) with avx we can do int max 4-wide but float max 8-wide
+  */
+ size = lp_build_max(&fbld, size, fbld.one);
+ size = lp_build_itrunc(&fbld, size);
+  }
   return size;
}
 }
@@ -1185,7 +1227,7 @@ lp_build_mipmap_level_sizes(struct 
lp_build_sample_context *bld,
 */
   

[Mesa-dev] [PATCH] gallivm: fix indirect addressing of inputs

2013-11-06 Thread sroland
From: Roland Scheidegger 

We weren't adding the soa offsets when constructing the indices
for the gather functions. That meant that we were always returning
the data in the first element.
(Copied straight from the same fix for temps.)
While here fix up a couple of broken comments in the fetch functions,
plus don't name a straight float type float4 which is just confusing.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |   45 ++-
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index e50f1d1..75f6def 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -993,11 +993,11 @@ emit_fetch_immediate(
   LLVMValueRef length_vec =
  lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type,
 bld->bld_base.base.type.length);
-  LLVMValueRef index_vec;  /* index into the const buffer */
+  LLVMValueRef index_vec;  /* index into the immediate register array */
   LLVMValueRef imms_array;
   LLVMValueRef pixel_offsets;
   LLVMValueRef offsets[LP_MAX_VECTOR_LENGTH];
-  LLVMTypeRef float4_ptr_type;
+  LLVMTypeRef float_ptr_type;
   int i;
 
   /* build pixel offset vector: {0, 1, 2, 3, ...} */
@@ -1013,12 +1013,12 @@ emit_fetch_immediate(
   index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
 
   /* cast imms_array pointer to float* */
-  float4_ptr_type = LLVMPointerType(
+  float_ptr_type = LLVMPointerType(
  LLVMFloatTypeInContext(bld->bld_base.base.gallivm->context), 0);
   imms_array = LLVMBuildBitCast(builder, bld->imms_array,
-float4_ptr_type, "");
+float_ptr_type, "");
 
-  /* Gather values from the temporary register array */
+  /* Gather values from the immediate register array */
   res = build_gather(&bld_base->base, imms_array, index_vec);
}
else {
@@ -1044,6 +1044,7 @@ emit_fetch_input(
struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   struct lp_build_context *float_bld = &bld_base->base;
LLVMValueRef indirect_index = NULL;
LLVMValueRef res;
 
@@ -1059,21 +1060,31 @@ emit_fetch_input(
  lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
   LLVMValueRef length_vec =
  lp_build_const_int_vec(gallivm, uint_bld->type, 
bld->bld_base.base.type.length);
-  LLVMValueRef index_vec;  /* index into the const buffer */
+  LLVMValueRef index_vec;  /* index into the input reg array */
   LLVMValueRef inputs_array;
-  LLVMTypeRef float4_ptr_type;
+  LLVMValueRef pixel_offsets;
+  LLVMValueRef offsets[LP_MAX_VECTOR_LENGTH];
+  LLVMTypeRef float_ptr_type;
+  int i;
+
+  /* build pixel offset vector: {0, 1, 2, 3, ...} */
+  for (i = 0; i < float_bld->type.length; i++) {
+ offsets[i] = lp_build_const_int32(gallivm, i);
+  }
+  pixel_offsets = LLVMConstVector(offsets, float_bld->type.length);
 
   /* index_vec = (indirect_index * 4 + swizzle) * length */
   index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
   index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
   index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+  index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
 
   /* cast inputs_array pointer to float* */
-  float4_ptr_type = 
LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
+  float_ptr_type = 
LLVMPointerType(LLVMFloatTypeInContext(gallivm->context), 0);
   inputs_array = LLVMBuildBitCast(builder, bld->inputs_array,
- float4_ptr_type, "");
+  float_ptr_type, "");
 
-  /* Gather values from the temporary register array */
+  /* Gather values from the input register array */
   res = build_gather(&bld_base->base, inputs_array, index_vec);
} else {
   if (bld->indirect_files & (1 << TGSI_FILE_INPUT)) {
@@ -1117,9 +1128,9 @@ emit_fetch_gs_input(
 
if (reg->Register.Indirect) {
   attrib_index = get_indirect_index(bld,
-  reg->Register.File,
-  reg->Register.Index,
-  ®->Indirect);
+reg->Register.File,
+reg->Register.Index,
+®->Indirect);
} else {
   attrib_index = lp_build_const_int32(gallivm, reg->Register.Index);
}
@@ -1179,11 +1190,11 @@ emit_fetch_temporary(
   LLVMValueRef length_vec =
  lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_

[Mesa-dev] [PATCH] gallivm: deduplicate some indirect register address code

2013-11-06 Thread sroland
From: Roland Scheidegger 

There's only one minor functional change, for immediates the pixel offsets
are no longer added since the values are all the same for all elements in
any case (it might be better if those weren't stored as soa vectors in the
first place maybe).
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c |  253 +--
 1 file changed, 96 insertions(+), 157 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 75f6def..5f81066 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -898,6 +898,39 @@ stype_to_fetch(struct lp_build_tgsi_context * bld_base,
 }
 
 static LLVMValueRef
+get_soa_array_offsets(struct lp_build_context *uint_bld,
+  LLVMValueRef indirect_index,
+  unsigned chan_index,
+  boolean need_perelement_offset)
+{
+   struct gallivm_state *gallivm = uint_bld->gallivm;
+   LLVMValueRef chan_vec =
+  lp_build_const_int_vec(uint_bld->gallivm, uint_bld->type, chan_index);
+   LLVMValueRef length_vec =
+  lp_build_const_int_vec(gallivm, uint_bld->type, uint_bld->type.length);
+   LLVMValueRef index_vec;
+
+   /* index_vec = (indirect_index * 4 + chan_index) * length + offsets */
+   index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+   index_vec = lp_build_add(uint_bld, index_vec, chan_vec);
+   index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+
+   if (need_perelement_offset) {
+  LLVMValueRef pixel_offsets;
+  int i;
+ /* build pixel offset vector: {0, 1, 2, 3, ...} */
+  pixel_offsets = uint_bld->undef;
+  for (i = 0; i < uint_bld->type.length; i++) {
+ LLVMValueRef ii = lp_build_const_int32(gallivm, i);
+ pixel_offsets = LLVMBuildInsertElement(gallivm->builder, 
pixel_offsets,
+ii, ii, "");
+  }
+  index_vec = lp_build_add(uint_bld, index_vec, pixel_offsets);
+   }
+   return index_vec;
+}
+
+static LLVMValueRef
 emit_fetch_constant(
struct lp_build_tgsi_context * bld_base,
const struct tgsi_full_src_register * reg,
@@ -908,7 +941,6 @@ emit_fetch_constant(
struct gallivm_state *gallivm = bld_base->base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   LLVMValueRef indirect_index = NULL;
unsigned dimension = 0;
LLVMValueRef dimension_index;
LLVMValueRef consts_ptr;
@@ -927,16 +959,15 @@ emit_fetch_constant(
consts_ptr = lp_build_array_get(gallivm, bld->consts_ptr, dimension_index);
 
if (reg->Register.Indirect) {
+  LLVMValueRef indirect_index;
+  LLVMValueRef swizzle_vec =
+ lp_build_const_int_vec(gallivm, uint_bld->type, swizzle);
+  LLVMValueRef index_vec;  /* index into the const buffer */
+
   indirect_index = get_indirect_index(bld,
   reg->Register.File,
   reg->Register.Index,
   ®->Indirect);
-   }
-
-   if (reg->Register.Indirect) {
-  LLVMValueRef swizzle_vec =
- lp_build_const_int_vec(bld->bld_base.base.gallivm, uint_bld->type, 
swizzle);
-  LLVMValueRef index_vec;  /* index into the const buffer */
 
   /* index_vec = indirect_index * 4 + swizzle */
   index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
@@ -949,7 +980,7 @@ emit_fetch_constant(
   LLVMValueRef index;  /* index into the const buffer */
   LLVMValueRef scalar, scalar_ptr;
 
-  index = lp_build_const_int32(gallivm, reg->Register.Index*4 + swizzle);
+  index = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
 
   scalar_ptr = LLVMBuildGEP(builder, consts_ptr,
 &index, 1, "");
@@ -974,49 +1005,32 @@ emit_fetch_immediate(
struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);
struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
-   struct lp_build_context *uint_bld = &bld_base->uint_bld;
-   struct lp_build_context *float_bld = &bld_base->base;
LLVMValueRef res = NULL;
-   LLVMValueRef indirect_index = NULL;
 
if (reg->Register.Indirect) {
+  LLVMValueRef indirect_index;
+  LLVMValueRef index_vec;  /* index into the immediate register array */
+  LLVMValueRef imms_array;
+  LLVMTypeRef fptr_type;
+
   indirect_index = get_indirect_index(bld,
   reg->Register.File,
   reg->Register.Index,
   ®->Indirect);
-   }
-
-   if (reg->Register.Indirect) {
-  LLVMValueRef swizzle_vec =
- lp_build_const_int_vec(bld->bld_base.base.gallivm,
-uint_bld->type, swizzle);
-  LLVMValueRef 

[Mesa-dev] [PATCH] gallivm, llvmpipe: fix float->srgb conversion to handle NaNs

2013-11-11 Thread sroland
From: Roland Scheidegger 

d3d10 requires us to convert NaNs to zero for any float->int conversion.
We don't really do that but mostly seems to work. In particular I suspect the
very common float->unorm8 path only really passes because it relies on sse2
pack intrinsics which just happen to work by luck for NaNs (float->int
conversion in hw gives integer indeterminate value, which just happens to be
-0x8000 hence gets converted to zero in the end after pack intrinsics).
However, float->srgb didn't get so lucky, because we need to clamp before
blending and clamping resulted in NaN behavior being undefined (and actually
got converted to 1.0 by clamping with sse2). Fix this by using a zero/one clamp
with defined nan behavior as we can handle the NaN for free this way.
I suspect there's more bugs lurking in this area (e.g. converting floats to
snorm) as we don't really use defined NaN behavior everywhere but this seems
to be good enough.
While here respecify nan behavior modes a bit, in particular the return_second
mode didn't really do what we wanted. From the caller's perspective, we really
wanted to say we need the non-nan result, but we already know the second arg
isn't a NaN. So we use this now instead, which means that cpu architectures
which actually implement min/max by always returning non-nan (that is adhering
to ieee754-2008 rules) don't need to bend over backwards for nothing.
---
 src/gallium/auxiliary/gallivm/lp_bld_arit.c|   44 +---
 src/gallium/auxiliary/gallivm/lp_bld_arit.h|   12 --
 src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c |2 +-
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c|   11 ++---
 src/gallium/drivers/llvmpipe/lp_state_fs.c |4 +-
 5 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c 
b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 00052ed..70929e7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -123,8 +123,10 @@ lp_build_min_simple(struct lp_build_context *bld,
   }
}
else if (type.floating && util_cpu_caps.has_altivec) {
-  debug_printf("%s: altivec doesn't support nan behavior modes\n",
-   __FUNCTION__);
+  if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
+ debug_printf("%s: altivec doesn't support nan return nan behavior\n",
+  __FUNCTION__);
+  }
   if (type.width == 32 && type.length == 4) {
  intrinsic = "llvm.ppc.altivec.vminfp";
  intr_size = 128;
@@ -159,8 +161,6 @@ lp_build_min_simple(struct lp_build_context *bld,
   }
} else if (util_cpu_caps.has_altivec) {
   intr_size = 128;
-  debug_printf("%s: altivec doesn't support nan behavior modes\n",
-   __FUNCTION__);
   if (type.width == 8) {
  if (!type.sign) {
 intrinsic = "llvm.ppc.altivec.vminub";
@@ -191,7 +191,7 @@ lp_build_min_simple(struct lp_build_context *bld,
*/
   if (util_cpu_caps.has_sse && type.floating &&
   nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
-  nan_behavior != GALLIVM_NAN_RETURN_SECOND) {
+  nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN) {
  LLVMValueRef isnan, max;
  max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
type,
@@ -227,7 +227,7 @@ lp_build_min_simple(struct lp_build_context *bld,
  return lp_build_select(bld, cond, a, b);
   }
  break;
-  case GALLIVM_NAN_RETURN_SECOND:
+  case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
  cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
  return lp_build_select(bld, cond, a, b);
   case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
@@ -299,8 +299,10 @@ lp_build_max_simple(struct lp_build_context *bld,
   }
}
else if (type.floating && util_cpu_caps.has_altivec) {
-  debug_printf("%s: altivec doesn't support nan behavior modes\n",
-   __FUNCTION__);
+  if (nan_behavior == GALLIVM_NAN_RETURN_NAN) {
+ debug_printf("%s: altivec doesn't support nan return nan behavior\n",
+  __FUNCTION__);
+  }
   if (type.width == 32 || type.length == 4) {
  intrinsic = "llvm.ppc.altivec.vmaxfp";
  intr_size = 128;
@@ -336,8 +338,6 @@ lp_build_max_simple(struct lp_build_context *bld,
   }
} else if (util_cpu_caps.has_altivec) {
  intr_size = 128;
- debug_printf("%s: altivec doesn't support nan behavior modes\n",
-  __FUNCTION__);
  if (type.width == 8) {
if (!type.sign) {
  intrinsic = "llvm.ppc.altivec.vmaxub";
@@ -362,7 +362,7 @@ lp_build_max_simple(struct lp_build_context *bld,
if(intrinsic) {
   if (util_cpu_caps.has_sse && type.floating &&
   nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
-  nan_behavior != GALLI

[Mesa-dev] [PATCH] llvmpipe: clean up state setup code a bit

2013-11-12 Thread sroland
From: Roland Scheidegger 

In particular get rid of home-grown vector helpers which didn't add much.
And while here fix formatting a bit. No functional change.
---
 src/gallium/drivers/llvmpipe/lp_state_setup.c |  183 +
 1 file changed, 66 insertions(+), 117 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c 
b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index ccc9fb9..fffc0ed 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -76,59 +76,10 @@ struct lp_setup_args
LLVMValueRef dy01_ooa;
LLVMValueRef dx20_ooa;
LLVMValueRef dx01_ooa;
+   struct lp_build_context bld;
 };
 
 
-
-static LLVMTypeRef
-type4f(struct gallivm_state *gallivm)
-{
-   return LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4);
-}
-
-
-/* Equivalent of _mm_setr_ps(a,b,c,d)
- */
-static LLVMValueRef
-vec4f(struct gallivm_state *gallivm,
-  LLVMValueRef a, LLVMValueRef b, LLVMValueRef c, LLVMValueRef d,
-  const char *name)
-{
-   LLVMBuilderRef bld = gallivm->builder;
-   LLVMValueRef i0 = lp_build_const_int32(gallivm, 0);
-   LLVMValueRef i1 = lp_build_const_int32(gallivm, 1);
-   LLVMValueRef i2 = lp_build_const_int32(gallivm, 2);
-   LLVMValueRef i3 = lp_build_const_int32(gallivm, 3);
-
-   LLVMValueRef res = LLVMGetUndef(type4f(gallivm));
-
-   res = LLVMBuildInsertElement(bld, res, a, i0, "");
-   res = LLVMBuildInsertElement(bld, res, b, i1, "");
-   res = LLVMBuildInsertElement(bld, res, c, i2, "");
-   res = LLVMBuildInsertElement(bld, res, d, i3, name);
-
-   return res;
-}
-
-/* Equivalent of _mm_set1_ps(a)
- */
-static LLVMValueRef
-vec4f_from_scalar(struct gallivm_state *gallivm,
-  LLVMValueRef a,
-  const char *name)
-{
-   LLVMBuilderRef bld = gallivm->builder;
-   LLVMValueRef res = LLVMGetUndef(type4f(gallivm));
-   int i;
-
-   for(i = 0; i < 4; ++i) {
-  LLVMValueRef index = lp_build_const_int32(gallivm, i);
-  res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : "");
-   }
-
-   return res;
-}
-
 static void
 store_coef(struct gallivm_state *gallivm,
   struct lp_setup_args *args,
@@ -157,13 +108,11 @@ store_coef(struct gallivm_state *gallivm,
 
 static void 
 emit_constant_coef4(struct gallivm_state *gallivm,
-struct lp_setup_args *args,
-unsigned slot,
-LLVMValueRef vert)
+struct lp_setup_args *args,
+unsigned slot,
+LLVMValueRef vert)
 {
-   LLVMValueRef zero  = lp_build_const_float(gallivm, 0.0);
-   LLVMValueRef zerovec   = vec4f_from_scalar(gallivm, zero, "zero");
-   store_coef(gallivm, args, slot, vert, zerovec, zerovec);
+   store_coef(gallivm, args, slot, vert, args->bld.zero, args->bld.zero);
 }
 
 
@@ -174,14 +123,15 @@ emit_constant_coef4(struct gallivm_state *gallivm,
  */
 static void 
 emit_facing_coef(struct gallivm_state *gallivm,
- struct lp_setup_args *args,
- unsigned slot )
+ struct lp_setup_args *args,
+ unsigned slot )
 {
LLVMBuilderRef builder = gallivm->builder;
LLVMTypeRef float_type = LLVMFloatTypeInContext(gallivm->context);
LLVMValueRef a0_0 = args->facing;
LLVMValueRef a0_0f = LLVMBuildSIToFP(builder, a0_0, float_type, "");
-   LLVMValueRef zero = lp_build_const_float(gallivm, 0.0);
+   const unsigned char swizzles[4] = { PIPE_SWIZZLE_RED, PIPE_SWIZZLE_ZERO,
+   PIPE_SWIZZLE_ZERO, PIPE_SWIZZLE_ZERO };
/* Our face val is either 1 or 0 so we do
 * face = (val * 2) - 1
 * to make it 1 or -1
@@ -192,20 +142,19 @@ emit_facing_coef(struct gallivm_state *gallivm,
   lp_build_const_float(gallivm, 2.0),
   ""),
 lp_build_const_float(gallivm, -1.0),
-"");
-   LLVMValueRef a0 = vec4f(gallivm, face_val, zero, zero, zero, "facing");
-   LLVMValueRef zerovec = vec4f_from_scalar(gallivm, zero, "zero");
+"facing");
+   LLVMValueRef a0 = lp_build_swizzle_aos(&args->bld, face_val, swizzles);
 
-   store_coef(gallivm, args, slot, a0, zerovec, zerovec);
+   store_coef(gallivm, args, slot, a0, args->bld.zero, args->bld.zero);
 }
 
 
 static LLVMValueRef
 vert_attrib(struct gallivm_state *gallivm,
-   LLVMValueRef vert,
-   int attr,
-   int elem,
-   const char *name)
+LLVMValueRef vert,
+int attr,
+int elem,
+const char *name)
 {
LLVMBuilderRef b = gallivm->builder;
LLVMValueRef idx[2];
@@ -227,8 +176,9 @@ lp_twoside(struct gallivm_state *gallivm,
LLVMValueRef idx2 = lp_build_const_int32(gallivm, bcolor_slot);
 
LLVMValueRef facing = args->facing;
-   LLVMValueRef front_facing = LLVMBuildICmp(b, LLVMIntEQ, facing, 
lp_build_const_int32(gallivm, 0), 

[Mesa-dev] [PATCH] llvmpipe: calculate more accurate interpolation value at origin

2013-11-20 Thread sroland
From: Roland Scheidegger 

Some rounding errors could crop up when calculating a0. Use a more accurate
method (barycentric interpolation essentially) to fix this, though to fix
the REAL problem (which is that our interpolation will give very bad results
with small triangles far away from the origin when they have steep gradients)
this does absolutely nothing (actually makes it worse). (To fix the real
problem, either would need to use a vertex corner (or some other point inside
the tri) as starting point value instead of fb origin and pass that down to
interpolation, or mimic what hw does, use barycentric interpolation (using
the coordinates extracted from the rasterizer edge functions) - maybe another
time.)
Some (silly) tests though really want a high accuracy at fb origin and don't
care much about anything else (Just. Don't. Ask.).
---
 src/gallium/drivers/llvmpipe/lp_state_setup.c |   88 +++--
 1 file changed, 82 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c 
b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index 59ab467..ef000fb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -49,6 +49,15 @@
 #include "lp_state_fs.h"
 #include "lp_state_setup.h"
 
+/*
+ * Set if the start point for interpolation should be calculated with a
+ * more accurate method (barycentric interpolation).
+ * Unfortunately, actual interpolation results of small tris with steep
+ * gradients far away from the origin are still very busted, this does
+ * nothing to change that (in fact it may make it worse), but some tests
+ * (don't ask) really want accurate values at origin (and ONLY origin).
+ */
+#define ACCURATE_A0 1
 
 
 /* currently organized to interpolate full float[4] attributes even
@@ -77,6 +86,9 @@ struct lp_setup_args
LLVMValueRef dy01_ooa;
LLVMValueRef dx20_ooa;
LLVMValueRef dx01_ooa;
+   LLVMValueRef e01o;
+   LLVMValueRef e20o;
+   LLVMValueRef e12o;
struct lp_build_context bld;
 };
 
@@ -376,6 +388,19 @@ load_attribute(struct gallivm_state *gallivm,
}
 }
 
+/*
+ * FIXME: interpolation is always done wrt fb origin (0/0).
+ * However, if some (small) tri is far away from the origin and gradients
+ * are large, this can lead to HUGE errors, since the a0 value calculated
+ * here can get very large (with the actual values inside the triangle way
+ * smaller), leading to complete loss of accuracy. This could be prevented
+ * by using some point inside (or at corner) of the tri as interpolation
+ * origin, or just use barycentric interpolation (which GL suggests and is
+ * what real hw does - you can get the barycentric coordinates from the
+ * edge functions in rasterization in principle (though we skip these
+ * sometimes completely in case of tris covering a block fully,
+ * which obviously wouldn't work)).
+ */
 static void 
 emit_coef4( struct gallivm_state *gallivm,
 struct lp_setup_args *args,
@@ -385,6 +410,8 @@ emit_coef4( struct gallivm_state *gallivm,
 LLVMValueRef a2)
 {
LLVMBuilderRef b = gallivm->builder;
+   bool accurate_a0 = ACCURATE_A0;
+   LLVMValueRef attr_0;
LLVMValueRef dy20_ooa = args->dy20_ooa;
LLVMValueRef dy01_ooa = args->dy01_ooa;
LLVMValueRef dx20_ooa = args->dx20_ooa;
@@ -408,10 +435,19 @@ emit_coef4( struct gallivm_state *gallivm,
 
/* Calculate a0 - the attribute value at the origin
 */
-   LLVMValueRef dadx_x0   = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0");
-   LLVMValueRef dady_y0   = LLVMBuildFMul(b, dady, y0_center, "dady_y0"); 
-   LLVMValueRef attr_v0   = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0");
-   LLVMValueRef attr_0= LLVMBuildFSub(b, a0, attr_v0, "attr_0");
+   if (!accurate_a0) {
+  LLVMValueRef dadx_x0= LLVMBuildFMul(b, dadx, x0_center, "dadx_x0");
+  LLVMValueRef dady_y0= LLVMBuildFMul(b, dady, y0_center, "dady_y0");
+  LLVMValueRef attr_v0= LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0");
+  attr_0  = LLVMBuildFSub(b, a0, attr_v0, "attr_0");
+   }
+   else {
+  LLVMValueRef ao2 = LLVMBuildFMul(b, args->e01o, a2, "");
+  LLVMValueRef ao1 = LLVMBuildFMul(b, args->e20o, a1, "");
+  LLVMValueRef ao0 = LLVMBuildFMul(b, args->e12o, a0, "");
+  attr_0 = LLVMBuildFAdd(b, ao0, ao1, "");
+  attr_0 = LLVMBuildFAdd(b, attr_0, ao2, "");
+   }
 
store_coef(gallivm, args, slot, attr_0, dadx, dady);
 }
@@ -623,10 +659,11 @@ init_args(struct gallivm_state *gallivm,
LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
LLVMValueRef pixel_center, xy0_center, dxy01, dxy20, dyx20;
LLVMValueRef e, f, ef, ooa;
-   LLVMValueRef shuffles[4];
+   LLVMValueRef shuffles[4], shuf10;
LLVMValueRef attr_pos[3];
struct lp_type typef4 = lp_type_float_vec(32, 128);
struct lp_build_context bld;
+   bool accurate_a0 = ACCURATE_A0;
 
lp_build_context_init(&bld, gallivm, typef4);
args->bld = bld;
@@

[Mesa-dev] [PATCH] llvmpipe: increase number of queries which can be binned simultaneously to 64

2014-06-12 Thread sroland
From: Roland Scheidegger 

Gallium (but not OpenGL) does allow nesting of queries, but there's no
limit specified (d3d10 has no limit neither). Nevertheless, for practical
purposes we need some limit in llvmpipe, otherwise we'd need more complex
handling of queries as we need to keep track of all binned queries (this
only affects queries which gather data past setup). A limit of 16 is too
small though, while 64 would suffice.
---
 src/gallium/drivers/llvmpipe/lp_rast.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h 
b/src/gallium/drivers/llvmpipe/lp_rast.h
index 6bd917d..c209f47 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -66,7 +66,7 @@ struct cmd_bin;
 /* Rasterizer output size going to jit fs, width/height */
 #define LP_RASTER_BLOCK_SIZE 4
 
-#define LP_MAX_ACTIVE_BINNED_QUERIES 16
+#define LP_MAX_ACTIVE_BINNED_QUERIES 64
 
 #define IMUL64(a, b) (((int64_t)(a)) * ((int64_t)(b)))
 
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix SCALED -> NORM conversions

2014-06-17 Thread sroland
From: Roland Scheidegger 

Such conversions (which are most likely rather pointless in practice) were
resulting in shifts with negative shift counts and shifts with counts the same
as the bit width. This was always undefined in llvm, the code generated was
rather horrendous but happened to work.
So make sure such shifts are filtered out and replaced with something that
works (the generated code is still just as horrendous as before).

This fixes lp_test_format, https://bugs.freedesktop.org/show_bug.cgi?id=73846.
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c | 24 +++-
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index d3bf621..fe8f08b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -794,7 +794,8 @@ lp_build_conv(struct gallivm_state *gallivm,
   unsigned dst_offset = lp_const_offset(dst_type);
 
   /* Compensate for different offsets */
-  if (dst_offset > src_offset && src_type.width > dst_type.width) {
+  /* sscaled -> unorm and similar would cause negative shift count, skip */
+  if (dst_offset > src_offset && src_type.width > dst_type.width && 
src_shift > 0) {
  for (i = 0; i < num_tmps; ++i) {
 LLVMValueRef shifted;
 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 
src_shift - 1);
@@ -903,11 +904,24 @@ lp_build_conv(struct gallivm_state *gallivm,
 
if (src_shift < dst_shift) {
   LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
-  LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 
dst_shift - src_shift);
 
-  for (i = 0; i < num_tmps; ++i) {
- pre_shift[i] = tmp[i];
- tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
+  if (dst_shift - src_shift < dst_type.width) {
+ LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
+ dst_shift - 
src_shift);
+ for (i = 0; i < num_tmps; ++i) {
+pre_shift[i] = tmp[i];
+tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
+ }
+  }
+  else {
+ /*
+  * This happens for things like sscaled -> unorm conversions.
+  * Negative shift counts cause undefined results, so hack around 
it.
+  */
+ for (i = 0; i < num_tmps; ++i) {
+pre_shift[i] = tmp[i];
+tmp[i] = lp_build_zero(gallivm, dst_type);
+ }
   }
 
   /* Compensate for different offsets */
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix SCALED -> NORM conversions

2014-06-17 Thread sroland
From: Roland Scheidegger 

Such conversions (which are most likely rather pointless in practice) were
resulting in shifts with negative shift counts and shifts with counts the same
as the bit width. This was always undefined in llvm, the code generated was
rather horrendous but happened to work.
So make sure such shifts are filtered out and replaced with something that
works (the generated code is still just as horrendous as before).

This fixes lp_test_format, https://bugs.freedesktop.org/show_bug.cgi?id=73846.

v2: prettify by using build context shift helpers.
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c | 39 +
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c 
b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index d3bf621..1424447 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -792,29 +792,23 @@ lp_build_conv(struct gallivm_state *gallivm,
   unsigned dst_shift = lp_const_shift(dst_type);
   unsigned src_offset = lp_const_offset(src_type);
   unsigned dst_offset = lp_const_offset(dst_type);
+  struct lp_build_context bld;
+  lp_build_context_init(&bld, gallivm, tmp_type);
 
   /* Compensate for different offsets */
-  if (dst_offset > src_offset && src_type.width > dst_type.width) {
+  /* sscaled -> unorm and similar would cause negative shift count, skip */
+  if (dst_offset > src_offset && src_type.width > dst_type.width && 
src_shift > 0) {
  for (i = 0; i < num_tmps; ++i) {
 LLVMValueRef shifted;
-LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 
src_shift - 1);
-if(src_type.sign)
-   shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
-else
-   shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
 
+shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
  }
   }
 
   if(src_shift > dst_shift) {
- LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
- src_shift - dst_shift);
  for(i = 0; i < num_tmps; ++i)
-if(src_type.sign)
-   tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
-else
-   tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
+tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
   }
}
 
@@ -900,14 +894,27 @@ lp_build_conv(struct gallivm_state *gallivm,
unsigned dst_shift = lp_const_shift(dst_type);
unsigned src_offset = lp_const_offset(src_type);
unsigned dst_offset = lp_const_offset(dst_type);
+   struct lp_build_context bld;
+   lp_build_context_init(&bld, gallivm, tmp_type);
 
if (src_shift < dst_shift) {
   LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
-  LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, 
dst_shift - src_shift);
 
-  for (i = 0; i < num_tmps; ++i) {
- pre_shift[i] = tmp[i];
- tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
+  if (dst_shift - src_shift < dst_type.width) {
+ for (i = 0; i < num_tmps; ++i) {
+pre_shift[i] = tmp[i];
+tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
+ }
+  }
+  else {
+ /*
+  * This happens for things like sscaled -> unorm conversions. 
Shift
+  * counts equal to bit width cause undefined results, so hack 
around it.
+  */
+ for (i = 0; i < num_tmps; ++i) {
+pre_shift[i] = tmp[i];
+tmp[i] = lp_build_zero(gallivm, dst_type);
+ }
   }
 
   /* Compensate for different offsets */
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: set mcpu when initializing llvm execution engine

2014-06-18 Thread sroland
From: Roland Scheidegger 

Previously llvm detected cpu features automatically when the execution engine
was created (based on host cpu). This is no longer the case, which meant llvm
was then not able to emit some of the intrinsics we used as we didn't specify
any sse attributes (only on avx supporting systems this was not a problem since
despite at least some llvm versions enabling it anyway we always set this
manually). So, instead of trying to figure out which MAttrs to set just set
MCPU.

This fixes https://bugs.freedesktop.org/show_bug.cgi?id=77493.
---
 src/gallium/auxiliary/gallivm/lp_bld_misc.cpp | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp 
b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 38fbe1f..6bea964 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -468,8 +468,8 @@ 
lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
   /*
* AVX feature is not automatically detected from CPUID by the X86 target
* yet, because the old (yet default) JIT engine is not capable of
-   * emitting the opcodes.  But as we're using MCJIT here, it is safe to
-   * add set this attribute.
+   * emitting the opcodes. On newer llvm versions it is and at least some
+   * versions (tested with 3.3) will emit avx opcodes without this anyway.
*/
   MAttrs.push_back("+avx");
   if (util_cpu_caps.has_f16c) {
@@ -478,12 +478,30 @@ 
lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
   builder.setMAttrs(MAttrs);
}
 
+#if HAVE_LLVM >= 0x0305
+   StringRef MCPU = llvm::sys::getHostCPUName();
+   /*
+* The cpu bits are no longer set automatically, so need to set mcpu 
manually.
+* Note that the MAttrs set above will be sort of ignored (since we should
+* not set any which would not be set by specifying the cpu anyway).
+* It ought to be safe though since getHostCPUName() should include bits
+* not only from the cpu but environment as well (for instance if it's safe
+* to use avx instructions which need OS support). According to
+* http://llvm.org/bugs/show_bug.cgi?id=19429 however if I understand this
+* right it may be necessary to specify older cpu (or disable mattrs) though
+* when not using MCJIT so no instructions are generated which the old JIT
+* can't handle. Not entirely sure if we really need to do anything yet.
+*/
+   builder.setMCPU(MCPU);
+#endif
+
ShaderMemoryManager *MM = new ShaderMemoryManager();
*OutCode = MM->getGeneratedCode();
 
builder.setJITMemoryManager(MM);
 
ExecutionEngine *JIT;
+
 #if HAVE_LLVM >= 0x0302
JIT = builder.create();
 #else
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] draw: (trivial) fix clamping of viewport index

2014-06-23 Thread sroland
From: Roland Scheidegger 

The old logic would let all negative values go through unclamped, with
potentially disastrous results (probably trying to fetch viewport values
from random memory locations). GL has undefined rendering for vp indices
outside valid range but that's a bit too undefined...
(The logic is now the same as in llvmpipe.)

CC: "10.1 10.2" 
---
 src/gallium/auxiliary/draw/draw_private.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_private.h 
b/src/gallium/auxiliary/draw/draw_private.h
index 783c3ef..d8dc2ab 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -493,7 +493,7 @@ draw_stats_clipper_primitives(struct draw_context *draw,
 static INLINE unsigned
 draw_clamp_viewport_idx(int idx)
 {
-   return ((PIPE_MAX_VIEWPORTS > idx || idx < 0) ? idx : 0);
+   return ((PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0);
 }
 
 /**
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] softpipe: use last_level from sampler view, not from the resource

2014-06-25 Thread sroland
From: Roland Scheidegger 

The last_level from the sampler view may be limited by the state tracker
to a value lower than what the base texture provides.

Fixes https://bugs.freedesktop.org/show_bug.cgi?id=80541.
---
 src/gallium/drivers/softpipe/sp_tex_sample.c | 39 ++--
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c 
b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 6d97664..8565a01 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -1860,22 +1860,22 @@ mip_filter_linear(struct sp_sampler_view *sp_sview,
   enum tgsi_sampler_control control,
   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
-   const struct pipe_resource *texture = sp_sview->base.texture;
+   const struct pipe_sampler_view *psview = &sp_sview->base;
int j;
float lod[TGSI_QUAD_SIZE];
 
compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
 
for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-  int level0 = sp_sview->base.u.tex.first_level + (int)lod[j];
+  int level0 = psview->u.tex.first_level + (int)lod[j];
 
   if (lod[j] < 0.0)
  mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-sp_sview->base.u.tex.first_level,
+psview->u.tex.first_level,
 sp_sview->faces[j], &rgba[0][j]);
 
-  else if (level0 >= (int) texture->last_level)
- min_filter(sp_sview, sp_samp, s[j], t[j], p[j], texture->last_level,
+  else if (level0 >= (int) psview->u.tex.last_level)
+ min_filter(sp_sview, sp_samp, s[j], t[j], p[j], 
psview->u.tex.last_level,
 sp_sview->faces[j], &rgba[0][j]);
 
   else {
@@ -1918,7 +1918,7 @@ mip_filter_nearest(struct sp_sampler_view *sp_sview,
enum tgsi_sampler_control control,
float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
-   const struct pipe_resource *texture = sp_sview->base.texture;
+   const struct pipe_sampler_view *psview = &sp_sview->base;
float lod[TGSI_QUAD_SIZE];
int j;
 
@@ -1927,11 +1927,11 @@ mip_filter_nearest(struct sp_sampler_view *sp_sview,
for (j = 0; j < TGSI_QUAD_SIZE; j++) {
   if (lod[j] < 0.0)
  mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-sp_sview->base.u.tex.first_level,
+psview->u.tex.first_level,
 sp_sview->faces[j], &rgba[0][j]);
   else {
- int level = sp_sview->base.u.tex.first_level + (int)(lod[j] + 0.5F);
- level = MIN2(level, (int)texture->last_level);
+ int level = psview->u.tex.first_level + (int)(lod[j] + 0.5F);
+ level = MIN2(level, (int)psview->u.tex.last_level);
  min_filter(sp_sview, sp_samp, s[j], t[j], p[j],
 level, sp_sview->faces[j], &rgba[0][j]);
   }
@@ -2230,12 +2230,13 @@ mip_filter_linear_aniso(struct sp_sampler_view 
*sp_sview,
 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
const struct pipe_resource *texture = sp_sview->base.texture;
+   const struct pipe_sampler_view *psview = &sp_sview->base;
int level0;
float lambda;
float lod[TGSI_QUAD_SIZE];
 
-   float s_to_u = u_minify(texture->width0, sp_sview->base.u.tex.first_level);
-   float t_to_v = u_minify(texture->height0, sp_sview->base.u.tex.first_level);
+   float s_to_u = u_minify(texture->width0, psview->u.tex.first_level);
+   float t_to_v = u_minify(texture->height0, psview->u.tex.first_level);
float dudx = (s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]) * s_to_u;
float dudy = (s[QUAD_TOP_LEFT] - s[QUAD_BOTTOM_LEFT]) * s_to_u;
float dvdx = (t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]) * t_to_v;
@@ -2292,15 +2293,15 @@ mip_filter_linear_aniso(struct sp_sampler_view 
*sp_sview,
/* XXX: Take into account all lod values.
 */
lambda = lod[0];
-   level0 = sp_sview->base.u.tex.first_level + (int)lambda;
+   level0 = psview->u.tex.first_level + (int)lambda;
 
/* If the ellipse covers the whole image, we can
 * simply return the average of the whole image.
 */
-   if (level0 >= (int) texture->last_level) {
+   if (level0 >= (int) psview->u.tex.last_level) {
   int j;
   for (j = 0; j < TGSI_QUAD_SIZE; j++)
- min_filter(sp_sview, sp_samp, s[j], t[j], p[j], texture->last_level,
+ min_filter(sp_sview, sp_samp, s[j], t[j], p[j], 
psview->u.tex.last_level,
 sp_sview->faces[j], &rgba[0][j]);
}
else {
@@ -2336,25 +2337,25 @@ mip_filter_linear_2d_linear_repeat_POT(
enum tgsi_sampler_control control,
float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
-   const struct pipe_resource *texture = sp_sview->base.texture;
+   const struct pipe_sampler_view *psview = &sp_sview->base;
int j;
float lod[TGSI_QUAD_SIZE];
 
compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, 

[Mesa-dev] [PATCH 6/6] llvmpipe: get rid of llvmpipe_get_texture_tile_linear

2014-07-01 Thread sroland
From: Roland Scheidegger 

Because the layout is always linear this didn't really do much any longer -
at some point this triggered per-tile swizzled->linear conversion. The x/y
coords were ignored too.
Apart from triggering conversion, this also invoked alloc_image_data(), which
could only actually trigger mapping of display target resources. So, instead
just call resource_map in the callers (which also gives the ability to unmap
again). Note that mapping/unmapping of display target resources still isn't
really all that clean (map/unmap may be unmatched, and all such mappings use
the same pointer thus usage flags are a lie).
---
 src/gallium/drivers/llvmpipe/lp_surface.c | 86 ++-
 src/gallium/drivers/llvmpipe/lp_texture.c | 32 +---
 src/gallium/drivers/llvmpipe/lp_texture.h |  6 ---
 3 files changed, 17 insertions(+), 107 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c 
b/src/gallium/drivers/llvmpipe/lp_surface.c
index 09ca39d..08f968f 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -35,22 +35,6 @@
 #include "lp_query.h"
 
 
-/**
- * Adjust x, y, width, height to lie on tile bounds.
- */
-static void
-adjust_to_tile_bounds(unsigned x, unsigned y, unsigned width, unsigned height,
-  unsigned *x_tile, unsigned *y_tile,
-  unsigned *w_tile, unsigned *h_tile)
-{
-   *x_tile = x & ~(TILE_SIZE - 1);
-   *y_tile = y & ~(TILE_SIZE - 1);
-   *w_tile = ((x + width + TILE_SIZE - 1) & ~(TILE_SIZE - 1)) - *x_tile;
-   *h_tile = ((y + height + TILE_SIZE - 1) & ~(TILE_SIZE - 1)) - *y_tile;
-}
-
-
-
 static void
 lp_resource_copy(struct pipe_context *pipe,
  struct pipe_resource *dst, unsigned dst_level,
@@ -64,7 +48,6 @@ lp_resource_copy(struct pipe_context *pipe,
unsigned width = src_box->width;
unsigned height = src_box->height;
unsigned depth = src_box->depth;
-   unsigned z;
 
llvmpipe_flush_resource(pipe,
dst, dst_level,
@@ -94,61 +77,16 @@ lp_resource_copy(struct pipe_context *pipe,
   src_box->width, src_box->height, src_box->depth);
*/
 
-   for (z = 0; z < src_box->depth; z++){
-
-  /* set src tiles to linear layout */
-  {
- unsigned tx, ty, tw, th;
- unsigned x, y;
+   /* make sure display target resources (which cannot have levels/layers) are 
mapped */
+   if (src_tex->dt)
+  (void) llvmpipe_resource_map(src, src_level, 0, LP_TEX_USAGE_READ);
+   if (dst_tex->dt)
+  /*
+   * Could set this to WRITE_ALL if complete dst is covered but it gets
+   * ignored anyway.
+   */
+  (void) llvmpipe_resource_map(dst, dst_level, 0, LP_TEX_USAGE_READ_WRITE);
 
- adjust_to_tile_bounds(src_box->x, src_box->y, width, height,
-   &tx, &ty, &tw, &th);
-
- for (y = 0; y < th; y += TILE_SIZE) {
-for (x = 0; x < tw; x += TILE_SIZE) {
-   (void) llvmpipe_get_texture_tile_linear(src_tex,
-   src_box->z + z, 
src_level,
-   LP_TEX_USAGE_READ,
-   tx + x, ty + y);
-}
- }
-  }
-
-  /* set dst tiles to linear layout */
-  {
- unsigned tx, ty, tw, th;
- unsigned x, y;
- enum lp_texture_usage usage;
-
- adjust_to_tile_bounds(dstx, dsty, width, height, &tx, &ty, &tw, &th);
-
- for (y = 0; y < th; y += TILE_SIZE) {
-boolean contained_y = ty + y >= dsty &&
-  ty + y + TILE_SIZE <= dsty + height ?
-  TRUE : FALSE;
-
-for (x = 0; x < tw; x += TILE_SIZE) {
-   boolean contained_x = tx + x >= dstx &&
- tx + x + TILE_SIZE <= dstx + width ?
- TRUE : FALSE;
-
-   /*
-* Set the usage mode to WRITE_ALL for the tiles which are
-* completely contained by the dest rectangle.
-*/
-   if (contained_y && contained_x)
-  usage = LP_TEX_USAGE_WRITE_ALL;
-   else
-  usage = LP_TEX_USAGE_READ_WRITE;
-
-   (void) llvmpipe_get_texture_tile_linear(dst_tex,
-   dstz + z, dst_level,
-   usage,
-   tx + x, ty + y);
-}
- }
-  }
-   }
 
/* copy */
{
@@ -171,6 +109,12 @@ lp_resource_copy(struct pipe_context *pipe,
src_box->x, src_box->y, 0);
   }
}
+
+   if (src_tex->dt)
+  llvmpipe_resource_unmap(src, 0, 0);
+   if (dst_tex->dt)
+  llvmpipe_resource_unmap(dst, 0, 0);
+
 }
 
 
diff --git a

[Mesa-dev] [PATCH 3/6] llvmpipe: allocate regular texture memory upfront

2014-07-01 Thread sroland
From: Roland Scheidegger 

The deferred allocation doesn't really make much sense anymore, since we no
longer allocate swizzled/linear memory in chunks and not per level / slice
neither.
This means we could fail resource creation a bit more (could already fail in
theory anyway) but should not fail maps later (right now, callers can't deal
with neither really).
---
 src/gallium/drivers/llvmpipe/lp_texture.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index a156449..f95b2a2 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -59,6 +59,8 @@ static struct llvmpipe_resource resource_list;
 #endif
 static unsigned id_counter = 0;
 
+static void
+alloc_image_data(struct llvmpipe_resource *lpr);
 
 /**
  * Conventional allocation path for non-display textures:
@@ -247,6 +249,11 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
  /* texture map */
  if (!llvmpipe_texture_layout(screen, lpr))
 goto fail;
+
+ alloc_image_data(lpr);
+ if (!lpr->tex_data) {
+goto fail;
+ }
   }
}
else {
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/6] llvmpipe: get rid of linear_img struct

2014-07-01 Thread sroland
From: Roland Scheidegger 

Just use a tex_data pointer directly - the description was no longer correct
neither.
---
 src/gallium/drivers/llvmpipe/lp_setup.c |  2 +-
 src/gallium/drivers/llvmpipe/lp_state_sampler.c |  2 +-
 src/gallium/drivers/llvmpipe/lp_texture.c   | 39 ++---
 src/gallium/drivers/llvmpipe/lp_texture.h   |  9 +-
 4 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index 046611a..cbf465e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -818,7 +818,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
 */
mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level,
 LP_TEX_USAGE_READ);
-   jit_tex->base = lp_tex->linear_img.data;
+   jit_tex->base = lp_tex->tex_data;
 }
 else {
mip_ptr = lp_tex->data;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 0b227ea..d204378 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -244,7 +244,7 @@ prepare_shader_sampling(
/* XXX this may fail due to OOM ? */
mip_ptr = llvmpipe_get_texture_image_all(lp_tex, 
view->u.tex.first_level,
 LP_TEX_USAGE_READ);
-   addr = lp_tex->linear_img.data;
+   addr = lp_tex->tex_data;
 
for (j = first_level; j <= last_level; j++) {
   mip_ptr = llvmpipe_get_texture_image_all(lp_tex, j,
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index d60d101..a156449 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -301,9 +301,9 @@ llvmpipe_resource_destroy(struct pipe_screen *pscreen,
}
else if (llvmpipe_resource_is_texture(pt)) {
   /* free linear image data */
-  if (lpr->linear_img.data) {
- align_free(lpr->linear_img.data);
- lpr->linear_img.data = NULL;
+  if (lpr->tex_data) {
+ align_free(lpr->tex_data);
+ lpr->tex_data = NULL;
   }
}
else if (!lpr->userBuffer) {
@@ -359,7 +359,7 @@ llvmpipe_resource_map(struct pipe_resource *resource,
   map = winsys->displaytarget_map(winsys, lpr->dt, dt_usage);
 
   /* install this linear image in texture data structure */
-  lpr->linear_img.data = map;
+  lpr->tex_data = map;
 
   return map;
}
@@ -726,16 +726,14 @@ ubyte *
 llvmpipe_get_texture_image_address(struct llvmpipe_resource *lpr,
unsigned face_slice, unsigned level)
 {
-   struct llvmpipe_texture_image *img;
unsigned offset;
 
-   img = &lpr->linear_img;
offset = lpr->mip_offsets[level];
 
if (face_slice > 0)
   offset += face_slice * tex_image_face_size(lpr, level);
 
-   return (ubyte *) img->data + offset;
+   return (ubyte *) lpr->tex_data + offset;
 }
 
 
@@ -759,7 +757,7 @@ alloc_image_data(struct llvmpipe_resource *lpr)
 
   assert(lpr->base.last_level == 0);
 
-  lpr->linear_img.data =
+  lpr->tex_data =
  winsys->displaytarget_map(winsys, lpr->dt,
PIPE_TRANSFER_READ_WRITE);
}
@@ -774,9 +772,9 @@ alloc_image_data(struct llvmpipe_resource *lpr)
  lpr->mip_offsets[level] = offset;
  offset += align(buffer_size, alignment);
   }
-  lpr->linear_img.data = align_malloc(offset, alignment);
-  if (lpr->linear_img.data) {
- memset(lpr->linear_img.data, 0, offset);
+  lpr->tex_data = align_malloc(offset, alignment);
+  if (lpr->tex_data) {
+ memset(lpr->tex_data, 0, offset);
   }
}
 }
@@ -795,7 +793,6 @@ llvmpipe_get_texture_image(struct llvmpipe_resource *lpr,
unsigned face_slice, unsigned level,
enum lp_texture_usage usage)
 {
-   struct llvmpipe_texture_image *target_img;
void *target_data;
unsigned target_offset;
unsigned *target_off_ptr;
@@ -805,17 +802,14 @@ llvmpipe_get_texture_image(struct llvmpipe_resource *lpr,
   usage == LP_TEX_USAGE_WRITE_ALL);
 
if (lpr->dt) {
-  assert(lpr->linear_img.data);
+  assert(lpr->tex_data);
}
 
-   target_img = &lpr->linear_img;
target_off_ptr = lpr->mip_offsets;
-   target_data = target_img->data;
 
-   if (!target_data) {
+   if (!lpr->tex_data) {
   /* allocate memory for the target image now */
   alloc_image_data(lpr);
-  target_data = target_img->data;
}
 
target_offset = target_off_ptr[level];
@@ -824,8 +818,8 @@ llvmpipe_get_texture_image(struct llvmpipe_resource *lpr,
   ta

[Mesa-dev] [PATCH 4/6] llvmpipe: get rid of llvmpipe_get_texture_image_all

2014-07-01 Thread sroland
From: Roland Scheidegger 

Once used for invoking swizzled->linear conversion for all needed images.
But we now have a single allocation for all images in a resource, thus looping
through all slices is rather pointless, conversion doesn't happen neither.
Also simplify the sampling setup code to use the mip_offsets array in the
resource directly - if the (non display target) resource exists its memory
will already be allocated as well.
---
 src/gallium/drivers/llvmpipe/lp_setup.c | 24 
 src/gallium/drivers/llvmpipe/lp_state_sampler.c | 20 +++-
 src/gallium/drivers/llvmpipe/lp_texture.c   | 23 ---
 src/gallium/drivers/llvmpipe/lp_texture.h   |  5 -
 4 files changed, 7 insertions(+), 65 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index cbf465e..d728e85 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -802,7 +802,6 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
 
  if (!lp_tex->dt) {
 /* regular texture - setup array of mipmap level offsets */
-void *mip_ptr;
 int j;
 unsigned first_level = 0;
 unsigned last_level = 0;
@@ -812,22 +811,14 @@ lp_setup_set_fragment_sampler_views(struct 
lp_setup_context *setup,
last_level = view->u.tex.last_level;
assert(first_level <= last_level);
assert(last_level <= res->last_level);
-
-   /*
-* The complexity here should no longer be necessary.
-*/
-   mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level,
-LP_TEX_USAGE_READ);
jit_tex->base = lp_tex->tex_data;
 }
 else {
-   mip_ptr = lp_tex->data;
-   jit_tex->base = mip_ptr;
+  jit_tex->base = lp_tex->data;
 }
 
-if ((LP_PERF & PERF_TEX_MEM) || !mip_ptr) {
-   /* out of memory - use dummy tile memory */
-   /* Note if using PERF_TEX_MEM will also skip tile conversion */
+if (LP_PERF & PERF_TEX_MEM) {
+   /* use dummy tile memory */
jit_tex->base = lp_dummy_tile;
jit_tex->width = TILE_SIZE/8;
jit_tex->height = TILE_SIZE/8;
@@ -847,14 +838,7 @@ lp_setup_set_fragment_sampler_views(struct 
lp_setup_context *setup,
 
if (llvmpipe_resource_is_texture(res)) {
   for (j = first_level; j <= last_level; j++) {
- mip_ptr = llvmpipe_get_texture_image_all(lp_tex, j,
-  
LP_TEX_USAGE_READ);
- jit_tex->mip_offsets[j] = (uint8_t *)mip_ptr - (uint8_t 
*)jit_tex->base;
- /*
-  * could get mip offset directly but need call above to
-  * invoke tiled->linear conversion.
-  */
- assert(lp_tex->mip_offsets[j] == jit_tex->mip_offsets[j]);
+ jit_tex->mip_offsets[j] = lp_tex->mip_offsets[j];
  jit_tex->row_stride[j] = lp_tex->row_stride[j];
  jit_tex->img_stride[j] = lp_tex->img_stride[j];
   }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index d204378..a14a64f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -232,29 +232,16 @@ prepare_shader_sampling(
 /* regular texture - setup array of mipmap level offsets */
 struct pipe_resource *res = view->texture;
 int j;
-void *mip_ptr;
 
 if (llvmpipe_resource_is_texture(res)) {
first_level = view->u.tex.first_level;
last_level = view->u.tex.last_level;
assert(first_level <= last_level);
assert(last_level <= res->last_level);
-
-   /* must trigger allocation first before we can get base ptr */
-   /* XXX this may fail due to OOM ? */
-   mip_ptr = llvmpipe_get_texture_image_all(lp_tex, 
view->u.tex.first_level,
-LP_TEX_USAGE_READ);
addr = lp_tex->tex_data;
 
for (j = first_level; j <= last_level; j++) {
-  mip_ptr = llvmpipe_get_texture_image_all(lp_tex, j,
-   LP_TEX_USAGE_READ);
-  mip_offsets[j] = (uint8_t *)mip_ptr - (uint8_t *)addr;
-  /*
-   * could get mip offset directly but need call above to
-   * invoke tiled->linear conversion.
-  

[Mesa-dev] [PATCH 5/6] llvmpipe: get rid of llvmpipe_get_texture_image

2014-07-01 Thread sroland
From: Roland Scheidegger 

The only caller left used it only for non display target textures,
hence it was really the same as llvmpipe_get_texture_image_address - it
also had a usage flag but this was ignored anyway.
---
 src/gallium/drivers/llvmpipe/lp_texture.c | 48 +--
 src/gallium/drivers/llvmpipe/lp_texture.h |  5 
 2 files changed, 1 insertion(+), 52 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index f5383dc..3cb421c 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -372,7 +372,7 @@ llvmpipe_resource_map(struct pipe_resource *resource,
}
else if (llvmpipe_resource_is_texture(resource)) {
 
-  map = llvmpipe_get_texture_image(lpr, layer, level, tex_usage);
+  map = llvmpipe_get_texture_image_address(lpr, layer, level);
   return map;
}
else {
@@ -787,52 +787,6 @@ alloc_image_data(struct llvmpipe_resource *lpr)
 }
 
 
-
-/**
- * Return pointer to texture image data
- * for a particular cube face or 3D texture slice.
- *
- * \param face_slice  the cube face or 3D slice of interest
- * \param usage  one of LP_TEX_USAGE_READ/WRITE_ALL/READ_WRITE
- */
-void *
-llvmpipe_get_texture_image(struct llvmpipe_resource *lpr,
-   unsigned face_slice, unsigned level,
-   enum lp_texture_usage usage)
-{
-   void *target_data;
-   unsigned target_offset;
-   unsigned *target_off_ptr;
-
-   assert(usage == LP_TEX_USAGE_READ ||
-  usage == LP_TEX_USAGE_READ_WRITE ||
-  usage == LP_TEX_USAGE_WRITE_ALL);
-
-   if (lpr->dt) {
-  assert(lpr->tex_data);
-   }
-
-   target_off_ptr = lpr->mip_offsets;
-
-   if (!lpr->tex_data) {
-  /* allocate memory for the target image now */
-  alloc_image_data(lpr);
-   }
-
-   target_offset = target_off_ptr[level];
-
-   if (face_slice > 0) {
-  target_offset += face_slice * tex_image_face_size(lpr, level);
-   }
-
-   if (lpr->tex_data) {
-  target_data = (uint8_t *) lpr->tex_data + target_offset;
-   }
-
-   return target_data;
-}
-
-
 /**
  * Get pointer to a linear image (not the tile!) at tile (x,y).
  * \return pointer to start of image/face (not the tile)
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h 
b/src/gallium/drivers/llvmpipe/lp_texture.h
index 5eb0f5a..b163226 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -217,11 +217,6 @@ ubyte *
 llvmpipe_get_texture_image_address(struct llvmpipe_resource *lpr,
unsigned face_slice, unsigned level);
 
-void *
-llvmpipe_get_texture_image(struct llvmpipe_resource *resource,
-   unsigned face_slice, unsigned level,
-   enum lp_texture_usage usage);
-
 ubyte *
 llvmpipe_get_texture_tile_linear(struct llvmpipe_resource *lpr,
  unsigned face_slice, unsigned level,
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/6] llvmpipe: (trivial) rename linear_mip_offsets to mip_offsets

2014-07-01 Thread sroland
From: Roland Scheidegger 

Since switching to non-swizzled rendering we only have "normal", aka linear,
offsets.
---
 src/gallium/drivers/llvmpipe/lp_setup.c | 2 +-
 src/gallium/drivers/llvmpipe/lp_state_sampler.c | 2 +-
 src/gallium/drivers/llvmpipe/lp_texture.c   | 6 +++---
 src/gallium/drivers/llvmpipe/lp_texture.h   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index 77ac3af..046611a 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -854,7 +854,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
   * could get mip offset directly but need call above to
   * invoke tiled->linear conversion.
   */
- assert(lp_tex->linear_mip_offsets[j] == 
jit_tex->mip_offsets[j]);
+ assert(lp_tex->mip_offsets[j] == jit_tex->mip_offsets[j]);
  jit_tex->row_stride[j] = lp_tex->row_stride[j];
  jit_tex->img_stride[j] = lp_tex->img_stride[j];
   }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 09bc928..0b227ea 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -254,7 +254,7 @@ prepare_shader_sampling(
* could get mip offset directly but need call above to
* invoke tiled->linear conversion.
*/
-  assert(lp_tex->linear_mip_offsets[j] == mip_offsets[j]);
+  assert(lp_tex->mip_offsets[j] == mip_offsets[j]);
   row_stride[j] = lp_tex->row_stride[j];
   img_stride[j] = lp_tex->img_stride[j];
}
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 6df88d0..d60d101 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -730,7 +730,7 @@ llvmpipe_get_texture_image_address(struct llvmpipe_resource 
*lpr,
unsigned offset;
 
img = &lpr->linear_img;
-   offset = lpr->linear_mip_offsets[level];
+   offset = lpr->mip_offsets[level];
 
if (face_slice > 0)
   offset += face_slice * tex_image_face_size(lpr, level);
@@ -771,7 +771,7 @@ alloc_image_data(struct llvmpipe_resource *lpr)
*/
   for (level = 0; level <= lpr->base.last_level; level++) {
  uint buffer_size = tex_image_size(lpr, level);
- lpr->linear_mip_offsets[level] = offset;
+ lpr->mip_offsets[level] = offset;
  offset += align(buffer_size, alignment);
   }
   lpr->linear_img.data = align_malloc(offset, alignment);
@@ -809,7 +809,7 @@ llvmpipe_get_texture_image(struct llvmpipe_resource *lpr,
}
 
target_img = &lpr->linear_img;
-   target_off_ptr = lpr->linear_mip_offsets;
+   target_off_ptr = lpr->mip_offsets;
target_data = target_img->data;
 
if (!target_data) {
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h 
b/src/gallium/drivers/llvmpipe/lp_texture.h
index af11848..e26d044 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -74,7 +74,7 @@ struct llvmpipe_resource
/** Number of 3D slices or cube faces per level */
unsigned num_slices_faces[LP_MAX_TEXTURE_LEVELS];
/** Offset to start of mipmap level, in bytes */
-   unsigned linear_mip_offsets[LP_MAX_TEXTURE_LEVELS];
+   unsigned mip_offsets[LP_MAX_TEXTURE_LEVELS];
 
/**
 * Display target, for textures with the PIPE_BIND_DISPLAY_TARGET
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] target-helpers: don't use designated initializers

2014-07-01 Thread sroland
From: Roland Scheidegger 

it looks since ce1a1372280d737a1b85279995529206586ae480 they are now included
in more places, in particular even for things buildable with msvc, and hence
those break the build.
---
 src/gallium/auxiliary/target-helpers/inline_drm_helper.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h 
b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
index a03db3a..3b36316 100644
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
@@ -210,13 +210,13 @@ dd_driver_name(void)
 }
 
 static const struct drm_conf_ret throttle_ret = {
-   .type = DRM_CONF_INT,
-   .val.val_int = 2,
+   DRM_CONF_INT,
+   {2},
 };
 
 static const struct drm_conf_ret share_fd_ret = {
-   .type = DRM_CONF_BOOL,
-   .val.val_int = true,
+   DRM_CONF_BOOL,
+   {true},
 };
 
 static const struct drm_conf_ret *
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] gallivm: fix up out-of-bounds level when using conformant out-of-bound behavior

2014-07-29 Thread sroland
From: Roland Scheidegger 

When using (d3d10) conformant out-of-bound behavior for texel fetching
(currently always enabled) the level still needs to be set to a safe value
even though the offset in the end won't get used because the level is used
to look up the mip offset itself and the actual strides, which might otherwise
crash.
For simplicity, we'll use level 0 in this case (this ought to be safe, llvmpipe
does not actually fill in level 0 information if first_level is larger, but
some random strides / offsets shouldn't hurt as ultimately we always use
offset 0 in this case).
Fixes a crash in some in-house test where random huge levels appear in
lp_build_fetch_texel() despite the test always using a fixed 0 for level
actually... But in any case the value comes from the shader and thus can
easily be outside max mip level.

CC: 
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c 
b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index aeecba8..f1bf285 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -927,6 +927,7 @@ lp_build_nearest_mip_level(struct lp_build_sample_context 
*bld,
 
bld->int_coord_bld.type,
 out);
   }
+  level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds);
   *level_out = level;
}
else {
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] llvmpipe: integrate memory allocation into llvmpipe_texture_layout

2014-07-31 Thread sroland
From: Roland Scheidegger 

Seems pointless to just duplicate some of the calculations (the calculation
of actual memory used compared to what was predicted in llvmpipe_texture_layout
actually could have differed slightly in some cases due to different alignment
rules used though this should have been of no consequence).
---
 src/gallium/drivers/llvmpipe/lp_texture.c | 58 +--
 1 file changed, 17 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 40e5815..3c87b28 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -59,16 +59,15 @@ static struct llvmpipe_resource resource_list;
 #endif
 static unsigned id_counter = 0;
 
-static void
-alloc_image_data(struct llvmpipe_resource *lpr);
 
 /**
  * Conventional allocation path for non-display textures:
- * Just compute row strides here.  Storage is allocated on demand later.
+ * Compute strides and allocate data (unless asked not to).
  */
 static boolean
 llvmpipe_texture_layout(struct llvmpipe_screen *screen,
-struct llvmpipe_resource *lpr)
+struct llvmpipe_resource *lpr,
+boolean allocate)
 {
struct pipe_resource *pt = &lpr->base;
unsigned level;
@@ -149,6 +148,8 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
  goto fail;
   }
 
+  lpr->mip_offsets[level] = total_size;
+
   total_size += (uint64_t) lpr->num_slices_faces[level]
   * (uint64_t) lpr->img_stride[level];
   if (total_size > LP_MAX_TEXTURE_SIZE) {
@@ -161,6 +162,16 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
   depth = u_minify(depth, 1);
}
 
+   if (allocate) {
+  lpr->tex_data = align_malloc(total_size, util_cpu_caps.cacheline);
+  if (!lpr->tex_data) {
+ return FALSE;
+  }
+  else {
+ memset(lpr->tex_data, 0, total_size);
+  }
+   }
+
return TRUE;
 
 fail:
@@ -179,7 +190,7 @@ llvmpipe_can_create_resource(struct pipe_screen *screen,
struct llvmpipe_resource lpr;
memset(&lpr, 0, sizeof(lpr));
lpr.base = *res;
-   return llvmpipe_texture_layout(llvmpipe_screen(screen), &lpr);
+   return llvmpipe_texture_layout(llvmpipe_screen(screen), &lpr, false);
 }
 
 
@@ -247,13 +258,8 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
   }
   else {
  /* texture map */
- if (!llvmpipe_texture_layout(screen, lpr))
-goto fail;
-
- alloc_image_data(lpr);
- if (!lpr->tex_data) {
+ if (!llvmpipe_texture_layout(screen, lpr, true))
 goto fail;
- }
   }
}
else {
@@ -747,36 +753,6 @@ llvmpipe_get_texture_image_address(struct 
llvmpipe_resource *lpr,
 
 
 /**
- * Allocate storage for a linear image
- * (all cube faces and all 3D slices, all levels).
- */
-static void
-alloc_image_data(struct llvmpipe_resource *lpr)
-{
-   uint alignment = MAX2(64, util_cpu_caps.cacheline);
-   uint level;
-   uint offset = 0;
-
-   assert(!lpr->dt);
-
-   /* not a display target - allocate regular memory */
-   /*
-* Offset calculation for start of a specific mip/layer is always
-* offset = lpr->linear_mip_offsets[level] + lpr->img_stride[level] * layer
-*/
-   for (level = 0; level <= lpr->base.last_level; level++) {
-  uint buffer_size = tex_image_size(lpr, level);
-  lpr->mip_offsets[level] = offset;
-  offset += align(buffer_size, alignment);
-   }
-   lpr->tex_data = align_malloc(offset, alignment);
-   if (lpr->tex_data) {
-  memset(lpr->tex_data, 0, offset);
-   }
-}
-
-
-/**
  * Return size of resource in bytes
  */
 unsigned
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] llvmpipe: get rid of impossible code in alloc_image_data

2014-07-31 Thread sroland
From: Roland Scheidegger 

Only used for non display target resources.
---
 src/gallium/drivers/llvmpipe/lp_texture.c | 39 +++
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index c07bd21..40e5815 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -757,34 +757,21 @@ alloc_image_data(struct llvmpipe_resource *lpr)
uint level;
uint offset = 0;
 
-   if (lpr->dt) {
-  /* we get the linear memory from the winsys, and it has
-   * already been zeroed
-   */
-  struct llvmpipe_screen *screen = llvmpipe_screen(lpr->base.screen);
-  struct sw_winsys *winsys = screen->winsys;
+   assert(!lpr->dt);
 
-  assert(lpr->base.last_level == 0);
-
-  lpr->tex_data =
- winsys->displaytarget_map(winsys, lpr->dt,
-   PIPE_TRANSFER_READ_WRITE);
+   /* not a display target - allocate regular memory */
+   /*
+* Offset calculation for start of a specific mip/layer is always
+* offset = lpr->linear_mip_offsets[level] + lpr->img_stride[level] * layer
+*/
+   for (level = 0; level <= lpr->base.last_level; level++) {
+  uint buffer_size = tex_image_size(lpr, level);
+  lpr->mip_offsets[level] = offset;
+  offset += align(buffer_size, alignment);
}
-   else {
-  /* not a display target - allocate regular memory */
-  /*
-   * Offset calculation for start of a specific mip/layer is always
-   * offset = lpr->linear_mip_offsets[level] + lpr->img_stride[level] * 
layer
-   */
-  for (level = 0; level <= lpr->base.last_level; level++) {
- uint buffer_size = tex_image_size(lpr, level);
- lpr->mip_offsets[level] = offset;
- offset += align(buffer_size, alignment);
-  }
-  lpr->tex_data = align_malloc(offset, alignment);
-  if (lpr->tex_data) {
- memset(lpr->tex_data, 0, offset);
-  }
+   lpr->tex_data = align_malloc(offset, alignment);
+   if (lpr->tex_data) {
+  memset(lpr->tex_data, 0, offset);
}
 }
 
-- 
1.9.1
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] llvmpipe: don't store number of layers per level

2014-07-31 Thread sroland
From: Roland Scheidegger 

This could be recalculated, though it turns out the only use of it after
resource allocation is for calculating whole resource size (for scene size
accounting though that isn't quite ideal neither). Thus, instead just store
the whole resource size and drop it (saving a couple bytes of storage per
resource). It makes things simpler too. Note that for the accounting winsys
resources always come back with size 0 but this is unchanged (we don't actually
know the size in any case).
Also reformat llvmpipe_texture_layout (drop unneded indentation).
---
 src/gallium/drivers/llvmpipe/lp_texture.c | 128 --
 src/gallium/drivers/llvmpipe/lp_texture.h |   4 +-
 2 files changed, 51 insertions(+), 81 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index 3c87b28..e37d5e8 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -81,77 +81,67 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
 
for (level = 0; level <= pt->last_level; level++) {
+  unsigned align_x, align_y, nblocksx, nblocksy, block_size, num_slices;
 
   /* Row stride and image stride */
-  {
- unsigned align_x, align_y, nblocksx, nblocksy, block_size;
-
- /* For non-compressed formats we need 4x4 pixel alignment
-  * so we can read/write LP_RASTER_BLOCK_SIZE when rendering to them.
-  * We also want cache line size in x direction,
-  * otherwise same cache line could end up in multiple threads.
-  * For explicit 1d resources however we reduce this to 4x1 and
-  * handle specially in render output code (as we need to do special
-  * handling there for buffers in any case).
-  */
- if (util_format_is_compressed(pt->format))
-align_x = align_y = 1;
- else {
-align_x = LP_RASTER_BLOCK_SIZE;
-if (llvmpipe_resource_is_1d(&lpr->base))
-   align_y = 1;
-else
-   align_y = LP_RASTER_BLOCK_SIZE;
- }
 
- nblocksx = util_format_get_nblocksx(pt->format,
- align(width, align_x));
- nblocksy = util_format_get_nblocksy(pt->format,
- align(height, align_y));
- block_size = util_format_get_blocksize(pt->format);
-
- if (util_format_is_compressed(pt->format))
-lpr->row_stride[level] = nblocksx * block_size;
+  /* For non-compressed formats we need 4x4 pixel alignment
+   * so we can read/write LP_RASTER_BLOCK_SIZE when rendering to them.
+   * We also want cache line size in x direction,
+   * otherwise same cache line could end up in multiple threads.
+   * For explicit 1d resources however we reduce this to 4x1 and
+   * handle specially in render output code (as we need to do special
+   * handling there for buffers in any case).
+   */
+  if (util_format_is_compressed(pt->format))
+ align_x = align_y = 1;
+  else {
+ align_x = LP_RASTER_BLOCK_SIZE;
+ if (llvmpipe_resource_is_1d(&lpr->base))
+align_y = 1;
  else
-lpr->row_stride[level] = align(nblocksx * block_size, 
util_cpu_caps.cacheline);
+align_y = LP_RASTER_BLOCK_SIZE;
+  }
 
- /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
- if ((uint64_t)lpr->row_stride[level] * nblocksy > 
LP_MAX_TEXTURE_SIZE) {
-/* image too large */
-goto fail;
- }
+  nblocksx = util_format_get_nblocksx(pt->format,
+  align(width, align_x));
+  nblocksy = util_format_get_nblocksy(pt->format,
+  align(height, align_y));
+  block_size = util_format_get_blocksize(pt->format);
+
+  if (util_format_is_compressed(pt->format))
+ lpr->row_stride[level] = nblocksx * block_size;
+  else
+ lpr->row_stride[level] = align(nblocksx * block_size, 
util_cpu_caps.cacheline);
 
- lpr->img_stride[level] = lpr->row_stride[level] * nblocksy;
+  /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
+  if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) {
+ /* image too large */
+ goto fail;
   }
 
-  /* Number of 3D image slices, cube faces or texture array layers */
-  {
- unsigned num_slices;
-
- if (lpr->base.target == PIPE_TEXTURE_CUBE)
-num_slices = 6;
- else if (lpr->base.target == PIPE_TEXTURE_3D)
-num_slices = depth;
- else if (lpr->base.target == PIPE_TEXTURE_1D_ARRAY ||
-  lpr->base.target == PIPE_TEXTURE_2D_ARRAY)
-num_slices = layers;
- else
-num_slices = 1;
+  lpr->

[Mesa-dev] [PATCH 1/3] llvmpipe: implement support for cube map arrays

2014-08-01 Thread sroland
From: Roland Scheidegger 

This just covers the resource side of things, not the actual sampling.
Here things are trivial as cube map arrays are identical to 2d arrays in
all respects.
---
 src/gallium/drivers/llvmpipe/lp_screen.c| 3 ++-
 src/gallium/drivers/llvmpipe/lp_setup.c | 6 +-
 src/gallium/drivers/llvmpipe/lp_state_sampler.c | 6 +-
 src/gallium/drivers/llvmpipe/lp_texture.c   | 3 ++-
 src/gallium/drivers/llvmpipe/lp_texture.h   | 2 ++
 5 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c 
b/src/gallium/drivers/llvmpipe/lp_screen.c
index e25d14e..e10a763 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -347,7 +347,8 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
   target == PIPE_TEXTURE_2D_ARRAY ||
   target == PIPE_TEXTURE_RECT ||
   target == PIPE_TEXTURE_3D ||
-  target == PIPE_TEXTURE_CUBE);
+  target == PIPE_TEXTURE_CUBE ||
+  target == PIPE_TEXTURE_CUBE_ARRAY);
 
if (sample_count > 1)
   return FALSE;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c 
b/src/gallium/drivers/llvmpipe/lp_setup.c
index d728e85..e85c4ca 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -844,7 +844,8 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
   }
 
   if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-  res->target == PIPE_TEXTURE_2D_ARRAY) {
+  res->target == PIPE_TEXTURE_2D_ARRAY ||
+  res->target == PIPE_TEXTURE_CUBE_ARRAY) {
  /*
   * For array textures, we don't have first_layer, instead
   * adjust last_layer (stored as depth) plus the mip level 
offsets
@@ -856,6 +857,9 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context 
*setup,
 jit_tex->mip_offsets[j] += view->u.tex.first_layer *
lp_tex->img_stride[j];
  }
+ if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+assert(jit_tex->depth % 6 == 0);
+ }
  assert(view->u.tex.first_layer <= view->u.tex.last_layer);
  assert(view->u.tex.last_layer < res->array_size);
   }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c 
b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index a14a64f..0180e99 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -246,12 +246,16 @@ prepare_shader_sampling(
   img_stride[j] = lp_tex->img_stride[j];
}
if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-   res->target == PIPE_TEXTURE_2D_ARRAY) {
+   res->target == PIPE_TEXTURE_2D_ARRAY ||
+   res->target == PIPE_TEXTURE_CUBE_ARRAY) {
   num_layers = view->u.tex.last_layer - 
view->u.tex.first_layer + 1;
   for (j = first_level; j <= last_level; j++) {
  mip_offsets[j] += view->u.tex.first_layer *
lp_tex->img_stride[j];
   }
+  if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+ assert(num_layers % 6 == 0);
+  }
   assert(view->u.tex.first_layer <= view->u.tex.last_layer);
   assert(view->u.tex.last_layer < res->array_size);
}
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c 
b/src/gallium/drivers/llvmpipe/lp_texture.c
index ec9e024..ba0e7f8 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -137,7 +137,8 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
   else if (lpr->base.target == PIPE_TEXTURE_3D)
  num_slices = depth;
   else if (lpr->base.target == PIPE_TEXTURE_1D_ARRAY ||
-   lpr->base.target == PIPE_TEXTURE_2D_ARRAY)
+   lpr->base.target == PIPE_TEXTURE_2D_ARRAY ||
+   lpr->base.target == PIPE_TEXTURE_CUBE_ARRAY)
  num_slices = layers;
   else
  num_slices = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h 
b/src/gallium/drivers/llvmpipe/lp_texture.h
index aa624b0..9fbd3a2 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -144,6 +144,7 @@ llvmpipe_resource_is_texture(const struct pipe_resource 
*resource)
case PIPE_TEXTURE_RECT:
case PIPE_TEXTURE_3D:
case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
   return TRUE;
default:
   assert(0);
@@ -165,6 +166,7 @@ llvmpipe_resource_is_1d(const struct pipe_resource 
*resource)
case PIPE

[Mesa-dev] [PATCH 2/3] gallivm: fix cube map array (and cube map shadow with bias) handling

2014-08-01 Thread sroland
From: Roland Scheidegger 

In particular need to handle TEX2/TXB2/TXL2 opcodes (cube map shadow
could already have used TXB2 which clearly couldn't have worked, despite
that no piglit change), and add a bunch more switch cases.
The actual sampling code still won't handle cube map arrays.
---
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c |  20 
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c  | 134 ++-
 2 files changed, 125 insertions(+), 29 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
index 48d1c52..3bbf260 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -137,10 +137,21 @@ analyse_tex(struct analysis_context *ctx,
   case TGSI_TEXTURE_SHADOWCUBE:
  readmask = TGSI_WRITEMASK_XYZW;
  break;
+  case TGSI_TEXTURE_CUBE_ARRAY:
+ readmask = TGSI_WRITEMASK_XYZW;
+ break;
+  case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+ readmask = TGSI_WRITEMASK_XYZW;
+ break;
   default:
  assert(0);
  return;
   }
+  /* XXX
+   * For cube map arrays, this will not analyze lod or shadow argument.
+   * For shadow cube, this will not analyze lod bias argument.
+   * "Indirect" really has no meaning for such textures anyway though.
+   */
 
   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
  /* We don't track explicit derivatives, although we could */
@@ -296,6 +307,15 @@ analyse_instruction(struct analysis_context *ctx,
   case TGSI_OPCODE_TXP:
  analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_PROJECTED);
  break;
+  case TGSI_OPCODE_TEX2:
+ analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_NONE);
+ break;
+  case TGSI_OPCODE_TXB2:
+ analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS);
+ break;
+  case TGSI_OPCODE_TXL2:
+ analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD);
+ break;
   case TGSI_OPCODE_SAMPLE:
  analyse_sample(ctx, inst, LP_BLD_TEX_MODIFIER_NONE, FALSE);
  break;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 3d7df3e..02a4a52 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1938,9 +1938,10 @@ static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
   const struct tgsi_full_instruction *inst,
   enum lp_build_tex_modifier modifier,
-  LLVMValueRef *texel)
+  LLVMValueRef *texel,
+  unsigned sampler_reg)
 {
-   unsigned unit;
+   unsigned unit = inst->Src[sampler_reg].Register.Index;
LLVMValueRef lod_bias, explicit_lod;
LLVMValueRef oow = NULL;
LLVMValueRef coords[5];
@@ -2010,7 +2011,16 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
   num_derivs = 3;
   break;
case TGSI_TEXTURE_CUBE_ARRAY:
+  num_offsets = 2;
+  num_derivs = 3;
+  layer_coord = 3;
+  break;
case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+  num_offsets = 2;
+  num_derivs = 3;
+  layer_coord = 3;
+  shadow_coord = 4; /* shadow coord special different reg */
+  break;
case TGSI_TEXTURE_2D_MSAA:
case TGSI_TEXTURE_2D_ARRAY_MSAA:
default:
@@ -2021,7 +2031,15 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
/* Note lod and especially projected are illegal in a LOT of cases */
if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
-  LLVMValueRef lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+  LLVMValueRef lod;
+  if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
+  inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
+ /* note that shadow cube array with bias/explicit lod does not exist 
*/
+ lod = lp_build_emit_fetch(&bld->bld_base, inst, 1, 0);
+  }
+  else {
+ lod = lp_build_emit_fetch(&bld->bld_base, inst, 0, 3);
+  }
   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
  lod_bias = lod;
  explicit_lod = NULL;
@@ -2053,13 +2071,23 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 
/* Layer coord always goes into 3rd slot, except for cube map arrays */
if (layer_coord) {
-  coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+  if (layer_coord == 3) {
+ coords[3] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+  }
+  else {
+ coords[2] = lp_build_emit_fetch(&bld->bld_base, inst, 0, layer_coord);
+  }
   if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
  coords[2] = lp_build_mul(&bld->bld_base.base, coords[2], oow);
}
/* Shadow coord occupies always 5th slot. */
if (shadow_coord) {
-  coords[4] = lp_build_emit_fetch(&bld->bld_base, inst, 0, shadow_coord);
+  if 

[Mesa-dev] [PATCH 3/3] gallium/docs: Document TEX2/TXL2/TXB2 instructions and fix up other tex doc

2014-08-01 Thread sroland
From: Roland Scheidegger 

Add documentation for TEX2/TXL2/TXB2 tgsi opcodes. Also, the texture opcode
documentation wasn't very accurate so fix this up a bit.
---
 src/gallium/docs/source/tgsi.rst | 127 +++
 1 file changed, 102 insertions(+), 25 deletions(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 093bf7d..ac0ea54 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -584,25 +584,44 @@ This instruction replicates its result.
   for array textures src0.y contains the slice for 1D,
   and src0.z contain the slice for 2D.
 
-  for shadow textures with no arrays, src0.z contains
-  the reference value.
+  for shadow textures with no arrays (and not cube map),
+  src0.z contains the reference value.
 
   for shadow textures with arrays, src0.z contains
   the reference value for 1D arrays, and src0.w contains
-  the reference value for 2D arrays.
+  the reference value for 2D arrays and cube maps.
 
-  There is no way to pass a bias in the .w value for
-  shadow arrays, and GLSL doesn't allow this.
-  GLSL does allow cube shadows maps to take a bias value,
-  and we have to determine how this will look in TGSI.
+  for cube map array shadow textures, the reference value
+  cannot be passed in src0.w, and TEX2 must be used instead.
 
 .. math::
 
   coord = src0
 
-  bias = 0.0
+  shadow_ref = src0.z or src0.w (optional)
+
+  unit = src1
+
+  dst = texture\_sample(unit, coord, shadow_ref)
+
+
+.. opcode:: TEX2 - Texture Lookup (for shadow cube map arrays only)
+
+  this is the same as TEX, but uses another reg to encode the
+  reference value.
+
+.. math::
+
+  coord = src0
+
+  shadow_ref = src1.x
+
+  unit = src2
+
+  dst = texture\_sample(unit, coord, shadow_ref)
+
+
 
-  dst = texture\_sample(unit, coord, bias)
 
 .. opcode:: TXD - Texture Lookup with Derivatives
 
@@ -614,26 +633,26 @@ This instruction replicates its result.
 
   ddy = src2
 
-  bias = 0.0
+  unit = src3
 
-  dst = texture\_sample\_deriv(unit, coord, bias, ddx, ddy)
+  dst = texture\_sample\_deriv(unit, coord, ddx, ddy)
 
 
 .. opcode:: TXP - Projective Texture Lookup
 
 .. math::
 
-  coord.x = src0.x / src.w
+  coord.x = src0.x / src0.w
 
-  coord.y = src0.y / src.w
+  coord.y = src0.y / src0.w
 
-  coord.z = src0.z / src.w
+  coord.z = src0.z / src0.w
 
   coord.w = src0.w
 
-  bias = 0.0
+  unit = src1
 
-  dst = texture\_sample(unit, coord, bias)
+  dst = texture\_sample(unit, coord)
 
 
 .. opcode:: UP2H - Unpack Two 16-Bit Floats
@@ -763,17 +782,46 @@ This instruction replicates its result.
 
 .. opcode:: TXB - Texture Lookup With Bias
 
+  for cube map array textures and shadow cube maps, the bias value
+  cannot be passed in src0.w, and TXB2 must be used instead.
+
+  if the target is a shadow texture, the reference value is always
+  in src.z (this prevents shadow 3d and shadow 2d arrays from
+  using this instruction, but this is not needed).
+
 .. math::
 
-  coord.x = src.x
+  coord.x = src0.x
+
+  coord.y = src0.y
 
-  coord.y = src.y
+  coord.z = src0.z
 
-  coord.z = src.z
+  coord.w = none
 
-  coord.w = 1.0
+  bias = src0.w
 
-  bias = src.z
+  unit = src1
+
+  dst = texture\_sample(unit, coord, bias)
+
+
+.. opcode:: TXB2 - Texture Lookup With Bias (some cube maps only)
+
+  this is the same as TXB, but uses another reg to encode the
+  lod bias value for cube map arrays and shadow cube maps.
+  Presumably shadow 2d arrays and shadow 3d targets could use
+  this encoding too, but this is not legal.
+
+  shadow cube map arrays are neither possible nor required.
+
+.. math::
+
+  coord = src0
+
+  bias = src1.x
+
+  unit = src2
 
   dst = texture\_sample(unit, coord, bias)
 
@@ -815,6 +863,13 @@ This instruction replicates its result.
 
 .. opcode:: TXL - Texture Lookup With explicit LOD
 
+  for cube map array textures, the explicit lod value
+  cannot be passed in src0.w, and TXL2 must be used instead.
+
+  if the target is a shadow texture, the reference value is always
+  in src.z (this prevents shadow 3d / 2d array / cube targets from
+  using this instruction, but this is not needed).
+
 .. math::
 
   coord.x = src0.x
@@ -823,10 +878,32 @@ This instruction replicates its result.
 
   coord.z = src0.z
 
-  coord.w = 1.0
+  coord.w = none
 
   lod = src0.w
 
+  unit = src1
+
+  dst = texture\_sample(unit, coord, lod)
+
+
+.. opcode:: TXL2 - Texture Lookup With explicit LOD (for cube map arrays only)
+
+  this is the same as TXL, but uses another reg to encode the
+  explicit lod value.
+  Presumably shadow 3d / 2d array / cube targets could use
+  this encoding too, but this is not legal.
+
+  shadow cube map arrays are neither possible nor required.
+
+.. math::
+
+  coord = src0
+
+  lod = src1.x
+
+  unit = src2
+
   dst = texture\_sample(unit, coord, lod)
 
 
@@ -954,9 +1031,9 @@ XXX doesn't look like most of the opcodes really belong 
here.
   As per NV_gpu_shader4, extract a single texel from a sp

[Mesa-dev] [PATCH] util/u_format_s3tc: handle srgb formats correctly.

2013-07-16 Thread sroland
From: Roland Scheidegger 

Instead of just ignoring the srgb/linear conversions, simply call the
corresponding conversion functions, for all of pack/unpack/fetch,
both for float and unorm8 versions (though some don't make a whole
lot of sense, i.e. unorm8/unorm8 srgb/linear combinations).
Refactored some functions a bit so don't have to duplicate all the code
(there's a slight change for packing dxt1_rgb, as there will now be
always 4 components initialized and sent to the external compression
function so the same code can be used for all, the quite horrid and
ad-hoc interface (by now) should always have worked with that).

Fixes llvmpipe/softpipe piglit texwrap GL_EXT_texture_sRGB-s3tc.
---
 src/gallium/auxiliary/util/u_format_s3tc.c |  434 
 src/gallium/auxiliary/util/u_format_srgb.h |5 +
 2 files changed, 254 insertions(+), 185 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c 
b/src/gallium/auxiliary/util/u_format_s3tc.c
index 35e3d34..788017d 100644
--- a/src/gallium/auxiliary/util/u_format_s3tc.c
+++ b/src/gallium/auxiliary/util/u_format_s3tc.c
@@ -27,6 +27,7 @@
 #include "u_math.h"
 #include "u_format.h"
 #include "u_format_s3tc.h"
+#include "u_format_srgb.h"
 
 
 #if defined(_WIN32) || defined(WIN32)
@@ -247,7 +248,7 @@ util_format_dxtn_rgb_unpack_rgba_8unorm(uint8_t *dst_row, 
unsigned dst_stride,
 const uint8_t *src_row, unsigned 
src_stride,
 unsigned width, unsigned height,
 util_format_dxtn_fetch_t fetch,
-unsigned block_size)
+unsigned block_size, boolean srgb)
 {
const unsigned bw = 4, bh = 4, comps = 4;
unsigned x, y, i, j;
@@ -258,6 +259,11 @@ util_format_dxtn_rgb_unpack_rgba_8unorm(uint8_t *dst_row, 
unsigned dst_stride,
 for(i = 0; i < bw; ++i) {
uint8_t *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + 
(x + i)*comps;
fetch(0, src, i, j, dst);
+   if (srgb) {
+  dst[0] = util_format_srgb_to_linear_8unorm(dst[0]);
+  dst[1] = util_format_srgb_to_linear_8unorm(dst[1]);
+  dst[2] = util_format_srgb_to_linear_8unorm(dst[2]);
+   }
 }
  }
  src += block_size;
@@ -274,7 +280,8 @@ util_format_dxt1_rgb_unpack_rgba_8unorm(uint8_t *dst_row, 
unsigned dst_stride,
util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
src_row, src_stride,
width, height,
-   util_format_dxt1_rgb_fetch, 8);
+   util_format_dxt1_rgb_fetch,
+   8, false);
 }
 
 void
@@ -285,7 +292,8 @@ util_format_dxt1_rgba_unpack_rgba_8unorm(uint8_t *dst_row, 
unsigned dst_stride,
util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
src_row, src_stride,
width, height,
-   util_format_dxt1_rgba_fetch, 8);
+   util_format_dxt1_rgba_fetch,
+   8, false);
 }
 
 void
@@ -296,7 +304,8 @@ util_format_dxt3_rgba_unpack_rgba_8unorm(uint8_t *dst_row, 
unsigned dst_stride,
util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
src_row, src_stride,
width, height,
-   util_format_dxt3_rgba_fetch, 16);
+   util_format_dxt3_rgba_fetch,
+   16, false);
 }
 
 void
@@ -307,7 +316,8 @@ util_format_dxt5_rgba_unpack_rgba_8unorm(uint8_t *dst_row, 
unsigned dst_stride,
util_format_dxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
src_row, src_stride,
width, height,
-   util_format_dxt5_rgba_fetch, 16);
+   util_format_dxt5_rgba_fetch,
+   16, false);
 }
 
 static INLINE void
@@ -315,7 +325,7 @@ util_format_dxtn_rgb_unpack_rgba_float(float *dst_row, 
unsigned dst_stride,
const uint8_t *src_row, unsigned 
src_stride,
unsigned width, unsigned height,
util_format_dxtn_fetch_t fetch,
-   unsigned block_size)
+   unsigned block_size, boolean srgb)
 {
unsigned x, y, i, j;
for(y = 0; y < height; y += 4) {
@@ -326,9 +33

  1   2   3   4   5   6   7   >