Recent testing uncovered that PTX JIT may reject attempts to use 'mul.u32' as
a non-widening 32-bit multiply instruction. Use 'mul.lo.u32' to fix 32-bit
code generation and conform to the PTX spec better.
* config/nvptx/nvptx.c (nvptx_init_unisimt_predicate): Emit
'mul.lo.u32' instead of 'mul.u32' for 32-bit ABI target.
(nvptx_declare_function_name): Ditto.
---
gcc/ChangeLog.gomp-nvptx | 6 ++++++
gcc/config/nvptx/nvptx.c | 4 ++--
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 93bf781..bc187ea 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -955,7 +955,7 @@ nvptx_init_unisimt_predicate (FILE *file)
fprintf (file, "\t\t.reg.u%d %%ustmp2;\n", bits);
fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
- bits == 64 ? ".wide" : "");
+ bits == 64 ? ".wide" : ".lo");
fprintf (file, "\t\tmov.u%d %%ustmp2, __nvptx_uni;\n", bits);
fprintf (file, "\t\tadd.u%d %%ustmp2, %%ustmp2, %%ustmp1;\n", bits);
fprintf (file, "\t\tld.shared.u32 %%r%d, [%%ustmp2];\n", master);
@@ -1115,7 +1115,7 @@ nvptx_declare_function_name (FILE *file, const char
*name, const_tree decl)
fprintf (file, "\t.reg.u%d %%fstmp2;\n", bits);
fprintf (file, "\tmov.u32 %%fstmp0, %%tid.y;\n");
fprintf (file, "\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
- bits == 64 ? ".wide" : "", bits / 8);
+ bits == 64 ? ".wide" : ".lo", bits / 8);
fprintf (file, "\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
/* fstmp2 = &__nvptx_stacks[tid.y]; */
fprintf (file, "\tadd.u%d %%fstmp2, %%fstmp2, %%fstmp1;\n", bits);