Re: [PATCH v3] i386: Check AX input in any_mul_highpart peepholes
On Sat, Dec 25, 2021 at 3:28 PM H.J. Lu wrote: > > When applying peephole optimization to transform > > mov imm, %reg0 > mov %reg1, %AX_REG > imul %reg0 > > to > > mov imm, %AX_REG > imul %reg1 > > disable peephole optimization if reg1 == AX_REG. > > gcc/ > > PR bootstrap/103785 > * config/i386/i386.md: Swap operand order in comments and check > AX input in any_mul_highpart peepholes. > > gcc/testsuite/ > > PR bootstrap/103785 > * gcc.target/i386/pr103785.c: New test. > --- > gcc/config/i386/i386.md | 9 -- > gcc/testsuite/gcc.target/i386/pr103785.c | 38 > 2 files changed, 44 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr103785.c > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 284b9507466..1c1ec227f71 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -8578,7 +8578,7 @@ > (set_attr "mode" "SI")]) > > ;; Highpart multiplication peephole2s to tweak register allocation. > -;; mov %rdx,imm; mov %rax,%rdi; imulq %rdx -> mov %rax,imm; imulq %rdi > +;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx -> mov imm,%rax; imulq %rdi > (define_peephole2 >[(set (match_operand:SWI48 0 "general_reg_operand") > (match_operand:SWI48 1 "immediate_operand")) > @@ -8588,7 +8588,8 @@ >(any_mul_highpart:SWI48 (match_dup 2) (match_dup 0))) > (clobber (match_dup 2)) > (clobber (reg:CC FLAGS_REG))])] > - "REGNO (operands[0]) != REGNO (operands[2]) > + "REGNO (operands[3]) != AX_REG > + && REGNO (operands[0]) != REGNO (operands[2]) > && REGNO (operands[0]) != REGNO (operands[3]) > && (REGNO (operands[0]) == REGNO (operands[4]) > || peep2_reg_dead_p (3, operands[0]))" > @@ -8608,7 +8609,9 @@ > (any_mul_highpart:SI (match_dup 2) (match_dup 0 > (clobber (match_dup 2)) > (clobber (reg:CC FLAGS_REG))])] > - "REGNO (operands[0]) != REGNO (operands[2]) > + "REGNO (operands[3]) != AX_REG Please also add TARGET_64BIT here, this DImode zero-extend pattern applies to 64bit targets only. OK with the above change. Thanks, Uros. > + && REGNO (operands[0]) != REGNO (operands[2]) > + && REGNO (operands[2]) != REGNO (operands[3]) > && REGNO (operands[0]) != REGNO (operands[3]) > && (REGNO (operands[0]) == REGNO (operands[4]) > || peep2_reg_dead_p (3, operands[0]))" > diff --git a/gcc/testsuite/gcc.target/i386/pr103785.c > b/gcc/testsuite/gcc.target/i386/pr103785.c > new file mode 100644 > index 000..5503b965256 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103785.c > @@ -0,0 +1,38 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +#include > + > +struct wrapper_t > +{ > + long k; > + long e; > +}; > + > +struct wrapper_t **table; > + > +__attribute__ ((weak, regparm (2))) > +void > +update (long k, long e) > +{ > + struct wrapper_t *elmt; > + > + elmt = table[k % 3079]; > + if (elmt == 0) > +return; > + elmt->e = e; > +} > + > +int > +main () > +{ > + table = (struct wrapper_t **) malloc (20 * sizeof (struct wrapper_t *)); > + for (int i = 0; i < 20; i++) > +table[i] = (struct wrapper_t *) calloc (sizeof (struct wrapper_t), 1); > + if (table[10]->e != 0) > +abort (); > + update (10, 20); > + if (table[10]->e != 20) > +abort (); > + return 0; > +} > -- > 2.33.1 >
Re: [PATCH v3] i386: Check AX input in any_mul_highpart peepholes
On Sun, Dec 26, 2021 at 2:31 AM Uros Bizjak wrote: > > On Sat, Dec 25, 2021 at 3:28 PM H.J. Lu wrote: > > > > When applying peephole optimization to transform > > > > mov imm, %reg0 > > mov %reg1, %AX_REG > > imul %reg0 > > > > to > > > > mov imm, %AX_REG > > imul %reg1 > > > > disable peephole optimization if reg1 == AX_REG. > > > > gcc/ > > > > PR bootstrap/103785 > > * config/i386/i386.md: Swap operand order in comments and check > > AX input in any_mul_highpart peepholes. > > > > gcc/testsuite/ > > > > PR bootstrap/103785 > > * gcc.target/i386/pr103785.c: New test. > > --- > > gcc/config/i386/i386.md | 9 -- > > gcc/testsuite/gcc.target/i386/pr103785.c | 38 > > 2 files changed, 44 insertions(+), 3 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103785.c > > > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > > index 284b9507466..1c1ec227f71 100644 > > --- a/gcc/config/i386/i386.md > > +++ b/gcc/config/i386/i386.md > > @@ -8578,7 +8578,7 @@ > > (set_attr "mode" "SI")]) > > > > ;; Highpart multiplication peephole2s to tweak register allocation. > > -;; mov %rdx,imm; mov %rax,%rdi; imulq %rdx -> mov %rax,imm; imulq %rdi > > +;; mov imm,%rdx; mov %rdi,%rax; imulq %rdx -> mov imm,%rax; imulq %rdi > > (define_peephole2 > >[(set (match_operand:SWI48 0 "general_reg_operand") > > (match_operand:SWI48 1 "immediate_operand")) > > @@ -8588,7 +8588,8 @@ > >(any_mul_highpart:SWI48 (match_dup 2) (match_dup 0))) > > (clobber (match_dup 2)) > > (clobber (reg:CC FLAGS_REG))])] > > - "REGNO (operands[0]) != REGNO (operands[2]) > > + "REGNO (operands[3]) != AX_REG > > + && REGNO (operands[0]) != REGNO (operands[2]) > > && REGNO (operands[0]) != REGNO (operands[3]) > > && (REGNO (operands[0]) == REGNO (operands[4]) > > || peep2_reg_dead_p (3, operands[0]))" > > @@ -8608,7 +8609,9 @@ > > (any_mul_highpart:SI (match_dup 2) (match_dup 0 > > (clobber (match_dup 2)) > > (clobber (reg:CC FLAGS_REG))])] > > - "REGNO (operands[0]) != REGNO (operands[2]) > > + "REGNO (operands[3]) != AX_REG > > Please also add TARGET_64BIT here, this DImode zero-extend pattern > applies to 64bit targets only. > > OK with the above change. This is the patch I am checking in. Thanks. > Thanks, > Uros. > > > + && REGNO (operands[0]) != REGNO (operands[2]) > > + && REGNO (operands[2]) != REGNO (operands[3]) > > && REGNO (operands[0]) != REGNO (operands[3]) > > && (REGNO (operands[0]) == REGNO (operands[4]) > > || peep2_reg_dead_p (3, operands[0]))" > > diff --git a/gcc/testsuite/gcc.target/i386/pr103785.c > > b/gcc/testsuite/gcc.target/i386/pr103785.c > > new file mode 100644 > > index 000..5503b965256 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103785.c > > @@ -0,0 +1,38 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O2" } */ > > + > > +#include > > + > > +struct wrapper_t > > +{ > > + long k; > > + long e; > > +}; > > + > > +struct wrapper_t **table; > > + > > +__attribute__ ((weak, regparm (2))) > > +void > > +update (long k, long e) > > +{ > > + struct wrapper_t *elmt; > > + > > + elmt = table[k % 3079]; > > + if (elmt == 0) > > +return; > > + elmt->e = e; > > +} > > + > > +int > > +main () > > +{ > > + table = (struct wrapper_t **) malloc (20 * sizeof (struct wrapper_t *)); > > + for (int i = 0; i < 20; i++) > > +table[i] = (struct wrapper_t *) calloc (sizeof (struct wrapper_t), 1); > > + if (table[10]->e != 0) > > +abort (); > > + update (10, 20); > > + if (table[10]->e != 20) > > +abort (); > > + return 0; > > +} > > -- > > 2.33.1 > > -- H.J. From 53300b05644c9cc0ed4067d1128118e98a3584d8 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Fri, 24 Dec 2021 12:50:21 -0800 Subject: [PATCH] i386: Check AX input in any_mul_highpart peepholes When applying peephole optimization to transform mov imm, %reg0 mov %reg1, %AX_REG imul %reg0 to mov imm, %AX_REG imul %reg1 disable peephole optimization if reg1 == AX_REG. gcc/ PR target/103785 * config/i386/i386.md: Swap operand order in comments and check AX input in any_mul_highpart peepholes. gcc/testsuite/ PR target/103785 * gcc.target/i386/pr103785.c: New test. --- gcc/config/i386/i386.md | 10 +-- gcc/testsuite/gcc.target/i386/pr103785.c | 38 2 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr103785.c diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 284b9507466..eea5d5d3f01 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8578,7 +8578,7 @@ (set_attr "mode" "SI")]) ;; Highpart multiplication peephole2s to tweak register allocation. -;; mov %rdx,imm; mov
Re: [PATCH] Make integer output faster in libgfortran
Hi, > I tested this on x86_64-pc-linux-gnu with > make -k -j8 check-fortran RUNTESTFLAGS="--target_board=unix'{-m32,-m64}'" > and didn't see any problems. Thanks Thomas! Pushed. > (We could also do something like that for a 32-bit system, but > that is another kettle of fish). We probably wouldn’t get a speed-up that big. Even on 32-bit targets (at least common ones), the 64-bit type and its operations (notably division) are implemented via CPU instructions, not library calls. At this point, the output of integers is probably bound by the many layers of indirection of libgfortran's I/O system (which are necessary because of the rich I/O formatting allowed by the standard). Best, FX
Re: [PATCH] Make integer output faster in libgfortran
Hi FX, (We could also do something like that for a 32-bit system, but that is another kettle of fish). We probably wouldn’t get a speed-up that big. Even on 32-bit targets (at least common ones), the 64-bit type and its operations (notably division) are implemented via CPU instructions, not library calls. I'll look at this a bit more closely and report :-) At this point, the output of integers is probably bound by the many layers of indirection of libgfortran's I/O system (which are necessary because of the rich I/O formatting allowed by the standard). There are a few things we could do. Getting a LTO-capable version of libgfortran would be a huge step, because the compiler could then strip out all of these layers. The speed of character:: c write (*,'(A)', advance="no") c could stand some improvement :-) Regards Thomas
Re: [PATCH v2] ix86: Don't use the 'm' constraint for x86_64_general_operand
On Fri, Dec 24, 2021 at 12:19 AM Uros Bizjak wrote: > > On Thu, Dec 23, 2021 at 3:42 PM H.J. Lu wrote: > > > > On Mon, Dec 20, 2021 at 2:22 PM H.J. Lu wrote: > > > > > > On Mon, Dec 20, 2021 at 12:38 PM Jakub Jelinek wrote: > > > > > > > > On Mon, Dec 20, 2021 at 11:44:08AM -0800, H.J. Lu wrote: > > > > > The problem is in > > > > > > > > > > (define_memory_constraint "TARGET_MEM_CONSTRAINT" > > > > > "Matches any valid memory." > > > > > (and (match_code "mem") > > > > >(match_test "memory_address_addr_space_p (GET_MODE (op), XEXP > > > > > (op, 0), > > > > > MEM_ADDR_SPACE > > > > > (op))"))) > > > > > > > > > > define_register_constraint allows LRA to convert the operand to the > > > > > form > > > > > '(mem (reg X))', where X is a base register. I am testing the v2 > > > > > patch with > > > > > > > > If you mean replacing an immediate with a MEM containing that immediate, > > > > isn't that often the right thing though? > > > > I mean, if the register pressure is high and options are either spill > > > > some > > > > register, set it to immediate, use it in one instruction and then fill > > > > the > > > > spilled register (i.e. 2 memory loads), compared to a MEM use on the > > > > arithmetic instruction one MEM seems cheaper to me. With -fPIC and the > > > > cst needing runtime relocation slightly less so of course. > > > > > > > > > > We will check the performance impact on SPEC CPU 2017. > > > Here is the v2 patch. Liwei, can you help collect SPEC CPU 2017 > > > impact of the enclosed patch? Thanks. > > > > We checked SPEC CPU 2017 performance with -O2 and -Ofast. > > There is no performance regression. OK for master? > > OK if there are no further comments from Jakub. I will check it tomorrow if there are further comments from Jakub. Thanks. > Thanks, > Uros. > > > > > The code due to ivopts is trying to have something like > > > > size_t a = (size_t) &tunable_list; > > > > size_t b = 0xffa8 - a; > > > > size_t c = x + b; > > > > and for that cst - &symbol one needs actually 2 registers, one to hold > > > > the > > > > constant and one to hold the (%rip) based address. > > > > (insn 790 789 791 111 (set (reg:DI 292) > > > > (const_int -88 [0xffa8])) "dl-tunables.c":304:15 76 > > > > {*movdi_internal} > > > > (nil)) > > > > (insn 791 790 792 111 (set (reg:DI 293) > > > > (symbol_ref:DI ("tunable_list") [flags 0x2] > > > 0x7f3460aa9cf0 tunable_list>)) "dl-tunables.c":304:15 76 > > > > {*movdi_internal} > > > > (nil)) > > > > (insn 792 791 793 111 (parallel [ > > > > (set (reg:DI 291) > > > > (minus:DI (reg:DI 292) > > > > (reg:DI 293))) > > > > (clobber (reg:CC 17 flags)) > > > > ]) "dl-tunables.c":304:15 299 {*subdi_1} > > > > (nil)) > > > > (insn 793 792 794 111 (parallel [ > > > > (set (reg:DI 294) > > > > (plus:DI (reg:DI 291) > > > > (reg:DI 198 [ ivtmp.176 ]))) > > > > (clobber (reg:CC 17 flags)) > > > > ]) "dl-tunables.c":304:15 226 {*adddi_1} > > > > (nil)) > > > > It would be smarter to rewrite the above into a lea > > > > 88+tunable_list(%rip), %temp1 > > > > and use a subtraction instead of addition in the last insn above, or of > > > > course in the particular case even consider the following 2 instructions > > > > that do: > > > > (insn 794 793 795 111 (set (reg:DI 296) > > > > (symbol_ref:DI ("tunable_list") [flags 0x2] > > > 0x7f3460aa9cf0 tunable_list>)) "dl-tunables.c":304:15 76 > > > > {*movdi_internal} > > > > (nil)) > > > > (insn 795 794 796 111 (parallel [ > > > > (set (reg:DI 295 [ cur ]) > > > > (plus:DI (reg:DI 294) > > > > (reg:DI 296))) > > > > (clobber (reg:CC 17 flags)) > > > > ]) "dl-tunables.c":304:15 226 {*adddi_1} > > > > (nil)) > > > > and find out that &tuneble_list - &tuneble_list is 0 and we don't need > > > > it at > > > > all. Guess we don't figure that out due to the cast of one of those > > > > addresses to size_t and the other one used in POINTER_PLUS_EXPR as > > > > normal > > > > pointer. > > > > > > > > Jakub > > > > > > > > > > > > > -- > > > H.J. > > > > Thanks. > > > > -- > > H.J. -- H.J.
[PATCH] Emit correct types for CHARACTER(C_CHAR), VALUE
Hi, I’ve spent many hours going down a rabbit hole, namely that we emit incorrect types for C interoperable procedure parameters of type CHARACTER(C_CHAR), VALUE. The full details can be found here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103828 I will try here to be as brief as I can. This bug manifests on the new target aarche64-apple-darwin, and there only if a CHARACTER(C_CHAR), VALUE argument is the 11th (or greater) argument to the function. Then, we would pass it as a 32-bit value, instead of an 8-bit value (as should a char). The front-end emits the wrong type information about the argument, but it turns out for all other targets this simply does not matter, due to their ABI. Interoperable CHARACTER(C_CHAR), VALUE arguments are weird, because deep down they’re not actually character types (in the Fortran sense of the word), i.e. arrays, but scalar C char (i.e., integers!). So they’ve always been kind-of second class citizens in the front-end. A function was introduced in 2007 that basically patches their type on the fly: gfc_conv_scalar_char_value(). The problem is that function is simply wrong. It says (from comments): /* Modify the tree type for scalar character dummy arguments of bind(c) procedures if they are passed by value. The tree type for them will be promoted to INTEGER_TYPE for the middle end, which appears to be what C would do with characters passed by-value. The value attribute implies the dummy is a scalar. */ The error is a misunderstanding of the C standard. char (and integer types smaller than int) are promoted to int for argument passing in unprototyped (old-school K&R) functions. In prototyped functions, they are most definitely not promoted. Therefore, the function is wrong. More importantly, we can actually emit the right type much earlier in the front-end, and we already have a place to do so, in gfc_sym_type(). We already handle the result variables, but by-value arguments should have the same type, so the fix is easy. In fact, once we emit the right type in gfc_sym_type(), we actually never need to fix it up in gfc_conv_scalar_char_value(). Removing the hack, I decided to leave it as a series of assertions, because we have seen in the past that this case is difficult to catch, and can easily regress due to changes elsewhere in the front-end. Then, once the hack is removed, it becomes clear that the two callers of gfc_conv_scalar_char_value() expected different things from it: it’s called from generate_local_decl() to hack the sym->backend_decl; and it’s called from gfc_conv_procedure_call() for its true purpose, as a real conv_* function. For the first case, we can inline the checks (which have replaced the hack) into the caller in generate_local_decl(). Once only the second use case remains, we can make the function local to that file (make static, remove gfc_ prefix). The patch comes with a couple of extra testcases, exercising the passing of char by value, and as integer, and their interoperability. It was regtested on x86_64-pc-gnu-linux, on aarch64-apple-darwin (because its ABI is picky). OK to commit? FX 0001-Fortran-Emit-correct-types-for-CHARACTER-C_CHAR-VALU.patch Description: Binary data
[PATCH] Prune LTO warnings that some tests fail
The testcase g++.dg/torture/pr31863.C fails on *-apple-darwin with: FAIL: g++.dg/torture/pr31863.C -O2 -flto (test for excess errors) Excess errors: lto-wrapper: warning: using serial compilation of 2 LTRANS jobs The test otherwise works fine, but those lto warnings should be pruned. OK to commit? FX 0001-LTO-Prune-some-warnings-in-the-testsuite.patch Description: Binary data
[PATCH] Set __FLT_EVAL_METHOD__/__FLT_EVAL_METHOD_TS_18661_3__ to FLT_EVAL_METHOD_PROMOTE_TO_FLOAT when FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16.
Since FLT_EVAL_METHOD only accepts negative value, 0, 1 or 2. gcc/c-family/ChangeLog: PR c/100854 * c-common.c (excess_precision_mode_join): Return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT when both x and y are FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16. --- gcc/c-family/c-common.c | 4 1 file changed, 4 insertions(+) diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c index a25d59fa77b..4dab4d60773 100644 --- a/gcc/c-family/c-common.c +++ b/gcc/c-family/c-common.c @@ -8842,6 +8842,10 @@ excess_precision_mode_join (enum flt_eval_method x, || y == FLT_EVAL_METHOD_UNPREDICTABLE) return FLT_EVAL_METHOD_UNPREDICTABLE; + /* FLT_EVAL_METHOD only accepts negative values, 0, 1 or 2, but + FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 is 16. */ +if (x == y && x == FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16) + return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT; /* GCC only supports one interchange type right now, _Float16. If we're evaluating _Float16 in 16-bit precision, then flt_eval_method will be FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16. */ -- 2.18.1
Re: [PATCH] fixed testcase riscv/pr103302.c
Hi ShiHua: Thanks for the fix, committed with Andrew's comment and minor fix for commit comment. On Thu, Dec 23, 2021 at 3:41 PM Andrew Pinski via Gcc-patches wrote: > > On Wed, Dec 22, 2021 at 11:37 PM wrote: > > > > From: LiaoShihua > > > > because riscv32 not support __int128, so skip if -march=rv32*. > > > > gcc/testsuite\ChangeLog: > > * gcc.target/riscv/pr103302.c: skip if -march=rv32* > > --- > > gcc/testsuite/gcc.target/riscv/pr103302.c | 1 + > > 1 file changed, 1 insertion(+) > > > > diff --git a/gcc/testsuite/gcc.target/riscv/pr103302.c > > b/gcc/testsuite/gcc.target/riscv/pr103302.c > > index 822c4087416..2cfb12498a2 100644 > > --- a/gcc/testsuite/gcc.target/riscv/pr103302.c > > +++ b/gcc/testsuite/gcc.target/riscv/pr103302.c > > @@ -1,4 +1,5 @@ > > /* { dg-do run } */ > > +/* { dg-skip-if "rv32 not support _int128" { *-*-* } { "-march=rv32*" } } > > */ > > Better fix: > /* { dg-do run { target int128 } } */ > > Thanks, > Andrew Pinski > > > /* { dg-options "-Og -fharden-compares -fno-tree-dce -fno-tree-fre " } */ > > > > typedef unsigned char u8; > > -- > > 2.31.1.windows.1 > >