Re: [PATCH v3] i386: Allow -mlarge-data-threshold with -mcmodel=large
On 13.06.2023 05:28, Fangrui Song wrote: > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/large-data.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-options "-O2 -mcmodel=large -mlarge-data-threshold=4" } */ > +/* { dg-final { scan-assembler ".lbss" } } */ > +/* { dg-final { scan-assembler ".bss" } } */ > +/* { dg-final { scan-assembler ".ldata" } } */ > +/* { dg-final { scan-assembler ".data" } } */ > +/* { dg-final { scan-assembler ".lrodata" } } */ > +/* { dg-final { scan-assembler ".rodata" } } */ Aren't these regex-es, and hence the dots all need escaping or enclosing in square brackets? Jan
[PATCH] x86/AVX512: use VMOVDDUP for broadcast to V2DF
Like is already the case for the AVX/AVX2 form, VMOVDDUP - acting on double precision floating values - is more appropriate to use here, and it can also result in shorter insn encodings when source is memory or %xmm0...%xmm7, and no masking is applied (in allowing a 2-byte VEX prefix then instead of a 3-byte one). gcc/ * config/i386/sse.md (_vec_dup): Use vmovddup. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -25724,9 +25724,9 @@ "TARGET_AVX512F" { /* There is no DF broadcast (in AVX-512*) to 128b register. - Mimic it with integer variant. */ + Mimic it with vmovddup, just like vec_dupv2df does. */ if (mode == V2DFmode) -return "vpbroadcastq\t{%1, %0|%0, %q1}"; +return "vmovddup\t{%1, %0|%0, %q1}"; return "vbroadcast\t{%1, %0|%0, %1}"; }
[PATCH] x86: add Bk and Br to comment list B's sub-chars
gcc/ * config/i386/constraints.md: Mention k and r for B. --- a/gcc/config/i386/constraints.md +++ b/gcc/config/i386/constraints.md @@ -162,7 +162,9 @@ ;; g GOT memory operand. ;; m Vector memory operand ;; c Constant memory operand +;; k TLS address that allows insn using non-integer registers ;; n Memory operand without REX prefix +;; r Broadcast memory operand ;; s Sibcall memory operand, not valid for TARGET_X32 ;; w Call memory operand, not valid for TARGET_X32 ;; z Constant call address operand.
[PATCH] x86: make better use of VBROADCASTSS / VPBROADCASTD
... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are never longer (yet sometimes shorter) than the corresponding VSHUFPS / VPSHUFD, due to the immediate operand of the shuffle insns balancing the need for VEX3 in the broadcast ones. When EVEX encoding is required the broadcast insns are always shorter. Add two new alternatives each, one covering the AVX2 case and one covering AVX512. gcc/ * config/i386/sse.md (vec_dupv4sf): New AVX2 and AVX512F alternatives using vbroadcastss. (*vec_dupv4si): New AVX2 and AVX512F alternatives using vpbroadcastd. --- I'm working from the assumption that the isa attributes to the original 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 or avx_noavx2 as applicable), as the new earlier alternatives cover all operand forms already when at least AVX2 is enabled. Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss use? (Same further down in *vec_dupv4si and avx2_vbroadcasti128_ and elsewhere.) Is use of Yv for the source operand really necessary in *vec_dupv4si? I.e. would scalar integer values be put in XMM{16...31} when AVX512VL isn't enabled? If so (*movsi_internal / *movdi_internal suggest they might), wouldn't *vec_dupv2di need to use Yv as well in its 3rd alternative (or just m, as Yv is already covered by the 2nd one)? --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -25798,38 +25798,42 @@ (const_int 1)))]) (define_insn "vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SF 0 "register_operand" "=Yv,v,v,v,x") (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SF 1 "nonimmediate_operand" "v,vm,Yv,m,0")))] "TARGET_SSE" "@ + vbroadcastss\t{%1, %0|%0, %1} + vbroadcastss\t{%1, %g0|%g0, %1} vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "avx,avx,noavx") - (set_attr "type" "sseshuf1,ssemov,sseshuf1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_evex,maybe_evex,orig") - (set_attr "mode" "V4SF")]) + [(set_attr "isa" "avx2,avx512f,avx,avx,noavx") + (set_attr "type" "ssemov,ssemov,sseshuf1,ssemov,sseshuf1") + (set_attr "length_immediate" "0,0,1,0,1") + (set_attr "prefix_extra" "*,*,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_evex,maybe_evex,orig") + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF,V4SF")]) (define_insn "*vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SI 0 "register_operand" "=Yv,v,v,v,x") (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SI 1 "nonimmediate_operand" "vm,vm,Yv,m,0")))] "TARGET_SSE" "@ + vpbroadcastd\t{%1, %0|%0, %1} + vpbroadcastd\t{%1, %g0|%g0, %1} %vpshufd\t{$0, %1, %0|%0, %1, 0} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "sse2,avx,noavx") - (set_attr "type" "sselog1,ssemov,sselog1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_vex,maybe_evex,orig") - (set_attr "mode" "TI,V4SF,V4SF") + [(set_attr "isa" "avx2,avx512f,sse2,avx,noavx") + (set_attr "type" "ssemov,ssemov,sselog1,ssemov,sselog1") + (set_attr "length_immediate" "0,0,1,0,1") + (set_attr "prefix_extra" "*,*,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") + (cond [(eq_attr "alternative" "3") (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") ] (symbol_ref "true")))])
[PATCH] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
There's no reason to constrain this to AVX512VL, as the wider operation is not usable for more narrow operands only when the possible memory source is a non-broadcast one. This way even the scalar copysign3 can benefit from the operation being a single-insn one (leaving aside moves which the compiler decides to insert for unclear reasons, and leaving aside the fact that bcst_mem_operand() is too restrictive for broadcast to be embedded right into VPTERNLOG*). Along with this also request value duplication in ix86_expand_copysign()'s call to ix86_build_signbit_mask(), eliminating excess space allocation in .rodata.*, filled with zeros which are never read. gcc/ * config/i386/i386-expand.cc (ix86_expand_copysign): Request value duplication by ix86_build_signbit_mask() when AVX512F and not HFmode. * config/i386/sse.md (*_vternlog_all): Convert to 2-alternative form. Adjust "mode" attribute. Add "enabled" attribute. (*_vpternlog_1): Relax to just TARGET_AVX512F. (*_vpternlog_2): Likewise. (*_vpternlog_3): Likewise. --- I guess the underlying pattern, going along the lines of what one_cmpl2 uses, can be applied elsewhere as well. HFmode could use embedded broadcast too for copysign and alike, but that would need to be V2HF -> V8HF (for which I don't think there are any existing patterns). --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2266,7 +2266,7 @@ ix86_expand_copysign (rtx operands[]) else dest = NULL_RTX; op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode); - mask = ix86_build_signbit_mask (vmode, 0, 0); + mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0); if (CONST_DOUBLE_P (operands[1])) { --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12399,11 +12399,11 @@ (set_attr "mode" "")]) (define_insn "*_vternlog_all" - [(set (match_operand:V 0 "register_operand" "=v") + [(set (match_operand:V 0 "register_operand" "=v,v") (unspec:V - [(match_operand:V 1 "register_operand" "0") - (match_operand:V 2 "register_operand" "v") - (match_operand:V 3 "bcst_vector_operand" "vmBr") + [(match_operand:V 1 "register_operand" "0,0") + (match_operand:V 2 "register_operand" "v,v") + (match_operand:V 3 "bcst_vector_operand" "vBr,m") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] "TARGET_AVX512F @@ -12411,10 +12411,22 @@ it's not real AVX512FP16 instruction. */ && (GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 || GET_CODE (operands[3]) != VEC_DUPLICATE)" - "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}" +{ + if (TARGET_AVX512VL) +return "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + else +return "vpternlog\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "evex") - (set_attr "mode" "")]) + (set (attr "mode") +(if_then_else (match_test "TARGET_AVX512VL") + (const_string "") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref " == 64 || TARGET_AVX512VL") + (const_string "*")))]) ;; There must be lots of other combinations like ;; @@ -12443,7 +12455,7 @@ (any_logic2:V (match_operand:V 3 "regmem_or_bitnot_regmem_operand") (match_operand:V 4 "regmem_or_bitnot_regmem_operand"] - "( == 64 || TARGET_AVX512VL) + "TARGET_AVX512F && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12527,7 +12539,7 @@ (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")) (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))] - "( == 64 || TARGET_AVX512VL) + "TARGET_AVX512F && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12610,7 +12622,7 @@ (match_operand:V 1 "regmem_or_bitnot_regmem_operand") (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")))] - "( == 64 || TARGET_AVX512VL) + "TARGET_AVX512F && ix86_pre_reload_split ()" "#" "&& 1"
Re: [PATCH] x86: make better use of VBROADCASTSS / VPBROADCASTD
On 14.06.2023 09:41, Hongtao Liu wrote: > On Wed, Jun 14, 2023 at 1:58 PM Jan Beulich via Gcc-patches > wrote: >> >> ... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are >> never longer (yet sometimes shorter) than the corresponding VSHUFPS / >> VPSHUFD, due to the immediate operand of the shuffle insns balancing the >> need for VEX3 in the broadcast ones. When EVEX encoding is required the >> broadcast insns are always shorter. >> >> Add two new alternatives each, one covering the AVX2 case and one >> covering AVX512. > I think you can just change assemble output for this first alternative > when TARGET_AVX2, use vbroadcastss, else use vshufps since > vbroadcastss only accept register operand when TARGET_AVX2. And no > need to support 2 extra alternatives which doesn't make sense just > make RA more confused about the same meaning of different > alternatives. You mean by switching from "@ ..." to C code using "switch (which_alternative)"? I can do that, sure. Yet that'll make for a more complicated "length_immediate" attribute then. Would be nice if you could confirm that this is what you want, as I may well have misunderstood you. But that'll be for vec_dupv4sf only, as vec_dupv4si is subtly different. >> --- >> I'm working from the assumption that the isa attributes to the original >> 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 >> or avx_noavx2 as applicable), as the new earlier alternatives cover all >> operand forms already when at least AVX2 is enabled. >> >> Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss >> use? (Same further down in *vec_dupv4si and avx2_vbroadcasti128_ >> and elsewhere.) > Not sure about this part. I grep prefix_extra, seems only used by > znver.md/znver4.md for schedule, and only for comi instructions(?the > reservation name seems so). define_attr "length_vex" and define_attr "length" use it, too. Otherwise I would have asked whether the attribute couldn't be purged from most insns. My present understanding is that the attribute is wrong on vec_dupv4sf (and hence wants dropping from there altogether), and it should be "prefix_data16" instead on *vec_dupv4si, evaluating to 1 only for the non-AVX pshufd case. I suspect at least the latter would be going to far for doing it "while here" right in this patch. Plus I think I have seen various other questionable uses of that attribute. >> Is use of Yv for the source operand really necessary in *vec_dupv4si? >> I.e. would scalar integer values be put in XMM{16...31} when AVX512VL > Yes, You can look at ix86_hard_regno_mode_ok, EXT_REX_SSE_REGNO is > allowed for scalar mode, but not for 128/256-bit vector modes. > > 20204 if (TARGET_AVX512F > 20205 && (VALID_AVX512F_REG_OR_XI_MODE (mode) > 20206 || VALID_AVX512F_SCALAR_MODE (mode))) > 20207return true; Okay, so I need to switch input constraints for relevant new alternatives to Yv (I actually wonder why I did use v in vec_dupv4sf, as it was clear to me that SFmode can be in the high 16 xmm registers with just AVX512F). >> isn't enabled? If so (*movsi_internal / *movdi_internal suggest they >> might), wouldn't *vec_dupv2di need to use Yv as well in its 3rd >> alternative (or just m, as Yv is already covered by the 2nd one)? > I guess xm is more suitable since we still want to allocate > operands[1] to register when sse3_noavx. > It didn't hit any error since for avx and above, alternative 1(2rd > one) is always matched than alternative 2. I'm afraid I don't follow: With just -mavx512f the source operand can be in, say, %xmm16 (as per your clarification above). This would not match Yv, but it would match vm. And hence wrongly create an AVX512VL form of vmovddup. I didn't try it out earlier, because unlike for SFmode / DFmode I thought it's not really clear how to get the compiler to reliably put a DImode variable in an xmm reg, but it just occurred to me that this can be done the same way there. And voila, typedef long long __attribute__((vector_size(16))) v2di; v2di bcst(long long ll) { register long long x asm("xmm16") = ll; asm("nop %%esp" : "+v" (x)); return (v2di){x, x}; } compiled with just -mavx512f (and -O2) produces an AVX512VL insn. I'll make another patch, yet for that I'm then also not sure why you say xm would be more suitable. Yvm allows for registers (with or without AVX, merely SSE being required) just as much as vm does, doesn't it? And I don't think I've found any combination of destination being v and source being xm anywhere. Plus we want to allow for the higher registers when AVX512VL is enabled. Jan
Re: [PATCH] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
On 14.06.2023 10:10, Hongtao Liu wrote: > On Wed, Jun 14, 2023 at 1:59 PM Jan Beulich via Gcc-patches > wrote: >> >> There's no reason to constrain this to AVX512VL, as the wider operation >> is not usable for more narrow operands only when the possible memory > But this may require more resources (on AMD znver4 processor a zmm > instruction will also be split into 2 uops, right?) And on some intel > processors(SKX/CLX) there will be frequency reduction. I'm afraid I don't follow: Largely the same AVX512 code would be generated when passing -mavx512vl, so how can power/performance considerations matter here? All I'm doing here (and in a few more patches I'm still in the process of testing) is relax when AVX512 insns can actually be used (reducing the copying between registers and/or the number of insns needed). My understanding on the Intel side is that it only matters whether AVX512 insns are used, not what vector length they are. You may be right about znver4, though. Nevertheless I agree ... > If it needs to be done, it is better guarded with > !TARGET_PREFER_AVX256, at least when micro-architecture AVX256_OPTIMAL > or users explicitly uses -mprefer-vector-width=256, we don't want to > produce any zmm instruction for surprise.(Although > -mprefer-vector-width=256 is supposed for auto-vectorizer, but backend > codegen also use it under such cases, i.e. in *movsf_internal > alternative 5 use zmm only TARGET_AVX512F && !TARGET_PREFER_AVX256.) ... that respecting such overrides is probably desirable, so I'll adjust. Jan >> source is a non-broadcast one. This way even the scalar copysign3 >> can benefit from the operation being a single-insn one (leaving aside >> moves which the compiler decides to insert for unclear reasons, and >> leaving aside the fact that bcst_mem_operand() is too restrictive for >> broadcast to be embedded right into VPTERNLOG*). >> >> Along with this also request value duplication in >> ix86_expand_copysign()'s call to ix86_build_signbit_mask(), eliminating >> excess space allocation in .rodata.*, filled with zeros which are never >> read. >> >> gcc/ >> >> * config/i386/i386-expand.cc (ix86_expand_copysign): Request >> value duplication by ix86_build_signbit_mask() when AVX512F and >> not HFmode. >> * config/i386/sse.md (*_vternlog_all): Convert to >> 2-alternative form. Adjust "mode" attribute. Add "enabled" >> attribute. >> (*_vpternlog_1): Relax to just TARGET_AVX512F. >> (*_vpternlog_2): Likewise. >> (*_vpternlog_3): Likewise.
[PATCH] x86: correct and improve "*vec_dupv2di"
The input constraint for the %vmovddup alternative was wrong, as the upper 16 XMM registers require AVX512VL to be used with this insn. To compensate, introduce a new alternative permitting all 32 registers, by broadcasting to the full 512 bits in that case if AVX512VL is not available. gcc/ * config/i386/sse.md (vec_dupv2di): Correct %vmovddup input constraint. Add new AVX512F alternative. --- Strictly speaking the new alternative could be enabled from AVX2 onwards, but vmovddup can frequently be a shorter encoding (VEX2 vs VEX3). --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -25851,19 +25851,39 @@ (symbol_ref "true")))]) (define_insn "*vec_dupv2di" - [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,x") + [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x") (vec_duplicate:V2DI - (match_operand:DI 1 "nonimmediate_operand" " 0,Yv,vm,0")))] + (match_operand:DI 1 "nonimmediate_operand" " 0,Yv,vm,Yvm,0")))] "TARGET_SSE" - "@ - punpcklqdq\t%0, %0 - vpunpcklqdq\t{%d1, %0|%0, %d1} - %vmovddup\t{%1, %0|%0, %1} - movlhps\t%0, %0" - [(set_attr "isa" "sse2_noavx,avx,sse3,noavx") - (set_attr "type" "sselog1,sselog1,sselog1,ssemov") - (set_attr "prefix" "orig,maybe_evex,maybe_vex,orig") - (set_attr "mode" "TI,TI,DF,V4SF")]) +{ + switch (which_alternative) +{ +case 0: + return "punpcklqdq\t%0, %0"; +case 1: + return "vpunpcklqdq\t{%d1, %0|%0, %d1}"; +case 2: + if (TARGET_AVX512VL) + return "vpbroadcastq\t{%1, %0|%0, %1}"; + return "vpbroadcastq\t{%1, %g0|%g0, %1}"; +case 3: + return "%vmovddup\t{%1, %0|%0, %1}"; +case 4: + return "movlhps\t%0, %0"; +default: + gcc_unreachable (); +} +} + [(set_attr "isa" "sse2_noavx,avx,avx512f,sse3,noavx") + (set_attr "type" "sselog1,sselog1,ssemov,sselog1,ssemov") + (set_attr "prefix" "orig,maybe_evex,evex,maybe_vex,orig") + (set_attr "mode" "TI,TI,TI,DF,V4SF") + (set (attr "enabled") + (if_then_else + (eq_attr "alternative" "2") + (symbol_ref "TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)") + (const_string "*")))]) (define_insn "avx2_vbroadcasti128_" [(set (match_operand:VI_256 0 "register_operand" "=x,v,v")
Re: [PATCH] x86: make better use of VBROADCASTSS / VPBROADCASTD
On 15.06.2023 07:23, Hongtao Liu wrote: > On Wed, Jun 14, 2023 at 5:03 PM Jan Beulich wrote: >> >> On 14.06.2023 09:41, Hongtao Liu wrote: >>> On Wed, Jun 14, 2023 at 1:58 PM Jan Beulich via Gcc-patches >>> wrote: >>>> >>>> ... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are >>>> never longer (yet sometimes shorter) than the corresponding VSHUFPS / >>>> VPSHUFD, due to the immediate operand of the shuffle insns balancing the >>>> need for VEX3 in the broadcast ones. When EVEX encoding is required the >>>> broadcast insns are always shorter. >>>> >>>> Add two new alternatives each, one covering the AVX2 case and one >>>> covering AVX512. >>> I think you can just change assemble output for this first alternative >>> when TARGET_AVX2, use vbroadcastss, else use vshufps since >>> vbroadcastss only accept register operand when TARGET_AVX2. And no >>> need to support 2 extra alternatives which doesn't make sense just >>> make RA more confused about the same meaning of different >>> alternatives. >> >> You mean by switching from "@ ..." to C code using "switch >> (which_alternative)"? I can do that, sure. Yet that'll make for a >> more complicated "length_immediate" attribute then. Would be nice > Yes, you can also do something like >(set (attr "length_immediate") > (cond [(eq_attr "alternative" "0") >(if_then_else (match_test "TARGET_AVX2) > (const_string "") >(const_string "1")) > ...] Yes, that's along the lines of what I was thinking of. I'm uncertain about one aspect of what you spelled out above, though: What is the meaning of the empty string in (const_string "")? Shouldn't this be "0" or "*"? >> But that'll be for vec_dupv4sf only, as vec_dupv4si is subtly >> different. > Yes, but can we use vpbroadcastd for vec_dupv4si similarly? Well, the use there is similar, but the folding with the shuffle alternative won't be possible, because of the new first alternative also allowing m for the source, when the shuffle one allows for only Yv. The extra m is pointless to have in vec_dupv4sf (because a later alternative with a wider ISA [avx] has it already), while in vec_dupv4si the similar later alternative resolves to vbroadcastss, not vpbroadcastd. I should be able to fold the two vpbroadcastd alternatives, along the lines of what I've done in the vec_dupv2di patch just sent. (As I just realized the m in what are alternatives 1 each in patch v1 is pointless, since already taken care of by other alternatives.) Jan
Re: [PATCH] x86: correct and improve "*vec_dupv2di"
On 15.06.2023 09:45, Hongtao Liu wrote: > On Thu, Jun 15, 2023 at 3:07 PM Uros Bizjak via Gcc-patches > wrote: >> On Thu, Jun 15, 2023 at 8:03 AM Jan Beulich via Gcc-patches >> wrote: >>> +case 3: >>> + return "%vmovddup\t{%1, %0|%0, %1}"; >>> +case 4: >>> + return "movlhps\t%0, %0"; >>> +default: >>> + gcc_unreachable (); >>> +} >>> +} >>> + [(set_attr "isa" "sse2_noavx,avx,avx512f,sse3,noavx") >>> + (set_attr "type" "sselog1,sselog1,ssemov,sselog1,ssemov") >>> + (set_attr "prefix" "orig,maybe_evex,evex,maybe_vex,orig") >>> + (set_attr "mode" "TI,TI,TI,DF,V4SF") > alternative 2 should be XImode when !TARGET_AVX512VL. This gives me a chance to actually raise a related question I stumbled across several times: Which operand does the mode attribute actually describe? I've seen places where it's the source, but I've also seen places where it's the destination. Because of this mix I wasn't really sure that getting this attribute entirely correct is actually necessary, and hence I hoped it would be okay to not further complicate the attribute here. Jan
[PATCH v2] x86: correct and improve "*vec_dupv2di"
The input constraint for the %vmovddup alternative was wrong, as the upper 16 XMM registers require AVX512VL to be used with this insn. To compensate, introduce a new alternative permitting all 32 registers, by broadcasting to the full 512 bits in that case if AVX512VL is not available. gcc/ * config/i386/sse.md (vec_dupv2di): Correct %vmovddup input constraint. Add new AVX512F alternative. --- Strictly speaking the new alternative could be enabled from AVX2 onwards, but vmovddup can frequently be a shorter encoding (VEX2 vs VEX3). It was suggested that the previously flawed %vmovddup alternative could use "xm" as source constraint. But then its destination would better also use "x", I think? --- v2: Use "* return ..." form. Set "mode" to XI for new alternative without AVX512VL. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -26033,19 +26033,35 @@ (symbol_ref "true")))]) (define_insn "*vec_dupv2di" - [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,x") + [(set (match_operand:V2DI 0 "register_operand" "=x,v,v,v,x") (vec_duplicate:V2DI - (match_operand:DI 1 "nonimmediate_operand" " 0,Yv,vm,0")))] + (match_operand:DI 1 "nonimmediate_operand" " 0,Yv,vm,Yvm,0")))] "TARGET_SSE" "@ punpcklqdq\t%0, %0 vpunpcklqdq\t{%d1, %0|%0, %d1} + * return TARGET_AVX512VL ? \"vpbroadcastq\t{%1, %0|%0, %1}\" : \"vpbroadcastq\t{%1, %g0|%g0, %1}\"; %vmovddup\t{%1, %0|%0, %1} movlhps\t%0, %0" - [(set_attr "isa" "sse2_noavx,avx,sse3,noavx") - (set_attr "type" "sselog1,sselog1,sselog1,ssemov") - (set_attr "prefix" "orig,maybe_evex,maybe_vex,orig") - (set_attr "mode" "TI,TI,DF,V4SF")]) + [(set_attr "isa" "sse2_noavx,avx,avx512f,sse3,noavx") + (set_attr "type" "sselog1,sselog1,ssemov,sselog1,ssemov") + (set_attr "prefix" "orig,maybe_evex,evex,maybe_vex,orig") + (set (attr "mode") + (cond [(and (eq_attr "alternative" "2") + (match_test "!TARGET_AVX512VL")) +(const_string "XI") + (eq_attr "alternative" "3") +(const_string "DF") + (eq_attr "alternative" "4") +(const_string "V4SF") + ] + (const_string "TI"))) + (set (attr "enabled") + (if_then_else + (eq_attr "alternative" "2") + (symbol_ref "TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)") + (const_string "*")))]) (define_insn "avx2_vbroadcasti128_" [(set (match_operand:VI_256 0 "register_operand" "=x,v,v")
[PATCH v2] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
There's no reason to constrain this to AVX512VL, unless instructed so by -mprefer-vector-width=, as the wider operation is unusable for more narrow operands only when the possible memory source is a non-broadcast one. This way even the scalar copysign3 can benefit from the operation being a single-insn one (leaving aside moves which the compiler decides to insert for unclear reasons, and leaving aside the fact that bcst_mem_operand() is too restrictive for broadcast to be embedded right into VPTERNLOG*). Along with this also request value duplication in ix86_expand_copysign()'s call to ix86_build_signbit_mask(), eliminating excess space allocation in .rodata.*, filled with zeros which are never read. gcc/ * config/i386/i386-expand.cc (ix86_expand_copysign): Request value duplication by ix86_build_signbit_mask() when AVX512F and not HFmode. * config/i386/sse.md (*_vternlog_all): Convert to 2-alternative form. Adjust "mode" attribute. Add "enabled" attribute. (*_vpternlog_1): Also permit when TARGET_AVX512F && !TARGET_PREFER_AVX256. (*_vpternlog_2): Likewise. (*_vpternlog_3): Likewise. --- I guess the underlying pattern, going along the lines of what one_cmpl2 uses, can be applied elsewhere as well. HFmode could use embedded broadcast too for copysign and alike, but that would need to be V2HF -> V8HF (for which I don't think there are any existing patterns). --- v2: Respect -mprefer-vector-width=. --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2266,7 +2266,7 @@ ix86_expand_copysign (rtx operands[]) else dest = NULL_RTX; op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode); - mask = ix86_build_signbit_mask (vmode, 0, 0); + mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0); if (CONST_DOUBLE_P (operands[1])) { --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12597,11 +12597,11 @@ (set_attr "mode" "")]) (define_insn "*_vternlog_all" - [(set (match_operand:V 0 "register_operand" "=v") + [(set (match_operand:V 0 "register_operand" "=v,v") (unspec:V - [(match_operand:V 1 "register_operand" "0") - (match_operand:V 2 "register_operand" "v") - (match_operand:V 3 "bcst_vector_operand" "vmBr") + [(match_operand:V 1 "register_operand" "0,0") + (match_operand:V 2 "register_operand" "v,v") + (match_operand:V 3 "bcst_vector_operand" "vBr,m") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] "TARGET_AVX512F @@ -12609,10 +12609,22 @@ it's not real AVX512FP16 instruction. */ && (GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 || GET_CODE (operands[3]) != VEC_DUPLICATE)" - "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}" +{ + if (TARGET_AVX512VL) +return "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + else +return "vpternlog\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "evex") - (set_attr "mode" "")]) + (set (attr "mode") +(if_then_else (match_test "TARGET_AVX512VL") + (const_string "") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref " == 64 || TARGET_AVX512VL") + (const_string "*")))]) ;; There must be lots of other combinations like ;; @@ -12641,7 +12653,8 @@ (any_logic2:V (match_operand:V 3 "regmem_or_bitnot_regmem_operand") (match_operand:V 4 "regmem_or_bitnot_regmem_operand"] - "( == 64 || TARGET_AVX512VL) + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12725,7 +12738,8 @@ (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")) (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))] - "( == 64 || TARGET_AVX512VL) + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12808,7 +12822,8 @@ (match_operand:V 1 "regmem_or_bitnot_regmem_operand") (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")))] - "( == 64 || TARGET_AVX512VL) + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split ()" "#" "&& 1"
[PATCH] x86: slightly enhance "vec_dupv2df"
Introduce a new alternative permitting all 32 registers to be used as source without AVX512VL, by broadcasting to the full 512 bits in that case. (The insn would also permit all registers to be used as destination, but V2DFmode doesn't.) gcc/ * config/i386/sse.md (vec_dupv2df): Add new AVX512F alternative. Move AVX512VL part of condition to new "enabled" attribute. --- Because of the V2DF restriction, in principle the new source constraint could also omit 'm'. Can't the latter two of the original alternatives be folded, by using Yvm instead of xm/vm? --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -13761,18 +13761,27 @@ (set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")]) (define_insn "vec_dupv2df" - [(set (match_operand:V2DF 0 "register_operand" "=x,x,v") + [(set (match_operand:V2DF 0 "register_operand" "=x,x,v,v") (vec_duplicate:V2DF - (match_operand:DF 1 "nonimmediate_operand" " 0,xm,vm")))] - "TARGET_SSE2 && " + (match_operand:DF 1 "nonimmediate_operand" "0,xm,vm,vm")))] + "TARGET_SSE2" "@ unpcklpd\t%0, %0 %vmovddup\t{%1, %0|%0, %1} - vmovddup\t{%1, %0|%0, %1}" - [(set_attr "isa" "noavx,sse3,avx512vl") - (set_attr "type" "sselog1") - (set_attr "prefix" "orig,maybe_vex,evex") - (set_attr "mode" "V2DF,DF,DF")]) + vmovddup\t{%1, %0|%0, %1} + vbroadcastsd\t{%1, }%g0{|, %1}" + [(set_attr "isa" "noavx,sse3,avx512vl,*") + (set_attr "type" "sselog1,ssemov,ssemov,ssemov") + (set_attr "prefix" "orig,maybe_vex,evex,evex") + (set_attr "mode" "V2DF,DF,DF,V8DF") + (set (attr "enabled") + (cond [(eq_attr "alternative" "3") +(symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (match_test "") +(const_string "*") + ] + (symbol_ref "false")))]) (define_insn "vec_concatv2df" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v,x,x, v,x,x")
[PATCH] x86: avoid maybe_gen_...()
In the (however unlikely) event that no insn can be found for the requested mode, using maybe_gen_...() without (really) checking its result for being a null rtx would lead to silent bad code generation. gcc/ * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate): Use gen_vec_set_0. (ix86_expand_vector_extract): Use gen_vec_extract_lo / gen_vec_extract_hi. (expand_vec_perm_broadcast_1): Use gen_vec_interleave_high / gen_vec_interleave_low. Rename local variable. --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -15456,8 +15456,7 @@ ix86_expand_vector_init_duplicate (bool { tmp1 = force_reg (GET_MODE_INNER (mode), val); tmp2 = gen_reg_rtx (mode); - emit_insn (maybe_gen_vec_set_0 (mode, tmp2, - CONST0_RTX (mode), tmp1)); + emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1)); tmp1 = gen_lowpart (mode, tmp2); } else @@ -17419,9 +17418,9 @@ ix86_expand_vector_extract (bool mmx_ok, ? gen_reg_rtx (V16HFmode) : gen_reg_rtx (V16BFmode)); if (elt < 16) - emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec)); + emit_insn (gen_vec_extract_lo (mode, tmp, vec)); else - emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec)); + emit_insn (gen_vec_extract_hi (mode, tmp, vec)); ix86_expand_vector_extract (false, target, tmp, elt & 15); return; } @@ -17435,9 +17434,9 @@ ix86_expand_vector_extract (bool mmx_ok, ? gen_reg_rtx (V8HFmode) : gen_reg_rtx (V8BFmode)); if (elt < 8) - emit_insn (maybe_gen_vec_extract_lo (mode, tmp, vec)); + emit_insn (gen_vec_extract_lo (mode, tmp, vec)); else - emit_insn (maybe_gen_vec_extract_hi (mode, tmp, vec)); + emit_insn (gen_vec_extract_hi (mode, tmp, vec)); ix86_expand_vector_extract (false, target, tmp, elt & 7); return; } @@ -22501,18 +22500,18 @@ expand_vec_perm_broadcast_1 (struct expa if (d->testing_p) return true; - rtx (*maybe_gen) (machine_mode, int, rtx, rtx, rtx); + rtx (*gen_interleave) (machine_mode, int, rtx, rtx, rtx); if (elt >= nelt2) { - maybe_gen = maybe_gen_vec_interleave_high; + gen_interleave = gen_vec_interleave_high; elt -= nelt2; } else - maybe_gen = maybe_gen_vec_interleave_low; + gen_interleave = gen_vec_interleave_low; nelt2 /= 2; dest = gen_reg_rtx (vmode); - emit_insn (maybe_gen (vmode, 1, dest, op0, op0)); + emit_insn (gen_interleave (vmode, 1, dest, op0, op0)); vmode = V4SImode; op0 = gen_lowpart (vmode, dest);
[PATCH] x86: replace "extendhfdf2" expander
The corresponding insn serves this purpose quite fine, and leads to slightly less (generated) code. All we need is the insn to not have a leading * in its name, while retaining that * for "extendhfsf2". Introduce a mode attribute in exchange to achieve that. gcc/ * config/i386/i386.md (extendhfdf2): Delete expander. (extendhf): New mode attribute. (*extendhf2): Use it. --- Of course the mode attribute could as well supply the full names. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -5221,13 +5221,9 @@ } }) -(define_expand "extendhfdf2" - [(set (match_operand:DF 0 "register_operand") - (float_extend:DF - (match_operand:HF 1 "nonimmediate_operand")))] - "TARGET_AVX512FP16") +(define_mode_attr extendhf [(SF "*") (DF "")]) -(define_insn "*extendhf2" +(define_insn "extendhf2" [(set (match_operand:MODEF 0 "register_operand" "=v") (float_extend:MODEF (match_operand:HF 1 "nonimmediate_operand" "vm")))]
Re: [PATCH] x86: replace "extendhfdf2" expander
On 14.07.2023 12:10, Uros Bizjak wrote: > On Fri, Jul 14, 2023 at 11:44 AM Jan Beulich wrote: >> >> The corresponding insn serves this purpose quite fine, and leads to >> slightly less (generated) code. All we need is the insn to not have a >> leading * in its name, while retaining that * for "extendhfsf2". >> Introduce a mode attribute in exchange to achieve that. >> >> gcc/ >> >> * config/i386/i386.md (extendhfdf2): Delete expander. >> (extendhf): New mode attribute. >> (*extendhf2): Use it. > > No, please leave the expander, it is there due to extendhfsf2 that > prevents effective macroization. Well, okay then. > FYI, there is no less generated code when the named pattern is used, > the same code is generated from the named pattern as from the > expander. Source code can be shrinked, but in this particular case, > forced macroization complicates things more. Hmm, I'm pretty sure I checked and found some reduction. Jan
Re: [PATCH] x86: slightly enhance "vec_dupv2df"
On 17.07.2023 08:09, Hongtao Liu wrote: > On Fri, Jul 14, 2023 at 5:40 PM Jan Beulich via Gcc-patches > wrote: >> >> Introduce a new alternative permitting all 32 registers to be used as >> source without AVX512VL, by broadcasting to the full 512 bits in that >> case. (The insn would also permit all registers to be used as >> destination, but V2DFmode doesn't.) > The patch looks technically ok, but considering we don't have a real > CPU with only AVX512F but no AVX512VL, these optimisations for AVX512F > only don't make much sense, but rather increase the burden for > maintenance. Well, I can of course ignore this aspect going forward. It seemed relevant to me for two reasons: For one, I expect I'm not the only one to simply pass -mavx512f when caring about basic AVX512. And then isn't the Knights line of processors (Xeon Phi) lacking VL? (I'm getting the impression though that this line is discontinued now.) >> Can't the latter two of the original alternatives be folded, by using >> Yvm instead of xm/vm? > I think yes. I guess I'll make a follow-on patch for that then. Jan
[PATCH] x86: fold two of vec_dupv2df's alternatives
By using Yvm in the source, both can be expressed in one. gcc/ * sse.md (vec_dupv2df): Fold the middle two of the alternatives. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -13784,21 +13784,20 @@ (set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")]) (define_insn "vec_dupv2df" - [(set (match_operand:V2DF 0 "register_operand" "=x,x,v,v") + [(set (match_operand:V2DF 0 "register_operand" "=x,v,v") (vec_duplicate:V2DF - (match_operand:DF 1 "nonimmediate_operand" "0,xm,vm,vm")))] + (match_operand:DF 1 "nonimmediate_operand" "0,Yvm,vm")))] "TARGET_SSE2" "@ unpcklpd\t%0, %0 %vmovddup\t{%1, %0|%0, %1} - vmovddup\t{%1, %0|%0, %1} vbroadcastsd\t{%1, }%g0{|, %1}" - [(set_attr "isa" "noavx,sse3,avx512vl,*") - (set_attr "type" "sselog1,ssemov,ssemov,ssemov") - (set_attr "prefix" "orig,maybe_vex,evex,evex") - (set_attr "mode" "V2DF,DF,DF,V8DF") + [(set_attr "isa" "noavx,sse3,*") + (set_attr "type" "sselog1,ssemov,ssemov") + (set_attr "prefix" "orig,maybe_evex,evex") + (set_attr "mode" "V2DF,DF,V8DF") (set (attr "enabled") - (cond [(eq_attr "alternative" "3") + (cond [(eq_attr "alternative" "2") (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL && !TARGET_PREFER_AVX256") (match_test "")
[PATCH RESEND] libatomic: drop redundant all-multi command
./multilib.am already specifies this same command, and make warns about the earlier one being ignored when seeing the later one. All that needs retaining to still satisfy the preceding comment is the extra dependency. libatomic/ * Makefile.am (all-multi): Drop commands. * Makefile.in: Update accordingly. --- While originally sent over a year ago and pinged subsequently, I can't quite view changes like this as "trivial" ... --- a/libatomic/Makefile.am +++ b/libatomic/Makefile.am @@ -149,12 +149,11 @@ endif libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES) libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD) -# Override the automake generated all-multi rule to guarantee that all-multi +# Amend the automake generated all-multi rule to guarantee that all-multi # is not run in parallel with the %_.lo rules which generate $(DEPDIR)/*.Ppo # makefile fragments to avoid broken *.Ppo getting included into the Makefile # when it is reloaded during the build of all-multi. all-multi: $(libatomic_la_LIBADD) - $(MULTIDO) $(AM_MAKEFLAGS) DO=all multi-do # $(MAKE) # target overrides -include $(tmake_file) --- a/libatomic/Makefile.in +++ b/libatomic/Makefile.in @@ -892,12 +892,11 @@ vpath % $(strip $(search_path)) %_.lo: Makefile $(LTCOMPILE) $(M_DEPS) $(M_SIZE) $(M_IFUNC) -c -o $@ $(M_SRC) -# Override the automake generated all-multi rule to guarantee that all-multi +# Amend the automake generated all-multi rule to guarantee that all-multi # is not run in parallel with the %_.lo rules which generate $(DEPDIR)/*.Ppo # makefile fragments to avoid broken *.Ppo getting included into the Makefile # when it is reloaded during the build of all-multi. all-multi: $(libatomic_la_LIBADD) - $(MULTIDO) $(AM_MAKEFLAGS) DO=all multi-do # $(MAKE) # target overrides -include $(tmake_file)
[PATCH] MAINTAINERS: correct my email address
The @novell.com one has been out of use for quite some time. ChangeLog: * MAINTAINERS: Correct my email address. --- a/MAINTAINERS +++ b/MAINTAINERS @@ -344,7 +344,7 @@ Andrew Bennett Daniel Berlin Pat Bernardi -Jan Beulich +Jan Beulich David Billinghurst Tomas Bily Laurynas Biveinis
[PATCH 00/10] x86: (mainly) "prefix_extra" adjustments
Having noticed various bogus uses, I thought I'd go through and audit them all. This is the result, with some other attributes also adjusted as noticed in the process. (I think this tidying also is a good thing to have ahead of APX further complicating insn length calculations.) 01: "prefix_extra" tidying 02: "sse4arg" adjustments 03: "ssemuladd" adjustments 04: "prefix_extra" can't really be "2" 05: replace/correct bogus "prefix_extra" 06: drop stray "prefix_extra" 07: add (adjust) XOP insn attributes 08: add missing "prefix" attribute to VF{,C}MULC 09: correct "length_immediate" in a few cases 10: drop redundant "prefix_data16" attributes Jan
[PATCH 01/10] x86: "prefix_extra" tidying
Drop SSE5 leftovers from both its comment and its default calculation. A value of 2 simply cannot occur anymore. Instead extend the comment to mention the use of the attribute in "length_vex", clarifying why "prefix_extra" can actually be meaningful on VEX-encoded insns despite those not having any real prefixes except possibly segment overrides. gcc/ * config/i386/i386.md (prefix_extra): Correct comment. Fold cases yielding 2 into ones yielding 1. --- I question the 3DNow! aspect here: There's no extra prefix there. It's an immediate instead which "sub-divides" major opcode 0f0f. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -620,13 +620,11 @@ (const_int 0))) ;; There are also additional prefixes in 3DNOW, SSSE3. -;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, -;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. +;; While generally inapplicable to VEX/XOP/EVEX encodings, "length_vex" uses +;; the attribute evaluating to zero to know that VEX2 encoding may be usable. (define_attr "prefix_extra" "" - (cond [(eq_attr "type" "ssemuladd,sse4arg") - (const_int 2) -(eq_attr "type" "sseiadd1,ssecvt1") + (cond [(eq_attr "type" "ssemuladd,sse4arg,sseiadd1,ssecvt1") (const_int 1) ] (const_int 0)))
[PATCH 02/10] x86: "sse4arg" adjustments
Record common properties in other attributes' default calculations: There's always a 1-byte immediate, and they're always encoded in a VEX3- like manner (note that "prefix_extra" already evaluates to 1 in this case). The drop now (or already previously) redundant explicit attributes, adding "mode" ones where they were missing. Furthermore use "sse4arg" consistently for all VPCOM* insns; so far signed comparisons did use it, while unsigned ones used "ssecmp". Note that while they have (not counting the explicit or implicit immediate operand) they really only have 3 operands, the operator is also counted in those patterns. That's relevant for establishing the "memory" attribute's value, and at the same time benign when there are only register operands. Note that despite also having 4 operands, multiply-add insns aren't affected by this change, as they use "ssemuladd" for "type". gcc/ * config/i386/i386.md (length_immediate): Handle "sse4arg". (prefix): Likewise. (*xop_pcmov_): Add "mode" attribute. * config/i386/mmx.md (*xop_maskcmp3): Drop "prefix_data16", "prefix_rep", "prefix_extra", and "length_immediate" attributes. (*xop_maskcmp_uns3): Likewise. Switch "type" to "sse4arg". (*xop_pcmov_): Add "mode" attribute. * config/i386/sse.md (xop_pcmov_): Add "mode" attribute. (xop_maskcmp3): Drop "prefix_data16", "prefix_rep", "prefix_extra", and "length_immediate" attributes. (xop_maskcmp_uns3): Likewise. Switch "type" to "sse4arg". (xop_maskcmp_uns23): Drop "prefix_data16", "prefix_extra", and "length_immediate" attributes. Switch "type" to "sse4arg". (xop_pcom_tf3): Likewise. (xop_vpermil23): Drop "length_immediate" attribute. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -536,6 +536,8 @@ (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave, bitmanip,imulx,msklog,mskmov") (const_int 0) +(eq_attr "type" "sse4arg") + (const_int 1) (eq_attr "unit" "i387,sse,mmx") (const_int 0) (eq_attr "type" "alu,alu1,negnot,imovx,ishift,ishiftx,ishift1, @@ -635,6 +637,8 @@ (const_string "vex") (eq_attr "mode" "XI,V16SF,V8DF") (const_string "evex") +(eq_attr "type" "sse4arg") + (const_string "vex") ] (const_string "orig"))) @@ -23286,7 +23290,8 @@ (match_operand:MODEF 3 "register_operand" "x")))] "TARGET_XOP" "vpcmov\t{%1, %3, %2, %0|%0, %2, %3, %1}" - [(set_attr "type" "sse4arg")]) + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) ;; These versions of the min/max patterns are intentionally ignorant of ;; their behavior wrt -0.0 and NaN (via the commutative operand mark). --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2909,10 +2909,6 @@ "TARGET_XOP" "vpcom%Y1\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "sse4arg") - (set_attr "prefix_data16" "0") - (set_attr "prefix_rep" "0") - (set_attr "prefix_extra" "2") - (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) (define_insn "*xop_maskcmp3" @@ -2923,10 +2919,6 @@ "TARGET_XOP" "vpcom%Y1\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "sse4arg") - (set_attr "prefix_data16" "0") - (set_attr "prefix_rep" "0") - (set_attr "prefix_extra" "2") - (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) (define_insn "*xop_maskcmp_uns3" @@ -2936,11 +2928,7 @@ (match_operand:MMXMODEI 3 "register_operand" "x")]))] "TARGET_XOP" "vpcom%Y1u\t{%3, %2, %0|%0, %2, %3}" - [(set_attr "type" "ssecmp") - (set_attr "prefix_data16" "0") - (set_attr "prefix_rep" "0") - (set_attr "prefix_extra" "2") - (set_attr "length_immediate" "1") + [(set_attr "type" "sse4arg") (set_attr "mode" "TI")]) (define_insn "*xop_maskcmp_uns3" @@ -2950,11 +2938,7 @@ (match_operand:VI_16_32 3 "register_operand" "x")]))] "TARGET_XOP" "vpcom%Y1u\t{%3, %2, %0|%0, %2, %3}" - [(set_attr "type" "ssecmp") - (set_attr "prefix_data16" "0") - (set_attr "prefix_rep" "0") - (set_attr "prefix_extra" "2") - (set_attr "length_immediate" "1") + [(set_attr "type" "sse4arg") (set_attr "mode" "TI")]) (define_expand "vec_cmp" @@ -3144,7 +3128,8 @@ (match_operand:MMXMODE124 2 "register_operand" "x")))] "TARGET_XOP && TARGET_MMX_WITH_SSE" "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "sse4arg")]) + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) (define_insn "*xop_pcmov_" [(set (match_operand:VI_16_32 0 "register_operand" "=x") @@ -3154,7 +3139,8 @@ (match_operand:VI_16_32 2 "register_operand" "x")))] "TARGET_XOP" "vpcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "sse4arg")]) + [(set_attr "type" "sse4arg") + (set_attr "mode" "TI")]) ;; XOP permute instructions (define_insn "mmx_pp
[PATCH 03/10] x86: "ssemuladd" adjustments
They're all VEX3- (also covering XOP) or EVEX-encoded. Express that in the default calculation of "prefix". FMA4 insns also all have a 1-byte immediate operand. Where the default calculation is not sufficient / applicable, add explicit "prefix" attributes. While there also add a "mode" attribute to fma___pair. gcc/ * config/i386/i386.md (isa): Move up. (length_immediate): Handle "fma4". (prefix): Handle "ssemuladd". * config/i386/sse.md (*fma_fmadd_): Add "prefix" attribute. (fma_fmadd_): Likewise. (_fmadd__mask): Likewise. (_fmadd__mask3): Likewise. (fma_fmsub_): Likewise. (_fmsub__mask): Likewise. (_fmsub__mask3): Likewise. (*fma_fnmadd_): Likewise. (fma_fnmadd_): Likewise. (_fnmadd__mask): Likewise. (_fnmadd__mask3): Likewise. (fma_fnmsub_): Likewise. (_fnmsub__mask): Likewise. (_fnmsub__mask3): Likewise. (fma_fmaddsub_): Likewise. (_fmaddsub__mask): Likewise. (_fmaddsub__mask3): Likewise. (fma_fmsubadd_): Likewise. (_fmsubadd__mask): Likewise. (_fmsubadd__mask3): Likewise. (*fmai_fmadd_): Likewise. (*fmai_fmsub_): Likewise. (*fmai_fnmadd_): Likewise. (*fmai_fnmsub_): Likewise. (avx512f_vmfmadd__mask): Likewise. (avx512f_vmfmadd__mask3): Likewise. (avx512f_vmfmadd__maskz_1): Likewise. (*avx512f_vmfmsub__mask): Likewise. (avx512f_vmfmsub__mask3): Likewise. (*avx512f_vmfmsub__maskz_1): Likewise. (avx512f_vmfnmadd__mask): Likewise. (avx512f_vmfnmadd__mask3): Likewise. (avx512f_vmfnmadd__maskz_1): Likewise. (*avx512f_vmfnmsub__mask): Likewise. (*avx512f_vmfnmsub__mask3): Likewise. (*avx512f_vmfnmsub__maskz_1): Likewise. (*fma4i_vmfmadd_): Likewise. (*fma4i_vmfmsub_): Likewise. (*fma4i_vmfnmadd_): Likewise. (*fma4i_vmfnmsub_): Likewise. (fma__): Likewise. (___mask): Likewise. (avx512fp16_fma_sh_v8hf): Likewise. (avx512fp16_sh_v8hf_mask): Likewise. (xop_p): Likewise. (xop_pdql): Likewise. (xop_pdqh): Likewise. (xop_pwd): Likewise. (xop_pwd): Likewise. (fma___pair): Likewise. Add "mode" attribute. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -531,12 +531,23 @@ (const_string "unknown")] (const_string "integer"))) +;; Used to control the "enabled" attribute on a per-instruction basis. +(define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx, + x64_avx,x64_avx512bw,x64_avx512dq,aes, + sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx, + avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f, + avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl, + avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16,avxifma, + avx512ifmavl,avxneconvert,avx512bf16vl,vpclmulqdqvl" + (const_string "base")) + ;; The (bounding maximum) length of an instruction immediate. (define_attr "length_immediate" "" (cond [(eq_attr "type" "incdec,setcc,icmov,str,lea,other,multi,idiv,leave, bitmanip,imulx,msklog,mskmov") (const_int 0) -(eq_attr "type" "sse4arg") +(ior (eq_attr "type" "sse4arg") + (eq_attr "isa" "fma4")) (const_int 1) (eq_attr "unit" "i387,sse,mmx") (const_int 0) @@ -637,6 +648,10 @@ (const_string "vex") (eq_attr "mode" "XI,V16SF,V8DF") (const_string "evex") +(eq_attr "type" "ssemuladd") + (if_then_else (eq_attr "isa" "fma4") +(const_string "vex") +(const_string "maybe_evex")) (eq_attr "type" "sse4arg") (const_string "vex") ] @@ -842,16 +857,6 @@ ;; Define attribute to indicate unaligned ssemov insns (define_attr "movu" "0,1" (const_string "0")) -;; Used to control the "enabled" attribute on a per-instruction basis. -(define_attr "isa" "base,x64,nox64,x64_sse2,x64_sse4,x64_sse4_noavx, - x64_avx,x64_avx512bw,x64_avx512dq,aes, - sse_noavx,sse2,sse2_noavx,sse3,sse3_noavx,sse4,sse4_noavx, - avx,noavx,avx2,noavx2,bmi,bmi2,fma4,fma,avx512f,noavx512f, - avx512bw,noavx512bw,avx512dq,noavx512dq,fma_or_avx512vl, - avx512vl,noavx512vl,avxvnni,avx512vnnivl,avx512fp16,avxifma, - avx512ifmavl,avxneconvert,avx512bf16vl,vpclmulqdqvl" - (const_string "base")) - ;; Define instruction set of MMX instructions (define_attr "mmx_isa" "base,native,sse,sse_noavx,avx" (const_string "base")) --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5422,6 +5422,7 @@ vfmadd213\t{%3, %2
[PATCH 04/10] x86: "prefix_extra" can't really be "2"
In the three remaining instances separate "prefix_0f" and "prefix_rep" are what is wanted instead. gcc/ * config/i386/i386.md (rdbase): Add "prefix_0f" and "prefix_rep". Drop "prefix_extra". (wrbase): Likewise. (ptwrite): Likewise. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -25914,7 +25914,8 @@ "TARGET_64BIT && TARGET_FSGSBASE" "rdbase\t%0" [(set_attr "type" "other") - (set_attr "prefix_extra" "2")]) + (set_attr "prefix_0f" "1") + (set_attr "prefix_rep" "1")]) (define_insn "wrbase" [(unspec_volatile [(match_operand:SWI48 0 "register_operand" "r")] @@ -25922,7 +25923,8 @@ "TARGET_64BIT && TARGET_FSGSBASE" "wrbase\t%0" [(set_attr "type" "other") - (set_attr "prefix_extra" "2")]) + (set_attr "prefix_0f" "1") + (set_attr "prefix_rep" "1")]) (define_insn "ptwrite" [(unspec_volatile [(match_operand:SWI48 0 "nonimmediate_operand" "rm")] @@ -25930,7 +25932,8 @@ "TARGET_PTWRITE" "ptwrite\t%0" [(set_attr "type" "other") - (set_attr "prefix_extra" "2")]) + (set_attr "prefix_0f" "1") + (set_attr "prefix_rep" "1")]) (define_insn "@rdrand" [(set (match_operand:SWI248 0 "register_operand" "=r")
[PATCH 07/10] x86: add (adjust) XOP insn attributes
Many were lacking "prefix" and "prefix_extra", some had a bogus value of 2 for "prefix_extra" (presumably inherited from their SSE5 counterparts, which are long gone) and a meaningless "prefix_data16" one. Where missing, "mode" attributes are also added. (Note that "sse4arg" and "ssemuladd" ones don't need further adjustment in this regard.) gcc/ * config/i386/sse.md (xop_phaddbw): Add "prefix", "prefix_extra", and "mode" attributes. (xop_phaddbd): Likewise. (xop_phaddbq): Likewise. (xop_phaddwd): Likewise. (xop_phaddwq): Likewise. (xop_phadddq): Likewise. (xop_phsubbw): Likewise. (xop_phsubwd): Likewise. (xop_phsubdq): Likewise. (xop_rotl3): Add "prefix" and "prefix_extra" attributes. (xop_rotr3): Likewise. (xop_frcz2): Likewise. (*xop_vmfrcz2): Likewise. (xop_vrotl3): Add "prefix" attribute. Change "prefix_extra" to 1. (xop_sha3): Likewise. (xop_shl3): Likewise. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -24897,7 +24897,10 @@ (const_int 13) (const_int 15)])] "TARGET_XOP" "vphaddbw\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phaddbd" [(set (match_operand:V4SI 0 "register_operand" "=x") @@ -24926,7 +24929,10 @@ (const_int 11) (const_int 15)]))] "TARGET_XOP" "vphaddbd\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phaddbq" [(set (match_operand:V2DI 0 "register_operand" "=x") @@ -24971,7 +24977,10 @@ (parallel [(const_int 7) (const_int 15)])))] "TARGET_XOP" "vphaddbq\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phaddwd" [(set (match_operand:V4SI 0 "register_operand" "=x") @@ -24988,7 +24997,10 @@ (const_int 5) (const_int 7)])] "TARGET_XOP" "vphaddwd\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phaddwq" [(set (match_operand:V2DI 0 "register_operand" "=x") @@ -25013,7 +25025,10 @@ (parallel [(const_int 3) (const_int 7)]))] "TARGET_XOP" "vphaddwq\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phadddq" [(set (match_operand:V2DI 0 "register_operand" "=x") @@ -25028,7 +25043,10 @@ (parallel [(const_int 1) (const_int 3)])] "TARGET_XOP" "vphadddq\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phsubbw" [(set (match_operand:V8HI 0 "register_operand" "=x") @@ -25049,7 +25067,10 @@ (const_int 13) (const_int 15)])] "TARGET_XOP" "vphsubbw\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phsubwd" [(set (match_operand:V4SI 0 "register_operand" "=x") @@ -25066,7 +25087,10 @@ (const_int 5) (const_int 7)])] "TARGET_XOP" "vphsubwd\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) (define_insn "xop_phsubdq" [(set (match_operand:V2DI 0 "register_operand" "=x") @@ -25081,7 +25105,10 @@ (parallel [(const_int 1) (const_int 3)])] "TARGET_XOP" "vphsubdq\t{%1, %0|%0, %1}" - [(set_attr "type" "sseiadd1")]) + [(set_attr "type" "sseiadd1") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") + (set_attr "mode" "TI")]) ;; XOP permute instructions (define_insn "xop_pperm" @@ -25209,6 +25236,8 @@ "TARGET_XOP" "vprot\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) @@ -25224,6 +25253,8 @@ return \"vprot\t{%3, %1, %0|%0, %1, %3}\"; } [(set_attr "type" "sseishft") + (set_attr "prefix" "vex") + (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) @@ -25264,8 +25295,8 @@ "TARGET_XOP && !(MEM_P (operands[1]) &&
[PATCH 09/10] x86: correct "length_immediate" in a few cases
When first added explicitly in 3ddffba914b2 ("i386.md (sse4_1_round2): Add avx512f alternative"), "*" should not have been used for the pre-existing alternative. The attribute was plain missing. Subsequent changes adding more alternatives then generously extended the bogus pattern. Apparently something similar happened to the two mmx_pblendvb_* insns. gcc/ * config/i386/i386.md (sse4_1_round2): Make "length_immediate" uniformly 1. * config/i386/mmx.md (mmx_pblendvb_v8qi): Likewise. (mmx_pblendvb_): Likewise. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -21594,7 +21594,7 @@ vrndscale\t{%2, %1, %d0|%d0, %1, %2}" [(set_attr "type" "ssecvt") (set_attr "prefix_extra" "1,1,1,*,*") - (set_attr "length_immediate" "*,*,*,1,1") + (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,evex,evex") (set_attr "isa" "noavx512f,noavx512f,noavx512f,avx512f,avx512f") (set_attr "avx_partial_xmm_update" "false,false,true,false,true") --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3094,7 +3094,7 @@ [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "*,*,1") + (set_attr "length_immediate" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "btver2_decode" "vector") (set_attr "mode" "TI")]) @@ -3114,7 +3114,7 @@ [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") - (set_attr "length_immediate" "*,*,1") + (set_attr "length_immediate" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "btver2_decode" "vector") (set_attr "mode" "TI")])
[PATCH 05/10] x86: replace/correct bogus "prefix_extra"
In the rdrand and rdseed cases "prefix_0f" is meant instead. For mmx_floatv2siv2sf2 1 is correct only for the first alternative. For the integer min/max cases 1 uniformly applies to legacy and VEX encodings (the UB and SW variants are dealt with separately anyway). Same for {,V}MOVNTDQA. Unlike {,V}PEXTRW, which has two encoding forms, {,V}PINSRW only has a single form in 0f space. (In *vec_extract note that the dropped part if the condition also referenced non-existing alternative 2.) Of the integer compare insns, only the 64-bit element forms are encoded in 0f38 space. gcc/ * config/i386/i386.md (@rdrand): Add "prefix_0f". Drop "prefix_extra". (@rdseed): Likewise. * config/i386/mmx.md (3 [smaxmin and umaxmin cases]): Adjust "prefix_extra". * config/i386/sse.md (@vec_set_0): Likewise. (*sse4_1_3): Likewise. (*avx2_eq3): Likewise. (avx2_gt3): Likewise. (_pinsr): Likewise. (*vec_extract): Likewise. (_movntdqa): Likewise. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -25943,7 +25943,7 @@ "TARGET_RDRND" "rdrand\t%0" [(set_attr "type" "other") - (set_attr "prefix_extra" "1")]) + (set_attr "prefix_0f" "1")]) (define_insn "@rdseed" [(set (match_operand:SWI248 0 "register_operand" "=r") @@ -25953,7 +25953,7 @@ "TARGET_RDSEED" "rdseed\t%0" [(set_attr "type" "other") - (set_attr "prefix_extra" "1")]) + (set_attr "prefix_0f" "1")]) (define_expand "pause" [(set (match_dup 0) --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -2483,7 +2483,7 @@ vp\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseiadd") - (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) @@ -2532,7 +2532,7 @@ vpb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseiadd") - (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) @@ -2561,7 +2561,7 @@ vp\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseiadd") - (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) @@ -2623,7 +2623,7 @@ vpw\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseiadd") - (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11064,7 +11064,7 @@ (const_string "1") (const_string "*"))) (set (attr "prefix_extra") - (if_then_else (eq_attr "alternative" "5,6,7,8,9") + (if_then_else (eq_attr "alternative" "5,6,9") (const_string "1") (const_string "*"))) (set (attr "length_immediate") @@ -16779,7 +16779,7 @@ vp\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseiadd") - (set_attr "prefix_extra" "1,1,*") + (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) @@ -16813,7 +16813,10 @@ "TARGET_AVX2 && !(MEM_P (operands[1]) && MEM_P (operands[2]))" "vpcmpeq\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "ssecmp") - (set_attr "prefix_extra" "1") + (set (attr "prefix_extra") + (if_then_else (eq (const_string "mode") (const_string "V4DImode")) + (const_string "1") + (const_string "*"))) (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -17048,7 +17051,10 @@ "TARGET_AVX2" "vpcmpgt\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "ssecmp") - (set_attr "prefix_extra" "1") + (set (attr "prefix_extra") + (if_then_else (eq (const_string "mode") (const_string "V4DImode")) + (const_string "1") + (const_string "*"))) (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -18843,7 +18849,7 @@ (const_string "*"))) (set (attr "prefix_extra") (if_then_else - (and (not (match_test "TARGET_AVX")) + (ior (eq_attr "prefix" "evex") (match_test "GET_MODE_NUNITS (mode) == 8")) (const_string "*") (const_string "1"))) @@ -20004,8 +20010,7 @@ (set_attr "prefix_data16" "1") (set (attr "prefix_extra") (if_then_else - (and (eq_attr "alternative" "0,2") - (eq (const_string "mode") (const_string "V8HImode"))) + (eq (const_string "mode") (const_string "V8HImode")) (const_string "*") (const_string "1"))) (set_attr "length_immediate" "1") @@ -22349,7 +22354,7 @@ "%vmovntdqa\t{%1, %0|%0, %1}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssemov") - (set_a
[PATCH 06/10] x86: drop stray "prefix_extra"
While the attribute is relevant for legacy- and VEX-encoded insns, it is of no relevance for EVEX-encoded ones. While there in avx512dq_broadcast_1 add the missing "length_immediate". gcc/ * config/i386/sse.md (*_eq3_1): Drop "prefix_extra". (avx512dq_vextract64x2_1_mask): Likewise. (*avx512dq_vextract64x2_1): Likewise. (avx512f_vextract32x4_1_mask): Likewise. (*avx512f_vextract32x4_1): Likewise. (vec_extract_lo__mask [AVX512 forms]): Likewise. (vec_extract_lo_ [AVX512 forms]): Likewise. (vec_extract_hi__mask [AVX512 forms]): Likewise. (vec_extract_hi_ [AVX512 forms]): Likewise. (@vec_extract_lo_ [AVX512 forms]): Likewise. (@vec_extract_hi_ [AVX512 forms]): Likewise. (vec_extract_lo_v64qi): Likewise. (vec_extract_hi_v64qi): Likewise. (*vec_widen_umult_even_v16si): Likewise. (*vec_widen_smult_even_v16si): Likewise. (*avx512f_3): Likewise. (*vec_extractv4ti): Likewise. (avx512bw_v32qiv32hi2): Likewise. (avx512dq_broadcast_1): Likewise. Add "length_immediate". --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4030,7 +4030,6 @@ vpcmpeq\t{%2, %1, %0|%0, %1, %2} vptestnm\t{%1, %1, %0|%0, %1, %1}" [(set_attr "type" "ssecmp") - (set_attr "prefix_extra" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -4128,7 +4127,6 @@ vpcmpeq\t{%2, %1, %0|%0, %1, %2} vptestnm\t{%1, %1, %0|%0, %1, %1}" [(set_attr "type" "ssecmp") - (set_attr "prefix_extra" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11487,7 +11485,6 @@ return "vextract64x2\t{%2, %1, %0%{%5%}%N4|%0%{%5%}%N4, %1, %2}"; } [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11506,7 +11503,6 @@ return "vextract64x2\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11554,7 +11550,6 @@ return "vextract32x4\t{%2, %1, %0%{%7%}%N6|%0%{%7%}%N6, %1, %2}"; } [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11577,7 +11572,6 @@ return "vextract32x4\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11671,7 +11665,6 @@ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[2]))" "vextract64x4\t{$0x0, %1, %0%{%3%}%N2|%0%{%3%}%N2, %1, 0x0}" [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store") (set_attr "prefix" "evex") @@ -11691,7 +11684,6 @@ return "#"; } [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store,load") (set_attr "prefix" "evex") @@ -11710,7 +11702,6 @@ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[2]))" "vextract64x4\t{$0x1, %1, %0%{%3%}%N2|%0%{%3%}%N2, %1, 0x1}" [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11724,7 +11715,6 @@ "TARGET_AVX512F" "vextract64x4\t{$0x1, %1, %0|%0, %1, 0x1}" [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11744,7 +11734,6 @@ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[2]))" "vextract32x8\t{$0x1, %1, %0%{%3%}%N2|%0%{%3%}%N2, %1, 0x1}" [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -11762,7 +11751,6 @@ vextract32x8\t{$0x1, %1, %0|%0, %1, 0x1} vextracti64x4\t{$0x1, %1, %0|%0, %1, 0x1}" [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "isa" "avx512dq,noavx512dq") (set_attr "length_immediate" "1") (set_attr "prefix" "evex") @@ -11850,7 +11838,6 @@ && (!MEM_P (operands[0]) || rtx_equal_p (operands[0], operands[2]))" "vextract32x8\t{$0x0, %1, %0%{%3%}%N2|%0%{%3%}%N2, %1, 0x0}" [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,store") (set_attr "prefix" "evex") @@ -11880,7 +11867,6 @@ return "#"; } [(set_attr "type" "sselog1") - (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "memory" "none,load,store") (set_attr "prefix" "evex") @@ -11923,7 +11909,6 @@ && (!MEM_P (o
[PATCH 08/10] x86: add missing "prefix" attribute to VF{,C}MULC
gcc/ * config/i386/sse.md (__): Add "prefix" attribute. (avx512fp16_sh_v8hf): Likewise. --- Talking of "prefix": Shouldn't at least V32HF and V32BF have it also default to "evex"? (It won't matter right here, but it may matter elsewhere.) --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6790,6 +6790,7 @@ return "v\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "ssemul") + (set_attr "prefix" "evex") (set_attr "mode" "")]) (define_expand "avx512fp16_fmaddcsh_v8hf_maskz" @@ -6993,6 +6994,7 @@ return "vsh\t{%2, %1, %0|%0, %1, %2}"; } [(set_attr "type" "ssemul") + (set_attr "prefix" "evex") (set_attr "mode" "V8HF")]) ;
[PATCH 10/10] x86: drop redundant "prefix_data16" attributes
The attribute defaults to 1 for TI-mode insns of type sselog, sselog1, sseiadd, sseimul, and sseishft. In *v8hi3 [smaxmin] and *v16qi3 [umaxmin] also drop the similarly stray "prefix_extra" at this occasion. These two max/min flavors are encoded in 0f space. gcc/ * config/i386/mmx.md (*mmx_pinsrd): Drop "prefix_data16". (*mmx_pinsrb): Likewise. (*mmx_pextrb): Likewise. (*mmx_pextrb_zext): Likewise. (mmx_pshufbv8qi3): Likewise. (mmx_pshufbv4qi3): Likewise. (mmx_pswapdv2si2): Likewise. (*pinsrb): Likewise. (*pextrb): Likewise. (*pextrb_zext): Likewise. * config/i386/sse.md (*sse4_1_mulv2siv2di3): Likewise. (*sse2_eq3): Likewise. (*sse2_gt3): Likewise. (_pinsr): Likewise. (*vec_extract): Likewise. (*vec_extract_zext): Likewise. (*vec_extractv16qi_zext): Likewise. (ssse3_phwv8hi3): Likewise. (ssse3_pmaddubsw128): Likewise. (*_pmulhrsw3): Likewise. (_pshufb3): Likewise. (_psign3): Likewise. (_palignr): Likewise. (*abs2): Likewise. (sse4_2_pcmpestr): Likewise. (sse4_2_pcmpestri): Likewise. (sse4_2_pcmpestrm): Likewise. (sse4_2_pcmpestr_cconly): Likewise. (sse4_2_pcmpistr): Likewise. (sse4_2_pcmpistri): Likewise. (sse4_2_pcmpistrm): Likewise. (sse4_2_pcmpistr_cconly): Likewise. (vgf2p8affineinvqb_): Likewise. (vgf2p8affineqb_): Likewise. (vgf2p8mulb_): Likewise. (*v8hi3 [smaxmin]): Drop "prefix_data16" and "prefix_extra". (*v16qi3 [umaxmin]): Likewise. --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -3863,7 +3863,6 @@ } } [(set_attr "isa" "noavx,avx") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "type" "sselog") (set_attr "length_immediate" "1") @@ -3950,7 +3949,6 @@ } [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "orig,vex") @@ -4002,7 +4000,6 @@ %vpextrb\t{%2, %1, %k0|%k0, %1, %2} %vpextrb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog1") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") @@ -4017,7 +4014,6 @@ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}" [(set_attr "type" "sselog1") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") @@ -4035,7 +4031,6 @@ vpshufb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") - (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector") @@ -4053,7 +4048,6 @@ vpshufb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") - (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector") @@ -4191,7 +4185,6 @@ (set_attr "mmx_isa" "native,*") (set_attr "type" "mmxcvt,sselog1") (set_attr "prefix_extra" "1,*") - (set_attr "prefix_data16" "*,1") (set_attr "length_immediate" "*,1") (set_attr "mode" "DI,TI")]) @@ -4531,7 +4524,6 @@ } [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "orig,vex") @@ -4575,7 +4567,6 @@ %vpextrb\t{%2, %1, %k0|%k0, %1, %2} %vpextrb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog1") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") @@ -4590,7 +4581,6 @@ "TARGET_SSE4_1" "%vpextrb\t{%2, %1, %k0|%k0, %1, %2}" [(set_attr "type" "sselog1") - (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") (set_attr "prefix" "maybe_vex") --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -15614,7 +15614,6 @@ vpmuldq\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "sseimul") - (set_attr "prefix_data16" "1,1,*") (set_attr "prefix_extra" "1") (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "TI")]) @@ -16688,8 +16687,6 @@ vpw\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseiadd") - (set_attr "prefix_data16" "1,*") - (set_attr "prefix_extra" "*,1") (set_attr "prefix" "orig,vex") (set_attr "mode" "TI")]) @@ -16772,8 +16769,6 @@ vpb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sse
Re: Intel AVX10.1 Compiler Design and Support
On 09.08.2023 04:14, Hongtao Liu wrote: > On Wed, Aug 9, 2023 at 9:21 AM Hongtao Liu wrote: >> >> On Wed, Aug 9, 2023 at 3:55 AM Joseph Myers wrote: >>> >>> Do you have any comments on the interaction of AVX10 with the >>> micro-architecture levels defined in the ABI (and supported with >>> glibc-hwcaps directories in glibc)? Given that the levels are cumulative, >>> should we take it that any future levels will be ones supporting 512-bit >>> vector width for AVX10 (because x86-64-v4 requires the current AVX512F, >>> AVX512BW, AVX512CD, AVX512DQ and AVX512VL) - and so any future processors >>> that only support 256-bit vector width will be considered to match the >>> x86-64-v3 micro-architecture level but not any higher level? >> This is actually something we really want to discuss in the community, >> our proposal for x86-64-v5: AVX10.2-256(Implying AVX10.1-256) + APX. >> One big reason is Intel E-core will only support AVX10 256-bit, if we >> want to use x86-64-v5 accross server and client, it's better to >> 256-bit default. Aiui these ABI levels were intended to be incremental, i.e. higher versions would include everything earlier ones cover. Without such a guarantee, how would you propose compatibility checks to be implemented in a way applicable both forwards and backwards? If a new level is wanted here, then I guess it could only be something like v3.5. Jan
Re: Intel AVX10.1 Compiler Design and Support
On 09.08.2023 09:38, Hongtao Liu wrote: > On Wed, Aug 9, 2023 at 3:17 PM Jan Beulich wrote: >> >> On 09.08.2023 04:14, Hongtao Liu wrote: >>> On Wed, Aug 9, 2023 at 9:21 AM Hongtao Liu wrote: On Wed, Aug 9, 2023 at 3:55 AM Joseph Myers wrote: > > Do you have any comments on the interaction of AVX10 with the > micro-architecture levels defined in the ABI (and supported with > glibc-hwcaps directories in glibc)? Given that the levels are cumulative, > should we take it that any future levels will be ones supporting 512-bit > vector width for AVX10 (because x86-64-v4 requires the current AVX512F, > AVX512BW, AVX512CD, AVX512DQ and AVX512VL) - and so any future processors > that only support 256-bit vector width will be considered to match the > x86-64-v3 micro-architecture level but not any higher level? This is actually something we really want to discuss in the community, our proposal for x86-64-v5: AVX10.2-256(Implying AVX10.1-256) + APX. One big reason is Intel E-core will only support AVX10 256-bit, if we want to use x86-64-v5 accross server and client, it's better to 256-bit default. >> >> Aiui these ABI levels were intended to be incremental, i.e. higher versions >> would include everything earlier ones cover. Without such a guarantee, how >> would you propose compatibility checks to be implemented in a way > Are there many software implemenation based on this assumption? > At least in GCC, it's not a big problem, we can adjust code for the > new micro-architecture level. >> applicable both forwards and backwards? If a new level is wanted here, then >> I guess it could only be something like v3.5. > But if we use avx10.1 as v3.5, it's still not subset of > x86-64-v4(avx10.1 contains avx512fp16,avx512bf16 .etc which are not in > x86-64-v4), there will be still a diverge. Hmm, yes. But something will end up being odd in any event. Versions no longer being integral values is kind of indicating a "branch", i.e. v4 not being a successor. Maybe v3.1 would be better, for it to then have possible successors v3.2, v3.3, etc. Of course it would be possible to "merge" branches back then, into e.g. v5 covering AVX10.2/512 (and thus fully covering everything that's in v4). Jan > Then 256-bit of x86-64-v4 as v3.5? that's too weired to me. > > Our main proposal is to make AVX10.x as new micro-architecture level > with 256-bit default, either v3.5 or v5 would be acceptable if it's > just the name.
Re: Intel AVX10.1 Compiler Design and Support
On 10.08.2023 15:12, Phoebe Wang wrote: >> The psABI should have some simple rule covering all of the above I think. > > psABI has a rule for the case doesn't mean the rule is a well defined ABI > in practice. A well defined ABI should guarantee 1) interlinkable across > different compile options within the same compiler; 2) interlinkable across > different compilers. Both aspects are failed in the non 512-bit version. > > 1) is more important than 2) and becomes more critical on AVX10 targets. > Because we expect AVX10-256 is a general setting for binaries that can run > on both AVX10-256 and AVX10-512. It would be common that binaries compiled > with AVX10-256 may link with native built binaries on AVX10-512 targets. But you're only describing a pre-existing problem here afaict. Code compiled with -mavx51f passing __m512 type data to a function compiled with only, say, -maxv2 won't interoperate properly either. What's worse, imo the psABI doesn't sufficiently define what __m256 etc actually are. After all these aren't types defined by the C standard (as opposed to at least most other types in the respective table there), and you can't really make assumptions like "this is what certain compilers think this is". Jan
Re: [PATCH v2] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
On 19.06.2023 04:07, Liu, Hongtao wrote: >> -Original Message- >> From: Jan Beulich >> Sent: Friday, June 16, 2023 2:22 PM >> >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -12597,11 +12597,11 @@ >> (set_attr "mode" "")]) >> >> (define_insn "*_vternlog_all" >> - [(set (match_operand:V 0 "register_operand" "=v") >> + [(set (match_operand:V 0 "register_operand" "=v,v") >> (unspec:V >> - [(match_operand:V 1 "register_operand" "0") >> - (match_operand:V 2 "register_operand" "v") >> - (match_operand:V 3 "bcst_vector_operand" "vmBr") >> + [(match_operand:V 1 "register_operand" "0,0") >> + (match_operand:V 2 "register_operand" "v,v") >> + (match_operand:V 3 "bcst_vector_operand" "vBr,m") >> (match_operand:SI 4 "const_0_to_255_operand")] >>UNSPEC_VTERNLOG))] >>"TARGET_AVX512F > Change condition to == 64 || TARGET_AVX512VL || (TARGET_AVX512F > && !TARGET_PREFER_AVX256) May I ask why you think this is necessary? The condition of the insn already wasn't in sync with the condition used in all three splitters, and I didn't see any reason why now they would need to be brought in sync. First and foremost because of the use of the UNSPEC (equally before and after this patch). Furthermore, isn't it the case that I'm already mostly expressing this with the "enabled" attribute? At the very least I think I should drop that again then if following your request? > Also please add a testcase for case TARGET_AVX512F && !TARGET_PREFER_AVX256. Especially in a case like this one I'm wondering about the usefulness of a contrived testcase: It won't test more than one minor sub-case of the whole set of constructs covered here. But well, here as well as for the other change I'll invent something. Jan
[PATCH v3] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
There's no reason to constrain this to AVX512VL, unless instructed so by -mprefer-vector-width=, as the wider operation is unusable for more narrow operands only when the possible memory source is a non-broadcast one. This way even the scalar copysign3 can benefit from the operation being a single-insn one (leaving aside moves which the compiler decides to insert for unclear reasons, and leaving aside the fact that bcst_mem_operand() is too restrictive for broadcast to be embedded right into VPTERNLOG*). While there also bring *_vternlog_all's in sync with that of the three splitters. Along with this also request value duplication in ix86_expand_copysign()'s call to ix86_build_signbit_mask(), eliminating excess space allocation in .rodata.*, filled with zeros which are never read. gcc/ * config/i386/i386-expand.cc (ix86_expand_copysign): Request value duplication by ix86_build_signbit_mask() when AVX512F and not HFmode. * config/i386/sse.md (*_vternlog_all): Convert to 2-alternative form. Adjust "mode" attribute. Add "enabled" attribute. (*_vpternlog_1): Also permit when TARGET_AVX512F && !TARGET_PREFER_AVX256. (*_vpternlog_2): Likewise. (*_vpternlog_3): Likewise. gcc/testsuite/ * gcc.target/i386/avx512f-copysign.c: New test. --- I haven't been able to find documentation on the dejagnu(?) regex syntax (?:...). With ordinary (...) failing (producing twice as many matches), I could only derive this from other scan-assembler patterns. I guess the underlying pattern, going along the lines of what one_cmpl2 uses, can be applied elsewhere as well. HFmode could use embedded broadcast too for copysign and alike, but that would need to be V2HF -> V8HF (for which I don't think there are any existing patterns). --- v3: Adjust insn conditional as well. Add testcase. v2: Respect -mprefer-vector-width=. --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2266,7 +2266,7 @@ ix86_expand_copysign (rtx operands[]) else dest = NULL_RTX; op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode); - mask = ix86_build_signbit_mask (vmode, 0, 0); + mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0); if (CONST_DOUBLE_P (operands[1])) { --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12399,22 +12399,35 @@ (set_attr "mode" "")]) (define_insn "*_vternlog_all" - [(set (match_operand:V 0 "register_operand" "=v") + [(set (match_operand:V 0 "register_operand" "=v,v") (unspec:V - [(match_operand:V 1 "register_operand" "0") - (match_operand:V 2 "register_operand" "v") - (match_operand:V 3 "bcst_vector_operand" "vmBr") + [(match_operand:V 1 "register_operand" "0,0") + (match_operand:V 2 "register_operand" "v,v") + (match_operand:V 3 "bcst_vector_operand" "vBr,m") (match_operand:SI 4 "const_0_to_255_operand")] UNSPEC_VTERNLOG))] - "TARGET_AVX512F + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) /* Disallow embeded broadcast for vector HFmode since it's not real AVX512FP16 instruction. */ && (GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 || GET_CODE (operands[3]) != VEC_DUPLICATE)" - "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}" +{ + if (TARGET_AVX512VL) +return "vpternlog\t{%4, %3, %2, %0|%0, %2, %3, %4}"; + else +return "vpternlog\t{%4, %g3, %g2, %g0|%g0, %g2, %g3, %4}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "evex") - (set_attr "mode" "")]) + (set (attr "mode") +(if_then_else (match_test "TARGET_AVX512VL") + (const_string "") + (const_string "XI"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref " == 64 || TARGET_AVX512VL") + (const_string "*")))]) ;; There must be lots of other combinations like ;; @@ -12443,7 +12456,8 @@ (any_logic2:V (match_operand:V 3 "regmem_or_bitnot_regmem_operand") (match_operand:V 4 "regmem_or_bitnot_regmem_operand"] - "( == 64 || TARGET_AVX512VL) + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12527,7 +12541,8 @@ (match_operand:V 2 "regmem_or_bitnot_regmem_operand")) (match_operand:V 3 "regmem_or_bitnot_regmem_operand")) (match_operand:V 4 "regmem_or_bitnot_regmem_operand")))] - "( == 64 || TARGET_AVX512VL) + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) && ix86_pre_reload_split () && (rtx_equal_p (STRIP_UNARY (operands[1]), STRIP_UNARY (operands[4])) @@ -12610,7 +12625,8 @@ (match_operand:V 1 "regmem_or_bitnot_regmem_o
Re: [PATCH v3] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
On 20.06.2023 10:33, Hongtao Liu wrote: > On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches > wrote: >> >> I guess the underlying pattern, going along the lines of what >> one_cmpl2 uses, can be applied elsewhere >> as well. > That should be guarded with !TARGET_PREFER_AVX256, let's handle that > in a separate patch. Sure, and as indicated there are more places where similar things could be done. >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c >> @@ -0,0 +1,32 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ > Please explicitly add -mprefer-vector-width=512, our tester will also > test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the > - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options > can overwrite that. Oh, I see. Will do. And I expect I then also need to adjust the newly added avx512f-dupv2di.c from the earlier patch. I guess I could commit that option addition there as obvious? > Others LGTM. May I take this as "okay with that change", or should I submit v4? Jan
[PATCH] x86: add -mprefer-vector-width=512 to new avx512f-dupv2di.c testcase
This is to cover testing also being done with -march=cascadelake. --- Committing as obvious. --- a/gcc/testsuite/gcc.target/i386/avx512f-dupv2di.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-dupv2di.c @@ -1,5 +1,5 @@ /* { dg-do compile { target { ! ia32 } } } */ -/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ /* { dg-final { scan-assembler-not "vmovddup\[^\n\]*%xmm16" } } */ typedef long long __attribute__ ((vector_size (16))) v2di;
[PATCH v2] x86: make better use of VBROADCASTSS / VPBROADCASTD
... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are never longer (yet sometimes shorter) than the corresponding VSHUFPS / VPSHUFD, due to the immediate operand of the shuffle insns balancing the possible need for VEX3 in the broadcast ones. When EVEX encoding is required the broadcast insns are always shorter. Add new alternatives to cover the AVX2 and AVX512 cases as appropriate. gcc/ * config/i386/sse.md (vec_dupv4sf): Make first alternative use vbroadcastss for AVX2. New AVX512F alternative. (*vec_dupv4si): New AVX2 and AVX512F alternatives using vpbroadcastd. --- Especially with the added "enabled" attribute I didn't really see how to (further) fold alternatives 0 and 1. Instead *vec_dupv4si might benefit from using sse2_noavx2 instead of sse2 for alternative 2, except that there is no sse2_noavx2, only sse2_noavx. Is there a reason why vec_dupv4sf uses sseshuf1 for its shuffle alternatives, but *vec_dupv4si uses sselog1? I'd be happy to correct this in whichever is the appropriate direction, while touching this anyway. I'm working from the assumption that the isa attributes to the original 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 or avx_noavx2 as applicable), as the new earlier alternatives cover all operand forms already when at least AVX2 is enabled. Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss use? (Same further down in *vec_dupv4si and avx2_vbroadcasti128_ and elsewhere.) --- v2: Correct operand constraints. Respect -mprefer-vector-width=. Fold two alternatives of vec_dupv4sf. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -26141,41 +26141,64 @@ (const_int 1)))]) (define_insn "vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] "TARGET_SSE" "@ - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; + vbroadcastss\t{%1, %g0|%g0, %1} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "avx,avx,noavx") - (set_attr "type" "sseshuf1,ssemov,sseshuf1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_evex,maybe_evex,orig") - (set_attr "mode" "V4SF")]) + [(set_attr "isa" "avx,*,avx,noavx") + (set (attr "type") + (cond [(and (eq_attr "alternative" "0") + (match_test "!TARGET_AVX2")) +(const_string "sseshuf1") + (eq_attr "alternative" "3") +(const_string "sseshuf1") + ] + (const_string "ssemov"))) + (set (attr "length_immediate") + (if_then_else (eq_attr "type" "sseshuf1") + (const_string "1") + (const_string "0"))) + (set_attr "prefix_extra" "0,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_insn "*vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] "TARGET_SSE" "@ + vpbroadcastd\t{%1, %0|%0, %1} + vpbroadcastd\t{%1, %g0|%g0, %1} %vpshufd\t{$0, %1, %0|%0, %1, 0} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "sse2,avx,noavx") - (set_attr "type" "sselog1,ssemov,sselog1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_vex,maybe_evex,orig") - (set_attr "mode" "TI,V4SF,V4SF") + [(set_attr "isa" "avx2,*,sse2,avx,noavx") + (set_attr "type" "ssemov,ssemov,sselog1,ssemov,sselog1") + (set_attr "length_immediate" "0,0,1,0,1") + (set_attr "prefix_extra" "0,0,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") (set (attr "preferred_for_speed") - (cond [(eq_attr "alternative" "1") + (cond [(eq_attr "alternative" "3") (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC") ] - (symbol_ref "true")))]) + (symbol_ref "true"))) + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL +
[PATCH 0/5] x86: make better use of VPTERNLOG{D,Q}
While there are some quite sophisticated 4-operand expanders, 2-operand binary logic which can't be expressed by just VPAND, VPANDN, VPOR, or VPXOR doesn't utilize this insn to carry out such operations in a single insn. Therefore the first two patches address one of the sub-aspects of PR target/93768 (which imo was closed prematurely), while the latter three ones extend what was done for PR target/100711. 1: use VPTERNLOG for further bitwise two-vector operations 2: use VPTERNLOG also for certain andnot forms 3: allow memory operand for AVX2 splitter for PR target/100711 4: further PR target/100711-like splitting 5: yet more PR target/100711-like splitting Jan
[PATCH 1/5] x86: use VPTERNLOG for further bitwise two-vector operations
All combinations of and, ior, xor, and not involving two operands can be expressed that way in a single insn. gcc/ PR target/93768 * config/i386/i386.cc (ix86_rtx_costs): Further special-case bitwise vector operations. * config/i386/sse.md (*iornot3): New insn. (*xnor3): Likewise. (*3): Likewise. (andor): New code iterator. (nlogic): New code attribute. (ternlog_nlogic): Likewise. gcc/testsuite/ PR target/93768 gcc.target/i386/avx512-binop-not-1.h: New. gcc.target/i386/avx512-binop-not-2.h: New. gcc.target/i386/avx512f-orn-si-zmm-1.c: New test. gcc.target/i386/avx512f-orn-si-zmm-2.c: New test. --- The use of VI matches that in e.g. one_cmpl2 / one_cmpl2 and *andnot3, despite (here and there) - V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled, - VTI not being covered, - vector modes more narrow than 16 bytes not being covered. --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode return false; case IOR: + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + /* (ior (not ...) ...) can be a single insn in AVX512. */ + if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F + && (GET_MODE_SIZE (mode) == 64 + || (TARGET_AVX512VL + && (GET_MODE_SIZE (mode) == 32 + || GET_MODE_SIZE (mode) == 16 + { + rtx right = GET_CODE (XEXP (x, 1)) != NOT + ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0); + + *total = ix86_vec_cost (mode, cost->sse_op) + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed) + + rtx_cost (right, mode, outer_code, opno, speed); + return true; + } + *total = ix86_vec_cost (mode, cost->sse_op); + } + else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + *total = cost->add * 2; + else + *total = cost->add; + return false; + case XOR: if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) *total = ix86_vec_cost (mode, cost->sse_op); @@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode /* pandn is a single instruction. */ if (GET_CODE (XEXP (x, 0)) == NOT) { + rtx right = XEXP (x, 1); + + /* (and (not ...) (not ...)) can be a single insn in AVX512. */ + if (GET_CODE (right) == NOT && TARGET_AVX512F + && (GET_MODE_SIZE (mode) == 64 + || (TARGET_AVX512VL + && (GET_MODE_SIZE (mode) == 32 + || GET_MODE_SIZE (mode) == 16 + right = XEXP (right, 0); + *total = ix86_vec_cost (mode, cost->sse_op) + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code, opno, speed) - + rtx_cost (XEXP (x, 1), mode, - outer_code, opno, speed); + + rtx_cost (right, mode, outer_code, opno, speed); return true; } else if (GET_CODE (XEXP (x, 1)) == NOT) @@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode case NOT: if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - // vnot is pxor -1. - *total = ix86_vec_cost (mode, cost->sse_op) + 1; + { + /* (not (xor ...)) can be a single insn in AVX512. */ + if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F + && (GET_MODE_SIZE (mode) == 64 + || (TARGET_AVX512VL + && (GET_MODE_SIZE (mode) == 32 + || GET_MODE_SIZE (mode) == 16 + { + *total = ix86_vec_cost (mode, cost->sse_op) + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed) + + rtx_cost (XEXP (XEXP (x, 0), 1), mode, + outer_code, opno, speed); + return true; + } + + // vnot is pxor -1. + *total = ix86_vec_cost (mode, cost->sse_op) + 1; + } else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) *total = cost->add * 2; else --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17616,6 +17616,98 @@ operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode)); }) +(define_insn "*iornot3" + [(set (match_operand:VI 0 "register_operand" "=v,v,v,v") + (ior:VI + (not:VI + (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m")) + (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))] + "( == 64 || TARGET_AVX512VL +|| (TARGET_AVX512F && !TARGET_PREFER_AVX256)) + && (register_operand
[PATCH 2/5] x86: use VPTERNLOG also for certain andnot forms
When it's the memory operand which is to be inverted, using VPANDN* requires a further load instruction. The same can be achieved by a single VPTERNLOG*. Add two new alternatives (for plain memory and embedded broadcast), adjusting the predicate for the first operand accordingly. Two pre-existing testcases actually end up being affected (improved) by the change, which is reflected in updated expectations there. gcc/ PR target/93768 * config/i386/sse.md (*andnot3): Add new alternatives for memory form operand 1. gcc/testsuite/ PR target/93768 * gcc.target/i386/avx512f-andn-di-zmm-2.c: New test. * gcc.target/i386/avx512f-andn-si-zmm-2.c: Adjust expecations towards generated code. * gcc.target/i386/pr100711-3.c: Adjust expectations for 32-bit code. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17210,11 +17210,13 @@ "TARGET_AVX512F") (define_insn "*andnot3" - [(set (match_operand:VI 0 "register_operand" "=x,x,v") + [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v") (and:VI - (not:VI (match_operand:VI 1 "vector_operand" "0,x,v")) - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr")))] - "TARGET_SSE" + (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br")) + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))] + "TARGET_SSE + && (register_operand (operands[1], mode) + || register_operand (operands[2], mode))" { char buf[64]; const char *ops; @@ -17281,6 +17283,15 @@ case 2: ops = "v%s%s\t{%%2, %%1, %%0|%%0, %%1, %%2}"; break; +case 3: +case 4: + tmp = "pternlog"; + ssesuffix = ""; + if (which_alternative != 4 || TARGET_AVX512VL) + ops = "v%s%s\t{$0x44, %%1, %%2, %%0|%%0, %%2, %%1, $0x44}"; + else + ops = "v%s%s\t{$0x44, %%g1, %%g2, %%g0|%%g0, %%g2, %%g1, $0x44}"; + break; default: gcc_unreachable (); } @@ -17289,7 +17300,7 @@ output_asm_insn (buf, operands); return ""; } - [(set_attr "isa" "noavx,avx,avx") + [(set_attr "isa" "noavx,avx,avx,*,*") (set_attr "type" "sselog") (set (attr "prefix_data16") (if_then_else @@ -17297,9 +17308,12 @@ (eq_attr "mode" "TI")) (const_string "1") (const_string "*"))) - (set_attr "prefix" "orig,vex,evex") + (set_attr "prefix" "orig,vex,evex,evex,evex") (set (attr "mode") - (cond [(match_test "TARGET_AVX2") + (cond [(and (eq_attr "alternative" "3,4") + (match_test " < 64 && !TARGET_AVX512VL")) +(const_string "XI") + (match_test "TARGET_AVX2") (const_string "") (match_test "TARGET_AVX") (if_then_else @@ -17310,7 +17324,15 @@ (match_test "optimize_function_for_size_p (cfun)")) (const_string "V4SF") ] - (const_string "")))]) + (const_string ""))) + (set (attr "enabled") + (cond [(eq_attr "alternative" "3") +(symbol_ref " == 64 || TARGET_AVX512VL") + (eq_attr "alternative" "4") +(symbol_ref " == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)") + ] + (const_string "*")))]) ;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn (define_split --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-andn-di-zmm-2.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ +/* { dg-final { scan-assembler-times "vpternlogq\[ \\t\]+\\\$0x44, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ + +#define type __m512i +#define vec 512 +#define op andnot +#define suffix epi64 +#define SCALAR long long + +#include "avx512-binop-2.h" --- a/gcc/testsuite/gcc.target/i386/avx512f-andn-si-zmm-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-andn-si-zmm-2.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ -/* { dg-final { scan-assembler-times "vpbroadcastd\[^\n\]*%zmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vpandnd\[^\n\]*%zmm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0x44, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */ +/* { dg-final { scan-assembler-not "vpbroadcast" } } */ #define type __m512i #define vec 512 --- a/gcc/testsuite/gcc.target/i386/pr100711-3.c +++ b/gcc/testsuite/gcc.target/i386/pr100711-3.c @@ -37,4 +37,6 @@ v8di foo_v8di (long long a, v8di b) return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b; } -/* { dg-final { scan-assembler-times "vpandn" 4 } } */ +/* { dg-final { scan-assembler-times "vpandn" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-a
[PATCH 3/5] x86: allow memory operand for AVX2 splitter for PR target/100711
The intended broadcast (with AVX512) can very well be done right from memory. gcc/ * config/i386/sse.md: Permit non-immediate operand 1 in AVX2 form of splitter for PR target/100711. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17356,7 +17356,7 @@ (and:VI_AVX2 (vec_duplicate:VI_AVX2 (not: - (match_operand: 1 "register_operand"))) + (match_operand: 1 "nonimmediate_operand"))) (match_operand:VI_AVX2 2 "vector_operand")))] "TARGET_AVX2" [(set (match_dup 3)
[PATCH 4/5] x86: further PR target/100711-like splitting
With respective two-operand bitwise operations now expressable by a single VPTERNLOG, add splitters to also deal with ior and xor counterparts of the original and-only case. Note that the splitters need to be separate, as the placement of "not" differs in the final insns (*iornot3, *xnor3) which are intended to pick up one half of the result. gcc/ * config/i386/sse.md: New splitters to simplify not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}. gcc/testsuite/ * gcc.target/i386/pr100711-4.c: New test. * gcc.target/i386/pr100711-5.c: New test. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17366,6 +17366,36 @@ (match_dup 2)))] "operands[3] = gen_reg_rtx (mode);") +(define_split + [(set (match_operand:VI 0 "register_operand") + (ior:VI + (vec_duplicate:VI + (not: + (match_operand: 1 "nonimmediate_operand"))) + (match_operand:VI 2 "vector_operand")))] + " == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 3) + (vec_duplicate:VI (match_dup 1))) + (set (match_dup 0) + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] + "operands[3] = gen_reg_rtx (mode);") + +(define_split + [(set (match_operand:VI 0 "register_operand") + (xor:VI + (vec_duplicate:VI + (not: + (match_operand: 1 "nonimmediate_operand"))) + (match_operand:VI 2 "vector_operand")))] + " == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 3) + (vec_duplicate:VI (match_dup 1))) + (set (match_dup 0) + (not:VI (xor:VI (match_dup 3) (match_dup 2] + "operands[3] = gen_reg_rtx (mode);") + (define_insn "*andnot3_mask" [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v") (vec_merge:VI48_AVX512VL --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-4.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v64qi foo_v64qi (char a, v64qi b) +{ +return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v32hi foo_v32hi (short a, v32hi b) +{ +return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v16si foo_v16si (int a, v16si b) +{ +return (__extension__ (v16si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +v8di foo_v8di (long long a, v8di b) +{ +return (__extension__ (v8di) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) | b; +} + +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xbb" 2 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0xdd" 2 { target { ia32 } } } } */ --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-5.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef short v32hi __attribute__ ((vector_size (64))); +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v64qi foo_v64qi (char a, v64qi b) +{ +return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) ^ b; +} + +v32hi foo_v32hi (short a, v32hi b) +{ +return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, + ~a, ~a, ~a, ~a, ~a, ~a, ~
[PATCH 5/5] x86: yet more PR target/100711-like splitting
Following two-operand bitwise operations, add another splitter to also deal with not followed by broadcast all on its own, which can be expressed as simple embedded broadcast instead once a broadcast operand is actually permitted in the respective insn. While there also permit a broadcast operand in the corresponding expander. gcc/ * config/i386/sse.md: New splitters to simplify not;vec_duplicate as a singular vpternlog. (one_cmpl2): Allow broadcast for operand 1. (one_cmpl2): Likewise. gcc/testsuite/ * gcc.target/i386/pr100711-6.c: New test. --- For the purpose here (and elsewhere) bcst_vector_operand() (really: bcst_mem_operand()) isn't permissive enough: We'd want it to allow 128-bit and 256-bit types as well irrespective of AVX512VL being enabled. This would likely require a new predicate (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name selection it will want considering that this is applicable to certain non-calculational FP operations as well.) --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17156,7 +17156,7 @@ (define_expand "one_cmpl2" [(set (match_operand:VI 0 "register_operand") - (xor:VI (match_operand:VI 1 "vector_operand") + (xor:VI (match_operand:VI 1 "bcst_vector_operand") (match_dup 2)))] "TARGET_SSE" { @@ -17168,7 +17168,7 @@ (define_insn "one_cmpl2" [(set (match_operand:VI 0 "register_operand" "=v,v") - (xor:VI (match_operand:VI 1 "nonimmediate_operand" "v,m") + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m") (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))] "TARGET_AVX512F && (! @@ -17191,6 +17191,19 @@ (symbol_ref " == 64 || TARGET_AVX512VL") (const_int 1)))]) +(define_split + [(set (match_operand:VI48_AVX512F 0 "register_operand") + (vec_duplicate:VI48_AVX512F + (not: + (match_operand: 1 "nonimmediate_operand"] + " == 64 || TARGET_AVX512VL + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" + [(set (match_dup 0) + (xor:VI48_AVX512F + (vec_duplicate:VI48_AVX512F (match_dup 1)) + (match_dup 2)))] + "operands[2] = CONSTM1_RTX (mode);") + (define_expand "_andnot3" [(set (match_operand:VI_AVX2 0 "register_operand") (and:VI_AVX2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr100711-6.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ + +typedef int v16si __attribute__ ((vector_size (64))); +typedef long long v8di __attribute__((vector_size (64))); + +v16si foo_v16si (const int *a) +{ +return (__extension__ (v16si) {~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, + ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a}); +} + +v8di foo_v8di (const long long *a) +{ +return (__extension__ (v8di) {~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a, ~*a}); +} + +/* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$0x55, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}" 2 } } */
Re: [PATCH v2] x86: make better use of VBROADCASTSS / VPBROADCASTD
On 21.06.2023 09:37, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches > wrote: >> >> Is there a reason why vec_dupv4sf uses sseshuf1 for its shuffle >> alternatives, but *vec_dupv4si uses sselog1? I'd be happy to correct >> this in whichever is the appropriate direction, while touching this >> anyway. > It should be sseshuf1(or sseshuf depending on input operands number in > the pattern) for shufps, sselog means logical instructions. Would you be okay for me to fold in that adjustment, or do you insist on a separate patch? >> I'm working from the assumption that the isa attributes to the original >> 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 >> or avx_noavx2 as applicable), as the new earlier alternatives cover all >> operand forms already when at least AVX2 is enabled. >> >> Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss > According to comments, yes, no extra prefix is needed. > > ;; There are also additional prefixes in 3DNOW, SSSE3. > ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, > ;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. > ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. Right, that's what triggered my question. I guess dropping these "prefix_extra" really wants to be a separate patch (or maybe even multiple, but it's hard to see how to split), dealing with all of the instances which likely have accumulated simply via copy-and-paste. >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -26141,41 +26141,64 @@ >> (const_int 1)))]) >> >> (define_insn "vec_dupv4sf" >> - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") >> + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") >> (vec_duplicate:V4SF >> - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] >> + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] >>"TARGET_SSE" >>"@ >> - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} >> + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : >> \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; >> + vbroadcastss\t{%1, %g0|%g0, %1} >> vbroadcastss\t{%1, %0|%0, %1} >> shufps\t{$0, %0, %0|%0, %0, 0}" >> - [(set_attr "isa" "avx,avx,noavx") >> - (set_attr "type" "sseshuf1,ssemov,sseshuf1") >> - (set_attr "length_immediate" "1,0,1") >> - (set_attr "prefix_extra" "0,1,*") >> - (set_attr "prefix" "maybe_evex,maybe_evex,orig") >> - (set_attr "mode" "V4SF")]) >> + [(set_attr "isa" "avx,*,avx,noavx") >> + (set (attr "type") >> + (cond [(and (eq_attr "alternative" "0") >> + (match_test "!TARGET_AVX2")) >> +(const_string "sseshuf1") >> + (eq_attr "alternative" "3") >> +(const_string "sseshuf1") >> + ] >> + (const_string "ssemov"))) >> + (set (attr "length_immediate") >> + (if_then_else (eq_attr "type" "sseshuf1") >> + (const_string "1") >> + (const_string "0"))) >> + (set_attr "prefix_extra" "0,0,1,*") >> + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") >> + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") >> + (set (attr "enabled") >> + (if_then_else (eq_attr "alternative" "1") >> + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL >> + && !TARGET_PREFER_AVX256") >> + (const_string "*")))]) >> >> (define_insn "*vec_dupv4si" >> - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") >> + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") >> (vec_duplicate:V4SI >> - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] >> + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] >>"TARGET_SSE" >>"@ >> + vpbroadcastd\t{%1, %0|%0, %1}
Re: [PATCH v2] x86: make better use of VBROADCASTSS / VPBROADCASTD
On 21.06.2023 09:44, Jan Beulich wrote: > On 21.06.2023 09:37, Hongtao Liu wrote: >> On Wed, Jun 21, 2023 at 2:06 PM Jan Beulich via Gcc-patches >> wrote: >>> >>> Isn't prefix_extra use bogus here? What extra prefix does vbroadcastss >> According to comments, yes, no extra prefix is needed. >> >> ;; There are also additional prefixes in 3DNOW, SSSE3. >> ;; ssemuladd,sse4arg default to 0f24/0f25 and DREX byte, >> ;; sseiadd1,ssecvt1 to 0f7a with no DREX byte. >> ;; 3DNOW has 0f0f prefix, SSSE3 and SSE4_{1,2} 0f38/0f3a. > > Right, that's what triggered my question. I guess dropping these > "prefix_extra" really wants to be a separate patch (or maybe even > multiple, but it's hard to see how to split), dealing with all of the > instances which likely have accumulated simply via copy-and-paste. Or wait - I'm altering those lines anyway, so I could as well drop them right away (and slightly shrink patch size), if that's okay with you. Of course I should then not forget to also mention this in the changelog entry. Jan
Re: [PATCH 1/5] x86: use VPTERNLOG for further bitwise two-vector operations
On 25.06.2023 06:42, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches > wrote: >> >> +(define_code_iterator andor [and ior]) >> +(define_code_attr nlogic [(and "nor") (ior "nand")]) >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) >> + >> +(define_insn "*3" >> + [(set (match_operand:VI 0 "register_operand" "=v,v") >> + (andor:VI >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"] > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform > (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) This wouldn't be a win (not + andn) -> (or + not), but what's more important is ... > (ior (not op1) (not op2)) -> (not : (and op1 op2)) > > Even w/o avx512f, the transformation should also benefit since it > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). ... that these transformations (from the, as per the doc, canonical representation of nand and nor) are already occurring in common code, _if_ no suitable insn can be found. That was at least the conclusion I drew from looking around a lot, supported by the code that's generated prior to this change. Jan
Re: [PATCH 4/5] x86: further PR target/100711-like splitting
On 25.06.2023 07:06, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:28 PM Jan Beulich via Gcc-patches > wrote: >> >> With respective two-operand bitwise operations now expressable by a >> single VPTERNLOG, add splitters to also deal with ior and xor >> counterparts of the original and-only case. Note that the splitters need >> to be separate, as the placement of "not" differs in the final insns >> (*iornot3, *xnor3) which are intended to pick up one half of >> the result. >> >> gcc/ >> >> * config/i386/sse.md: New splitters to simplify >> not;vec_duplicate;{ior,xor} as vec_duplicate;{iornot,xnor}. >> >> gcc/testsuite/ >> >> * gcc.target/i386/pr100711-4.c: New test. >> * gcc.target/i386/pr100711-5.c: New test. >> >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -17366,6 +17366,36 @@ >> (match_dup 2)))] >>"operands[3] = gen_reg_rtx (mode);") >> >> +(define_split >> + [(set (match_operand:VI 0 "register_operand") >> + (ior:VI >> + (vec_duplicate:VI >> + (not: >> + (match_operand: 1 "nonimmediate_operand"))) >> + (match_operand:VI 2 "vector_operand")))] >> + " == 64 || TARGET_AVX512VL >> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" >> + [(set (match_dup 3) >> + (vec_duplicate:VI (match_dup 1))) >> + (set (match_dup 0) >> + (ior:VI (not:VI (match_dup 3)) (match_dup 2)))] >> + "operands[3] = gen_reg_rtx (mode);") >> + >> +(define_split >> + [(set (match_operand:VI 0 "register_operand") >> + (xor:VI >> + (vec_duplicate:VI >> + (not: >> + (match_operand: 1 "nonimmediate_operand"))) >> + (match_operand:VI 2 "vector_operand")))] >> + " == 64 || TARGET_AVX512VL >> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256)" >> + [(set (match_dup 3) >> + (vec_duplicate:VI (match_dup 1))) >> + (set (match_dup 0) >> + (not:VI (xor:VI (match_dup 3) (match_dup 2] >> + "operands[3] = gen_reg_rtx (mode);") >> + > Can we merge this splitter(xor:not) into ior:not one with a code > iterator for xor,ior, They look the same except for the xor/ior. They're only almost the same: Note (ior (not )) vs (not (xor )) as the result of the splitting. The difference is necessary to fit with what patch 1 introduces (which in turn is the way it is to fit with what generic code transforms things to up front). (I had it the way you suggest initially, until I figured why one of the two would end up never being used.) Jan
Re: [PATCH 5/5] x86: yet more PR target/100711-like splitting
On 25.06.2023 07:12, Hongtao Liu wrote: > On Wed, Jun 21, 2023 at 2:29 PM Jan Beulich via Gcc-patches > wrote: >> >> --- >> For the purpose here (and elsewhere) bcst_vector_operand() (really: >> bcst_mem_operand()) isn't permissive enough: We'd want it to allow >> 128-bit and 256-bit types as well irrespective of AVX512VL being >> enabled. This would likely require a new predicate >> (bcst_intvec_operand()?) and a new constraint (BR? Bi?). (Yet for name >> selection it will want considering that this is applicable to certain >> non-calculational FP operations as well.) > I think so. Any preference towards predicate and constraint naming? Plus I think there's a more general question behind this: A new predicate / constraint pair is likely just one way of dealing with the issue. Another would appear to be to remove the restriction of 128- and 256-byte types when AVX512VL is not enabled, but AVX512F is. While that would require touching a lot of insn constraints, it looks as if lifting that restriction would "merely" require much wider use of Yv where v is used right now. But of course I may well be unaware of (some of) the reasons why that restriction was put in place in the first place (it can't really be the lack of suitable move insns, as those can be synthesized by using e.g. vextract{32,64}x4). Jan
Re: [PATCH 1/5] x86: use VPTERNLOG for further bitwise two-vector operations
On 25.06.2023 09:30, Hongtao Liu wrote: > On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu wrote: >> >> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu wrote: >>> >>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich wrote: >>>> >>>> On 25.06.2023 06:42, Hongtao Liu wrote: >>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches >>>>> wrote: >>>>>> >>>>>> +(define_code_iterator andor [and ior]) >>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")]) >>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")]) >>>>>> + >>>>>> +(define_insn "*3" >>>>>> + [(set (match_operand:VI 0 "register_operand" "=v,v") >>>>>> + (andor:VI >>>>>> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v")) >>>>>> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"] >>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform >>>>> (and (not op1)) (not op2)) -> (not: (ior: op1 op2)) >>>> >>>> This wouldn't be a win (not + andn) -> (or + not), but what's >>>> more important is ... >>>> >>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2)) >>>>> >>>>> Even w/o avx512f, the transformation should also benefit since it >>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn). >>>> >>>> ... that these transformations (from the, as per the doc, >>>> canonical representation of nand and nor) are already occurring >>> I see, there're already such simplifications in the gimple phase, so >>> the question: is there any need for and/ior:not not pattern? >>> Can you provide a testcase to demonstrate that and/ior: not not >>> pattern is needed? >> >> typedef int v4si __attribute__((vector_size(16))); >> v4si >> foo1 (v4si a, v4si b) >> { >> return ~a & ~b; >> } >> >> I only gimple have optimized it to >> >>[local count: 1073741824]: >> # DEBUG BEGIN_STMT >> _1 = a_2(D) | b_3(D); >> _4 = ~_1; >> return _4; >> >> >> But rtl still try to match >> >> (set (reg:V4SI 86) >> (and:V4SI (not:V4SI (reg:V4SI 88)) >> (not:V4SI (reg:V4SI 89 >> >> Hmm. > In rtl, we're using xor -1 for not, so it's > > (insn 8 7 9 2 (set (reg:V4SI 87) > (ior:V4SI (reg:V4SI 88) > (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3} > (expr_list:REG_DEAD (reg:V4SI 89) > (expr_list:REG_DEAD (reg:V4SI 88) > (nil > (insn 9 8 14 2 (set (reg:V4SI 86) > (xor:V4SI (reg:V4SI 87) > (const_vector:V4SI [ > (const_int -1 [0x]) repeated x4 > ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2} > > Then simplified to >> (set (reg:V4SI 86) >> (and:V4SI (not:V4SI (reg:V4SI 88)) >> (not:V4SI (reg:V4SI 89 >> > > by > > 3565case XOR: > 3566 if (trueop1 == CONST0_RTX (mode)) > 3567return op0; > 3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode)) > 3569return simplify_gen_unary (NOT, mode, op0, mode); > > and > > 1018 /* Apply De Morgan's laws to reduce number of patterns for machines > 1019 with negating logical insns (and-not, nand, etc.). If result has > 1020 only one NOT, put it first, since that is how the patterns are > 1021 coded. */ > 1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND) > 1023{ > 1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1); > 1025 machine_mode op_mode; > 1026 > 1027 op_mode = GET_MODE (in1); > 1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode); > 1029 > 1030 op_mode = GET_MODE (in2); > 1031 if (op_mode == VOIDmode) > 1032op_mode = mode; > 1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode); > 1034 > 1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT) > 1036std::swap (in1, in2); > 1037 > 1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR, > 1039 mode, in1, in2); > 1040} > > > Ok, got it, and/ior:not not pattern LGTM then. Just to avoid misunderstandings - together with your initial reply that's then an "okay" to the patch as a whole, right? Thanks, Jan
Re: [PATCH v3] x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F
On 27.06.2023 07:11, Hongtao Liu wrote: > On Tue, Jun 20, 2023 at 5:34 PM Hongtao Liu wrote: >> >> On Tue, Jun 20, 2023 at 5:03 PM Jan Beulich wrote: >>> >>> On 20.06.2023 10:33, Hongtao Liu wrote: >>>> On Tue, Jun 20, 2023 at 3:07 PM Jan Beulich via Gcc-patches >>>> wrote: >>>>> >>>>> I guess the underlying pattern, going along the lines of what >>>>> one_cmpl2 uses, can be applied elsewhere >>>>> as well. >>>> That should be guarded with !TARGET_PREFER_AVX256, let's handle that >>>> in a separate patch. >>> >>> Sure, and as indicated there are more places where similar things could >>> be done. >>> >>>>> --- /dev/null >>>>> +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c >>>>> @@ -0,0 +1,32 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-mavx512f -mno-avx512vl -O2" } */ >>>> Please explicitly add -mprefer-vector-width=512, our tester will also >>>> test unix{-m32 \-march=cascadelake,\ -march=cascadelake} which set the >>>> - mprefer-vector-width=256, -mprefer-vector-width=512 in dg-options >>>> can overwrite that. >>> >>> Oh, I see. Will do. And I expect I then also need to adjust the newly >>> added avx512f-dupv2di.c from the earlier patch. I guess I could commit >>> that option addition there as obvious? >> Still need to send out the patch, and commit as an obvious fix. >>> >>>> Others LGTM. >>> >>> May I take this as "okay with that change", or should I submit v4? >> Okay. no need for a v4 version. >>> > avx512f-copysign.c failed for -m32, we need to add -mfpmath=sse to dg-options. Oh, of course. I will take care of this, but it may take me a couple of days, as I just came back from a week of vacation. One question though: Elsewhere such tests are simply suppressed for 32-bit. Personally I'd prefer going that route, but if you think adding -mfpmath=sse is indeed better, I'll follow your request. Jan
[PATCH] x86: suppress avx512f-copysign.c testcase for 32-bit
The test installed by "x86: make VPTERNLOG* usable on less than 512-bit operands with just AVX512F" won't succeed on 32-bit, for floating point operations being done there (by default) without using SIMD insns. gcc/testsuite/ * gcc.target/i386/avx512f-copysign.c: Suppress for 32-bit. --- Committing right away based on previous communication with maintainer. --- a/gcc/testsuite/gcc.target/i386/avx512f-copysign.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-copysign.c @@ -1,4 +1,4 @@ -/* { dg-do compile } */ +/* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */ /* { dg-final { scan-assembler-times "vpternlog\[dq\]\[ \\t\]+\\\$(?:216|228|0xd8|0xe4)," 5 } } */
[PATCH 0/2] x86: vec_extract_* adjustments
1: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi 2: slightly correct / simplify *vec_extractv2ti Jan
[PATCH 1/2] x86: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi
The middle alternative each was unusable without enabling AVX512DQ (in addition to AVX512VL), which is entirely unrelated here. The last alternative is usable with AVX512VL only (due to type restrictions on what may be put in the upper 16 YMM registers), and hence is pointlessly forcing 512-bit mode (without actually reflecting that in the "mode" attribute). gcc/ * config/i386/sse.md (@vec_extract_hi_): Drop last alternative. Switch new last alternative's "isa" attribute to "avx512vl". (vec_extract_hi_v32qi): Likewise. --- Like elsewhere I suspect "prefix_extra" is bogus here and should be dropped. Is "sselog1" actually appropriate here? Extracts are special forms of moves after all, not logical operations. Even "sseshuf1" would seem to come closer. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -12029,9 +12029,9 @@ "operands[1] = gen_lowpart (mode, operands[1]);") (define_insn "@vec_extract_hi_" - [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm,vm") + [(set (match_operand: 0 "nonimmediate_operand" "=xm,vm") (vec_select: - (match_operand:V16_256 1 "register_operand" "x,v,v") + (match_operand:V16_256 1 "register_operand" "x,v") (parallel [(const_int 8) (const_int 9) (const_int 10) (const_int 11) (const_int 12) (const_int 13) @@ -12039,13 +12039,12 @@ "TARGET_AVX" "@ vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1} - vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1} - vextracti32x4\t{$0x1, %g1, %0|%0, %g1, 0x1}" + vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}" [(set_attr "type" "sselog1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") - (set_attr "isa" "*,avx512dq,avx512f") - (set_attr "prefix" "vex,evex,evex") + (set_attr "isa" "*,avx512vl") + (set_attr "prefix" "vex,evex") (set_attr "mode" "OI")]) (define_insn_and_split "vec_extract_lo_v64qi" @@ -12144,9 +12143,9 @@ "operands[1] = gen_lowpart (V16QImode, operands[1]);") (define_insn "vec_extract_hi_v32qi" - [(set (match_operand:V16QI 0 "nonimmediate_operand" "=xm,vm,vm") + [(set (match_operand:V16QI 0 "nonimmediate_operand" "=xm,vm") (vec_select:V16QI - (match_operand:V32QI 1 "register_operand" "x,v,v") + (match_operand:V32QI 1 "register_operand" "x,v") (parallel [(const_int 16) (const_int 17) (const_int 18) (const_int 19) (const_int 20) (const_int 21) @@ -12158,13 +12157,12 @@ "TARGET_AVX" "@ vextract%~128\t{$0x1, %1, %0|%0, %1, 0x1} - vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1} - vextracti32x4\t{$0x1, %g1, %0|%0, %g1, 0x1}" + vextracti32x4\t{$0x1, %1, %0|%0, %1, 0x1}" [(set_attr "type" "sselog1") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1") - (set_attr "isa" "*,avx512dq,avx512f") - (set_attr "prefix" "vex,evex,evex") + (set_attr "isa" "*,avx512vl") + (set_attr "prefix" "vex,evex") (set_attr "mode" "OI")]) ;; NB: *vec_extract_0 must be placed before *vec_extracthf.
[PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti
V2TImode values cannot appear in the upper 16 YMM registers without AVX512VL being enabled. Therefore forcing 512-bit mode (also not reflected in the "mode" attribute) is pointless. gcc/ * config/i386/sse.md (*vec_extractv2ti): Drop g modifiers. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -20115,7 +20115,7 @@ "TARGET_AVX" "@ vextract%~128\t{%2, %1, %0|%0, %1, %2} - vextracti32x4\t{%2, %g1, %0|%0, %g1, %2}" + vextracti32x4\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog") (set_attr "prefix_extra" "1") (set_attr "length_immediate" "1")
Re: [PATCH 1/2] x86: correct / simplify @vec_extract_hi_ and vec_extract_hi_v32qi
On 05.07.2023 10:40, Hongtao Liu wrote: > On Wed, Jul 5, 2023 at 4:00 PM Jan Beulich via Gcc-patches > wrote: >> >> The middle alternative each was unusable without enabling AVX512DQ (in >> addition to AVX512VL), which is entirely unrelated here. The last >> alternative is usable with AVX512VL only (due to type restrictions on >> what may be put in the upper 16 YMM registers), and hence is pointlessly >> forcing 512-bit mode (without actually reflecting that in the "mode" >> attribute). > Ok. Thanks. >> --- >> Like elsewhere I suspect "prefix_extra" is bogus here and should be >> dropped. >> >> Is "sselog1" actually appropriate here? Extracts are special forms of >> moves after all, not logical operations. Even "sseshuf1" would seem to >> come closer. > Honestly, I don't know why it's marked as sselog1, but looking at the > code, almost all vec_extract patterns are marked as sselog1, guess > it's originally from pextr. > Agree that it's should be more close to shuffle instructions. Yet as said I think these are special forms of moves. To me "shuffle" involves more than one element. Yet then I don't really know what the "type" attributes are used for (other than vaguely "for scheduling"), and hence whether treating extracts as shuffles would be more appropriate. (IOW I'd be happy to make a patch to convert all extracts, but I'd need to know whether the conversion should be to "sseshuf", "sseshuf1", or "ssemov". In the former two cases knowing the "Why?" would also help, especially for writing a sensible description. I also haven't found any explanation towards the difference between sse and sse1: The "memory" attribute evaluates to "both" for the 1 forms if operand 1 is in memory, yet that doesn't seem to fit any of the uses here.) Jan
Re: [PATCH 2/2] x86: slightly correct / simplify *vec_extractv2ti
On 05.07.2023 10:47, Hongtao Liu wrote: > On Wed, Jul 5, 2023 at 4:01 PM Jan Beulich via Gcc-patches > wrote: >> >> V2TImode values cannot appear in the upper 16 YMM registers without >> AVX512VL being enabled. Therefore forcing 512-bit mode (also not >> reflected in the "mode" attribute) is pointless. > Please set isa attribute for alternative 1 to avx512vl. Since that looks redundant to me (as per the description), would you mind explaining why that's necessary / wanted? It also feels orthogonal to the change I'm making, as there was no "isa" attribute so far (which would have wanted to be "avx512f" as per what you ask for, prior to the change I'm making). Again me asking back is primarily to properly describe the changes I'm making, of course along with me still needing to properly understand when what attribute needs specifying explicitly. Jan
Re: [r14-2310 Regression] FAIL: gcc.target/i386/pr53652-1.c scan-assembler-times pandn[ \\t] 2 on Linux/x86_64
On 06.07.2023 13:57, haochen.jiang wrote: > On Linux/x86_64, > > 2d11c99dfca3cc603dbbfafb3afc41689a68e40f is the first bad commit > commit 2d11c99dfca3cc603dbbfafb3afc41689a68e40f > Author: Jan Beulich > Date: Wed Jul 5 09:41:09 2023 +0200 > > x86: use VPTERNLOG also for certain andnot forms > > caused > > FAIL: gcc.target/i386/pr53652-1.c scan-assembler-not vpternlogq[ \\t] The respective expectation was never valid to add without excluding cases where -march= overrides (extends) the -msse2 that the test specifies explicitly. I'm afraid I don't know how to tweak a testcase to properly deal with that. Perhaps (like iirc was suggested elsewhere) -mno-avx512f, but honestly this approach feels clumsy to me. Cc-ing Hongtao, who I think suggested that approach elsewhere. > FAIL: gcc.target/i386/pr53652-1.c scan-assembler-times pandn[ \\t] 2 Aiui this is merely a knock-on effect. Jan > with GCC configured with > > ../../gcc/configure > --prefix=/export/users/haochenj/src/gcc-bisect/master/master/r14-2310/usr > --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld > --with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl > --enable-libmpx x86_64-linux --disable-bootstrap > > To reproduce: > > $ cd {build_dir}/gcc && make check > RUNTESTFLAGS="i386.exp=gcc.target/i386/pr53652-1.c --target_board='unix{-m32\ > -march=cascadelake}'" > $ cd {build_dir}/gcc && make check > RUNTESTFLAGS="i386.exp=gcc.target/i386/pr53652-1.c --target_board='unix{-m64\ > -march=cascadelake}'" > > (Please do not reply to this email, for question about this report, contact > me at haochen dot jiang at intel.com)
Re: [r14-2314 Regression] FAIL: gcc.target/i386/pr100711-2.c scan-assembler-times vpandn 8 on Linux/x86_64
On 06.07.2023 13:57, haochen.jiang wrote: > On Linux/x86_64, > > e007369c8b67bcabd57c4fed8cff2a6db82e78e6 is the first bad commit > commit e007369c8b67bcabd57c4fed8cff2a6db82e78e6 > Author: Jan Beulich > Date: Wed Jul 5 09:49:16 2023 +0200 > > x86: yet more PR target/100711-like splitting > > caused > > FAIL: gcc.target/i386/pr100711-1.c scan-assembler-times pandn 2 > FAIL: gcc.target/i386/pr100711-2.c scan-assembler-times vpandn 8 I expect the same applies here - -mno-avx512f (or -mno-avx512vl) might address this failure. But whether that's really the way to go I'm not sure of. Plus of course such adjustments should have been done ahead of time, when it was decided that testing with certain -march= settings is a goal. My changes have merely uncovered the prior omissions. Jan > with GCC configured with > > ../../gcc/configure > --prefix=/export/users/haochenj/src/gcc-bisect/master/master/r14-2314/usr > --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld > --with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl > --enable-libmpx x86_64-linux --disable-bootstrap > > To reproduce: > > $ cd {build_dir}/gcc && make check > RUNTESTFLAGS="i386.exp=gcc.target/i386/pr100711-1.c > --target_board='unix{-m32\ -march=cascadelake}'" > $ cd {build_dir}/gcc && make check > RUNTESTFLAGS="i386.exp=gcc.target/i386/pr100711-2.c > --target_board='unix{-m32\ -march=cascadelake}'" > > (Please do not reply to this email, for question about this report, contact > me at haochen dot jiang at intel.com)
Re: [r14-2310 Regression] FAIL: gcc.target/i386/pr53652-1.c scan-assembler-times pandn[ \\t] 2 on Linux/x86_64
On 07.07.2023 09:30, Hongtao Liu wrote: > On Fri, Jul 7, 2023 at 3:13 PM Jan Beulich via Gcc-regression > wrote: >> >> On 06.07.2023 13:57, haochen.jiang wrote: >>> On Linux/x86_64, >>> >>> 2d11c99dfca3cc603dbbfafb3afc41689a68e40f is the first bad commit >>> commit 2d11c99dfca3cc603dbbfafb3afc41689a68e40f >>> Author: Jan Beulich >>> Date: Wed Jul 5 09:41:09 2023 +0200 >>> >>> x86: use VPTERNLOG also for certain andnot forms >>> >>> caused >>> >>> FAIL: gcc.target/i386/pr53652-1.c scan-assembler-not vpternlogq[ \\t] >> >> The respective expectation was never valid to add without excluding >> cases where -march= overrides (extends) the -msse2 that the test >> specifies explicitly. I'm afraid I don't know how to tweak a testcase >> to properly deal with that. Perhaps (like iirc was suggested elsewhere) >> -mno-avx512f, but honestly this approach feels clumsy to me. Cc-ing >> Hongtao, who I think suggested that approach elsewhere. >> >>> FAIL: gcc.target/i386/pr53652-1.c scan-assembler-times pandn[ \\t] 2 > There're a false dependence when using pternlog for andnot(and other > newly added) pattern, i'm working on a patch to avoid that(PR110438). > Let me handle the test case. Of course I'm happy to see you handle the testcase, but if you don't mind I'm curious towards the connection you see between that false dependency issue and the adjustments missing in this (and other) testcase(s). Jan
Re: [r14-2314 Regression] FAIL: gcc.target/i386/pr100711-2.c scan-assembler-times vpandn 8 on Linux/x86_64
On 07.07.2023 09:46, Hongtao Liu wrote: > On Fri, Jul 7, 2023 at 3:18 PM Jan Beulich via Gcc-regression > wrote: >> >> On 06.07.2023 13:57, haochen.jiang wrote: >>> On Linux/x86_64, >>> >>> e007369c8b67bcabd57c4fed8cff2a6db82e78e6 is the first bad commit >>> commit e007369c8b67bcabd57c4fed8cff2a6db82e78e6 >>> Author: Jan Beulich >>> Date: Wed Jul 5 09:49:16 2023 +0200 >>> >>> x86: yet more PR target/100711-like splitting >>> >>> caused >>> >>> FAIL: gcc.target/i386/pr100711-1.c scan-assembler-times pandn 2 >>> FAIL: gcc.target/i386/pr100711-2.c scan-assembler-times vpandn 8 >> >> I expect the same applies here - -mno-avx512f (or -mno-avx512vl) might > For this one, we can just add -mno-avx512f to the testcase,it aims to > optimize pandn for avx2 target. >> address this failure. But whether that's really the way to go I'm not >> sure of. Plus of course such adjustments should have been done ahead >> of time, when it was decided that testing with certain -march= settings >> is a goal. My changes have merely uncovered the prior omissions. > It's not a standard request, it's just our private tester which is > used to find gcc bugs and miss-optimizations. > It sometimes generates false positive reports (usually adding > -mno-avx512f to the testcase can fix that), hope that's not too > annoying. Wouldn't that then better be done once uniformly for all affected tests, rather than being discovered piecemeal? Anyway, in this case: Since you said you'd take care of the other test, will/can you do so for the two ones here as well, or am I on the hook? Jan
[PATCH v3] x86: make better use of VBROADCASTSS / VPBROADCASTD
... in vec_dupv4sf / *vec_dupv4si. The respective broadcast insns are never longer (yet sometimes shorter) than the corresponding VSHUFPS / VPSHUFD, due to the immediate operand of the shuffle insns balancing the (uniform) need for VEX3 in the broadcast ones. When EVEX encoding is respective the broadcast insns are always shorter. Add new alternatives to cover the AVX2 and AVX512 cases as appropriate. While touching this anyway, switch to consistently using "sseshuf1" in the "type" attributes for all shuffle forms. gcc/ * config/i386/sse.md (vec_dupv4sf): Make first alternative use vbroadcastss for AVX2. New AVX512F alternative. (*vec_dupv4si): New AVX2 and AVX512F alternatives using vpbroadcastd. Replace sselog1 by sseshuf1 in "type" attribute. gcc/testsuite/ * gcc.target/i386/avx2-dupv4sf.c: New test. * gcc.target/i386/avx2-dupv4si.c: Likewise. * gcc.target/i386/avx512f-dupv4sf.c: Likewise. * gcc.target/i386/avx512f-dupv4si.c: Likewise. --- Note that unlike originally intended, "prefix_extra" isn't dropped: "length_vex" uses it to determine whether 2-byte VEX encoding is possible (which it isn't for VBROADCASTSS / VPBROADCASTD). "length" itself specifically does not use it for VEX/EVEX encoded insns. Especially with the added "enabled" attribute I didn't really see how to (further) fold alternatives 0 and 1. Instead *vec_dupv4si might benefit from using sse2_noavx2 instead of sse2 for alternative 2, except that there is no sse2_noavx2, only sse2_noavx. I'm working from the assumption that the isa attributes to the original 1st and 2nd alternatives don't need further restricting (to sse2_noavx2 or avx_noavx2 as applicable), as the new earlier alternatives cover all operand forms already when at least AVX2 is enabled. --- v3: Testcases for new alternatives. "type" and "prefix_extra" adjustments. v2: Correct operand constraints. Respect -mprefer-vector-width=. Fold two alternatives of vec_dupv4sf. --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -25969,41 +25969,64 @@ (const_int 1)))]) (define_insn "vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SF 0 "register_operand" "=v,v,v,x") (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SF 1 "nonimmediate_operand" "Yv,v,m,0")))] "TARGET_SSE" "@ - vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} + * return TARGET_AVX2 ? \"vbroadcastss\t{%1, %0|%0, %1}\" : \"vshufps\t{$0, %d1, %0|%0, %d1, 0}\"; + vbroadcastss\t{%1, %g0|%g0, %1} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "avx,avx,noavx") - (set_attr "type" "sseshuf1,ssemov,sseshuf1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_evex,maybe_evex,orig") - (set_attr "mode" "V4SF")]) + [(set_attr "isa" "avx,*,avx,noavx") + (set (attr "type") + (cond [(and (eq_attr "alternative" "0") + (match_test "!TARGET_AVX2")) +(const_string "sseshuf1") + (eq_attr "alternative" "3") +(const_string "sseshuf1") + ] + (const_string "ssemov"))) + (set (attr "length_immediate") + (if_then_else (eq_attr "type" "sseshuf1") + (const_string "1") + (const_string "0"))) + (set_attr "prefix_extra" "0,1,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_evex,orig") + (set_attr "mode" "V4SF,V16SF,V4SF,V4SF") + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "1") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_insn "*vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "=v,v,x") + [(set (match_operand:V4SI 0 "register_operand" "=v,v,v,v,x") (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" "Yv,m,0")))] + (match_operand:SI 1 "nonimmediate_operand" "Yvm,v,Yv,m,0")))] "TARGET_SSE" "@ + vpbroadcastd\t{%1, %0|%0, %1} + vpbroadcastd\t{%1, %g0|%g0, %1} %vpshufd\t{$0, %1, %0|%0, %1, 0} vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "sse2,avx,noavx") - (set_attr "type" "sselog1,ssemov,sselog1") - (set_attr "length_immediate" "1,0,1") - (set_attr "prefix_extra" "0,1,*") - (set_attr "prefix" "maybe_vex,maybe_evex,orig") - (set_attr "mode" "TI,V4SF,V4SF") + [(set_attr "isa" "avx2,*,sse2,avx,noavx") + (set_attr "type" "ssemov,ssemov,sseshuf1,ssemov,sseshuf1") + (set_attr "length_immediate" "0,0,1,0,1") + (set_attr "prefix_extra" "1,1,0,1,*") + (set_attr "prefix" "maybe_evex,evex,maybe_vex,maybe_evex,orig") + (set_attr "mode" "TI,XI,TI,V4SF,V4SF") (set (attr "preferred_for_speed") -
[PATCH] x86: improve fast bfloat->float conversion
There's nothing AVX512BW-ish in here, so no reason to use Yw as the constraints for the AVX alternative. Furthermore by using the 512-bit form of VPSSLD (in a new alternative) all 32 registers can be used directly by the insn without AVX512VL needing to be enabled. Also adjust the originally last alternative's "prefix" attribute to maybe_evex. gcc/ * config/i386/i386.md (extendbfsf2_1): Add new AVX512F alternative. Adjust original last alternative's "prefix" attribute to maybe_evex. --- The corresponding expander, "extendbfsf2", looks to have been dead since its introduction in a1ecc5600464 ("Fix incorrect _mm_cvtsbh_ss"): The builtin references the insn (extendbfsf2_1), not the expander. Can't the expander be deleted and the name of the insn then pruned of the _1 suffix? If so, that further raises the question of the significance of the "!HONOR_NANS (BFmode)" that the expander has, but the insn doesn't have. Which may instead suggest the builtin was meant to reference the expander. Yet then I can't see what would the builtin would expand to when HONOR_NANS (BFmode) it true. I further wonder whether the nearby "extendhfdf2" expander is really needed. It doesn't look to specify anything that the corresponding insn doesn't also specify. --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -5181,21 +5181,27 @@ ;; Don't use float_extend since psrlld doesn't raise ;; exceptions and turn a sNaN into a qNaN. (define_insn "extendbfsf2_1" - [(set (match_operand:SF 0 "register_operand" "=x,Yw") + [(set (match_operand:SF 0 "register_operand" "=x,Yv,v") (unspec:SF - [(match_operand:BF 1 "register_operand" " 0,Yw")] + [(match_operand:BF 1 "register_operand" " 0,Yv,v")] UNSPEC_CVTBFSF))] "TARGET_SSE2" "@ pslld\t{$16, %0|%0, 16} - vpslld\t{$16, %1, %0|%0, %1, 16}" - [(set_attr "isa" "noavx,avx") + vpslld\t{$16, %1, %0|%0, %1, 16} + vpslld\t{$16, %g1, %g0|%g0, %g1, 16}" + [(set_attr "isa" "noavx,avx,*") (set_attr "type" "sseishft1") (set_attr "length_immediate" "1") - (set_attr "prefix_data16" "1,*") - (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI") - (set_attr "memory" "none")]) + (set_attr "prefix_data16" "1,*,*") + (set_attr "prefix" "orig,maybe_evex,evex") + (set_attr "mode" "TI,TI,XI") + (set_attr "memory" "none") + (set (attr "enabled") + (if_then_else (eq_attr "alternative" "2") + (symbol_ref "TARGET_AVX512F && !TARGET_AVX512VL + && !TARGET_PREFER_AVX256") + (const_string "*")))]) (define_expand "extendxf2" [(set (match_operand:XF 0 "nonimmediate_operand")
Re: [PATCH] x86: improve fast bfloat->float conversion
On 11.07.2023 08:45, Liu, Hongtao wrote: >> -Original Message- >> From: Jan Beulich >> Sent: Tuesday, July 11, 2023 2:08 PM >> >> There's nothing AVX512BW-ish in here, so no reason to use Yw as the >> constraints for the AVX alternative. Furthermore by using the 512-bit form of >> VPSSLD (in a new alternative) all 32 registers can be used directly by the >> insn >> without AVX512VL needing to be enabled. > Yes, the instruction vpslld doesn't need AVX512BW, the patch LGTM. Thanks. >> --- >> The corresponding expander, "extendbfsf2", looks to have been dead since >> its introduction in a1ecc5600464 ("Fix incorrect _mm_cvtsbh_ss"): The builtin >> references the insn (extendbfsf2_1), not the expander. Can't the expander >> be deleted and the name of the insn then pruned of the _1 suffix? If so, that >> further raises the question of the significance of the "!HONOR_NANS >> (BFmode)" that the expander has, but the insn doesn't have. Which may >> instead suggest the builtin was meant to reference the expander. Yet then I >> can't see what would the builtin would expand to when HONOR_NANS >> (BFmode) it true. > > Quote from what Jakub said in [1]. > --- > This is not correct. > While using such code for _mm_cvtsbh_ss is fine if it is documented not to > raise exceptions and turn a sNaN into a qNaN, it is not fine for HONOR_NANS > (i.e. when -ffast-math is not on), because a __bf16 -> float conversion > on sNaN should raise invalid exception and turn it into a qNaN. > We could have extendbfsf2 expander that would FAIL; if HONOR_NANS and > emit extendbfsf2_1 otherwise. > --- > [1] https://gcc.gnu.org/pipermail/gcc-patches/2022-November/607108.html I'm not sure I understand: It sounds like what Jakub said matches my observation, yet then it seems unlikely that the issue wasn't fixed in over half a year. Also having the expander FAIL when HONOR_NANS (matching what I was thinking) still doesn't clarify to me what then would happen to uses of the builtin. Is there any (common code) fallback for such a case? I didn't think there would be, in which case wouldn't this result in an internal compiler error? Jan
Re: [PATCH v2] i386: Allow -mlarge-data-threshold with -mcmodel=large
On 25.05.2023 17:16, Fangrui Song wrote: > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -32942,9 +32942,10 @@ the cache line size. @samp{compat} is the default. > > @opindex mlarge-data-threshold > @item -mlarge-data-threshold=@var{threshold} > -When @option{-mcmodel=medium} is specified, data objects larger than > -@var{threshold} are placed in the large data section. This value must be the > -same across all objects linked into the binary, and defaults to 65535. > +When @option{-mcmodel=medium} or @option{-mcmodel=large} is specified, data > +objects larger than @var{threshold} are placed in large data sections. This > +value must be the same across all objects linked into the binary, and > defaults > +to 65535. Where's the "must be the same" requirement coming from? As to the default - to remain compatible with earlier versions, shouldn't large model code default to "infinity"? Jan
Re: [PATCH v2] i386: Allow -mlarge-data-threshold with -mcmodel=large
On 25.05.2023 18:11, Fangrui Song wrote: > On 2023-05-25, Jan Beulich wrote: >> On 25.05.2023 17:16, Fangrui Song wrote: >>> --- a/gcc/doc/invoke.texi >>> +++ b/gcc/doc/invoke.texi >>> @@ -32942,9 +32942,10 @@ the cache line size. @samp{compat} is the default. >>> >>> @opindex mlarge-data-threshold >>> @item -mlarge-data-threshold=@var{threshold} >>> -When @option{-mcmodel=medium} is specified, data objects larger than >>> -@var{threshold} are placed in the large data section. This value must be >>> the >>> -same across all objects linked into the binary, and defaults to 65535. >>> +When @option{-mcmodel=medium} or @option{-mcmodel=large} is specified, data >>> +objects larger than @var{threshold} are placed in large data sections. >>> This >>> +value must be the same across all objects linked into the binary, and >>> defaults >>> +to 65535. >> >> Where's the "must be the same" requirement coming from? > > It's an existing requirement. I think it may be related to discouraging > different COMDAT sections names due to different -mlarge-data-threshold=. > I don't think it makes sense but did not feel strongly dropping it. > > Happy to drop the requirement if I revise this patch. I understand that this isn't something you introduce, but it still stuck me as odd. Therefore I thought I'd suggest to take the opportunity to at least soften the language, unless of course there's a real reason behind it. >> As to the default - to remain compatible with earlier versions, shouldn't >> large model code default to "infinity"? >> >> Jan > > I have thought about this compatibility need and feel that it is very > unlikly to be needed. GNU ld has supported large data sections since > 2005 > (https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=3b22753a67cf616514de804ef6d5ed5e90a7d883). > Users' programs with the internal linker scripts will still be working > and -fdata-sections sections will be combined. Well, the concern clearly is about custom scripts. Imo ... > First, -mcmodel=large use cases are rare enough. Rare perhaps > -mcmodel=largel was considered theoretic excercise in > trying to reach feature completion > (https://groups.google.com/g/x86-64-abi/c/jnQdJeabxiU/m/NNuA0P7pAQAJ), > without this patch -mcmodel=large object files don't interract well with > existing -mcmodel=small object files. ... the more exotic a project, the more likely it is that they're using custom scripts. > Moreover, if a user expects a specific section prefix with > -mcmodel=large, that's a brittle assumption. I think it's fair to say > that the fault is on the user side and GCC doesn't need to work around > their issues. I guess I don't really see what you base this on. Without any special options, expecting data to end up in .data/.bss/.rodata (and variants thereof) looks like quite reasonable an assumption to me. Jan
[PATCH] testsuite: adjust NOP expectations for RISC-V
RISC-V will emit ".option nopic" when -fno-pie is in effect, which matches the generic pattern. Just like done for Alpha, special-case RISC-V. --- A couple more targets look to be affected as well, simply because their "no-operation" insn doesn't match the expectation. With the apparently necessary further special casing I then also question the presence of "SWYM" in the generic pattern. An alternative here might be to use dg-additional-options to add e.g. -fpie. I don't think I know all possible implications of doing so, though. --- a/gcc/testsuite/c-c++-common/patchable_function_entry-decl.c +++ b/gcc/testsuite/c-c++-common/patchable_function_entry-decl.c @@ -1,8 +1,9 @@ /* { dg-do compile { target { ! { nvptx*-*-* visium-*-* } } } } */ /* { dg-options "-O2 -fpatchable-function-entry=3,1" } */ /* { dg-additional-options "-fno-pie" { target sparc*-*-* } } */ -/* { dg-final { scan-assembler-times "nop|NOP|SWYM" 2 { target { ! { alpha*-*-* } } } } } */ +/* { dg-final { scan-assembler-times "nop|NOP|SWYM" 2 { target { ! { alpha*-*-* riscv*-*-* } } } } } */ /* { dg-final { scan-assembler-times "bis" 2 { target alpha*-*-* } } } */ +/* { dg-final { scan-assembler-times "nop\n" 2 { target riscv*-*-* } } } */ extern int a; --- a/gcc/testsuite/c-c++-common/patchable_function_entry-default.c +++ b/gcc/testsuite/c-c++-common/patchable_function_entry-default.c @@ -4,8 +4,9 @@ /* See PR99888, one single preceding nop isn't allowed on powerpc_elfv2, so overriding with two preceding nops to make it pass there. */ /* { dg-additional-options "-fpatchable-function-entry=3,2" { target powerpc_elfv2 } } */ -/* { dg-final { scan-assembler-times "nop|NOP|SWYM" 3 { target { ! { alpha*-*-* } } } } } */ +/* { dg-final { scan-assembler-times "nop|NOP|SWYM" 3 { target { ! { alpha*-*-* riscv*-*-* } } } } } */ /* { dg-final { scan-assembler-times "bis" 3 { target alpha*-*-* } } } */ +/* { dg-final { scan-assembler-times "nop\n" 3 { target riscv*-*-* } } } */ extern int a; --- a/gcc/testsuite/c-c++-common/patchable_function_entry-definition.c +++ b/gcc/testsuite/c-c++-common/patchable_function_entry-definition.c @@ -1,8 +1,9 @@ /* { dg-do compile { target { ! { nvptx*-*-* visium-*-* } } } } */ /* { dg-options "-O2 -fpatchable-function-entry=3,1" } */ /* { dg-additional-options "-fno-pie" { target sparc*-*-* } } */ -/* { dg-final { scan-assembler-times "nop|NOP|SWYM" 1 { target { ! { alpha*-*-* } } } } } */ +/* { dg-final { scan-assembler-times "nop|NOP|SWYM" 1 { target { ! { alpha*-*-* riscv*-*-* } } } } } */ /* { dg-final { scan-assembler-times "bis" 1 { target alpha*-*-* } } } */ +/* { dg-final { scan-assembler-times "nop\n" 1 { target riscv*-*-* } } } */ extern int a;
[PATCH v2] testsuite/C++: cope with IPv6 being unavailable
When IPv6 is disabled in the kernel, the error message coming back from Cody::OpenInet6() is different from the sole so far expected one. --- v2: Re-base. --- a/gcc/testsuite/g++.dg/modules/bad-mapper-3.C +++ b/gcc/testsuite/g++.dg/modules/bad-mapper-3.C @@ -1,6 +1,6 @@ // { dg-additional-options "-fmodules-ts -fmodule-mapper=localhost:172477262" } import unique3.bob; -// { dg-error {failed (connecting|disabled) mapper 'localhost:172477262'} "" { target *-*-* } 0 } +// { dg-error {failed (socket|connecting|disabled) mapper 'localhost:172477262'} "" { target *-*-* } 0 } // { dg-prune-output "fatal error:" } // { dg-prune-output "failed to read" } // { dg-prune-output "compilation terminated" }
Ping: [PATCH] testsuite/C++: suppress filename canonicalization in module tests
On 28.06.2022 16:06, Jan Beulich wrote: > The pathname underneath gcm.cache/ is determined from the effective name > used for the main input file of a particular module. When modules are > built, no canonicalization occurs for the main input file. Hence the > module file wouldn't be found if a different (the canonicalized) file > name was used when importing that same module. (This is an effect of > importing happening in the preprocessor, just like #include handling.) > > Since it doesn't look easy to make module generation use libcpp's > maybe_shorter_path() (in fact I'd consider this a layering violation, > while cloning the logic would - at least in principle - be prone to both > going out of sync), simply suppress system header path canonicalization > for the respective tests. Ping: This still looks to apply as is. Thanks, Jan > --- > Strictly speaking it could be necessary to also suppress > canonicalization when generating the modules, but for now they're self- > contained, i.e. don't include any "real" system headers. IOW at the > moment the tests aren't susceptible to the issue at generation time. > > --- a/gcc/testsuite/g++.dg/modules/alias-1_b.C > +++ b/gcc/testsuite/g++.dg/modules/alias-1_b.C > @@ -1,4 +1,4 @@ > -// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem > [srcdir]" } > +// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem > [srcdir] -fno-canonical-system-headers" } > > // Alias at the header file. We have one CMI file > import "alias-1_a.H"; > --- a/gcc/testsuite/g++.dg/modules/alias-1_d.C > +++ b/gcc/testsuite/g++.dg/modules/alias-1_d.C > @@ -1,4 +1,4 @@ > -// { dg-additional-options "-fmodules-ts -isystem [srcdir]" } > +// { dg-additional-options "-fmodules-ts -isystem [srcdir] > -fno-canonical-system-headers" } > // { dg-module-cmi kevin } > > export module kevin; > --- a/gcc/testsuite/g++.dg/modules/alias-1_e.C > +++ b/gcc/testsuite/g++.dg/modules/alias-1_e.C > @@ -1,4 +1,4 @@ > -// { dg-additional-options "-fmodules-ts -isystem [srcdir]" } > +// { dg-additional-options "-fmodules-ts -isystem [srcdir] > -fno-canonical-system-headers" } > > import bob; > import kevin; > --- a/gcc/testsuite/g++.dg/modules/alias-1_f.C > +++ b/gcc/testsuite/g++.dg/modules/alias-1_f.C > @@ -1,4 +1,4 @@ > -// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem > [srcdir]" } > +// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem > [srcdir] -fno-canonical-system-headers" } > > import kevin; > import bob; > --- a/gcc/testsuite/g++.dg/modules/cpp-6_c.C > +++ b/gcc/testsuite/g++.dg/modules/cpp-6_c.C > @@ -1,5 +1,5 @@ > // { dg-do preprocess } > -// { dg-additional-options "-fmodules-ts -isystem [srcdir]" } > +// { dg-additional-options "-fmodules-ts -isystem [srcdir] > -fno-canonical-system-headers" } > > #define empty > #define nop(X) X > --- a/gcc/testsuite/g++.dg/modules/dir-only-2_b.C > +++ b/gcc/testsuite/g++.dg/modules/dir-only-2_b.C > @@ -1,5 +1,5 @@ > // { dg-do preprocess } > -// { dg-additional-options "-fmodules-ts -fdirectives-only -isystem > [srcdir]" } > +// { dg-additional-options "-fmodules-ts -fdirectives-only -isystem [srcdir] > -fno-canonical-system-headers" } > // a comment > module; // line > frob
Re: [PATCH] testsuite: adjust NOP expectations for RISC-V
On 26.04.2023 17:45, Palmer Dabbelt wrote: > On Wed, 26 Apr 2023 08:26:26 PDT (-0700), gcc-patches@gcc.gnu.org wrote: >> >> >> On 4/25/23 08:50, Jan Beulich via Gcc-patches wrote: >>> RISC-V will emit ".option nopic" when -fno-pie is in effect, which >>> matches the generic pattern. Just like done for Alpha, special-case >>> RISC-V. >>> --- >>> A couple more targets look to be affected as well, simply because their >>> "no-operation" insn doesn't match the expectation. With the apparently >>> necessary further special casing I then also question the presence of >>> "SWYM" in the generic pattern. >>> >>> An alternative here might be to use dg-additional-options to add e.g. >>> -fpie. I don't think I know all possible implications of doing so, >>> though. > > Looks like there's already a no-pie for SPARC. Nothing's jumping out as > to why, but I'm not super familiar with `-fpatchable-function-entry`. > >> I think this is fine. Go ahead and install it. > > We run into this sort of thing somewhat frequently. Maybe we want a DG > matcher that avoids matching assembler directives? Or maybe even a > "scan-assembler-nop-times" type thing, given that different ports have > different names for the instruction? > > I don't see reason to block fixing the test on something bigger, though, > so seems fine for trunk. Presumably we'd want to backport this as well? Perhaps, but in order to do so I'd need to be given the respective okay. Jan
Re: Ping: [PATCH] testsuite/C++: suppress filename canonicalization in module tests
On 28.04.2023 00:24, Nathan Sidwell wrote: > On 4/25/23 11:04, Jan Beulich wrote: >> On 28.06.2022 16:06, Jan Beulich wrote: >>> The pathname underneath gcm.cache/ is determined from the effective name >>> used for the main input file of a particular module. When modules are >>> built, no canonicalization occurs for the main input file. Hence the >>> module file wouldn't be found if a different (the canonicalized) file >>> name was used when importing that same module. (This is an effect of >>> importing happening in the preprocessor, just like #include handling.) >>> >>> Since it doesn't look easy to make module generation use libcpp's >>> maybe_shorter_path() (in fact I'd consider this a layering violation, >>> while cloning the logic would - at least in principle - be prone to both >>> going out of sync), simply suppress system header path canonicalization >>> for the respective tests. >> >> Ping: This still looks to apply as is. > > ok -- I was unaware of this. might be sensible to file a defect about this? Sure: 109660. Jan
libatomic: drop redundant all-multi command
./multilib.am already specifies this same command, and make warns about the earlier one being ignored when seeing the later one. All that needs retaining to still satisfy the preceding comment is the extra dependency. libatomic/ 2022-05-XX Jan Beulich * Makefile.am (all-multi): Drop commands. * Makefile.in: Update accordingly. --- a/libatomic/Makefile.am +++ b/libatomic/Makefile.am @@ -147,12 +147,11 @@ libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES) libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD) -# Override the automake generated all-multi rule to guarantee that all-multi +# Amend the automake generated all-multi rule to guarantee that all-multi # is not run in parallel with the %_.lo rules which generate $(DEPDIR)/*.Ppo # makefile fragments to avoid broken *.Ppo getting included into the Makefile # when it is reloaded during the build of all-multi. all-multi: $(libatomic_la_LIBADD) - $(MULTIDO) $(AM_MAKEFLAGS) DO=all multi-do # $(MAKE) # target overrides -include $(tmake_file) --- a/libatomic/Makefile.in +++ b/libatomic/Makefile.in @@ -858,12 +858,11 @@ %_.lo: Makefile $(LTCOMPILE) $(M_DEPS) $(M_SIZE) $(M_IFUNC) -c -o $@ $(M_SRC) -# Override the automake generated all-multi rule to guarantee that all-multi +# Amend the automake generated all-multi rule to guarantee that all-multi # is not run in parallel with the %_.lo rules which generate $(DEPDIR)/*.Ppo # makefile fragments to avoid broken *.Ppo getting included into the Makefile # when it is reloaded during the build of all-multi. all-multi: $(libatomic_la_LIBADD) - $(MULTIDO) $(AM_MAKEFLAGS) DO=all multi-do # $(MAKE) # target overrides -include $(tmake_file)
[PATCH] x86: correct bmi2_umul3_1's MEM_P() uses
It's pretty clear that the operand numbers in the MEM_P() checks are off by one, perhaps due to a copy-and-paste oversight (unlike in most other places here we're dealing with two outputs). --- What I don't understand is why operand 2 is "nonimmediate_operand", not "register_operand" (which afaict would eliminate the need for these MEM_P() checks). This would then also extend to e.g. the subsequent umul3_1 and mul3_1 (and apparently quite a few more). --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -8465,7 +8465,7 @@ (zero_extend: (match_dup 3))) (match_operand:QI 4 "const_int_operand" "n"] "TARGET_BMI2 && INTVAL (operands[4]) == * BITS_PER_UNIT - && !(MEM_P (operands[1]) && MEM_P (operands[2]))" + && !(MEM_P (operands[2]) && MEM_P (operands[3]))" "mulx\t{%3, %0, %1|%1, %0, %3}" [(set_attr "type" "imulx") (set_attr "prefix" "vex")
[PATCH] x86: {,v}psadbw have commutative source operands
Like noticed for gas as well (binutils-gdb commit c8cad9d389b7), the "absolute difference" aspect of the insns makes their source operands commutative. gcc/ 2022-05-XX Jan Beulich * config/i386/mmx.md (mmx_psadbw): Mark as commutative. * config/i386/sse.md (_psadbw): Likewise. --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -4407,7 +4407,7 @@ (define_insn "mmx_psadbw" [(set (match_operand:V1DI 0 "register_operand" "=y,x,Yw") -(unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0,0,Yw") +(unspec:V1DI [(match_operand:V8QI 1 "register_operand" "%0,0,Yw") (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yw")] UNSPEC_PSADBW))] "(TARGET_MMX || TARGET_MMX_WITH_SSE) --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19983,7 +19983,7 @@ (define_insn "_psadbw" [(set (match_operand:VI8_AVX2_AVX512BW 0 "register_operand" "=x,YW") (unspec:VI8_AVX2_AVX512BW - [(match_operand: 1 "register_operand" "0,YW") + [(match_operand: 1 "register_operand" "%0,YW") (match_operand: 2 "vector_operand" "xBm,YWm")] UNSPEC_PSADBW))] "TARGET_SSE2"
Re: [PATCH] x86: correct bmi2_umul3_1's MEM_P() uses
On 27.05.2022 10:57, Uros Bizjak wrote: > On Fri, May 27, 2022 at 10:05 AM Jan Beulich wrote: >> >> It's pretty clear that the operand numbers in the MEM_P() checks are >> off by one, perhaps due to a copy-and-paste oversight (unlike in most >> other places here we're dealing with two outputs). >> --- >> What I don't understand is why operand 2 is "nonimmediate_operand", not >> "register_operand" (which afaict would eliminate the need for these >> MEM_P() checks). This would then also extend to e.g. the subsequent >> umul3_1 and mul3_1 (and apparently quite a few >> more). > > Because they are commutative (due to % operand modifier) and reload > can put memory operand into each operand. > > Patch is OK with the appropriate ChangeLog entry. Thanks, and yes, I did notice I failed to add a ChangeLog entry right after sending (being a result of such no longer be required in binutils, which I work more frequently with), sorry. This is what I did add already: gcc/ 2022-05-XX Jan Beulich * config/i386/i386.md (bmi2_umul3_1): Correct MEM_P() arguments. Jan
Re: [PATCH] x86: {,v}psadbw have commutative source operands
On 27.05.2022 11:05, Uros Bizjak wrote: > On Fri, May 27, 2022 at 10:13 AM Jan Beulich wrote: >> >> Like noticed for gas as well (binutils-gdb commit c8cad9d389b7), the >> "absolute difference" aspect of the insns makes their source operands >> commutative. > > You will need to expand via ix86_fixup_binary_operands_no_copy, use > register_mmxmem_operand on both input operands and use > ix86_binary_operator insn constraint. Please see many examples w/ > commutative operands throughout .md files. Hmm, yes, I see. As to the use of ix86_binary_operator_ok(): In particular in sse.md I see many uses of ix86_fixup_binary_operands_no_copy() in expanders where the corresponding insns don't use ix86_binary_operator_ok(), e.g. the immediately preceding uavg. Is there a(n) (anti-)pattern? My simplistic initial version was based on observations while putting together the inverse change for vgf2p8affine{,inv}qb_ (commit c0569d342ca4), which aren't commutative. Are you suggesting that the remaining (for indeed being commutative) vgf2p8mulb_ also is incomplete, requiring an expander as well? And maybe the same then in v1ti3 for any_logic:V1TI, avx512bw_umulhrswv32hi3, or _dp (and likely a few more)? At least a few pmadd* appear to lack commutativity marking altogether. Jan >> --- a/gcc/config/i386/mmx.md >> +++ b/gcc/config/i386/mmx.md >> @@ -4407,7 +4407,7 @@ >> >> (define_insn "mmx_psadbw" >>[(set (match_operand:V1DI 0 "register_operand" "=y,x,Yw") >> -(unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0,0,Yw") >> +(unspec:V1DI [(match_operand:V8QI 1 "register_operand" "%0,0,Yw") >> (match_operand:V8QI 2 "register_mmxmem_operand" >> "ym,x,Yw")] >> UNSPEC_PSADBW))] >>"(TARGET_MMX || TARGET_MMX_WITH_SSE) >> --- a/gcc/config/i386/sse.md >> +++ b/gcc/config/i386/sse.md >> @@ -19983,7 +19983,7 @@ >> (define_insn "_psadbw" >>[(set (match_operand:VI8_AVX2_AVX512BW 0 "register_operand" "=x,YW") >> (unspec:VI8_AVX2_AVX512BW >> - [(match_operand: 1 "register_operand" "0,YW") >> + [(match_operand: 1 "register_operand" "%0,YW") >>(match_operand: 2 "vector_operand" "xBm,YWm")] >> UNSPEC_PSADBW))] >>"TARGET_SSE2" >> >
[PATCH v2] x86: {,v}psadbw have commutative source operands
Like noticed for gas as well (binutils-gdb commit c8cad9d389b7), the "absolute difference" aspect of the insns makes their source operands commutative. gcc/ * config/i386/mmx.md (mmx_psadbw): Convert to expander. (*mmx_psadbw): New. Mark as commutative. * config/i386/sse.md (_psadbw): Convert to expander. (*_psadbw): New. Mark as commutative. --- v2: Introduce expanders. --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -4405,13 +4405,21 @@ (set_attr "type" "sseiadd") (set_attr "mode" "TI")]) -(define_insn "mmx_psadbw" +(define_expand "mmx_psadbw" + [(set (match_operand:V1DI 0 "register_operand") + (unspec:V1DI [(match_operand:V8QI 1 "register_mmxmem_operand") + (match_operand:V8QI 2 "register_mmxmem_operand")] +UNSPEC_PSADBW))] + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && (TARGET_SSE || TARGET_3DNOW_A)" + "ix86_fixup_binary_operands_no_copy (PLUS, V8QImode, operands);") + +(define_insn "*mmx_psadbw" [(set (match_operand:V1DI 0 "register_operand" "=y,x,Yw") -(unspec:V1DI [(match_operand:V8QI 1 "register_operand" "0,0,Yw") + (unspec:V1DI [(match_operand:V8QI 1 "register_mmxmem_operand" "%0,0,Yw") (match_operand:V8QI 2 "register_mmxmem_operand" "ym,x,Yw")] UNSPEC_PSADBW))] - "(TARGET_MMX || TARGET_MMX_WITH_SSE) - && (TARGET_SSE || TARGET_3DNOW_A)" + "(TARGET_MMX || TARGET_MMX_WITH_SSE) && (TARGET_SSE || TARGET_3DNOW_A) + && ix86_binary_operator_ok (PLUS, V8QImode, operands)" "@ psadbw\t{%2, %0|%0, %2} psadbw\t{%2, %0|%0, %2} --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19981,13 +19981,22 @@ ;; The correct representation for this is absolutely enormous, and ;; surely not generally useful. -(define_insn "_psadbw" +(define_expand "_psadbw" + [(set (match_operand:VI8_AVX2_AVX512BW 0 "register_operand") + (unspec:VI8_AVX2_AVX512BW + [(match_operand: 1 "vector_operand") + (match_operand: 2 "vector_operand")] + UNSPEC_PSADBW))] + "TARGET_SSE2" + "ix86_fixup_binary_operands_no_copy (PLUS, mode, operands);") + +(define_insn "*_psadbw" [(set (match_operand:VI8_AVX2_AVX512BW 0 "register_operand" "=x,YW") (unspec:VI8_AVX2_AVX512BW - [(match_operand: 1 "register_operand" "0,YW") + [(match_operand: 1 "vector_operand" "%0,YW") (match_operand: 2 "vector_operand" "xBm,YWm")] UNSPEC_PSADBW))] - "TARGET_SSE2" + "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, mode, operands)" "@ psadbw\t{%2, %0|%0, %2} vpsadbw\t{%2, %1, %0|%0, %1, %2}"
[PATCH] x86: harmonize __builtin_ia32_psadbw*() types
The 64-bit, 128-bit, and 512-bit variants have VDI return type, in line with instruction behavior. Make the 256-bit builtin match, thus also making it match the insn it expands to (using VI8_AVX2_AVX512BW). gcc/ * config/i386/i386-builtin.def (__builtin_ia32_psadbw256): Change type. * config/i386/i386-builtin-types.def: New function type (V4DI, V32QI, V32QI). * config/i386/i386-expand.cc (ix86_expand_args_builtin): Handle V4DI_FTYPE_V32QI_V32QI. --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -1217,7 +1217,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI) -BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI) +BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V4DI_FTYPE_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT) BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT) --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -516,6 +516,7 @@ DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, INT DEF_FUNCTION_TYPE (V8DI, V8DI, V2DI, INT, V8DI, UQI) DEF_FUNCTION_TYPE (V8DI, V8DI, V4DI, INT, V8DI, UQI) DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI) +DEF_FUNCTION_TYPE (V4DI, V32QI, V32QI) DEF_FUNCTION_TYPE (V8DI, V64QI, V64QI) DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI) DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI) --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -10359,6 +10359,7 @@ ix86_expand_args_builtin (const struct b case V8SI_FTYPE_V16HI_V16HI: case V4DI_FTYPE_V4DI_V4DI: case V4DI_FTYPE_V8SI_V8SI: +case V4DI_FTYPE_V32QI_V32QI: case V8DI_FTYPE_V64QI_V64QI: if (comparison == UNKNOWN) return ix86_expand_binop_builtin (icode, exp, target);
[PATCH] x86-64: make "length_vex" also account for VEX.B use by register operand
The length attribute ought to be "the (bounding maximum) length of an instruction" according to the comment next to its definition. A register operand encoded using the ModR/M.rm field will additionally use VEX.B for encoding the highest bit of the register number. Hence for the high 8 GPR registers as well as the [xy]mm{8..15} ones 3-byte VEX encoding may be needed. Since it isn't known to the function calculating the length which register goes where in the insn encoding, be conservative and assume a 3-byte VEX prefix whenever any such register operand is present and there's no memory operand. gcc/ * config/i386/i386.cc (ix86_attr_length_vex_default): Take REX.B into account for reg-only insns. --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -16820,7 +16820,8 @@ int ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode, bool has_vex_w) { - int i; + int i, reg_only = 2 + 1; + bool has_mem = false; /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3 byte VEX prefix. */ @@ -16840,16 +16841,23 @@ ix86_attr_length_vex_default (rtx_insn * if (GET_MODE (recog_data.operand[i]) == DImode && GENERAL_REG_P (recog_data.operand[i])) return 3 + 1; + + /* REX.B bit requires 3-byte VEX. Right here we don't know which + operand will be encoded using VEX.B, so be conservative. */ + if (REX_INT_REGNO_P (recog_data.operand[i]) + || REX_SSE_REGNO_P (recog_data.operand[i])) + reg_only = 3 + 1; } -else +else if (MEM_P (recog_data.operand[i])) { /* REX.X or REX.B bits use 3 byte VEX prefix. */ - if (MEM_P (recog_data.operand[i]) - && x86_extended_reg_mentioned_p (recog_data.operand[i])) + if (x86_extended_reg_mentioned_p (recog_data.operand[i])) return 3 + 1; + + has_mem = true; } - return 2 + 1; + return has_mem ? 2 + 1 : reg_only; }
[PATCH] configure: arrange to use appropriate objcopy
Using the system objcopy is wrong when other configure checks have probed a different set of binutils (I've noticed the problem on a system where the base objcopy can't deal with compressed debug sections). Arrange for the matching one to be picked up, first and foremost if an "in tree" one is available, by mirroring respective logic already present for nm. gcc/ * Makefile.in (ORIGINAL_OBJCOPY_FOR_TARGET): New. * configure.ac: Check for objcopy, producing ORIGINAL_OBJCOPY_FOR_TARGET. * configure: Update accordingly. * exec-tool.in (ORIGINAL_OBJCOPY_FOR_TARGET): New. Handle objcopy. --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -499,6 +499,7 @@ RANLIB_FOR_TARGET := $(shell \ ORIGINAL_LD_FOR_TARGET = @ORIGINAL_LD_FOR_TARGET@ ORIGINAL_NM_FOR_TARGET = @ORIGINAL_NM_FOR_TARGET@ NM_FOR_TARGET = ./nm +ORIGINAL_OBJCOPY_FOR_TARGET = @ORIGINAL_OBJCOPY_FOR_TARGET@ STRIP_FOR_TARGET := $(shell \ if [ -f $(objdir)/../binutils/strip-new ] ; then \ echo $(objdir)/../binutils/strip-new ; \ --- a/gcc/configure +++ b/gcc/configure @@ -733,6 +733,8 @@ gcc_cv_readelf gcc_cv_objdump ORIGINAL_NM_FOR_TARGET gcc_cv_nm +ORIGINAL_OBJCOPY_FOR_TARGET +gcc_cv_objcopy ORIGINAL_LD_GOLD_FOR_TARGET ORIGINAL_LD_BFD_FOR_TARGET ORIGINAL_LD_FOR_TARGET @@ -23436,6 +23438,83 @@ case "$ORIGINAL_NM_FOR_TARGET" in ;; esac +# Figure out what objcopy we will be using. +if ${gcc_cv_objcopy+:} false; then : + +else + +if test -f $gcc_cv_binutils_srcdir/configure.ac \ + && test -f ../binutils/Makefile \ + && test x$build = x$host; then + gcc_cv_objcopy=../binutils/objcopy$build_exeext +elif test -x objcopy$build_exeext; then + gcc_cv_objcopy=./objcopy$build_exeext +elif ( set dummy $OBJCOPY_FOR_TARGET; test -x $2 ); then +gcc_cv_objcopy="$OBJCOPY_FOR_TARGET" +else +# Extract the first word of "$OBJCOPY_FOR_TARGET", so it can be a program name with args. +set dummy $OBJCOPY_FOR_TARGET; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_path_gcc_cv_objcopy+:} false; then : + $as_echo_n "(cached) " >&6 +else + case $gcc_cv_objcopy in + [\\/]* | ?:[\\/]*) + ac_cv_path_gcc_cv_objcopy="$gcc_cv_objcopy" # Let the user override the test with a path. + ;; + *) + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. +for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then +ac_cv_path_gcc_cv_objcopy="$as_dir/$ac_word$ac_exec_ext" +$as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 +break 2 + fi +done + done +IFS=$as_save_IFS + + ;; +esac +fi +gcc_cv_objcopy=$ac_cv_path_gcc_cv_objcopy +if test -n "$gcc_cv_objcopy"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_objcopy" >&5 +$as_echo "$gcc_cv_objcopy" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking what objcopy to use" >&5 +$as_echo_n "checking what objcopy to use... " >&6; } +if test "$gcc_cv_objcopy" = ../binutils/objcopy$build_exeext; then + # Single tree build which includes binutils. + { $as_echo "$as_me:${as_lineno-$LINENO}: result: newly built objcopy" >&5 +$as_echo "newly built objcopy" >&6; } + in_tree_objcopy=yes +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $gcc_cv_objcopy" >&5 +$as_echo "$gcc_cv_objcopy" >&6; } + in_tree_objcopy=no +fi + +ORIGINAL_OBJCOPY_FOR_TARGET=$gcc_cv_objcopy + +case "$ORIGINAL_OBJCOPY_FOR_TARGET" in + ./objcopy | ./objcopy$build_exeext) ;; + *) ac_config_files="$ac_config_files objcopy:exec-tool.in" + ;; +esac # Figure out what objdump we will be using. if ${gcc_cv_objdump+:} false; then : @@ -33176,6 +33255,7 @@ do "as") CONFIG_FILES="$CONFIG_FILES as:exec-tool.in" ;; "collect-ld") CONFIG_FILES="$CONFIG_FILES collect-ld:exec-tool.in" ;; "nm") CONFIG_FILES="$CONFIG_FILES nm:exec-tool.in" ;; +"objcopy") CONFIG_FILES="$CONFIG_FILES objcopy:exec-tool.in" ;; "dsymutil") CONFIG_FILES="$CONFIG_FILES dsymutil:exec-tool.in" ;; "clearcap.map") CONFIG_LINKS="$CONFIG_LINKS clearcap.map:${srcdir}/config/$clearcap_map" ;; "$all_outputs") CONFIG_FILES="$CONFIG_FILES $all_outputs" ;; @@ -33811,6 +33891,7 @@ $as_echo "$as_me: executing $ac_file com "as":F) chmod +x as ;; "collect-ld":F) chmod +x collect-ld ;; "nm":F) chmod +x nm ;; +"objcopy":F) chmod +x objcopy ;; "dsymutil":F) chmod +x dsymutil ;; "default":C) case ${CONFIG_HEADERS} in --- a/gcc/configure.ac +++ b/gcc/configure.ac @@ -2815,6 +2815,36 @@ case "$ORIGINAL_NM_FOR_TARGET" in *) AC_CONFIG_FILES(nm:exec-tool.in, [chmod +x nm]) ;; esac +# Figure out what objcopy we will be using. +AS_VAR_SET_IF(gcc_cv_objcopy,, [ +if t
Re: [PATCH] configure: arrange to use appropriate objcopy
On 04.06.2022 10:32, Jakub Jelinek wrote: > On Thu, Jun 02, 2022 at 05:32:10PM +0200, Jan Beulich via Gcc-patches wrote: >> Using the system objcopy is wrong when other configure checks have >> probed a different set of binutils (I've noticed the problem on a system >> where the base objcopy can't deal with compressed debug sections). >> Arrange for the matching one to be picked up, first and foremost if an >> "in tree" one is available, by mirroring respective logic already >> present for nm. >> >> gcc/ >> >> * Makefile.in (ORIGINAL_OBJCOPY_FOR_TARGET): New. >> * configure.ac: Check for objcopy, producing >> ORIGINAL_OBJCOPY_FOR_TARGET. >> * configure: Update accordingly. >> * exec-tool.in (ORIGINAL_OBJCOPY_FOR_TARGET): New. >> Handle objcopy. > > This regressed > Executing on host: /home/jakub/src/gcc/obj44/gcc/xgcc > -B/home/jakub/src/gcc/obj44/gcc/ -fdiagnostics-plain-output -flto -g > -gsplit-dwarf -c -o c_lto_pr83719_0.o > /home/jakub/src/gcc/gcc/testsuite/gcc.dg/lto/pr83719_0.c(timeout = 300) > spawn -ignore SIGHUP /home/jakub/src/gcc/obj44/gcc/xgcc > -B/home/jakub/src/gcc/obj44/gcc/ -fdiagnostics-plain-output -flto -g > -gsplit-dwarf -c -o c_lto_pr83719_0.o > /home/jakub/src/gcc/gcc/testsuite/gcc.dg/lto/pr83719_0.c > cc1: note: '-gsplit-dwarf' is not supported with LTO, disabling > /home/jakub/src/gcc/obj44/gcc/objcopy: line 120: exec: --: invalid option > exec: usage: exec [-cl] [-a name] [command [argument ...]] [redirection ...] > compiler exited with status 1 > FAIL: gcc.dg/lto/pr83719 c_lto_pr83719_0.o assemble, -flto -g -gsplit-dwarf > for me, both on x86_64-linux and i686-linux. Hmm, it surely worked for me for both, with and without in-tree binutils (you don't say which variant you saw the failure with). > For some reason, I have > grep OBJCOPY *gcc/Makefile > gcc/Makefile:ORIGINAL_OBJCOPY_FOR_TARGET = > prev-gcc/Makefile:ORIGINAL_OBJCOPY_FOR_TARGET = > stage1-gcc/Makefile:ORIGINAL_OBJCOPY_FOR_TARGET = What about the corresponding ORIGINAL_NM_FOR_TARGET? And could you provide one of the config.log instances? Jan
Re: [PATCH] configure: arrange to use appropriate objcopy
On 07.06.2022 09:41, Jakub Jelinek wrote: > On Tue, Jun 07, 2022 at 08:12:26AM +0200, Jan Beulich via Gcc-patches wrote: >>> This regressed >>> Executing on host: /home/jakub/src/gcc/obj44/gcc/xgcc >>> -B/home/jakub/src/gcc/obj44/gcc/ -fdiagnostics-plain-output -flto -g >>> -gsplit-dwarf -c -o c_lto_pr83719_0.o >>> /home/jakub/src/gcc/gcc/testsuite/gcc.dg/lto/pr83719_0.c(timeout = 300) >>> spawn -ignore SIGHUP /home/jakub/src/gcc/obj44/gcc/xgcc >>> -B/home/jakub/src/gcc/obj44/gcc/ -fdiagnostics-plain-output -flto -g >>> -gsplit-dwarf -c -o c_lto_pr83719_0.o >>> /home/jakub/src/gcc/gcc/testsuite/gcc.dg/lto/pr83719_0.c >>> cc1: note: '-gsplit-dwarf' is not supported with LTO, disabling >>> /home/jakub/src/gcc/obj44/gcc/objcopy: line 120: exec: --: invalid option >>> exec: usage: exec [-cl] [-a name] [command [argument ...]] [redirection ...] >>> compiler exited with status 1 >>> FAIL: gcc.dg/lto/pr83719 c_lto_pr83719_0.o assemble, -flto -g >>> -gsplit-dwarf >>> for me, both on x86_64-linux and i686-linux. >> >> Hmm, it surely worked for me for both, with and without in-tree binutils >> (you don't say which variant you saw the failure with). > > System binutils. > grep ORIGINAL_ gcc/Makefile > ORIGINAL_AS_FOR_TARGET = /usr/bin/as > ORIGINAL_LD_FOR_TARGET = /usr/bin/ld > ORIGINAL_NM_FOR_TARGET = /usr/bin/nm > ORIGINAL_OBJCOPY_FOR_TARGET = > ls -l /usr/bin/{as,ld,nm,objcopy} > -rwxr-xr-x. 1 root root 439192 Mar 10 12:51 /usr/bin/as > lrwxrwxrwx. 1 root root 20 May 20 13:28 /usr/bin/ld -> > /etc/alternatives/ld > -rwxr-xr-x. 1 root root 47928 Mar 10 12:51 /usr/bin/nm > -rwxr-xr-x. 1 root root 184304 Mar 10 12:51 /usr/bin/objcopy > (but ditto grep ORIGINAL_ stage1-gcc/Makefile > or grep ORIGINAL_ prev-gcc/Makefile). > >>> For some reason, I have >>> grep OBJCOPY *gcc/Makefile >>> gcc/Makefile:ORIGINAL_OBJCOPY_FOR_TARGET = >>> prev-gcc/Makefile:ORIGINAL_OBJCOPY_FOR_TARGET = >>> stage1-gcc/Makefile:ORIGINAL_OBJCOPY_FOR_TARGET = >> >> What about the corresponding ORIGINAL_NM_FOR_TARGET? And could you provide >> one of the config.log instances? > > config.log has: > configure:23317: checking what linker to use > configure:23351: result: /usr/bin/ld > configure:23379: checking for nm > configure:23397: found /usr/bin/nm > configure:23409: result: /usr/bin/nm > configure:23420: checking what nm to use > configure:23428: result: /usr/bin/nm > configure:23498: checking what objcopy to use > configure:23506: result: > configure:23536: checking for objdump > configure:23554: found /usr/bin/objdump > configure:23566: result: /usr/bin/objdump > configure:23577: checking what objdump to use > configure:23587: result: /usr/bin/objdump > > It is a bootstrapped compiler: > ../configure --enable-languages=default,obj-c++,lto,go,d > --enable-checking=yes,rtl,extra --enable-libstdcxx-backtrace=yes && make -j32 > bootstrap > LOG 2>&1 > on Fedora 36 x86_64-linux (ada left out because it is currently broken). > > Comparing the toplevel Makefile, I see some differences: > grep NM_FOR_TARGET Makefile > NM_FOR_TARGET="$(NM_FOR_TARGET)"; export NM_FOR_TARGET; \ > NM="$(COMPILER_NM_FOR_TARGET)"; export NM; \ > NM_FOR_TARGET=$(NM) > COMPILER_NM_FOR_TARGET=$$r/$(HOST_SUBDIR)/gcc/nm > "NM_FOR_TARGET=$(NM_FOR_TARGET)" \ > 'NM=$(COMPILER_NM_FOR_TARGET)' \ > grep OBJCOPY_FOR_TARGET Makefile > OBJCOPY_FOR_TARGET="$(OBJCOPY_FOR_TARGET)"; export OBJCOPY_FOR_TARGET; \ > OBJCOPY="$(OBJCOPY_FOR_TARGET)"; export OBJCOPY; \ > OBJCOPY_FOR_TARGET=$(OBJCOPY) > "OBJCOPY_FOR_TARGET=$(OBJCOPY_FOR_TARGET)" \ > 'OBJCOPY=$$(OBJCOPY_FOR_TARGET)' \ > E.g. the COMPILER_*_FOR_TARGET line is missing completely for OBJCOPY > and the last line is different too. > Also: > grep ^NM[[:space:]]*= Makefile; echo end > NM = nm > end > grep ^OBJCOPY[[:space:]]*= Makefile; echo end > end > > Note, I see > S["OBJDUMP"]="objdump" > S["OBJCOPY"]="objcopy" > S["WINDMC"]="windmc" > S["WINDRES"]="windres" > S["STRIP"]="strip" > S["RANLIB"]="ranlib" > S["NM"]="nm" > in toplevel status, it is just that toplevel: > grep ^NM Makefile.tpl > NM_FOR_BUILD = @NM_FOR_BUILD@ > NM = @NM@ > NM_FOR_TARGET=@NM_FOR_TARGET@ > grep ^OBJCOPY Makefile.tpl > OBJCOPY_FOR_TARGET=@OBJCOPY_FOR_TARGET@ > doesn't have the OBJCOPY = @OBJCOPY@ line, and perhaps the > COMPILER_OBJCOPY_FOR_TARGET stuff. Let me revert the change - I've just realized that I only thought I would have tested this with system binutils as well. I'm sorry for the breakage. Jan
[PATCH] testsuite/ix86: prune MMX ABI warning
So far on 32-bit hosts this test failed (for both C and C++) because of the ABI change warning occurring without (explictly) enabling MMX. gcc/testsuite/ * c-c++-common/torture/builtin-shufflevector-2.c: Prune ix86 MMX ABI warning. --- a/gcc/testsuite/c-c++-common/torture/builtin-shufflevector-2.c +++ b/gcc/testsuite/c-c++-common/torture/builtin-shufflevector-2.c @@ -24,3 +24,5 @@ main (void) __builtin_abort (); return 0; } + +// { dg-prune-output "MMX vector (argument|return) without MMX enabled changes the ABI" }
[PATCH] testsuite/ix86: SSE2 is a prereq to _Float16 use
When enabling AVX512FP via attribute or pragma, the _Float16 type would remain unavailable when at initialization time SSE2 wouldn't be seen as available for use. While this may hint at a wider underlying issue (like the feature, the type may want providing dynamically, albeit this may be challenging in particular for functions returning _Float16 yet having the attribute specified after their return type), for now simply make SSE2 available when targeting ix86. gcc/testsuite/ * gcc.target/i386/avx512fp16-reduce-op-2.c: Force SSE2 for i?86. * gcc.target/i386/pr99464.c: Likewise. --- a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c @@ -1,5 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-O2 -mprefer-vector-width=512 -fdump-tree-optimized" } */ +/* { dg-additional-options "-msse2" { target i?86-*-* } } */ /* { dg-final { scan-tree-dump-times "\.REDUC_PLUS" 3 "optimized" } } */ /* { dg-final { scan-tree-dump-times "\.REDUC_MIN" 3 "optimized" } } */ --- a/gcc/testsuite/gcc.target/i386/pr99464.c +++ b/gcc/testsuite/gcc.target/i386/pr99464.c @@ -1,6 +1,7 @@ /* PR target/99464 */ /* { dg-do compile } */ /* { dg-options "-O2" } */ +/* { dg-additional-options "-msse2" { target i?86-*-* } } */ #pragma GCC target("arch=cannonlake")
Ping: [PATCH] libatomic: drop redundant all-multi command
On 27.05.2022 10:01, Jan Beulich wrote: > ./multilib.am already specifies this same command, and make warns about > the earlier one being ignored when seeing the later one. All that needs > retaining to still satisfy the preceding comment is the extra > dependency. > > libatomic/ > > * Makefile.am (all-multi): Drop commands. > * Makefile.in: Update accordingly. Ping? Thanks, Jan > --- a/libatomic/Makefile.am > +++ b/libatomic/Makefile.am > @@ -147,12 +147,11 @@ > libatomic_convenience_la_SOURCES = $(libatomic_la_SOURCES) > libatomic_convenience_la_LIBADD = $(libatomic_la_LIBADD) > > -# Override the automake generated all-multi rule to guarantee that all-multi > +# Amend the automake generated all-multi rule to guarantee that all-multi > # is not run in parallel with the %_.lo rules which generate $(DEPDIR)/*.Ppo > # makefile fragments to avoid broken *.Ppo getting included into the Makefile > # when it is reloaded during the build of all-multi. > all-multi: $(libatomic_la_LIBADD) > - $(MULTIDO) $(AM_MAKEFLAGS) DO=all multi-do # $(MAKE) > > # target overrides > -include $(tmake_file) > --- a/libatomic/Makefile.in > +++ b/libatomic/Makefile.in > @@ -858,12 +858,11 @@ > %_.lo: Makefile > $(LTCOMPILE) $(M_DEPS) $(M_SIZE) $(M_IFUNC) -c -o $@ $(M_SRC) > > -# Override the automake generated all-multi rule to guarantee that all-multi > +# Amend the automake generated all-multi rule to guarantee that all-multi > # is not run in parallel with the %_.lo rules which generate $(DEPDIR)/*.Ppo > # makefile fragments to avoid broken *.Ppo getting included into the Makefile > # when it is reloaded during the build of all-multi. > all-multi: $(libatomic_la_LIBADD) > - $(MULTIDO) $(AM_MAKEFLAGS) DO=all multi-do # $(MAKE) > > # target overrides > -include $(tmake_file)
[PATCH] testsuite/C++: suppress filename canonicalization in module tests
The pathname underneath gcm.cache/ is determined from the effective name used for the main input file of a particular module. When modules are built, no canonicalization occurs for the main input file. Hence the module file wouldn't be found if a different (the canonicalized) file name was used when importing that same module. (This is an effect of importing happening in the preprocessor, just like #include handling.) Since it doesn't look easy to make module generation use libcpp's maybe_shorter_path() (in fact I'd consider this a layering violation, while cloning the logic would - at least in principle - be prone to both going out of sync), simply suppress system header path canonicalization for the respective tests. gcc/testsuite/ * g++.dg/modules/alias-1_b.C: Add -fno-canonical-system-headers. * g++.dg/modules/alias-1_d.C: Likewise. * g++.dg/modules/alias-1_e.C: Likewise. * g++.dg/modules/alias-1_f.C: Likewise. * g++.dg/modules/cpp-6_c.C: Likewise. * g++.dg/modules/dir-only-2_b.C: Likewise. --- Strictly speaking it could be necessary to also suppress canonicalization when generating the modules, but for now they're self- contained, i.e. don't include any "real" system headers. IOW at the moment the tests aren't susceptible to the issue at generation time. --- a/gcc/testsuite/g++.dg/modules/alias-1_b.C +++ b/gcc/testsuite/g++.dg/modules/alias-1_b.C @@ -1,4 +1,4 @@ -// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem [srcdir]" } +// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem [srcdir] -fno-canonical-system-headers" } // Alias at the header file. We have one CMI file import "alias-1_a.H"; --- a/gcc/testsuite/g++.dg/modules/alias-1_d.C +++ b/gcc/testsuite/g++.dg/modules/alias-1_d.C @@ -1,4 +1,4 @@ -// { dg-additional-options "-fmodules-ts -isystem [srcdir]" } +// { dg-additional-options "-fmodules-ts -isystem [srcdir] -fno-canonical-system-headers" } // { dg-module-cmi kevin } export module kevin; --- a/gcc/testsuite/g++.dg/modules/alias-1_e.C +++ b/gcc/testsuite/g++.dg/modules/alias-1_e.C @@ -1,4 +1,4 @@ -// { dg-additional-options "-fmodules-ts -isystem [srcdir]" } +// { dg-additional-options "-fmodules-ts -isystem [srcdir] -fno-canonical-system-headers" } import bob; import kevin; --- a/gcc/testsuite/g++.dg/modules/alias-1_f.C +++ b/gcc/testsuite/g++.dg/modules/alias-1_f.C @@ -1,4 +1,4 @@ -// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem [srcdir]" } +// { dg-additional-options "-fmodules-ts -fdump-lang-module -isystem [srcdir] -fno-canonical-system-headers" } import kevin; import bob; --- a/gcc/testsuite/g++.dg/modules/cpp-6_c.C +++ b/gcc/testsuite/g++.dg/modules/cpp-6_c.C @@ -1,5 +1,5 @@ // { dg-do preprocess } -// { dg-additional-options "-fmodules-ts -isystem [srcdir]" } +// { dg-additional-options "-fmodules-ts -isystem [srcdir] -fno-canonical-system-headers" } #define empty #define nop(X) X --- a/gcc/testsuite/g++.dg/modules/dir-only-2_b.C +++ b/gcc/testsuite/g++.dg/modules/dir-only-2_b.C @@ -1,5 +1,5 @@ // { dg-do preprocess } -// { dg-additional-options "-fmodules-ts -fdirectives-only -isystem [srcdir]" } +// { dg-additional-options "-fmodules-ts -fdirectives-only -isystem [srcdir] -fno-canonical-system-headers" } // a comment module; // line frob
[PATCH] testsuite/C++: cope with IPv6 being unavailable
When IPv6 is disabled in the kernel, the error message coming back from Cody::OpenInet6() is different from the sole so far expected one. gcc/testsuite/ * g++.dg/modules/bad-mapper-3.C: Relax failure pattern. --- a/gcc/testsuite/g++.dg/modules/bad-mapper-3.C +++ b/gcc/testsuite/g++.dg/modules/bad-mapper-3.C @@ -1,6 +1,6 @@ // { dg-additional-options "-fmodules-ts -fmodule-mapper=localhost:172477262" } import unique3.bob; -// { dg-error {failed connecting mapper 'localhost:172477262'} "" { target *-*-* } 0 } +// { dg-error {failed (socket|connecting) mapper 'localhost:172477262'} "" { target *-*-* } 0 } // { dg-prune-output "fatal error:" } // { dg-prune-output "failed to read" } // { dg-prune-output "compilation terminated" }