[PATCH 8/8] AArch64: rules for CMPBR instructions

2025-05-07 Thread Karl Meakin
Add rules for lowering `cbranch4` to CBB/CBH/CB when CMPBR
extension is enabled.

gcc/ChangeLog:

* config/aarch64/aarch64.md (cbranch4): emit CMPBR
instructions if possible.
(cbranch4): new expand rule.
(aarch64_cb): likewise.
(aarch64_cb): likewise.
* config/aarch64/iterators.md (cmpbr_suffix): new mode attr.
* config/aarch64/predicates.md (const_0_to_63_operand): new
predicate.
(aarch64_cb_immediate): likewise.
(aarch64_cb_operand): likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/cmpbr.c: update tests.
---
 gcc/config/aarch64/aarch64.md|  87 +++-
 gcc/config/aarch64/iterators.md  |   5 +
 gcc/config/aarch64/predicates.md |  17 +
 gcc/testsuite/gcc.target/aarch64/cmpbr.c | 484 ---
 4 files changed, 275 insertions(+), 318 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 256df0dcc04..73f3e062e57 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -720,18 +720,41 @@ (define_constants
 ;; Conditional jumps
 ;; ---
 
-(define_expand "cbranch4"
+(define_expand "cbranch4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
[(match_operand:GPI 1 "register_operand")
 (match_operand:GPI 2 "aarch64_plus_operand")])
   (label_ref (match_operand 3))
   (pc)))]
   ""
-  "
-  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
-operands[2]);
-  operands[2] = const0_rtx;
-  "
+  {
+  if (TARGET_CMPBR && aarch64_cb_operand (operands[2], mode))
+{
+  emit_jump_insn (gen_aarch64_cb (operands[0], operands[1],
+   operands[2], operands[3]));
+  DONE;
+}
+  else
+{
+  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]),
+operands[1], operands[2]);
+  operands[2] = const0_rtx;
+}
+  }
+)
+
+(define_expand "cbranch4"
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+   [(match_operand:SHORT 1 "register_operand")
+(match_operand:SHORT 2 
"aarch64_cb_short_operand")])
+  (label_ref (match_operand 3))
+  (pc)))]
+  "TARGET_CMPBR"
+  {
+  emit_jump_insn (gen_aarch64_cb (operands[0], operands[1],
+   operands[2], operands[3]));
+  DONE;
+  }
 )
 
 (define_expand "cbranch4"
@@ -758,6 +781,58 @@ (define_expand "cbranchcc4"
   ""
 )
 
+;; Emit a `CB (register)` or `CB (immediate)` instruction.
+(define_insn "aarch64_cb"
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+   [(match_operand:GPI 1 "register_operand")
+(match_operand:GPI 2 "aarch64_cb_operand")])
+  (label_ref (match_operand 3))
+  (pc)))]
+  "TARGET_CMPBR"
+  "cb%m0\\t%1, %2, %l3";
+  [(set_attr "type" "branch")
+   (set (attr "length")
+   (if_then_else (and (ge (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_N_1Kib))
+  (lt (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_P_1Kib)))
+ (const_int 4)
+ (const_int 8)))
+   (set (attr "far_branch")
+   (if_then_else (and (ge (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_N_1Kib))
+  (lt (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_P_1Kib)))
+ (const_string "no")
+ (const_string "yes")))]
+)
+
+;; Emit a `CBB (register)` or `CBH (register)` instruction.
+(define_insn "aarch64_cb"
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+   [(match_operand:SHORT 1 "register_operand")
+(match_operand:SHORT 2 
"aarch64_cb_short_operand")])
+  (label_ref (match_operand 3))
+  (pc)))]
+  "TARGET_CMPBR"
+  "cb%m0\\t%1, %2, %l3";
+  [(set_attr "type" "branch")
+   (set (attr "length")
+   (if_then_else (and (ge (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_N_1Kib))
+  (lt (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_P_1Kib)))
+ (const_int 4)
+ (const_int 8)))
+   (set (attr "far_branch")
+   (if_then_else (and (ge (minus (match_dup 3) (pc))
+  (const_int BRANCH_LEN_N_1Kib))
+  (lt (minus (

[PATCH 3/8] AArch64: rename branch instruction rules

2025-05-07 Thread Karl Meakin
Give the `define_insn` rules used in lowering `cbranch4` to RTL
more descriptive and consistent names: from now on, each rule is named
after the AArch64 instruction that it generates. Also add comments to
document each rule.

gcc/ChangeLog:

* config/aarch64/aarch64.md (condjump): rename to ...
(aarch64_bcond): ...here.
(*compare_condjump): rename to ...
(*aarch64_bcond_wide_imm): ...here.
(restore_stack_nonlocal): handle rename.
(stack_protect_combined_test): likewise.
* config/aarch64/aarch64-simd.md (cbranch4): likewise.
* config/aarch64/aarch64-sme.md (aarch64_restore_za): likewise.
* config/aarch64/aarch64.cc (aarch64_gen_test_and_branch): likewise.
---
 gcc/config/aarch64/aarch64-simd.md |  2 +-
 gcc/config/aarch64/aarch64-sme.md  |  3 ++-
 gcc/config/aarch64/aarch64.cc  |  2 +-
 gcc/config/aarch64/aarch64.md  | 15 +--
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index e2afe87e513..197a5f65f34 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3946,7 +3946,7 @@ (define_expand "cbranch4"
 
   rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
   rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
-  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+  emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[3]));
   DONE;
 })
 
diff --git a/gcc/config/aarch64/aarch64-sme.md 
b/gcc/config/aarch64/aarch64-sme.md
index c49affd0dd3..6a7c31acf0a 100644
--- a/gcc/config/aarch64/aarch64-sme.md
+++ b/gcc/config/aarch64/aarch64-sme.md
@@ -389,7 +389,8 @@ (define_insn_and_split "aarch64_restore_za"
 auto label = gen_label_rtx ();
 auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM);
 emit_insn (gen_aarch64_read_tpidr2 (tpidr2));
-auto jump = emit_likely_jump_insn (gen_aarch64_cbnedi1 (tpidr2, label));
+auto jump = emit_likely_jump_insn (
+   gen_aarch64_cbnedi1 (tpidr2, label));
 JUMP_LABEL (jump) = label;
 
 aarch64_restore_za (operands[0]);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index fff8d9da49d..c0afdeb87ee 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2879,7 +2879,7 @@ aarch64_gen_test_and_branch (rtx_code code, rtx x, int 
bitnum,
   emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
   rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
   rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
-  return gen_condjump (x, cc_reg, label);
+  return gen_aarch64_bcond (x, cc_reg, label);
 }
   return gen_aarch64_tb (code, mode, mode,
 x, gen_int_mode (bitnum, mode), label);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 45b2283c5c0..23775ec58ca 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -740,7 +740,8 @@ (define_expand "cbranchcc4"
   ""
 )
 
-(define_insn "condjump"
+;; Emit `B`, assuming that the condition is already in the CC register.
+(define_insn "aarch64_bcond"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
[(match_operand 1 "cc_register")
 (const_int 0)])
@@ -780,7 +781,7 @@ (define_insn "condjump"
 ;; sub x0, x1, #(CST & 0xfff000)
 ;; subsx0, x0, #(CST & 0x000fff)
 ;; b .Label
-(define_insn_and_split "*compare_condjump"
+(define_insn_and_split "*aarch64_bcond_wide_imm"
   [(set (pc) (if_then_else (EQL
 (match_operand:GPI 0 "register_operand" "r")
 (match_operand:GPI 1 "aarch64_imm24" "n"))
@@ -801,11 +802,12 @@ (define_insn_and_split "*compare_condjump"
 rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
 rtx cmp_rtx = gen_rtx_fmt_ee (, mode,
  cc_reg, const0_rtx);
-emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
+emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2]));
 DONE;
   }
 )
 
+;; For an EQ/NE comparison against zero, emit `CBZ`/`CBNZ`
 (define_insn "aarch64_cb1"
   [(set (pc) (if_then_else (EQL
 (match_operand:GPI 0 "register_operand" "r")
@@ -832,6 +834,7 @@ (define_insn "aarch64_cb1"
  (const_int 1)))]
 )
 
+;; For an LT/GE comparison against zero, emit `TBZ`/`TBNZ`
 (define_insn "*cb1"
   [(set (pc) (if_then_else (LTGE
 (match_operand:ALLI 0 "register_operand" "r")
@@ -1325,13 +1328,13 @@ (define_expand "restore_stack_nonlocal"
   emit_insn (gen_subdi3_compare1 (gcs_now, gcs_old, gcs_now));
   rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
   rtx cmp_rtx = gen_rtx_fmt_ee (EQ, DImode, cc_reg, const0_rtx);
-  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, done_label));
+  

[PATCH 0/8] AArch64: CMPBR support

2025-05-07 Thread Karl Meakin
This patch series adds support for the CMPBR extension. It includes the
new `+cmpbr` option and rules to generate the new instructions when
lowering conditional branches.

Karl Meakin (8):
  AArch64: place branch instruction rules together
  AArch64: reformat branch instruction rules
  AArch64: rename branch instruction rules
  AArch64: add constants for branch displacements
  AArch64: make `far_branch` attribute a boolean
  AArch64: recognize `+cmpbr` option
  AArch64: precommit test for CMPBR instructions
  AArch64: rules for CMPBR instructions

 .../aarch64/aarch64-option-extensions.def |2 +
 gcc/config/aarch64/aarch64-simd.md|2 +-
 gcc/config/aarch64/aarch64-sme.md |3 +-
 gcc/config/aarch64/aarch64.cc |2 +-
 gcc/config/aarch64/aarch64.h  |3 +
 gcc/config/aarch64/aarch64.md |  557 +---
 gcc/config/aarch64/iterators.md   |5 +
 gcc/config/aarch64/predicates.md  |   17 +
 gcc/doc/invoke.texi   |3 +
 gcc/testsuite/gcc.target/aarch64/cmpbr.c  | 1238 +
 10 files changed, 1615 insertions(+), 217 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c

-- 
2.45.2



Re: [PATCH] libstdc++: Use _Padding_sink in __formatter_chrono to produce padded output.

2025-05-07 Thread Tomasz Kaminski
On Wed, May 7, 2025 at 1:01 PM Tomasz Kamiński  wrote:

> Formatting code is extracted to _M_format_to function, that produced output
> to specified iterator. This function is now invoked either with __fc.out()
> directly (if width is not specified) or _Padding_sink::out().
>
> This avoid formatting to temporary string if no padding is requested,
> and minimize allocations otherwise. For more details see commit message of
> r16-142-g01e5ef3e8b91288f5d387a27708f9f8979a50edf.
>
> This should not increase number of instantiations, as implementation only
> produce basis_format_context with _Sink_iter as iterator, which is also
> _Padding_sink iterator.
>
> libstdc++-v3/ChangeLog:
>
> * include/bits/chrono_io.h (__formatter_chrono::_M_format_to):
> Extracted from _M_format.
> (__formatter_chrono::_M_format): Use _Padding_sink and delegate
> to _M_format_to.
> ---
> I have checked that there are no other calls to out() in this file,
> so _M_format_to uses only __out, and not iterator from __fc.
> Testing on x86_64-linux. OK for trunk?
>
All test passed.

>
>  libstdc++-v3/include/bits/chrono_io.h | 55 ++-
>  1 file changed, 20 insertions(+), 35 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/chrono_io.h
> b/libstdc++-v3/include/bits/chrono_io.h
> index 620227a9f35..ace8b9f2629 100644
> --- a/libstdc++-v3/include/bits/chrono_io.h
> +++ b/libstdc++-v3/include/bits/chrono_io.h
> @@ -503,9 +503,7 @@ namespace __format
> _M_format(const _Tp& __t, _FormatContext& __fc,
>   bool __is_neg = false) const
> {
> - auto __first = _M_spec._M_chrono_specs.begin();
> - const auto __last = _M_spec._M_chrono_specs.end();
> - if (__first == __last)
> + if (_M_spec._M_chrono_specs.empty())
> return _M_format_to_ostream(__t, __fc, __is_neg);
>
>  #if defined _GLIBCXX_USE_NL_LANGINFO_L && __CHAR_BIT__ == 8
> @@ -525,29 +523,29 @@ namespace __format
> __fc._M_loc =  __with_encoding_conversion(__loc);
> }
>  #endif
> -
> - _Sink_iter<_CharT> __out;
> - __format::_Str_sink<_CharT> __sink;
> - bool __write_direct = false;
> - if constexpr (is_same_v - _Sink_iter<_CharT>>)
> -   {
> - if (_M_spec._M_width_kind == __format::_WP_none)
> -   {
> - __out = __fc.out();
> - __write_direct = true;
> -   }
> - else
> -   __out = __sink.out();
> -   }
> - else
> -   __out = __sink.out();
> -
>   // formatter passes the correct value of __is_neg
>   // for durations but for hh_mm_ss we decide it here.
>   if constexpr (__is_specialization_of<_Tp, chrono::hh_mm_ss>)
> __is_neg = __t.is_negative();
>
> + const size_t __padwidth = _M_spec._M_get_width(__fc);
> + if (__padwidth == 0)
> +   return _M_format_to(__t, __fc.out(), __fc, __is_neg);
> +
> + using _Out = typename _FormatContext::iterator;
> + _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth);
> + _M_format_to(__t, __sink.out(), __fc, __is_neg);
> + return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
> +   }
> +
> +  template
> +   _Out
> +   _M_format_to(const _Tp& __t, _Out __out, _FormatContext& __fc,
> +bool __is_neg) const
> +   {
> + auto __first = _M_spec._M_chrono_specs.begin();
> + const auto __last = _M_spec._M_chrono_specs.end();
> +
>   auto __print_sign = [&__is_neg, &__out] {
> if constexpr (chrono::__is_duration_v<_Tp>
> || __is_specialization_of<_Tp,
> chrono::hh_mm_ss>)
> @@ -699,20 +697,7 @@ namespace __format
> }
> }
>   while (__first != __last);
> -
> - if constexpr (is_same_v - _Sink_iter<_CharT>>)
> -   if (__write_direct)
> - return __out;
> -
> - auto __str = __sink.view();
> - size_t __width;
> - if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
> -   __width = __unicode::__field_width(__str);
> - else
> -   __width = __str.size();
> - return __format::__write_padded_as_spec(__str, __width,
> - __fc, _M_spec);
> + return std::move(__out);
> }
>
>_ChronoSpec<_CharT> _M_spec;
> --
> 2.49.0
>
>


Re: [PATCH 2/8] AArch64: reformat branch instruction rules

2025-05-07 Thread Kyrylo Tkachov



> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> Make the formatting of the RTL templates in the rules for branch
> instructions more consistent with each other.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64.md (cbranch4): reformat.
> (cbranchcc4): likewise.
> (condjump): likewise.
> (*compare_condjump): likewise.
> (aarch64_cb1): likewise.
> (*cb1): likewise.
> (tbranch_3): likewise.
> (@aarch64_tb): likewise.

Ok with similar comments on Changelog entries as in 1/8.

Thanks,
Kyrill

> ---
> gcc/config/aarch64/aarch64.md | 82 ++-
> 1 file changed, 42 insertions(+), 40 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 4d556d886bc..45b2283c5c0 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -706,7 +706,7 @@ (define_expand "cbranch4"
>   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>[(match_operand:GPI 1 "register_operand")
> (match_operand:GPI 2 "aarch64_plus_operand")])
> -   (label_ref (match_operand 3 "" ""))
> +   (label_ref (match_operand 3))
>   (pc)))]
>   ""
>   "
> @@ -717,34 +717,34 @@ (define_expand "cbranch4"
> )
> 
> (define_expand "cbranch4"
> -  [(set (pc) (if_then_else
> - (match_operator 0 "aarch64_comparison_operator"
> - [(match_operand:GPF_F16 1 "register_operand")
> -  (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
> - (label_ref (match_operand 3 "" ""))
> - (pc)))]
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> +[(match_operand:GPF_F16 1 "register_operand")
> + (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
> +   (label_ref (match_operand 3))
> +   (pc)))]
>   ""
> -  "
> +  {
>   operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
> operands[2]);
>   operands[2] = const0_rtx;
> -  "
> +  }
> )
> 
> (define_expand "cbranchcc4"
> -  [(set (pc) (if_then_else
> -  (match_operator 0 "aarch64_comparison_operator"
> -   [(match_operand 1 "cc_register")
> -(match_operand 2 "const0_operand")])
> -  (label_ref (match_operand 3 "" ""))
> -  (pc)))]
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> +[(match_operand 1 "cc_register")
> + (match_operand 2 "const0_operand")])
> +   (label_ref (match_operand 3))
> +   (pc)))]
>   ""
> -  "")
> +  ""
> +)
> 
> (define_insn "condjump"
>   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> -[(match_operand 1 "cc_register" "") (const_int 0)])
> -   (label_ref (match_operand 2 "" ""))
> +[(match_operand 1 "cc_register")
> + (const_int 0)])
> +   (label_ref (match_operand 2))
>   (pc)))]
>   ""
>   {
> @@ -782,9 +782,9 @@ (define_insn "condjump"
> ;; b .Label
> (define_insn_and_split "*compare_condjump"
>   [(set (pc) (if_then_else (EQL
> -  (match_operand:GPI 0 "register_operand" "r")
> -  (match_operand:GPI 1 "aarch64_imm24" "n"))
> -   (label_ref:P (match_operand 2 "" ""))
> + (match_operand:GPI 0 "register_operand" "r")
> + (match_operand:GPI 1 "aarch64_imm24" "n"))
> +   (label_ref:P (match_operand 2))
>   (pc)))]
>   "!aarch64_move_imm (INTVAL (operands[1]), mode)
>&& !aarch64_plus_operand (operands[1], mode)
> @@ -807,9 +807,10 @@ (define_insn_and_split "*compare_condjump"
> )
> 
> (define_insn "aarch64_cb1"
> -  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
> - (const_int 0))
> -   (label_ref (match_operand 1 "" ""))
> +  [(set (pc) (if_then_else (EQL
> + (match_operand:GPI 0 "register_operand" "r")
> + (const_int 0))
> +   (label_ref (match_operand 1))
>   (pc)))]
>   "!aarch64_track_speculation"
>   {
> @@ -832,9 +833,10 @@ (define_insn "aarch64_cb1"
> )
> 
> (define_insn "*cb1"
> -  [(set (pc) (if_then_else (LTGE (match_operand:ALLI 0 "register_operand" 
> "r")
> - (const_int 0))
> -   (label_ref (match_operand 1 "" ""))
> +  [(set (pc) (if_then_else (LTGE
> + (match_operand:ALLI 0 "register_operand" "r")
> + (const_int 0))
> +   (label_ref (match_operand 1))
>   (pc)))
>(clobber (reg:CC CC_REGNUM))]
>   "!aarch64_track_speculation"
> @@ -875,11 +877,11 @@ (define_insn "*cb1"
> ;; ---
> 
> (define_expand "tbranch_3"
> -  [(set (pc) (if_then_else
> -  (EQL (match_operand:SHORT 0 "register_operand")
> -   (match_operand 1 "const0_operand"))
> -  (label_ref (match_operand 2 ""))
> -  (pc)))]
> +  [(set (pc) (if_then_else (EQL
> + (match_operand:SHORT 0 "register_operand")
> + (match_operand 1 "const0_operand"))
> +   (label_ref (match_operand 2 ""))
> +   (pc)))]
>   ""
> {
>   rtx bitvalue = gen_reg_rtx (mode);
> @@ -893,14 +895,14 @@ (define_expand "tbranch_3"
> })
> 
> (define_insn "@aarch64_tb"
> -  [(set (pc) (if_then_else
> -  (EQL (zero_extract:GPI (match_operand:ALLI 0 "register_operand" "r")
> - (const_int 1)
> - (match_operand 1
> -

Re: [PATCH] [PR117978] AArch64: Fold SVE load/store with certain ptrue patterns to LDR/STR.

2025-05-07 Thread Richard Sandiford
Jennifer Schmitz  writes:
> @@ -3698,6 +3706,24 @@ aarch64_partial_ptrue_length (rtx_vector_builder 
> &builder,
>return vl;
>  }
>  
> +/* Return:
> +
> +  * -1 if all bits of PRED are set
> +  * N if PRED has N leading set bits followed by all clear bits
> +  * 0 if PRED does not have any of these forms.  */

Sorry for the formatting nits, but the indentation looks off here.
Each nonempty line should be indented by three spaces rather than two.

> [...]
> @@ -23526,6 +23567,39 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info 
> *info,
>return false;
>  }
>  
> +/* Try to optimize the expansion of a maskload or maskstore with
> +  the operands in OPERANDS, given that the vector being loaded or
> +  stored has mode MODE.  Return true on success or false if the normal
> +  expansion should be used.  */

Same here.

> +
> +bool
> +aarch64_expand_maskloadstore (rtx *operands, machine_mode mode)
> +{
> +  /* If the predicate in operands[2] is a patterned SVE PTRUE predicate
> +   with patterns VL1, VL2, VL4, VL8, or VL16 and at most the bottom
> +   128 bits are loaded/stored, emit an ASIMD load/store.  */

Same here (five spaces rather than three).

OK with those nits fixed, thanks.

Richard


Re: [PATCH] [PR117978] AArch64: Fold SVE load/store with certain ptrue patterns to LDR/STR.

2025-05-07 Thread Jennifer Schmitz


> On 6 May 2025, at 23:02, Richard Sandiford  wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Jennifer Schmitz  writes:
>> About the tests: Non-power-of-2 patterns are already being tested in
>> gcc.target/aarch64/sve/acle/general/whilelt_5.c.
> 
> OK
> 
>> For the case of svptrue_b16 ()
>> with 8-bit load, I added a test case for it. Currently, it has a single test 
>> case,
>> but if necessary I can add more tests for other data types and for stores as 
>> well.
> 
> Nah, that should be fine.
> 
>> 
>> I bootstrapped and tested and the check-function-bodies test in 
>> gcc.target/aarch64/pr117557.c
>> is currently failing.
>> The current GCC trunk produces:
>> f:
>>  add x3, x1, 8
>>  cmp x2, x3
>>  add x3, x2, 60
>>  ccmp x1, x3, 2, cc
>>  bcc .L2
>>  ptrue p7.h, vl8
>>  index z31.s, #0, #8
>>  ld1b z0.h, p7/z, [x1]
>>  punpklo p6.h, p7.b
>>  lsl z0.h, z0.h, #2
>>  punpkhi p7.h, p7.b
>>  uunpklo z1.s, z0.h
>>  uunpkhi z0.s, z0.h
>>  ld1w z29.s, p6/z, [x0, z1.s, sxtw]
>>  ld1w z30.s, p7/z, [x0, z0.s, sxtw]
>>  st1w z29.s, p6, [x2, z31.s, sxtw]
>>  incb x2, all, mul #2
>>  st1w z30.s, p7, [x2, z31.s, sxtw]
>>  ret
>>  …
>> 
>> WITH my patch it produces:
>> f:
>>  add x3, x1, 8
>>  cmp x2, x3
>>  add x3, x2, 60
>>  ccmpx1, x3, 2, cc
>>  bcc .L2
>>  ldr d31, [x1]
>>  addvl   sp, sp, #-1
>>  ptrue   p5.b, all
>>  addpl   x1, sp, #4
>>  ptrue   p7.h, vl8
>>  punpklo p6.h, p7.b
>>  punpkhi p7.h, p7.b
>>  str d31, [x1]
>>  ld1bz31.h, p5/z, [sp, #1, mul vl]
>>  lsl z31.h, z31.h, #2
>>  uunpklo z30.s, z31.h
>>  uunpkhi z31.s, z31.h
>>  ld1wz29.s, p6/z, [x0, z30.s, sxtw]
>>  ld1wz30.s, p7/z, [x0, z31.s, sxtw]
>>  index   z31.s, #0, #8
>>  st1wz29.s, p6, [x2, z31.s, sxtw]
>>  incbx2, all, mul #2
>>  st1wz30.s, p7, [x2, z31.s, sxtw]
>>  addvl   sp, sp, #1
>>  ret
>>  …
>> 
>> The patch seems to fold the ptrue p7.h, vl8 and ld1b z0.h, p7/z, [x1] to an 
>> LDR,
>> but because the ptrue predicate in p7 is needed again later, this is not 
>> really
>> an optimization. Could we prevent the fold somehow in case the predicate is 
>> used
>> multiple times?
> 
> Using LDR and STR is still better in many cases, because it has one
> fewer register to rename and because there's the possiblity that the
> compiler might form LDPs and STPs.
The test case now passes. Adding aarch64_classify_vector_mode (mode) == 
VEC_SVE_DATA)
again prevented the application of the optimization in this case, because mode 
VNx8QI had been used.
> 
>> [...]
>> @@ -3698,6 +3706,19 @@ aarch64_partial_ptrue_length (rtx_vector_builder 
>> &builder,
>>   return vl;
>> }
>> 
>> +/* For predicate PRED, return the number of active lanes.  */
> 
> That's not really what the function does.  How about:
> 
> /* Return:
> 
>   * -1 if all bits of PRED are set
>   * N if PRED has N leading set bits followed by all clear bits
>   * 0 if PRED does not have any of these forms.  */
> 
Done.
>> +int
>> +aarch64_partial_ptrue_length (rtx pred)
>> +{
>> +  rtx_vector_builder builder;
>> +  if (!aarch64_get_sve_pred_bits (builder, pred))
>> +return 0;
>> +
>> +  auto elt_size = vector_element_size (GET_MODE_BITSIZE (GET_MODE (pred)),
>> +GET_MODE_NUNITS (GET_MODE (pred)));
>> +  return aarch64_partial_ptrue_length (builder, elt_size);
>> +}
>> +
>> /* See if there is an svpattern that encodes an SVE predicate of mode
>>PRED_MODE in which the first VL bits are set and the rest are clear.
>>Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
>> @@ -6410,8 +6431,32 @@ aarch64_stack_protect_canary_mem (machine_mode mode, 
>> rtx decl_rtl,
>>   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
>> }
>> 
>> -/* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
>> -   that is known to contain PTRUE.  */
>> +/* Emit a load/store from a subreg of SRC to a subreg of DEST.
>> +   The subregs have mode NEW_MODE. Use only for reg<->mem moves.  */
>> +void
>> +aarch64_emit_load_store_through_mode (rtx dest, rtx src, machine_mode 
>> new_mode)
>> +{
>> +  gcc_assert ((REG_P (src) && MEM_P (dest))
>> +   || (REG_P (dest) && MEM_P (src)));
> 
> We should allow subregs too, so how about:
> 
>  gcc_assert ((MEM_P (dest) && register_operand (src, VOIDmode))
>  || (MEM_P (src) && register_operand (dest, VOIDmode)));
> 
Done.
>> +  auto mode = GET_MODE (dest);
>> +  auto int_mode = aarch64_sve_int_mode (mode);
>> +  if (MEM_P (src))
>> +{
>> +  rtx tmp = force_reg (new_mode, adjust_address (src, new_mode, 0));
>> +  tmp = force_lowpart_subreg (int_mode, tmp, new_mode);
>> +  emit_move_insn (dest, force_lowpart_subreg (mode, tmp, int_mode));
>> +}
>> +  else
>> +{
>> +  src = force_lowpart_subreg (int

Re: [PATCH 5/8] AArch64: make `far_branch` attribute a boolean

2025-05-07 Thread Kyrylo Tkachov



> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> The `far_branch` attribute only ever takes the values 0 or 1, so make it
> a `no/yes` valued string attribute instead.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64.md (far_branch): replace 0/1 with
> no/yes.
> (aarch64_bcond): handle rename.
> (aarch64_cb1): likewise.
> (*cb1): likewise.
> (@aarch64_tb): likewise.

Ok (with fixed Changelog entries).
Thanks,
Kyrill

> ---
> gcc/config/aarch64/aarch64.md | 22 ++
> 1 file changed, 10 insertions(+), 12 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index ca5bd96a754..256df0dcc04 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -561,9 +561,7 @@ (define_attr "enabled" "no,yes"
> ;; Attribute that specifies whether we are dealing with a branch to a
> ;; label that is far away, i.e. further away than the maximum/minimum
> ;; representable in a signed 21-bits number.
> -;; 0 :=: no
> -;; 1 :=: yes
> -(define_attr "far_branch" "" (const_int 0))
> +(define_attr "far_branch" "no,yes" (const_string "no"))
> 
> ;; Attribute that specifies whether the alternative uses MOVPRFX.
> (define_attr "movprfx" "no,yes" (const_string "no"))
> @@ -791,8 +789,8 @@ (define_insn "aarch64_bcond"
>   (const_int BRANCH_LEN_N_1Mib))
>   (lt (minus (match_dup 2) (pc))
>   (const_int BRANCH_LEN_P_1Mib)))
> -  (const_int 0)
> -  (const_int 1)))]
> +  (const_string "no")
> +  (const_string "yes")))]
> )
> 
> ;; For a 24-bit immediate CST we can optimize the compare for equality
> @@ -858,8 +856,8 @@ (define_insn "aarch64_cb1"
>   (const_int BRANCH_LEN_N_1Mib))
>   (lt (minus (match_dup 2) (pc))
>   (const_int BRANCH_LEN_P_1Mib)))
> -  (const_int 0)
> -  (const_int 1)))]
> +  (const_string "no")
> +  (const_string "yes")))]
> )
> 
> ;; For an LT/GE comparison against zero, emit `TBZ`/`TBNZ`
> @@ -874,7 +872,7 @@ (define_insn "*cb1"
>   {
> if (get_attr_length (insn) == 8)
>   {
> - if (get_attr_far_branch (insn) == 1)
> + if (get_attr_far_branch (insn) == FAR_BRANCH_YES)
>  return aarch64_gen_far_branch (operands, 1, "Ltb",
> "\\t%0, , ");
> else
> @@ -903,8 +901,8 @@ (define_insn "*cb1"
>   (const_int BRANCH_LEN_N_1Mib))
>   (lt (minus (match_dup 1) (pc))
>   (const_int BRANCH_LEN_P_1Mib)))
> -  (const_int 0)
> -  (const_int 1)))]
> +  (const_string "no")
> +  (const_string "yes")))]
> )
> 
> ;; ---
> @@ -968,8 +966,8 @@ (define_insn "@aarch64_tb"
>   (const_int BRANCH_LEN_N_1Mib))
>   (lt (minus (match_dup 2) (pc))
>   (const_int BRANCH_LEN_P_1Mib)))
> -  (const_int 0)
> -  (const_int 1)))]
> +  (const_string "no")
> +  (const_string "yes")))]
> 
> )
> 
> -- 
> 2.45.2
> 



Re: [PATCH 6/8] AArch64: recognize `+cmpbr` option

2025-05-07 Thread Kyrylo Tkachov


> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> Add the `+cmpbr` option to enable the FEAT_CMPBR architectural
> extension.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64-option-extensions.def (cmpbr): new
> option.
> * config/aarch64/aarch64.h (TARGET_CMPBR): new macro.
> * doc/invoke.texi (cmpbr): new option.

Looks ok to me.
Not a blocker here, but does this need any FMV handling? I guess this is one of 
those transparent codegen features and maybe doesn’t need FMV clones…
Thanks,
Kyrill


> ---
> gcc/config/aarch64/aarch64-option-extensions.def | 2 ++
> gcc/config/aarch64/aarch64.h | 3 +++
> gcc/doc/invoke.texi  | 3 +++
> 3 files changed, 8 insertions(+)
> 
> diff --git a/gcc/config/aarch64/aarch64-option-extensions.def 
> b/gcc/config/aarch64/aarch64-option-extensions.def
> index dbbb021f05a..1c3e69799f5 100644
> --- a/gcc/config/aarch64/aarch64-option-extensions.def
> +++ b/gcc/config/aarch64/aarch64-option-extensions.def
> @@ -249,6 +249,8 @@ AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops")
> 
> AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc")
> 
> +AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr")
> +
> AARCH64_OPT_EXTENSION("lse128", LSE128, (LSE), (), (), "lse128")
> 
> AARCH64_OPT_EXTENSION("d128", D128, (LSE128), (), (), "d128")
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index e8bd8c73c12..d5c4a42e96d 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -410,6 +410,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE 
> ATTRIBUTE_UNUSED
> /* CSSC instructions are enabled through +cssc.  */
> #define TARGET_CSSC AARCH64_HAVE_ISA (CSSC)
> 
> +/* CB instructions are enabled through +cmpbr.  */
> +#define TARGET_CMPBR AARCH64_HAVE_ISA (CMPBR)
> +
> /* Make sure this is always defined so we don't have to check for ifdefs
>but rather use normal ifs.  */
> #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 32bc45725de..3f05e5e0e34 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -22252,6 +22252,9 @@ Enable the FlagM2 flag conversion instructions.
> Enable the Pointer Authentication Extension.
> @item cssc
> Enable the Common Short Sequence Compression instructions.
> +@item cmpbr
> +Enable the shorter compare and branch instructions, @code{cbb}, @code{cbh} 
> and
> +@code{cb}.
> @item sme
> Enable the Scalable Matrix Extension.  This is only supported when SVE2 is 
> also
> enabled.
> -- 
> 2.45.2
> 



Re: [PATCH] RISC-V: Minimal support for sdtrig and ssstrict extensions.

2025-05-07 Thread Dongyan Chen

Okay, thanks.

Dongyan Chen

在 2025/5/7 7:11, Jeff Law 写道:



On 4/18/25 2:47 AM, Dongyan Chen wrote:

This patch support sdtrig and ssstrict extensions[1].
To enable GCC to recognize and process sdtrig and ssstrict extensions 
correctly at compile time.


[1]https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc 



gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: New extension.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-45.c: New test.
THanks.  I renamed the test to arch-47.c since -45 and -46 were 
already taken and pushed this to the trunk.


jeff




Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Kyrylo Tkachov


> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> Commit the test file `cmpbr.c` before rules for generating the new
> instructions are added, so that the changes in codegen are more obvious
> in the next commit.

I guess that’s an LLVM best practice.
In GCC since we have the check-function-bodies mechanism we usually prefer to 
include the relevant test together with the patch that adds the optimization.
But this is not wrong either.


> 
> gcc/testsuite/ChangeLog:
> 
> * gcc.target/aarch64/cmpbr.c: New test.
> ---
> gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
> 1 file changed, 1378 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
> b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
> new file mode 100644
> index 000..728d6ead91c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
> @@ -0,0 +1,1378 @@
> +/* Test that the instructions added by FEAT_CMPBR are emitted */
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */

As you’ll be adding new instructions to the compiler it’d be good to have it a 
dg-do assemble test where possible.
For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use it 
like so to fallback to dg-do compile when the assembler is too old:
/* { dg-do compile { target aarch64_asm_cmpbr_ok } } */
/* { dg-do compile { target { ! aarch64_asm_cmpbr_ok } } } */
Look in lib/target-supports.exp for “aarch64_asm” for how to define it.

Ok otherwise.
Thanks,
Kyrill

> +
> +#include 
> +
> +typedef uint8_t u8;
> +typedef int8_t i8;
> +
> +typedef uint16_t u16;
> +typedef int16_t i16;
> +
> +typedef uint32_t u32;
> +typedef int32_t i32;
> +
> +typedef uint64_t u64;
> +typedef int64_t i64;
> +
> +int taken();
> +int not_taken();
> +
> +#define COMPARE(ty, name, op, rhs)   
>   \
> +  int ty##_x0_##name##_##rhs(ty x0, ty x1) { 
>   \
> +return (x0 op rhs) ? taken() : not_taken();  
>   \
> +  }
> +
> +#define COMPARE_ALL(unsigned_ty, signed_ty, rhs) 
>   \
> +  COMPARE(unsigned_ty, eq, ==, rhs); 
>   \
> +  COMPARE(unsigned_ty, ne, !=, rhs); 
>   \
> + 
>   \
> +  COMPARE(unsigned_ty, ult, <, rhs); 
>   \
> +  COMPARE(unsigned_ty, ule, <=, rhs);
>   \
> +  COMPARE(unsigned_ty, ugt, >, rhs); 
>   \
> +  COMPARE(unsigned_ty, uge, >=, rhs);
>   \
> + 
>   \
> +  COMPARE(signed_ty, slt, <, rhs);   
>   \
> +  COMPARE(signed_ty, sle, <=, rhs);  
>   \
> +  COMPARE(signed_ty, sgt, >, rhs);   
>   \
> +  COMPARE(signed_ty, sge, >=, rhs);
> +
> +//  CBB (register) 
> +COMPARE_ALL(u8, i8, x1);
> +
> +//  CBH (register) 
> +COMPARE_ALL(u16, i16, x1);
> +
> +//  CB (register) 
> +COMPARE_ALL(u32, i32, x1);
> +COMPARE_ALL(u64, i64, x1);
> +
> +//  CB (immediate) 
> +COMPARE_ALL(u32, i32, 42);
> +COMPARE_ALL(u64, i64, 42);
> +
> +//  Special cases 
> +// CBB and CBH cannot have immediate operands. Instead we have to do a MOV+CB
> +COMPARE_ALL(u8, i8, 42);
> +COMPARE_ALL(u16, i16, 42);
> +
> +// 65 is out of the range for immediate operands (0 to 63).
> +// * For 8/16-bit types, use a MOV+CB as above.
> +// * For 32/64-bit types, use a CMP+B instead, because
> +//   B has a longer range than CB.
> +COMPARE_ALL(u8, i8, 65);
> +COMPARE_ALL(u16, i16, 65);
> +COMPARE_ALL(u32, i32, 65);
> +COMPARE_ALL(u64, i64, 65);
> +
> +// Comparisons against zero can use the wzr/xzr register.
> +COMPARE_ALL(u8, i8, 0);
> +COMPARE_ALL(u16, i16, 0);
> +COMPARE_ALL(u32, i32, 0);
> +COMPARE_ALL(u64, i64, 0);
> +
> +/*
> +** u8_x0_eq_x1:
> +** and w1, w1, 255
> +** cmp w1, w0, uxtb
> +** beq .L4
> +** b not_taken
> +** b taken
> +*/
> +
> +/*
> +** u8_x0_ne_x1:
> +** and w1, w1, 255
> +** cmp w1, w0, uxtb
> +** beq .L6
> +** b taken
> +** b not_taken
> +*/
> +
> +/*
> +** u8_x0_ult_x1:
> +** and w1, w1, 255
> +** cmp w1, w0, uxtb
> +** bls .L8
> +** b taken
> +** b not_taken
> +*/
> +
> +/*
> +** u8_x0_ule_x1:
> +** and w1, w1, 255
> +** cmp w1, w0, uxtb
> +** bcc .L10
> +** b taken
> +** b not_taken
> +*/
> +
> +/*
> +** u8_x0_ugt_x1:
> +** and w1, w1, 255
> +** cmp w1, w0, uxtb
> +** bcs .L12
> +** b taken
> +** b not_taken
> +*/
> +
> +/*
> +** u8_x0_uge_x1:
> +** and w1, w1, 255
> +** cmp w1, w0, uxtb
> +** bhi .L14
> +** b taken

Re: [PATCH 8/8] AArch64: rules for CMPBR instructions

2025-05-07 Thread Kyrylo Tkachov


> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> Add rules for lowering `cbranch4` to CBB/CBH/CB when CMPBR
> extension is enabled.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64.md (cbranch4): emit CMPBR
> instructions if possible.
> (cbranch4): new expand rule.
> (aarch64_cb): likewise.
> (aarch64_cb): likewise.
> * config/aarch64/iterators.md (cmpbr_suffix): new mode attr.
> * config/aarch64/predicates.md (const_0_to_63_operand): new
> predicate.
> (aarch64_cb_immediate): likewise.
> (aarch64_cb_operand): likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> * gcc.target/aarch64/cmpbr.c: update tests.
> ---
> gcc/config/aarch64/aarch64.md|  87 +++-
> gcc/config/aarch64/iterators.md  |   5 +
> gcc/config/aarch64/predicates.md |  17 +
> gcc/testsuite/gcc.target/aarch64/cmpbr.c | 484 ---
> 4 files changed, 275 insertions(+), 318 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 256df0dcc04..73f3e062e57 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -720,18 +720,41 @@ (define_constants
> ;; Conditional jumps
> ;; ---
> 
> -(define_expand "cbranch4"
> +(define_expand "cbranch4"
>   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>[(match_operand:GPI 1 "register_operand")
> (match_operand:GPI 2 "aarch64_plus_operand")])
>   (label_ref (match_operand 3))
>   (pc)))]
>   ""
> -  "
> -  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
> - operands[2]);
> -  operands[2] = const0_rtx;
> -  "
> +  {
> +  if (TARGET_CMPBR && aarch64_cb_operand (operands[2], mode))
> +{
> +  emit_jump_insn (gen_aarch64_cb (operands[0], operands[1],
> +operands[2], operands[3]));
> +  DONE;
> +}
> +  else
> +{
> +  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]),
> + operands[1], operands[2]);
> +  operands[2] = const0_rtx;
> +}
> +  }
> +)
> +
> +(define_expand "cbranch4"
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> +[(match_operand:SHORT 1 "register_operand")
> + (match_operand:SHORT 2 "aarch64_cb_short_operand")])
> +   (label_ref (match_operand 3))
> +   (pc)))]
> +  "TARGET_CMPBR"
> +  {
> +  emit_jump_insn (gen_aarch64_cb (operands[0], operands[1],
> + operands[2], operands[3]));
> +  DONE;
> +  }

If you just need to expand the standard cbranch4 name to this 
pattern and you don’t need to adjust any of the operands you shouldn’t need the 
C code and the DONE; here.
This can be just:
(define_expand "cbranch4”
  [(set (pc) (if_then_else (match_operator 0 “aarch64_comparison_operator"
   [(match_operand:SHORT 1 "register_operand”)
 (match_operand:SHORT 2 "aarch64_cb_short_operand")])
   (label_ref (match_operand 3))
   (pc)))]
  “TARGET_CMPBR”
  {}

> )
> 
> (define_expand "cbranch4"
> @@ -758,6 +781,58 @@ (define_expand "cbranchcc4"
>   ""
> )
> 
> +;; Emit a `CB (register)` or `CB (immediate)` instruction.
> +(define_insn "aarch64_cb"
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> +[(match_operand:GPI 1 "register_operand")
> + (match_operand:GPI 2 "aarch64_cb_operand")])
> +   (label_ref (match_operand 3))
> +   (pc)))]

define_insn operands, particularly the register operands need to have 
constraints.

> +  "TARGET_CMPBR"
> +  "cb%m0\\t%1, %2, %l3";
> +  [(set_attr "type" "branch")
> +   (set (attr "length")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_N_1Kib))
> +   (lt (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_P_1Kib)))
> +  (const_int 4)
> +  (const_int 8)))
> +   (set (attr "far_branch")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_N_1Kib))
> +   (lt (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_P_1Kib)))
> +  (const_string "no")
> +  (const_string "yes")))]
> +)
> +
> +;; Emit a `CBB (register)` or `CBH (register)` instruction.
> +(define_insn "aarch64_cb"
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> +[(match_operand:SHORT 1 "register_operand")
> + (match_operand:SHORT 2 "aarch64_cb_short_operand")])
> +   (label_ref (match_operand 3))
> +   (pc)))]

Likewise.
Thanks,
Kyrill

> +  "TARGET_CMPBR"
> +  "cb%m0\\t%1, %2, %l3";
> +  [(set_attr "type" "branch")
> +   (set (attr "length")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_N_1Kib))
> +   (lt (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_P_1Kib)))
> +  (const_int 4)
> +  (const_int 8)))
> +   (set (attr "far_branch")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_N_1Kib))
> +   (lt (minus (match_dup 3) (pc))
> +   (const_int BRANCH_LEN_P_1Kib)))
> +  (const_string "no")
> +  (const_string "yes"

[committed] libgomp.fortran/map-alloc-comp-9{,-usm}.f90: Add unified_shared_memory variant

2025-05-07 Thread Tobias Burnus
Committed asr16-445-g9565076f9b8105. This test supports mapping + accessing the vtab 
of the polymorphic variable on the host. Obviously, this only works if 
the host pointer is device accessible ("unified-shared memory"). In 
principle, we want to check for this - and enable some subtests. The 
enabling/disabling works, but there is no simple USM check. Currently, 
only the host gets the extra tests, the new test also checks it with 
devices, but unfortunately the USM requirement unconditionally switches 
to self maps. Thus, while it is useful to test this (hence, this 
commit), it does not check what we actually want ... Tobias PS: My plans 
for the future is to permit more fine tuning and some default changes. 
In particular, if a device is an APU, it should by default use self 
mapping. Additionally, the user should have the possibility to switch 
between mapping and self mapping for devices. Depending on the access 
pattern, way how USM is implemented and location of host vs. device 
process, copying (mapping) or direct access (self map) is faster. 
Additionally, copying might run into storage-size issues.
commit 9565076f9b810541aeb63cb621d694326aa12216
Author: Tobias Burnus 
Date:   Wed May 7 13:46:51 2025 +0200

libgomp.fortran/map-alloc-comp-9{,-usm}.f90: Add unified_shared_memory variant

When host memory is device accessible - independent whether mapping is done or
not (i.e. self map), the 'vtab' pointer becomes accessible, which stores the
dynamic type's type and size information.

In principle, we want to test: USM available but mapping is still done, but
as there is no simple + reliable not-crashing way to test for this, those
checks are skipped in the (pre)existing test file map-alloc-comp-9.f90.

Or rather: those are only active with self-maps, which is currently only true
for the host.

This commit adds map-alloc-comp-9-usm.f90 which runs the same test with
'omp requires unified_shared_memory'.  While OpenMP permits both actual
mapping and self maps with this flag, it in theory covers the missing cases.
However, currently, GCC always uses self maps with USM. Still, having a
device-run self-maps check is better than nothing, even if it misses the
most interesting case.

libgomp/ChangeLog:

* testsuite/libgomp.fortran/map-alloc-comp-9.f90: Process differently
when USE_USM_REQUIREMENT is set.
* testsuite/libgomp.fortran/map-alloc-comp-9-usm.f90: New test.
---
 .../libgomp.fortran/map-alloc-comp-9-usm.f90  | 11 +++
 .../testsuite/libgomp.fortran/map-alloc-comp-9.f90| 19 +++
 2 files changed, 30 insertions(+)

diff --git a/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9-usm.f90 b/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9-usm.f90
new file mode 100644
index 000..90378c0e42a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9-usm.f90
@@ -0,0 +1,11 @@
+! { dg-additional-options "-cpp -DUSE_USM_REQUIREMENT=1 -Wno-openmp" }
+!
+! We silence the warning:
+!  Mapping of polymorphic list item '...' is unspecified behavior [-Wopenmp]
+!
+! Ensure that polymorphic mapping is diagnosed as undefined behavior
+! Ensure that static access to polymorphic variables works
+
+! Run map-alloc-comp-9.f90 in unified-shared-memory mode
+
+#include "map-alloc-comp-9.f90"
diff --git a/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9.f90 b/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9.f90
index 3cec39218f5..26c73d75c09 100644
--- a/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9.f90
+++ b/libgomp/testsuite/libgomp.fortran/map-alloc-comp-9.f90
@@ -1,8 +1,19 @@
+! { dg-additional-options "-cpp" }
+!
 ! Ensure that polymorphic mapping is diagnosed as undefined behavior
 ! Ensure that static access to polymorphic variables works
 
+! Some extended tests are only run with shared memory
+! To enforce this (where possible) on the device side:
+!   #define USE_USM_REQUIREMENT
+! which is done in map-alloc-comp-9-usm.f90
+
 subroutine test(case)
 implicit none(type, external)
+#ifdef USE_USM_REQUIREMENT
+  !$omp requires unified_shared_memory
+#endif
+
 type t
   integer :: x(4)
 end type t
@@ -73,10 +84,14 @@ var4%y2(2)%y%x%x = -7 * [,,,]
 var4%y2(2)%y%x2(1)%x = -8 * [,,,]
 var4%y2(2)%y%x2(2)%x = -9 * [,,,]
 
+#ifdef USE_USM_REQUIREMENT
+is_shared_mem = .true.
+#else
 is_shared_mem = .false.
 !$omp target map(to: is_shared_mem)
   is_shared_mem = .true.
 !$omp end target
+#endif
 
 if (case == 1) then
   ! implicit mapping
@@ -532,6 +547,10 @@ end subroutine test
 program main
   use omp_lib
   implicit none(type, external)
+#ifdef USE_USM_REQUIREMENT
+  !$omp requires unified_shared_memory
+#endif
+
   interface
 subroutine test(case)
   integer, value :: case


Re: [PATCH 0/8] AArch64: CMPBR support

2025-05-07 Thread Kyrylo Tkachov
Hi Karl,

> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> This patch series adds support for the CMPBR extension. It includes the
> new `+cmpbr` option and rules to generate the new instructions when
> lowering conditional branches.

Thanks for the series.
You didn’t state it explicitly, but have you run a bootstrap and testsuite run 
with this series?
It’s usually best to include testing information in the patches to help 
reviewers.

Thanks,
Kyrill 


> 
> Karl Meakin (8):
>  AArch64: place branch instruction rules together
>  AArch64: reformat branch instruction rules
>  AArch64: rename branch instruction rules
>  AArch64: add constants for branch displacements
>  AArch64: make `far_branch` attribute a boolean
>  AArch64: recognize `+cmpbr` option
>  AArch64: precommit test for CMPBR instructions
>  AArch64: rules for CMPBR instructions
> 
> .../aarch64/aarch64-option-extensions.def |2 +
> gcc/config/aarch64/aarch64-simd.md|2 +-
> gcc/config/aarch64/aarch64-sme.md |3 +-
> gcc/config/aarch64/aarch64.cc |2 +-
> gcc/config/aarch64/aarch64.h  |3 +
> gcc/config/aarch64/aarch64.md |  557 +---
> gcc/config/aarch64/iterators.md   |5 +
> gcc/config/aarch64/predicates.md  |   17 +
> gcc/doc/invoke.texi   |3 +
> gcc/testsuite/gcc.target/aarch64/cmpbr.c  | 1238 +
> 10 files changed, 1615 insertions(+), 217 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c
> 
> -- 
> 2.45.2
> 



[PATCH] libfortran: Readd 15 accidentally removed libgfortran symbols [PR120152]

2025-05-07 Thread Jakub Jelinek
Hi!

The r15-4124-gc0002a675a92e76d change seems to have accidentally
dropped 5 sourcefiles from i_maxloc1_c, which resulted in dropping
15 GFORTRAN_8 symbols on x86_64 and 6 on i686.

The following patch adds it back, so that we export those symbols
again, fixing the ABI problem.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk
and 15.2?

2025-05-07  Jakub Jelinek  

PR libfortran/120152
* Makefile.am (i_maxloc1_c): Readd generated/maxloc1_4_i8.c,
generated/maxloc1_8_i8.c, generated/maxloc1_16_i8.c,
generated/maxloc1_4_i16.c, generated/maxloc1_8_i16.c.  Move
generated/maxloc1_16_i16.c entry earlier in the list.
* Makefile.in: Regenerated.

--- libgfortran/Makefile.am.jj  2025-04-08 14:10:04.882256861 +0200
+++ libgfortran/Makefile.am 2025-05-07 10:44:54.534238786 +0200
@@ -400,6 +400,12 @@ generated/maxloc1_16_i2.c \
 generated/maxloc1_4_i4.c \
 generated/maxloc1_8_i4.c \
 generated/maxloc1_16_i4.c \
+generated/maxloc1_4_i8.c \
+generated/maxloc1_8_i8.c \
+generated/maxloc1_16_i8.c \
+generated/maxloc1_4_i16.c \
+generated/maxloc1_8_i16.c \
+generated/maxloc1_16_i16.c \
 generated/maxloc1_4_m1.c \
 generated/maxloc1_8_m1.c \
 generated/maxloc1_16_m1.c \
@@ -414,7 +420,6 @@ generated/maxloc1_8_m8.c \
 generated/maxloc1_16_m8.c \
 generated/maxloc1_4_m16.c \
 generated/maxloc1_8_m16.c \
-generated/maxloc1_16_i16.c \
 generated/maxloc1_4_r4.c \
 generated/maxloc1_8_r4.c \
 generated/maxloc1_16_r4.c \
--- libgfortran/Makefile.in.jj  2025-05-07 10:47:40.361973724 +0200
+++ libgfortran/Makefile.in 2025-05-07 10:46:41.288779475 +0200
@@ -265,22 +265,24 @@ am__objects_8 = generated/maxloc1_4_i1.l
generated/maxloc1_16_i1.lo generated/maxloc1_4_i2.lo \
generated/maxloc1_8_i2.lo generated/maxloc1_16_i2.lo \
generated/maxloc1_4_i4.lo generated/maxloc1_8_i4.lo \
-   generated/maxloc1_16_i4.lo generated/maxloc1_4_m1.lo \
+   generated/maxloc1_16_i4.lo generated/maxloc1_4_i8.lo \
+   generated/maxloc1_8_i8.lo generated/maxloc1_16_i8.lo \
+   generated/maxloc1_4_i16.lo generated/maxloc1_8_i16.lo \
+   generated/maxloc1_16_i16.lo generated/maxloc1_4_m1.lo \
generated/maxloc1_8_m1.lo generated/maxloc1_16_m1.lo \
generated/maxloc1_4_m2.lo generated/maxloc1_8_m2.lo \
generated/maxloc1_16_m2.lo generated/maxloc1_4_m4.lo \
generated/maxloc1_8_m4.lo generated/maxloc1_16_m4.lo \
generated/maxloc1_4_m8.lo generated/maxloc1_8_m8.lo \
generated/maxloc1_16_m8.lo generated/maxloc1_4_m16.lo \
-   generated/maxloc1_8_m16.lo generated/maxloc1_16_i16.lo \
-   generated/maxloc1_4_r4.lo generated/maxloc1_8_r4.lo \
-   generated/maxloc1_16_r4.lo generated/maxloc1_4_r8.lo \
-   generated/maxloc1_8_r8.lo generated/maxloc1_16_r8.lo \
-   generated/maxloc1_4_r10.lo generated/maxloc1_8_r10.lo \
-   generated/maxloc1_16_r10.lo generated/maxloc1_4_r16.lo \
-   generated/maxloc1_8_r16.lo generated/maxloc1_16_r16.lo \
-   generated/maxloc1_4_r17.lo generated/maxloc1_8_r17.lo \
-   generated/maxloc1_16_r17.lo
+   generated/maxloc1_8_m16.lo generated/maxloc1_4_r4.lo \
+   generated/maxloc1_8_r4.lo generated/maxloc1_16_r4.lo \
+   generated/maxloc1_4_r8.lo generated/maxloc1_8_r8.lo \
+   generated/maxloc1_16_r8.lo generated/maxloc1_4_r10.lo \
+   generated/maxloc1_8_r10.lo generated/maxloc1_16_r10.lo \
+   generated/maxloc1_4_r16.lo generated/maxloc1_8_r16.lo \
+   generated/maxloc1_16_r16.lo generated/maxloc1_4_r17.lo \
+   generated/maxloc1_8_r17.lo generated/maxloc1_16_r17.lo
 am__objects_9 = generated/maxval_i1.lo generated/maxval_i2.lo \
generated/maxval_i4.lo generated/maxval_i8.lo \
generated/maxval_i16.lo generated/maxval_m1.lo \
@@ -1205,6 +1207,12 @@ generated/maxloc1_16_i2.c \
 generated/maxloc1_4_i4.c \
 generated/maxloc1_8_i4.c \
 generated/maxloc1_16_i4.c \
+generated/maxloc1_4_i8.c \
+generated/maxloc1_8_i8.c \
+generated/maxloc1_16_i8.c \
+generated/maxloc1_4_i16.c \
+generated/maxloc1_8_i16.c \
+generated/maxloc1_16_i16.c \
 generated/maxloc1_4_m1.c \
 generated/maxloc1_8_m1.c \
 generated/maxloc1_16_m1.c \
@@ -1219,7 +1227,6 @@ generated/maxloc1_8_m8.c \
 generated/maxloc1_16_m8.c \
 generated/maxloc1_4_m16.c \
 generated/maxloc1_8_m16.c \
-generated/maxloc1_16_i16.c \
 generated/maxloc1_4_r4.c \
 generated/maxloc1_8_r4.c \
 generated/maxloc1_16_r4.c \
@@ -2311,6 +2318,18 @@ generated/maxloc1_8_i4.lo: generated/$(a
generated/$(DEPDIR)/$(am__dirstamp)
 generated/maxloc1_16_i4.lo: generated/$(am__dirstamp) \
generated/$(DEPDIR)/$(am__dirstamp)
+generated/maxloc1_4_i8.lo: generated/$(am__dirstamp) \
+   generated/$(DEPDIR)/$(am__dirstamp)
+generated/maxloc1_8_i8.lo: generated/$(am__dirstamp) \
+   generated/$(DEPDIR)/$(am__dirstamp)
+generated/maxloc1_16_i8.lo: generated/$(am__dirstamp) \
+   generated/$(DEPDIR)/$(am__dirstamp)
+generated/maxloc1_4_i16.lo: genera

Re: [PATCH 1/8] AArch64: place branch instruction rules together

2025-05-07 Thread Kyrylo Tkachov


> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> The rules for conditional branches were spread throughout `aarch64.md`.
> Group them together so it is easier to understand how `cbranch4`
> is lowered to RTL.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64.md (condjump): move.
> (*compare_condjump): likewise.
> (aarch64_cb1): likewise.
> (*cb1): likewise.
> (tbranch_3): likewise.
> (@aarch64_tb): likewise.

Changelog entries should start with a capital letter.


> ---
> gcc/config/aarch64/aarch64.md | 387 ++
> 1 file changed, 201 insertions(+), 186 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c678f7afb1a..4d556d886bc 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -674,6 +674,10 @@ (define_insn "aarch64_write_sysregti"
>  "msrr\t%x0, %x1, %H1"
> )
> 
> +;; ---
> +;; Unconditional jumps
> +;; ---
> +
> (define_insn "indirect_jump"
>   [(set (pc) (match_operand:DI 0 "register_operand" "r"))]
>   ""
> @@ -692,6 +696,12 @@ (define_insn "jump"
>   [(set_attr "type" "branch")]
> )
> 
> +
> +
> +;; ---
> +;; Conditional jumps
> +;; ---
> +
> (define_expand "cbranch4"
>   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>[(match_operand:GPI 1 "register_operand")
> @@ -731,6 +741,197 @@ (define_expand "cbranchcc4"
>   ""
>   "")
> 
> +(define_insn "condjump"
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> +[(match_operand 1 "cc_register" "") (const_int 0)])
> +   (label_ref (match_operand 2 "" ""))
> +   (pc)))]
> +  ""
> +  {
> +/* GCC's traditional style has been to use "beq" instead of "b.eq", etc.,
> +   but the "." is required for SVE conditions.  */
> +bool use_dot_p = GET_MODE (operands[1]) == CC_NZCmode;
> +if (get_attr_length (insn) == 8)
> +  return aarch64_gen_far_branch (operands, 2, "Lbcond",
> + use_dot_p ? "b.%M0\\t" : "b%M0\\t");
> +else
> +  return use_dot_p ? "b.%m0\\t%l2" : "b%m0\\t%l2";
> +  }
> +  [(set_attr "type" "branch")
> +   (set (attr "length")
> + (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> +   (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> +  (const_int 4)
> +  (const_int 8)))
> +   (set (attr "far_branch")
> + (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> +   (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> +  (const_int 0)
> +  (const_int 1)))]
> +)
> +
> +;; For a 24-bit immediate CST we can optimize the compare for equality
> +;; and branch sequence from:
> +;; mov x0, #imm1
> +;; movk x0, #imm2, lsl 16 /* x0 contains CST.  */
> +;; cmp x1, x0
> +;; b .Label
> +;; into the shorter:
> +;; sub x0, x1, #(CST & 0xfff000)
> +;; subs x0, x0, #(CST & 0x000fff)
> +;; b .Label
> +(define_insn_and_split "*compare_condjump"
> +  [(set (pc) (if_then_else (EQL
> +  (match_operand:GPI 0 "register_operand" "r")
> +  (match_operand:GPI 1 "aarch64_imm24" "n"))
> +   (label_ref:P (match_operand 2 "" ""))
> +   (pc)))]
> +  "!aarch64_move_imm (INTVAL (operands[1]), mode)
> +   && !aarch64_plus_operand (operands[1], mode)
> +   && !reload_completed"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +  {
> +HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
> +HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
> +rtx tmp = gen_reg_rtx (mode);
> +emit_insn (gen_add3 (tmp, operands[0], GEN_INT (-hi_imm)));
> +emit_insn (gen_add3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
> +rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
> +rtx cmp_rtx = gen_rtx_fmt_ee (, mode,
> +  cc_reg, const0_rtx);
> +emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
> +DONE;
> +  }
> +)

This pattern isn’t really related to lowering jumps, it’s a splitting 
optimization.
So I wouldn’t group it with other patterns moved here.
Ok otherwise.
Thanks,
Kyrill

> +
> +(define_insn "aarch64_cb1"
> +  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
> + (const_int 0))
> +   (label_ref (match_operand 1 "" ""))
> +   (pc)))]
> +  "!aarch64_track_speculation"
> +  {
> +if (get_attr_length (insn) == 8)
> +  return aarch64_gen_far_branch (operands, 1, "Lcb", "\\t%0, 
> ");
> +else
> +  return "\\t%0, %l1";
> +  }
> +  [(set_attr "type" "branch")
> +   (set (attr "length")
> + (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
> +   (lt (minus (match_dup 1) (pc)) (const_int 1048572)))
> +  (const_int 4)
> +  (const_int 8)))
> +   (set (attr "far_branch")
> + (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> +   (lt (minus (match_

[PATCH] libfortran: Add 5 missing UNSIGNED symbols [PR120153]

2025-05-07 Thread Jakub Jelinek
Hi!

While looking at PR120152, I have noticed that libgfortran.so doesn't
export 5 *m16* symbols I would have expected that should be exported.
This is caused by 2 issues, one filename was forgotten to be added in r15-4124
to i_maxloc1_c (guess because generated/maxloc1_16_i16.c was kept in the
position after generated/maxloc1_8_m16.c and the i -> m difference wasn't
spotted), and one some garbage prefix on HAVE_GFC_UINTEGER_16 macro.

The first two hunks of this patch fix that.
Though, as GCC 15.1 has been released already, we can't add these symbols
to GFORTRAN_15 symbol version as they've never been there, so the patch
adds them to a new GFORTRAN_15.2 symbol version instead.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 15.2?

2025-05-07  Jakub Jelinek  

PR libfortran/120153
* Makefile.am (i_maxloc1_c): Add generated/maxloc1_16_m16.c.
* intrinsics/random.c (arandom_m16): Use #ifdef HAVE_GFC_UINTEGER_16
guard rather than #ifdef GFC_HAVE_GFC_UINTEGER_16.
* gfortran.map (GFORTRAN_15): Remove _gfortran_arandom_m16,
_gfortran_maxloc1_16_m16, _gfortran_mmaxloc1_16_m16 and
_gfortran_smaxloc1_16_m16.
(GFORTRAN_15.2): New symbol version, add those 4 symbols to it.
* generated/maxloc1_16_m16.c: New file.
* Makefile.in: Regenerate.

--- libgfortran/Makefile.am.jj  2025-05-07 10:56:25.857806018 +0200
+++ libgfortran/Makefile.am 2025-05-07 11:25:33.819973194 +0200
@@ -420,6 +420,7 @@ generated/maxloc1_8_m8.c \
 generated/maxloc1_16_m8.c \
 generated/maxloc1_4_m16.c \
 generated/maxloc1_8_m16.c \
+generated/maxloc1_16_m16.c \
 generated/maxloc1_4_r4.c \
 generated/maxloc1_8_r4.c \
 generated/maxloc1_16_r4.c \
--- libgfortran/intrinsics/random.c.jj  2025-01-02 20:54:32.790120772 +0100
+++ libgfortran/intrinsics/random.c 2025-05-07 11:26:13.451431511 +0200
@@ -1215,7 +1215,7 @@ arandom_m8 (gfc_array_m8 *x)
 }
 }
 
-#ifdef GFC_HAVE_GFC_UINTEGER_16
+#ifdef HAVE_GFC_UINTEGER_16
 
 /* Fill an unsigned array with random bytes.  */
 
--- libgfortran/gfortran.map.jj 2025-03-21 22:40:04.748803949 +0100
+++ libgfortran/gfortran.map2025-05-07 11:31:26.706149955 +0200
@@ -1786,7 +1786,6 @@ GFORTRAN_15 {
 _gfortran_arandom_m2;
 _gfortran_arandom_m4;
 _gfortran_arandom_m8;
-_gfortran_arandom_m16;
 _gfortran_minval_m16;
 _gfortran_minval_m1;
 _gfortran_minval_m2;
@@ -1832,7 +1831,6 @@ GFORTRAN_15 {
 _gfortran_maxloc0_8_m2;
 _gfortran_maxloc0_8_m4;
 _gfortran_maxloc0_8_m8;
-_gfortran_maxloc1_16_m16;
 _gfortran_maxloc1_16_m1;
 _gfortran_maxloc1_16_m2;
 _gfortran_maxloc1_16_m4;
@@ -1862,7 +1860,6 @@ GFORTRAN_15 {
 _gfortran_mmaxloc0_8_m2;
 _gfortran_mmaxloc0_8_m4;
 _gfortran_mmaxloc0_8_m8;
-_gfortran_mmaxloc1_16_m16;
 _gfortran_mmaxloc1_16_m1;
 _gfortran_mmaxloc1_16_m2;
 _gfortran_mmaxloc1_16_m4;
@@ -1892,7 +1889,6 @@ GFORTRAN_15 {
 _gfortran_smaxloc0_8_m2;
 _gfortran_smaxloc0_8_m4;
 _gfortran_smaxloc0_8_m8;
-_gfortran_smaxloc1_16_m16;
 _gfortran_smaxloc1_16_m1;
 _gfortran_smaxloc1_16_m2;
 _gfortran_smaxloc1_16_m4;
@@ -2028,3 +2024,11 @@ GFORTRAN_15 {
 _gfortran_reduce_c;
 _gfortran_reduce_scalar_c;
 } GFORTRAN_14;
+
+GFORTRAN_15.2 {
+  global:
+_gfortran_arandom_m16;
+_gfortran_maxloc1_16_m16;
+_gfortran_mmaxloc1_16_m16;
+_gfortran_smaxloc1_16_m16;
+} GFORTRAN_15;
--- libgfortran/generated/maxloc1_16_m16.c.jj   2025-05-07 11:35:20.094959988 
+0200
+++ libgfortran/generated/maxloc1_16_m16.c  2025-05-07 11:34:17.131820570 
+0200
@@ -0,0 +1,591 @@
+/* Implementation of the MAXLOC intrinsic
+   Copyright (C) 2002-2025 Free Software Foundation, Inc.
+   Contributed by Paul Brook 
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+#include "libgfortran.h"
+#include 
+
+
+#if defined (HAVE_GFC_UINTEGER_16) && defined (HAVE_GFC_INTEGER_16)
+
+#define HAVE_BACK_ARG 1
+
+
+extern void maxloc1_16_m16 (gfc_array_i16 * const restrict,
+   gfc_array_m16 * cons

Re: [PATCH] libcpp, v2: Further fixes for incorrect line numbers in large files [PR120061]

2025-05-07 Thread Jason Merrill

On 5/7/25 1:50 AM, Jakub Jelinek wrote:

On Tue, May 06, 2025 at 05:11:51PM -0400, Jason Merrill wrote:

Well, that's all very complicated but seems to make sense.  Can you also add
short rationale comments to the changes in linemap_add and plugin_init?


So like this?


That's great, thanks.  This patch is OK.


2025-05-07  Jakub Jelinek  

PR preprocessor/108900
PR preprocessor/116047
PR preprocessor/120061
* files.cc (_cpp_stack_file): Revert 2025-03-28 change.
* line-map.cc (linemap_add): Use
SOURCE_LINE (from, linemap_included_from (map - 1)) + 1; instead of
SOURCE_LINE (from, from[1].start_location); to compute to_line
for LC_LEAVE.  For LC_ENTER included_from computation, look at
map[-2] or even lower if map[-1] has the same start_location as
map[0].

* gcc.dg/plugin/plugin.exp: Add location-overflow-test-pr116047.c
and location-overflow-test-pr120061.c.
* gcc.dg/plugin/location_overflow_plugin.c (plugin_init): Don't error
on unknown values, instead just break.  Handle 0x4fHH arguments
differently.
* gcc.dg/plugin/location-overflow-test-pr116047.c: New test.
* gcc.dg/plugin/location-overflow-test-pr116047-1.h: New test.
* gcc.dg/plugin/location-overflow-test-pr116047-2.h: New test.
* gcc.dg/plugin/location-overflow-test-pr120061.c: New test.
* gcc.dg/plugin/location-overflow-test-pr120061-1.h: New test.
* gcc.dg/plugin/location-overflow-test-pr120061-2.h: New test.

--- libcpp/files.cc.jj  2025-05-03 11:02:02.502647404 +0200
+++ libcpp/files.cc 2025-05-05 21:09:18.042680877 +0200
@@ -1006,14 +1006,6 @@ _cpp_stack_file (cpp_reader *pfile, _cpp
&& (pfile->line_table->highest_location
!= LINE_MAP_MAX_LOCATION - 1));
  
-  if (decrement && LINEMAPS_ORDINARY_USED (pfile->line_table))

-{
-  const line_map_ordinary *map
-   = LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table);
-  if (map && map->start_location == pfile->line_table->highest_location)
-   decrement = false;
-}
-
if (decrement)
  pfile->line_table->highest_location--;
  
--- libcpp/line-map.cc.jj	2024-04-26 11:47:02.244168816 +0200

+++ libcpp/line-map.cc  2025-05-07 07:44:14.351845653 +0200
@@ -621,8 +621,8 @@ linemap_add (line_maps *set, enum lc_rea
 #include "included", inside the same "includer" file.  */
  
linemap_assert (!MAIN_FILE_P (map - 1));

-  /* (MAP - 1) points to the map we are leaving. The
-map from which (MAP - 1) got included should be the map
+  /* (MAP - 1) points to the map we are leaving.  The
+map from which (MAP - 1) got included should be usually the map
 that comes right before MAP in the same file.  */
from = linemap_included_from_linemap (set, map - 1);
  
@@ -630,7 +630,24 @@ linemap_add (line_maps *set, enum lc_rea

if (to_file == NULL)
{
  to_file = ORDINARY_MAP_FILE_NAME (from);
- to_line = SOURCE_LINE (from, from[1].start_location);
+ /* Compute the line on which the map resumes, for #include this
+should be the line after the #include line.  Usually FROM is
+the map right before LC_ENTER map - the first map of the included
+file, and in that case SOURCE_LINE (from, from[1].start_location);
+computes the right line (and does handle even some special cases
+(e.g. where for returning from  we still want to
+be at line 0 or some -traditional-cpp cases).  In rare cases
+FROM can be followed by LC_RENAME created by linemap_line_start
+for line right after #include line.  If that happens,
+start_location of the FROM[1] map will be the same as
+start_location of FROM[2] LC_ENTER, but FROM[1] start_location
+might not have advance enough for moving to a full next line.
+In that case compute the line of #include line and add 1 to it
+to advance to the next line.  See PR120061.  */
+ if (from[1].reason == LC_RENAME)
+   to_line = SOURCE_LINE (from, linemap_included_from (map - 1)) + 1;
+ else
+   to_line = SOURCE_LINE (from, from[1].start_location);
  sysp = ORDINARY_MAP_IN_SYSTEM_HEADER_P (from);
}
else
@@ -660,11 +677,26 @@ linemap_add (line_maps *set, enum lc_rea
if (set->depth == 0)
map->included_from = 0;
else
-   /* The location of the end of the just-closed map.  */
-   map->included_from
- = (((map[0].start_location - 1 - map[-1].start_location)
- & ~((loc_one << map[-1].m_column_and_range_bits) - 1))
-+ map[-1].start_location);
+   {
+ /* Compute location from whence this line map was included.
+For #include this should be preferrably column 0 of the
+line on which

Re: [PATCH 8/8] AArch64: rules for CMPBR instructions

2025-05-07 Thread Richard Sandiford
Karl Meakin  writes:
> Add rules for lowering `cbranch4` to CBB/CBH/CB when CMPBR
> extension is enabled.
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.md (cbranch4): emit CMPBR
>   instructions if possible.
>   (cbranch4): new expand rule.
>   (aarch64_cb): likewise.
>   (aarch64_cb): likewise.
>   * config/aarch64/iterators.md (cmpbr_suffix): new mode attr.
>   * config/aarch64/predicates.md (const_0_to_63_operand): new
>   predicate.
>   (aarch64_cb_immediate): likewise.
>   (aarch64_cb_operand): likewise.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/cmpbr.c: update tests.

In addition to Kyrill's comments (which I agree with):

> @@ -720,18 +720,41 @@ (define_constants
>  ;; Conditional jumps
>  ;; ---
>  
> -(define_expand "cbranch4"
> +(define_expand "cbranch4"
>[(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>   [(match_operand:GPI 1 "register_operand")
>(match_operand:GPI 2 "aarch64_plus_operand")])
>  (label_ref (match_operand 3))
>  (pc)))]
>""
> -  "
> -  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
> -  operands[2]);
> -  operands[2] = const0_rtx;
> -  "
> +  {
> +  if (TARGET_CMPBR && aarch64_cb_operand (operands[2], mode))
> +{
> +  emit_jump_insn (gen_aarch64_cb (operands[0], operands[1],
> + operands[2], operands[3]));
> +  DONE;
> +}

There is an implicit choice here to use a separate CMP + Bcc if the
immediate is out of range, rather than force out-of-range immediates into
a temporary register.  That can be the right choice for immediates in the
range of CMP, but whether it is or not depends on global information that
we don't have.  If the immediate is needed for multiple branches, it would
be better (sizewise) to load the immediate into a temporary register and
use it for each branch, provided that there's a call-clobbered register
free and that the branches are in the 1KiB range.  In other situations,
what the patch does is best.

But perhaps it would be worth forcing values that are outside the
range of CMP into a register and using the new form, rather than
emitting an immediate move, a CMP, and a branch.

Either way, I think it's worth a comment saying what we do with
out-of-range immediates.

> +  else
> +{
> +  operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]),
> +  operands[1], operands[2]);
> +  operands[2] = const0_rtx;
> +}
> +  }
> +)
> +
> @@ -758,6 +781,58 @@ (define_expand "cbranchcc4"
>""
>  )
>  
> +;; Emit a `CB (register)` or `CB (immediate)` instruction.
> +(define_insn "aarch64_cb"
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> + [(match_operand:GPI 1 "register_operand")
> +  (match_operand:GPI 2 "aarch64_cb_operand")])
> +(label_ref (match_operand 3))
> +(pc)))]
> +  "TARGET_CMPBR"
> +  "cb%m0\\t%1, %2, %l3";
> +  [(set_attr "type" "branch")
> +   (set (attr "length")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +(const_int BRANCH_LEN_N_1Kib))
> +(lt (minus (match_dup 3) (pc))
> +(const_int BRANCH_LEN_P_1Kib)))
> +   (const_int 4)
> +   (const_int 8)))
> +   (set (attr "far_branch")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +(const_int BRANCH_LEN_N_1Kib))
> +(lt (minus (match_dup 3) (pc))
> +(const_int BRANCH_LEN_P_1Kib)))
> +   (const_string "no")
> +   (const_string "yes")))]
> +)
> +
> +;; Emit a `CBB (register)` or `CBH (register)` instruction.
> +(define_insn "aarch64_cb"
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> + [(match_operand:SHORT 1 "register_operand")
> +  (match_operand:SHORT 2 
> "aarch64_cb_short_operand")])
> +(label_ref (match_operand 3))
> +(pc)))]
> +  "TARGET_CMPBR"
> +  "cb%m0\\t%1, %2, %l3";
> +  [(set_attr "type" "branch")
> +   (set (attr "length")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +(const_int BRANCH_LEN_N_1Kib))
> +(lt (minus (match_dup 3) (pc))
> +(const_int BRANCH_LEN_P_1Kib)))
> +   (const_int 4)
> +   (const_int 8)))
> +   (set (attr "far_branch")
> + (if_then_else (and (ge (minus (match_dup 3) (pc))
> +(const_int BRAN

Re: [PATCH] RISC-V: Minimal support for zama16b extension.

2025-05-07 Thread Jeff Law




On 4/18/25 2:46 AM, Dongyan Chen wrote:

This patch support zama16b extension[1].
To enable GCC to recognize and process zama16b extension correctly at compile 
time.

[1] https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: New extension.
* config/riscv/riscv.opt: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-45.c: New test.

Thanks.  I've pushed this to the trunk after renaming the test.

jeff



Re: [GCC16,RFC,V2 03/14] aarch64: add new insn definition for st2g

2025-05-07 Thread Indu Bhagat

On 5/6/25 3:53 AM, Richard Sandiford wrote:

Indu Bhagat  writes:

On 4/15/25 9:21 AM, Richard Sandiford wrote:

Indu Bhagat  writes:

Store Allocation Tags (st2g) is an Armv8.5-A memory tagging (MTE)
instruction. It stores an allocation tag to two tag granules of memory.

TBD:
- Not too sure what is the best way to generate the st2g yet; A
  subsequent patch will emit them in one of the target hooks.


Regarding the previous thread about this:

  https://gcc.gnu.org/pipermail/gcc-patches/2024-November/668671.html

and your question about whether all types of store tag instructions
should be volatile: if we went for that approach, then yeah, I think so.

As I mentioned there, I don't think we should use (unspec ...) memory
addresses.

But thinking more about it: can we guarantee that GCC will only use
these instruction patterns with base registers that are aligned to
16 bytes?  If so, then perhaps an alternative would be to model
them as read-modify-write operations to the whole granule (even though
the actual instructions leave normal memory untouched and only change
the tags).  That is, rather than:



gcc/ChangeLog:

* config/aarch64/aarch64.md (st2g): New definition.
---
   gcc/config/aarch64/aarch64.md | 20 
   1 file changed, 20 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 0c7aebb838cd..d3223e275c51 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -8475,6 +8475,26 @@
 [(set_attr "type" "memtag")]
   )
   
+;; ST2G updates allocation tags for two memory granules (i.e. 32 bytes) at

+;; once, without zero initialization.
+(define_insn "st2g"
+  [(set (mem:QI (unspec:DI
+[(plus:DI (match_operand:DI 1 "register_operand" "rk")
+  (match_operand:DI 2 "aarch64_granule16_simm9" "i"))]
+UNSPEC_TAG_SPACE))
+   (and:QI (lshiftrt:DI (match_operand:DI 0 "register_operand" "rk")
+(const_int 56)) (const_int 15)))
+   (set (mem:QI (unspec:DI
+[(plus:DI (match_dup 1)
+  (match_operand:DI 3 "aarch64_granule16_simm9" "i"))]
+UNSPEC_TAG_SPACE))
+   (and:QI (lshiftrt:DI (match_dup 0)
+(const_int 56)) (const_int 15)))]
+  "TARGET_MEMTAG && (INTVAL (operands[2]) - 16 == INTVAL (operands[3]))"
+  "st2g\\t%0, [%1, #%2]"
+  [(set_attr "type" "memtag")]
+)
+


...this, we could do:

(set (match_operand:OI 0 "aarch64_granule_memory_operand" "+")
   (unspec_volatile:OI
 [(match_dup 0)
  (match_operand:DI 1 "register_operand" "rk")]
 UNSPECV...))

Using OImode (256 bytes) indicates that two full granules are affected
by the store, but that no other memory is affected.  The (match_dup 0)
read indicates that this store does not kill any previous store to the
same 256 bytes (since the contents of normal memory don't change).
The unspec_volatile should ensure that nothing tries to remove the
store as dead (which would especially be a problem when clearing tags).



I dont understand the statement: "The (match_dup 0) read indicates that
this store does not kill any previous store to the same 256 bytes".


The problem is that if we had, say:

   (set (mem:TI x) (const_int 0))
   (set (mem:TI x) (unspec_volatile [(reg:DI base)] UNSPECV...))

the (mem:TI x) in the second instruction would seem to overwrite the
result of the first instruction, making the first instruction dead.
I would expect DSE to get rid of the zeroing in this case.


I am currently seeing an issue (mentioned below).


Using a single memory operand for the whole instruction has the advantage
of only requiring one offset to be represented, rather than having both
operands 2 and 3 in the original pattern.  It also copes more easily
with cases where the offset is zero for the first or second address,
since no (plus ...) should be present in that case.



Currently I am using:

(define_insn "stg"
[(set (match_operand:TI 0 "aarch64_granule16_memory_operand" "+Umg")
(unspec_volatile:TI
  [(match_dup 0)
   (match_operand:DI 1 "register_operand" "rk")]
  UNSPECV_TAG_SPACE))]
"TARGET_MEMTAG"
"stg\\t%1, %0"
[(set_attr "type" "memtag")]
)

...

(define_predicate "aarch64_granule16_memory_operand"
(and (match_test "TARGET_MEMTAG")
 (and (match_code "mem")
  (match_test "aarch64_granule16_memory_address_p (op)"

where aarch64_granule16_memory_address_p () simply checks for
aarch64_granule16_simm9 immediate for now.

Basically, I was expecting the generation of a POST_MODIFY for :
  stg x0, [x2]
  add x2, x2, 16

But in the rtl dump (XX.c.300r.auto_inc_dec):

(insn 31 44 32 3 (set (mem:TI (plus:DI (reg/f:DI 122 [ _10 ])
  (const_int 0 [0])) [0  S16 A8])


Not related to the issue you're hitting, but this indicates that
something has gone wrong somewhere.  We shouldn't see an unfolded
(plus

[PATCH v2 1/3] Support symbol reference in jump label and jump table

2025-05-07 Thread H.J. Lu
Conditional and unconditional branch targets can be either a label or
a symbol.  For conditional jump:

(jump_insn 7 6 14 2 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
(label_ref:DI 23)
(pc))) "x.c":8:5 1458 {jcc}
 (expr_list:REG_DEAD (reg:CCZ 17 flags)
(int_list:REG_BR_PROB 217325348 (nil)))
...
(code_label 23 20 8 4 4 (nil) [1 uses])
(note 8 23 9 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(call_insn/j 9 8 10 4 (call (mem:QI (symbol_ref:DI ("bar") [flags 0x41]  
) [0 bar S1 A8])
(const_int 0 [0])) "x.c":8:14 discrim 1 1469 {sibcall_di}
 (expr_list:REG_CALL_DECL (symbol_ref:DI ("bar") [flags 0x41]  )
(nil))
(nil))

they can be changed to

(jump_insn 7 6 14 2 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
((symbol_ref:DI ("bar") [flags 0x41] )
(pc))) "x.c":8:5 1458 {jcc}
 (expr_list:REG_DEAD (reg:CCZ 17 flags)
(int_list:REG_BR_PROB 217325348 (nil)))

if the call is a sibcall.  For jump table:

(jump_table_data 16 15 17 (addr_vec:DI [
(label_ref:DI 18)
(label_ref:DI 22)
(label_ref:DI 26)
(label_ref:DI 30)
(label_ref:DI 34)
]))
...
(code_label 30 17 31 4 5 (nil) [1 uses])
(note 31 30 32 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(call_insn/j 32 31 33 4 (call (mem:QI (symbol_ref:DI ("bar3") [flags 0x41]  
) [0 bar3 S1 A8])
(const_int 0 [0])) "j.c":15:13 1469 {sibcall_di}
 (expr_list:REG_CALL_DECL (symbol_ref:DI ("bar3") [flags 0x41]  
)
(nil))
(nil))

They can be changed to

(jump_table_data 16 15 17 (addr_vec:DI [
(symbol_ref:DI ("bar0") [flags 0x41]  )
(symbol_ref:DI ("bar1") [flags 0x41]  )
(symbol_ref:DI ("bar2") [flags 0x41]  )
(symbol_ref:DI ("bar3") [flags 0x41]  )
(symbol_ref:DI ("bar4") [flags 0x41]  )
]))

if bar0/bar1/bar2/bar3/bar4 calls are sibcalls.

Instead of supporting symbol reference in jump label and jump table in
the full RTL optimization pipeline, which requires very invasive changes
to GCC RTL infrastructure, support symbol reference in jump label and
jump table for the pass which turning REG_EH_REGION notes back into
NOTE_INSN_EH_REGION notes and after:

1. Add a set_jump_target method to assign symbol reference to jump label.
2. Add condsibcall_p for conditional sibling call.
3. Add anycall_p to return true for call and conditional sibcall.
4. Replace CALL_P with anycall_p in except.cc, final.cc and function-abi.cc
to support conditional sibcall.
5. Return false for symbol reference in jump table check.
6. Update create_trace_edges and rtx_writer::print_rtx_operand_code_0 to
handle symbol reference in jump label.
7. Update to final_scan_insn_1 to handle symbol reference in jump table.
8. Document limitation of symbol reference support in jump label.

* dwarf2cfi.c (create_trace_edges): Skip symbol reference in
jump table and in JUMP_LABEL.  Short-circuit JUMP for the pure
sibcall.
* except.cc (sjlj_mark_call_sites): Replace CALL_P with
anycall_p.
(finish_eh_generation): Likewise.
(insn_could_throw_p): Likewise.
(can_nonlocal_goto): Likewise.
(set_nothrow_function_flags): Also call condsibcall_p to check
conditional sibcall.
* final.cc (final_scan_insn_1): Support symbol reference in jump
table.
(collect_fn_hard_reg_usage): Replace CALL_P with anycall_p.
* function-abi.cc (insn_callee_abi): Likewise.
* jump.cc (condsibcall_p): New.
* print-rtl.cc (rtx_writer::print_rtx_operand_code_0): Support
symbol reference in JUMP_LABEL.
* rtl.h (rtx_jump_insn::set_jump_target): New, with the rtx
argument.
* rtl.h (condsibcall_p): New.
(anycall_p): Likewise.
* rtlanal.cc (tablejump_p): Return false if JUMP_LABEL is a
symbol reference.
* config/i386/i386-expand.cc (ix86_notrack_prefixed_insn_p):
Likewise.
* doc/rtl.texi (addr_vec): Also allow symbol reference.
(JUMP_LABEL): Likewise.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386-expand.cc |  5 -
 gcc/doc/rtl.texi   | 24 +--
 gcc/dwarf2cfi.cc   | 20 ++-
 gcc/except.cc  | 11 ++-
 gcc/final.cc   | 26 +---
 gcc/function-abi.cc|  2 +-
 gcc/jump.cc| 36 ++
 gcc/print-rtl.cc   |  2 ++
 gcc/rtl.h  | 32 ++
 gcc/rtlanal.cc |  5 -
 10 files changed, 141 insertions(+), 22 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 7f0fdb6fa9e..0d0802692d1 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expan

[PATCH v2 0/3] x86: Add a pass to fold tail call

2025-05-07 Thread H.J. Lu
Changes in v2:

1. Search backward from exit basic blocks with only sibcalls.
2. Add anycall_p to return true for call and conditional sibcall.
3. Replace CALL_P with anycall_p in except.cc to handle exceptions with
conditional sibcalls.
4. Update the gate function to follow the tree tailcall pass.

---
Conditional and unconditional branch targets can be either a label or
a symbol.  For conditional jump:

(jump_insn 7 6 14 2 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
(label_ref:DI 23)
(pc))) "x.c":8:5 1458 {jcc}
 (expr_list:REG_DEAD (reg:CCZ 17 flags)
(int_list:REG_BR_PROB 217325348 (nil)))
...
(code_label 23 20 8 4 4 (nil) [1 uses])
(note 8 23 9 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(call_insn/j 9 8 10 4 (call (mem:QI (symbol_ref:DI ("bar") [flags 0x41]  
) [0 bar S1 A8])
(const_int 0 [0])) "x.c":8:14 discrim 1 1469 {sibcall_di}
 (expr_list:REG_CALL_DECL (symbol_ref:DI ("bar") [flags 0x41]  )
(nil))
(nil))

they can be changed to

(jump_insn 7 6 14 2 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
((symbol_ref:DI ("bar") [flags 0x41] )
(pc))) "x.c":8:5 1458 {jcc}
 (expr_list:REG_DEAD (reg:CCZ 17 flags)
(int_list:REG_BR_PROB 217325348 (nil)))

if the call is a sibcall.  For jump table:

(jump_table_data 16 15 17 (addr_vec:DI [
(label_ref:DI 18)
(label_ref:DI 22)
(label_ref:DI 26)
(label_ref:DI 30)
(label_ref:DI 34)
]))
...
(code_label 30 17 31 4 5 (nil) [1 uses])
(note 31 30 32 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(call_insn/j 32 31 33 4 (call (mem:QI (symbol_ref:DI ("bar3") [flags 0x41]  
) [0 bar3 S1 A8])
(const_int 0 [0])) "j.c":15:13 1469 {sibcall_di}
 (expr_list:REG_CALL_DECL (symbol_ref:DI ("bar3") [flags 0x41]  
)
(nil))
(nil))

They can be changed to

(jump_table_data 16 15 17 (addr_vec:DI [
(symbol_ref:DI ("bar0") [flags 0x41]  )
(symbol_ref:DI ("bar1") [flags 0x41]  )
(symbol_ref:DI ("bar2") [flags 0x41]  )
(symbol_ref:DI ("bar3") [flags 0x41]  )
(symbol_ref:DI ("bar4") [flags 0x41]  )
]))

if bar0/bar1/bar2/bar3/bar4 calls are sibcalls.

Instead of supporting symbol reference in jump label and jump table in
the full RTL optimization pipeline, which requires very invasive changes
to GCC RTL infrastructure, support symbol reference in jump label and
jump table for the pass which turning REG_EH_REGION notes back into
NOTE_INSN_EH_REGION notes and after.

Searching backward from exit basic blocks with only sibcalls, check the
last instruction in each predecessor.  If the last instruction is a
conditional jump and its target is the exit block, change the conditional
jump target to the sibcall target, decrement the destination basic block
entry label use count, redirect the edge to the exit basic block and call
delete_unreachable_blocks to delete the unreachable basic blocks.  Repeat
it until there is no conditional jump to update.

If the jump table entry points to a target basic block with only a direct
sibcall, change the entry to point to the sibcall target, decrement the
target basic block entry label use count and redirect the edge to the
exit basic block.

H.J. Lu (3):
  Support symbol reference in jump label and jump table
  x86: Add a pass to fold tail call
  x86: Fold sibcall targets into jump table

 gcc/config/i386/i386-expand.cc |   5 +-
 gcc/config/i386/i386-features.cc   | 327 +
 gcc/config/i386/i386-passes.def|   1 +
 gcc/config/i386/i386-protos.h  |   3 +
 gcc/config/i386/i386.cc|  12 +
 gcc/config/i386/i386.md|   9 +-
 gcc/config/i386/predicates.md  |   4 +
 gcc/doc/rtl.texi   |  24 +-
 gcc/dwarf2cfi.cc   |  20 +-
 gcc/except.cc  |  11 +-
 gcc/final.cc   |  26 +-
 gcc/function-abi.cc|   2 +-
 gcc/jump.cc|  36 +++
 gcc/print-rtl.cc   |   2 +
 gcc/rtl.h  |  32 ++
 gcc/rtlanal.cc |   5 +-
 gcc/testsuite/gcc.target/i386/pr14721-1a.c |  54 
 gcc/testsuite/gcc.target/i386/pr14721-1b.c |  37 +++
 gcc/testsuite/gcc.target/i386/pr14721-1c.c |  37 +++
 gcc/testsuite/gcc.target/i386/pr14721-2a.c |  58 
 gcc/testsuite/gcc.target/i386/pr14721-2b.c |  41 +++
 gcc/testsuite/gcc.target/i386/pr14721-2c.c |  43 +++
 gcc/testsuite/gcc.target/i386/pr14721-3a.c |  56 
 gcc/testsuite/gcc.target/i386/pr14721-3b.c |  40 +++
 gcc/testsuite/gcc.target/i386/pr14721-3c.c |  39 +++
 gcc/testsuite/gcc.target/i386/pr47253-10.c |  15 +
 gcc/testsuite/gcc.target/i386/pr47253-1a.c |  24 ++
 gcc/testsuite/gcc.target/i386/pr47

[PATCH v2 3/3] x86: Fold sibcall targets into jump table

2025-05-07 Thread H.J. Lu
Enhance fold sibcall pass to fold sibcall targets into jump table by
turning:

foo:
.cfi_startproc
cmpl$4, %edi
ja  .L1
movl%edi, %edi
jmp *.L4(,%rdi,8)
.section.rodata
.L4:
.quad   .L8
.quad   .L7
.quad   .L6
.quad   .L5
.quad   .L3
.text
.L5:
jmp bar3
.L3:
jmp bar4
.L8:
jmp bar0
.L7:
jmp bar1
.L6:
jmp bar2
.L1:
ret
.cfi_endproc

into:

foo:
.cfi_startproc
cmpl$4, %edi
ja  .L1
movl%edi, %edi
jmp *.L4(,%rdi,8)
.section.rodata
.L4:
.quad   bar0
.quad   bar1
.quad   bar2
.quad   bar3
.quad   bar4
.text
.L1:
ret
.cfi_endproc

Before DWARF frame generation pass, jump tables look like:

(jump_table_data 16 15 17 (addr_vec:DI [
(label_ref:DI 18)
(label_ref:DI 22)
(label_ref:DI 26)
(label_ref:DI 30)
(label_ref:DI 34)
]))
...
(code_label 30 17 31 4 5 (nil) [1 uses])
(note 31 30 32 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(call_insn/j 32 31 33 4 (call (mem:QI (symbol_ref:DI ("bar3") [flags 0x41]  
) [0 bar3 S1 A8])
(const_int 0 [0])) "j.c":15:13 1469 {sibcall_di}
 (expr_list:REG_CALL_DECL (symbol_ref:DI ("bar3") [flags 0x41]  
)
(nil))
(nil))

If the jump table entry points to a target basic block with only a direct
sibcall, change the entry to point to the sibcall target, decrement the
target basic block entry label use count and redirect the edge to the
exit basic block.

gcc/

PR target/14721
* config/i386/i386-features.cc (fold_sibcall): Fold the sibcall
targets into jump table.

gcc/testsuite/

PR target/14721
* gcc.target/i386/pr14721-1a.c: New.
* gcc.target/i386/pr14721-1b.c: Likewise.
* gcc.target/i386/pr14721-1c.c: Likewise.
* gcc.target/i386/pr14721-2c.c: Likewise.
* gcc.target/i386/pr14721-2b.c: Likewise.
* gcc.target/i386/pr14721-2c.c: Likewise.
* gcc.target/i386/pr14721-3c.c: Likewise.
* gcc.target/i386/pr14721-3b.c: Likewise.
* gcc.target/i386/pr14721-3c.c: Likewise.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386-features.cc   | 92 +-
 gcc/testsuite/gcc.target/i386/pr14721-1a.c | 54 +
 gcc/testsuite/gcc.target/i386/pr14721-1b.c | 37 +
 gcc/testsuite/gcc.target/i386/pr14721-1c.c | 37 +
 gcc/testsuite/gcc.target/i386/pr14721-2a.c | 58 ++
 gcc/testsuite/gcc.target/i386/pr14721-2b.c | 41 ++
 gcc/testsuite/gcc.target/i386/pr14721-2c.c | 43 ++
 gcc/testsuite/gcc.target/i386/pr14721-3a.c | 56 +
 gcc/testsuite/gcc.target/i386/pr14721-3b.c | 40 ++
 gcc/testsuite/gcc.target/i386/pr14721-3c.c | 39 +
 10 files changed, 496 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr14721-3c.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index b58fd79e68d..feb7384baac 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3702,13 +3702,103 @@ fold_one_sibcall (void)
  EXIT_BLOCK_PTR_FOR_FN (cfun));
  branch_edge->flags |= EDGE_SIBCALL | EDGE_ABNORMAL;
}
+ else if (label && !ANY_RETURN_P (label))
+   {
+ /* Check if it is a jump table with addresses.  */
+ rtx_insn *target = as_a (label);
+ rtx_insn *table = next_insn (target);
+ if (!table
+ || !JUMP_TABLE_DATA_P (table)
+ || GET_CODE (PATTERN (table)) != ADDR_VEC)
+   continue;
+
+ basic_block dest_bb;
+ rtx body = PATTERN (table);
+ unsigned int i, len = XVECLEN (body, 0);
+ rtx *sibcall_targets = new rtx [len]();
+ rtx *sibcall_notes = new rtx [len]();
+ bool matched_bb = false;
+
+ for (i = 0; i < len; i++)
+   {
+ label = XVECEXP (body, 0, i);
+ label = XEXP (label, 0);
+ dest_bb = BLOCK_FOR_INSN (label);
+ if (dest_bb == exit_bb)
+   {
+ matched_bb = true;
+ sibcall_targets[i] =

[PATCH v2 2/3] x86: Add a pass to fold tail call

2025-05-07 Thread H.J. Lu
x86 conditional branch (jcc) target can be either a label or a symbol.
Add a pass to fold tail call with jcc by turning:

jcc .L6
...
.L6:
jmp tailcall

into:

jcc tailcall

Immediately before the pass which turning REG_EH_REGION notes back into
NOTE_INSN_EH_REGION notes, conditional branches look like

(jump_insn 7 6 14 2 (set (pc)
(if_then_else (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
(label_ref:DI 23)
(pc))) "x.c":8:5 1458 {jcc}
 (expr_list:REG_DEAD (reg:CCZ 17 flags)
(int_list:REG_BR_PROB 217325348 (nil)))
...
(code_label 23 20 8 4 4 (nil) [1 uses])
(note 8 23 9 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
(call_insn/j 9 8 10 4 (call (mem:QI (symbol_ref:DI ("bar") [flags 0x41]  ) [0 bar S1 A8])
(const_int 0 [0])) "x.c":8:14 discrim 1 1469 {sibcall_di}
 (expr_list:REG_CALL_DECL (symbol_ref:DI ("bar") [flags 0x41]  )
(nil))
(nil))

Searching backward from exit basic blocks with only sibcalls, check the
last instruction in each predecessor.  If the last instruction is a
conditional jump and its target is the exit block, change the conditional
jump target to the sibcall target, decrement the destination basic block
entry label use count, redirect the edge to the exit basic block and call
delete_unreachable_blocks to delete the unreachable basic blocks.  Repeat
it until there is no conditional jump to update.

gcc/

PR target/47253
* i386/i386-features.cc: Include "cfgcleanup.h".
(sibcall_only_bb): New.
(reg_eh_region_note_ok_p): Likewise.
(fold_one_sibcall): Likewise.
(fold_sibcall): Likewise.
(pass_data_fold_sibcall): Likewise.
(pass_fold_sibcall): Likewise.
(make_pass_fold_sibcall): Likewise.
* config/i386/i386-passes.def: Add pass_fold_sibcall before
pass_convert_to_eh_region_ranges.
* config/i386/i386-protos.h (ix86_output_jcc_insn): New.
(make_pass_fold_sibcall): Likewise.
* config/i386/i386.cc (ix86_output_jcc_insn): Likewise.
* config/i386/i386.md (*jcc): Renamed to ...
(jcc): This.  Replace label_ref with symbol_label_operand.  Use
ix86_output_jcc_insn.  Set length to 6 if the branch target
isn't a label.

gcc/testsuite/

PR target/47253
* gcc.target/i386/pr47253-1a.c: New file.
* gcc.target/i386/pr47253-1b.c: Likewise.
* gcc.target/i386/pr47253-2a.c: Likewise.
* gcc.target/i386/pr47253-2b.c: Likewise.
* gcc.target/i386/pr47253-3a.c: Likewise.
* gcc.target/i386/pr47253-3b.c: Likewise.
* gcc.target/i386/pr47253-3c.c: Likewise.
* gcc.target/i386/pr47253-4a.c: Likewise.
* gcc.target/i386/pr47253-4b.c: Likewise.
* gcc.target/i386/pr47253-5.c: Likewise.
* gcc.target/i386/pr47253-6.c: Likewise.
* gcc.target/i386/pr47253-7a.c: Likewise.
* gcc.target/i386/pr47253-7b.c: Likewise.
* gcc.target/i386/pr47253-8.c: Likewise.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386-features.cc   | 237 +
 gcc/config/i386/i386-passes.def|   1 +
 gcc/config/i386/i386-protos.h  |   3 +
 gcc/config/i386/i386.cc|  12 ++
 gcc/config/i386/i386.md|   9 +-
 gcc/config/i386/predicates.md  |   4 +
 gcc/testsuite/gcc.target/i386/pr47253-10.c |  15 ++
 gcc/testsuite/gcc.target/i386/pr47253-1a.c |  24 +++
 gcc/testsuite/gcc.target/i386/pr47253-1b.c |  17 ++
 gcc/testsuite/gcc.target/i386/pr47253-2a.c |  29 +++
 gcc/testsuite/gcc.target/i386/pr47253-2b.c |  17 ++
 gcc/testsuite/gcc.target/i386/pr47253-3a.c |  32 +++
 gcc/testsuite/gcc.target/i386/pr47253-3b.c |  20 ++
 gcc/testsuite/gcc.target/i386/pr47253-3c.c |  20 ++
 gcc/testsuite/gcc.target/i386/pr47253-4a.c |  26 +++
 gcc/testsuite/gcc.target/i386/pr47253-4b.c |  18 ++
 gcc/testsuite/gcc.target/i386/pr47253-5.c  |  15 ++
 gcc/testsuite/gcc.target/i386/pr47253-6.c  |  15 ++
 gcc/testsuite/gcc.target/i386/pr47253-7a.c |  52 +
 gcc/testsuite/gcc.target/i386/pr47253-7b.c |  36 
 gcc/testsuite/gcc.target/i386/pr47253-8.c  |  74 +++
 gcc/testsuite/gcc.target/i386/pr47253-9.c  |  22 ++
 22 files changed, 694 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr47253-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr

[PATCH v3 2/2] phiopt: Use rewrite_to_defined_overflow in move_stmt [PR116938]

2025-05-07 Thread Andrew Pinski
As mentioned previously the rewrite in move_stmt should be
using gimple_needing_rewrite_undefined/rewrite_to_defined_unconditional
instead of just rewriting the VCE.
This moves move_stmt over to those APIs.

A few testcases needed to be updated due to ABS_EXPR rewrite that happens.

Bootstrapped and tested on x86_64-linux-gnu.

gcc/ChangeLog:

* tree-ssa-phiopt.cc (move_stmt): Use rewrite_to_defined_overflow
isntead of manually doing the rewrite of the VCE.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/phi-opt-40.c: Update to expect ABSU_EXPR.
* gcc.dg/tree-ssa/phi-opt-41.c: Likewise.

Signed-off-by: Andrew Pinski 
---
 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-40.c |  7 +++---
 gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c |  4 ++--
 gcc/tree-ssa-phiopt.cc | 26 +++---
 3 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-40.c 
b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-40.c
index a9011ce97fb..70629165bb6 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-40.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-40.c
@@ -20,6 +20,7 @@ int f1(int x)
 
 /* { dg-final { scan-tree-dump-times "if " 1 "phiopt1" } } */
 /* { dg-final { scan-tree-dump-not "if " "phiopt2" } } */
-/* { dg-final { scan-tree-dump-times "ABS_EXPR <" 2 "phiopt1" } } */
-/* { dg-final { scan-tree-dump-times "ABS_EXPR <" 1 "phiopt2" } } */
-/* { dg-final { scan-tree-dump-times "ABSU_EXPR <" 1 "phiopt2" } } */
+/* The ABS_EXPR in f gets rewritten to ABSU_EXPR as phiopt can't prove it was 
not undefined when moving it. */
+/* { dg-final { scan-tree-dump-times "ABS_EXPR <" 1 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "ABSU_EXPR <" 1 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "ABSU_EXPR <" 2 "phiopt2" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c 
b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
index 9774e283a7b..817d4feb027 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/phi-opt-41.c
@@ -29,6 +29,6 @@ int fge(int a, unsigned char b)
   return a > 0 ? a : -a;
 }
 
-
+/* The ABS_EXPR gets rewritten to ABSU_EXPR as phiopt can't prove it was not 
undefined when moving it. */
 /* { dg-final { scan-tree-dump-not "if " "phiopt1" } } */
-/* { dg-final { scan-tree-dump-times "ABS_EXPR <" 4 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "ABSU_EXPR <" 4 "phiopt1" } } */
diff --git a/gcc/tree-ssa-phiopt.cc b/gcc/tree-ssa-phiopt.cc
index 54ecd93495a..efd43d2d77e 100644
--- a/gcc/tree-ssa-phiopt.cc
+++ b/gcc/tree-ssa-phiopt.cc
@@ -838,33 +838,13 @@ move_stmt (gimple *stmt, gimple_stmt_iterator *gsi, 
auto_bitmap &inserted_exprs)
   // Mark the name to be renamed if there is one.
   bitmap_set_bit (inserted_exprs, SSA_NAME_VERSION (name));
   gimple_stmt_iterator gsi1 = gsi_for_stmt (stmt);
-  gsi_move_before (&gsi1, gsi);
+  gsi_move_before (&gsi1, gsi, GSI_NEW_STMT);
   reset_flow_sensitive_info (name);
 
   /* Rewrite some code which might be undefined when
  unconditionalized. */
-  if (gimple_assign_single_p (stmt))
-{
-  tree rhs = gimple_assign_rhs1 (stmt);
-  /* VCE from integral types to another integral types but with
-different precisions need to be changed into casts
-to be well defined when unconditional. */
-  if (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
- && INTEGRAL_TYPE_P (TREE_TYPE (name))
- && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (rhs, 0
-   {
- if (dump_file && (dump_flags & TDF_DETAILS))
-   {
- fprintf (dump_file, "rewriting stmt with maybe undefined VCE ");
- print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
-   }
- tree new_rhs = TREE_OPERAND (rhs, 0);
- gcc_assert (is_gimple_val (new_rhs));
- gimple_assign_set_rhs_code (stmt, NOP_EXPR);
- gimple_assign_set_rhs1 (stmt, new_rhs);
- update_stmt (stmt);
-   }
-}
+  if (gimple_needing_rewrite_undefined (stmt))
+rewrite_to_defined_unconditional (gsi);
 }
 
 /* RAII style class to temporarily remove flow sensitive
-- 
2.34.1



[PATCH v3 1/2] Rewrite VCEs of integral types [PR116939]

2025-05-07 Thread Andrew Pinski
Like the patch to phiopt (r15-4033-g1f619fe25925a5f7), this adds rewriting
of VCE to gimple_with_undefined_signed_overflow/rewrite_to_defined_overflow.
In the case of moving VCE of a bool from being conditional to unconditional,
it needs to be rewritten to not to use VCE but a normal cast. pr120122-1.c is
an example of where LIM needs this rewriting. The precision of the outer type
needs to be less then the inner one.

This also renames gimple_with_undefined_signed_overflow to 
gimple_needing_rewrite_undefined
and rewrite_to_defined_overflow to rewrite_to_defined_unconditional as they 
will be doing
more than just handling signed overflow.

Changes since v1:
* v2: rename the functions.
* v3: Add check for precision to be smaller.

Bootstrappd and tested on x86_64-linux-gnu.

PR tree-optimization/120122
PR tree-optimization/116939

gcc/ChangeLog:

* gimple-fold.h (gimple_with_undefined_signed_overflow): Rename to ..
(rewrite_to_defined_overflow): This.
(gimple_needing_rewrite_undefined): Rename to ...
(rewrite_to_defined_unconditional): this.
* gimple-fold.cc (gimple_with_undefined_signed_overflow): Rename to ...
(gimple_needing_rewrite_undefined): This. Return true for VCE with 
integral
types of smaller precision.
(rewrite_to_defined_overflow): Rename to ...
(rewrite_to_defined_unconditional): This. Handle VCE rewriting to a 
cast.
* tree-if-conv.cc: 
s/gimple_with_undefined_signed_overflow/gimple_needing_rewrite_undefined/
s/rewrite_to_defined_overflow/rewrite_to_defined_unconditional.
* tree-scalar-evolution.cc: Likewise
* tree-ssa-ifcombine.cc: Likewise.
* tree-ssa-loop-im.cc: Likewise.
* tree-ssa-loop-split.cc: Likewise.
* tree-ssa-reassoc.cc: Likewise.

gcc/testsuite/ChangeLog:

* gcc.dg/torture/pr120122-1.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/gimple-fold.cc| 56 ++-
 gcc/gimple-fold.h |  6 +--
 gcc/testsuite/gcc.dg/torture/pr120122-1.c | 51 +
 gcc/tree-if-conv.cc   |  6 +--
 gcc/tree-scalar-evolution.cc  |  4 +-
 gcc/tree-ssa-ifcombine.cc |  4 +-
 gcc/tree-ssa-loop-im.cc   |  4 +-
 gcc/tree-ssa-loop-split.cc|  4 +-
 gcc/tree-ssa-reassoc.cc   |  4 +-
 9 files changed, 111 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/pr120122-1.c

diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index 7721795b20d..fd52b58905c 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -10592,10 +10592,12 @@ arith_code_with_undefined_signed_overflow (tree_code 
code)
 
 /* Return true if STMT has an operation that operates on a signed
integer types involves undefined behavior on overflow and the
-   operation can be expressed with unsigned arithmetic.  */
+   operation can be expressed with unsigned arithmetic.
+   Also returns true if STMT is a VCE that needs to be rewritten
+   if moved to be executed unconditionally.   */
 
 bool
-gimple_with_undefined_signed_overflow (gimple *stmt)
+gimple_needing_rewrite_undefined (gimple *stmt)
 {
   if (!is_gimple_assign (stmt))
 return false;
@@ -10606,6 +10608,16 @@ gimple_with_undefined_signed_overflow (gimple *stmt)
   if (!INTEGRAL_TYPE_P (lhs_type)
   && !POINTER_TYPE_P (lhs_type))
 return false;
+  tree rhs = gimple_assign_rhs1 (stmt);
+  /* VCE from integral types to a integral types but with
+ a smaller precision need to be changed into casts
+ to be well defined. */
+  if (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
+  && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (rhs, 0)))
+  && is_gimple_val (TREE_OPERAND (rhs, 0))
+  && TYPE_PRECISION (lhs_type)
+ < TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (rhs, 0
+return true;
   if (!TYPE_OVERFLOW_UNDEFINED (lhs_type))
 return false;
   if (!arith_code_with_undefined_signed_overflow
@@ -10625,19 +10637,39 @@ gimple_with_undefined_signed_overflow (gimple *stmt)
contain a modified form of STMT itself.  */
 
 static gimple_seq
-rewrite_to_defined_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
-bool in_place)
+rewrite_to_defined_unconditional (gimple_stmt_iterator *gsi, gimple *stmt,
+ bool in_place)
 {
+  gcc_assert (gimple_needing_rewrite_undefined (stmt));
   if (dump_file && (dump_flags & TDF_DETAILS))
 {
-  fprintf (dump_file, "rewriting stmt with undefined signed "
-  "overflow ");
+  fprintf (dump_file, "rewriting stmt for being uncondtional defined");
   print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
 }
-
+  gimple_seq stmts = NULL;
+  /* VCE from integral types to another integral types but with
+ smaller precisions need to be changed into casts
+ to be well defined. */
+  if

Re: [PATCH v2] RISC-V: Support for zilsd and zclsd extensions.

2025-05-07 Thread Dongyan Chen

This patch support zilsd and zclsd[1] extensions.
To enable GCC to recognize and process zilsd and zclsd extension 
correctly at compile time.


[1] https://github.com/riscv/riscv-zilsd

Changes for v3:
- Fix the error of testcases and merge the fail testcases into one file 
and rename the file.

Changes for v2:
- Remove the addition of zilsd and zclsd extension in 
gcc/common/config/riscv/riscv-ext-bitmask.def
- Fix a bug with zilsd and zclsd extension dependency in 
gcc/common/config/riscv/riscv-common.cc


gcc/ChangeLog:

    * common/config/riscv/riscv-common.cc 
(riscv_subset_list::check_conflict_ext): New extension.

    * config/riscv/riscv.opt: Ditto.

gcc/testsuite/ChangeLog:

    * gcc.target/riscv/arch-49.c: New test.
    * gcc.target/riscv/arch-50.c: New test.

---
 gcc/common/config/riscv/riscv-common.cc  | 16 
 gcc/config/riscv/riscv.opt   |  4 
 gcc/testsuite/gcc.target/riscv/arch-49.c |  5 +
 gcc/testsuite/gcc.target/riscv/arch-50.c |  9 +
 4 files changed, 34 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-49.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-50.c

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc

index ca14eb96b253..8c477fa227be 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -115,6 +115,9 @@ static const riscv_implied_info_t riscv_implied_info[] =
   {"zicfiss", "zimop"},
   {"zicfilp", "zicsr"},

+  {"zclsd", "zilsd"},
+  {"zclsd", "zca"},
+
   {"zk", "zkn"},
   {"zk", "zkr"},
   {"zk", "zkt"},
@@ -371,6 +374,9 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =

   {"zicntr", ISA_SPEC_CLASS_NONE, 2, 0},
   {"zihpm",  ISA_SPEC_CLASS_NONE, 2, 0},

+  {"zilsd",  ISA_SPEC_CLASS_NONE, 1, 0},
+  {"zclsd",  ISA_SPEC_CLASS_NONE, 1, 0},
+
   {"zk",    ISA_SPEC_CLASS_NONE, 1, 0},
   {"zkn",   ISA_SPEC_CLASS_NONE, 1, 0},
   {"zks",   ISA_SPEC_CLASS_NONE, 1, 0},
@@ -1347,6 +1353,14 @@ riscv_subset_list::check_conflict_ext ()
 error_at (m_loc, "%<-march=%s%>: zcf extension supports in rv32 only",
   m_arch);

+  if (lookup ("zilsd") && m_xlen == 64)
+    error_at (m_loc, "%<-march=%s%>: zilsd extension supports in rv32 
only",

+  m_arch);
+
+  if (lookup ("zclsd") && m_xlen == 64)
+    error_at (m_loc, "%<-march=%s%>: zclsd extension supports in rv32 
only",

+  m_arch);
+
   if (lookup ("zfinx") && lookup ("f"))
 error_at (m_loc,
   "%<-march=%s%>: z*inx conflicts with floating-point "
@@ -1687,6 +1701,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =

   RISCV_EXT_FLAG_ENTRY ("ziccif",  x_riscv_zi_subext, MASK_ZICCIF),
   RISCV_EXT_FLAG_ENTRY ("zicclsm", x_riscv_zi_subext, MASK_ZICCLSM),
   RISCV_EXT_FLAG_ENTRY ("ziccrse", x_riscv_zi_subext, MASK_ZICCRSE),
+  RISCV_EXT_FLAG_ENTRY ("zilsd",   x_riscv_zi_subext, MASK_ZILSD),

   RISCV_EXT_FLAG_ENTRY ("zicboz", x_riscv_zicmo_subext, MASK_ZICBOZ),
   RISCV_EXT_FLAG_ENTRY ("zicbom", x_riscv_zicmo_subext, MASK_ZICBOM),
@@ -1770,6 +1785,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =

   RISCV_EXT_FLAG_ENTRY ("zcd",  x_riscv_zc_subext, MASK_ZCD),
   RISCV_EXT_FLAG_ENTRY ("zcmp", x_riscv_zc_subext, MASK_ZCMP),
   RISCV_EXT_FLAG_ENTRY ("zcmt", x_riscv_zc_subext, MASK_ZCMT),
+  RISCV_EXT_FLAG_ENTRY ("zclsd", x_riscv_zc_subext, MASK_ZCLSD),

   RISCV_EXT_FLAG_ENTRY ("svade",   x_riscv_sv_subext, MASK_SVADE),
   RISCV_EXT_FLAG_ENTRY ("svadu",   x_riscv_sv_subext, MASK_SVADU),
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 80593ee139c1..ba5805e95452 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -257,6 +257,8 @@ Mask(ZICFISS) Var(riscv_zi_subext)

 Mask(ZICFILP) Var(riscv_zi_subext)

+Mask(ZILSD)   Var(riscv_zi_subext)
+
 TargetVariable
 int riscv_za_subext

@@ -463,6 +465,8 @@ Mask(ZCMP) Var(riscv_zc_subext)

 Mask(ZCMT) Var(riscv_zc_subext)

+Mask(ZCLSD) Var(riscv_zc_subext)
+
 Mask(XCVBI) Var(riscv_xcv_subext)

 TargetVariable
diff --git a/gcc/testsuite/gcc.target/riscv/arch-49.c 
b/gcc/testsuite/gcc.target/riscv/arch-49.c

new file mode 100644
index ..452c04e42f6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/arch-49.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gc_zilsd_zclsd -mabi=ilp32d" } */
+int foo()
+{
+}
diff --git a/gcc/testsuite/gcc.target/riscv/arch-50.c 
b/gcc/testsuite/gcc.target/riscv/arch-50.c

new file mode 100644
index ..4fc8608366b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/arch-50.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zilsd_zclsd -mabi=ilp32d" } */
+int foo()
+{
+}
+/* { dg-error "'-march=rv64gc_zilsd_zclsd': zilsd extension supports in 
rv32 only" "" { target *-*-* } 0 } */
+/* { dg-error "'-march=rv64gc_zilsd_zclsd': zclsd extension supports in 
rv32 only" "" { t

Re: [PATCH v5 05/10] libstdc++: Implement layout_left from mdspan.

2025-05-07 Thread Luc Grosheintz



On 5/6/25 2:47 PM, Tomasz Kaminski wrote:

On Tue, May 6, 2025 at 1:39 PM Luc Grosheintz 
wrote:



On 5/6/25 11:28 AM, Tomasz Kaminski wrote:

For better reference, here is illustration of the design I was thinking
about:
https://godbolt.org/z/7aTcM8fz4
I would also consider having left_mapping_base to accept padding, where
layout_left uses left_mapping_base.



Thank you for all the help! I think I'm doing what you're proposing.
However,
now I'm seeing an issue related to `explicit(cond)`.

Essentially, it seems like with GCC the explicit for inherited ctors is
ignored
while with Clang it isn't.

There's three variation of the issue: my working copy of layouts, a
simplified
version I extracted from the working copy:
https://godbolt.org/z/8zfoeoc7j
Here extents are convertible if the IndexType is convertible.

and a modification of your reproducer:
https://godbolt.org/z/hG6YKosrf
Here extents are convertible if:
   (Extent == dynamic_extent || Extents == OExtent) && ...


As a temporary workaround we could use a separate overloads:
template
explicit right_mapping_base(right_mapping_base const&) {}

template
requires std::is_convertible_v
right_mapping_base(right_mapping_base const&) {}
The second overload is more constrained than the first.


I have a version that works for rank 0, 1, and N for layout_left and
layout_right. It would be nice to see how much we saved, and confirm
that we didn't miss anything.

To count number of symbol I thought one could use `nm`. So I create
a dummy file:

```
std::layout_left::mapping> m0;

bool all_unique()
{
  return m0.is_unique();
}
```
but with a lot more duplication. Then compile it with:
gcc -O2 -c many_symbols.cc
look at `nm many_symbols.o`; and I see nothing. Just one for `m0`
and another for `all_unique`. With `gcc -O0` I see all the symbols,
but I'm not sure how relevant -O0 is.

Does this way of checking make sense, or do you have some strategy
to ensure that we're effectively reducing the number of symbols?






On Tue, May 6, 2025 at 10:48 AM Tomasz Kaminski 

wrote:



The constructors that are inside mapping_left, that I think represents
constructors with other extends:
template
mapping_left(const mapping_left_base& other)
: mapping_left_base(other) {}
Can be placed in mapping_left_base, and they will be inherited, as only
copy/move constructors are shadowed.


On Tue, May 6, 2025 at 9:11 AM Tomasz Kaminski 
wrote:




On Mon, May 5, 2025 at 9:20 PM Luc Grosheintz <

luc.groshei...@gmail.com>

wrote:




On 5/5/25 9:44 AM, Tomasz Kaminski wrote:

On Sat, May 3, 2025 at 2:39 PM Luc Grosheintz <

luc.groshei...@gmail.com>

wrote:




On 4/30/25 7:13 AM, Tomasz Kaminski wrote:

Hi,

As we will be landing patches for extends, this will become a

separate

patch series.
I would prefer, if you could commit per layout, and start with

layout_right

(default)
I try to provide prompt responses, so if that works better for you,

you

can

post a patch
only with this layout first, as most of the comments will apply to

all of

them.

For the general design we have constructors that allow conversion

between

rank-0
and rank-1 layouts left and right. This is done because they

essentially

represents
the same layout. I think we could benefit from that in code by

having a

base classes
for rank0 and rank1 mapping:
template
_Rank0_mapping_base
{
   static_assert(_Extents::rank() == 0);

   template
   // explicit, requires goes here
   _Rank0_mapping_base(_Rank0_mapping_base);

// All members layout_type goes her
};

template
_Rank1_mapping_base
{
   static_assert(_Extents::rank() == 1);
  // Static assert for product is much simpler here, as we need

to

check one


   template
   // explicit, requires goes here
   _Rank1_mapping_base(_Rank1_mapping_base);

  // Call operator can also be simplified
  index_type operator()(index_type i) const // conversion

happens

at

user

side

  // cosntructor from strided_layout of Rank1 goes here.

// All members layout_type goes her
};
Then we will specialize layout_left/right/stride to use

_Rank0_mapping_base

as a base for rank() == 0
and layout_left/right to use _Rank1_mapping as base for rank()1;
template
struct extents {};

struct layout
{
template
struct mapping
{
// static assert that Extents mmyst be specialization of _Extents

goes

here.

}
};

template
struct layout::mapping>
: _Rank0_mapping_base>
{
using layout_type = layout_left;
// Provides converting constructor.
using

_Rank0_mapping_base>::_Rank0_mapping_base;

// This one is implicit;
mapping(_Rank0_mapping_base> const&);
};

template
struct layout::mapping>
: _Rank1_mapping_base>

{
using layout_type = layout_left;
// Provides converting constructor.
using

_Rank0_mapping_base>::_Rank0_mapping_base;

// This one is implicit, allows construction from layout_right
mapping(_Rank1_mapping_base> const&);
};
};

template
requires sizeof..(_Ext) > = 2
struct layout::mapping>

The last one is 

Re: [PATCH] RISC-V: Add pattern for vector-scalar multiply-add/sub [PR119100]

2025-05-07 Thread Paul-Antoine Arras

On 06/05/2025 15:06, Jeff Law wrote:

diff --git gcc/config/riscv/riscv.cc gcc/config/riscv/riscv.cc
index 38f3ae7cd84..0f0cf04bdd9 100644
--- gcc/config/riscv/riscv.cc
+++ gcc/config/riscv/riscv.cc
@@ -3864,6 +3864,18 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN

   if (riscv_v_ext_mode_p (mode))
 {
   *total = COSTS_N_INSNS (1);
+  if ((GET_CODE (x) == PLUS || GET_CODE (x) == MINUS) && 
outer_code == SET)

+    {
+  rtx plus_op0 = XEXP (x, 0);
+  if (GET_CODE (plus_op0) == MULT)
+    {
+  rtx mult_op0 = XEXP (plus_op0, 0);
+  if (GET_CODE (mult_op0) == VEC_DUPLICATE)
+    {
+  *total += get_vector_costs ()->regmove->FR2VR;
+    }
+    }
+    }
   return true;
 }
So this probably needs minor updates now that Pan's code is in, though I 
suspect combining your work and his in the costing code will be trivial.


Functionally, I would suggest one change:

if (FLOAT_MODE_P (mode))
   *total += get_vector_costs ()->regmove->FR2VR;
else
   *total += get_vector_costs ()->regmove->GR2VR;

That way costing ought to work for the vector integer multiply-add/sub 
operations as well.


You'll need to double check if FLOAT_MODE_P works on a vector mode, if 
not, you may need to get the inner mode.


Thanks Jeff. I will rebase and update my patch. One question though, I 
noticed that Pan's patch introduced a command-line parameter to tweak 
the GR2VR cost; do we need something equivalent for FR2VR?

--
PA


[committed] libstdc++: Remove use of undefined GLIBCXX_LANG_{PUSH, POP} [PR120147]

2025-05-07 Thread Jonathan Wakely
Commit r16-427-g86627faec10da5 was using the new GLIBCXX_LANG_PUSH and
GLIBCXX_LANG_POP macros from a change that I haven't pushed yet,
resulting in changes to CXXFLAGS not being restored after the
GLIBCXX_ENABLE_BACKTRACE checks.

libstdc++-v3/ChangeLog:

PR libstdc++/120147
* acinclude.m4 (GLIBCXX_ENABLE_BACKTRACE): Restore use of
AC_LANG_CPLUSPLUS.
* configure: Regenerate.
---

Lightly tested x86_64-linux and x86_64-w64-mingw. Pushed to trunk.

 libstdc++-v3/acinclude.m4 |  6 --
 libstdc++-v3/configure| 20 
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
index 0fc74d00a98..204bed5b27b 100644
--- a/libstdc++-v3/acinclude.m4
+++ b/libstdc++-v3/acinclude.m4
@@ -5290,7 +5290,8 @@ AC_DEFUN([GLIBCXX_ENABLE_BACKTRACE], [
 
   BACKTRACE_CPPFLAGS="-D_GNU_SOURCE"
 
-  GLIBCXX_LANG_PUSH
+  AC_LANG_CPLUSPLUS
+  old_CXXFLAGS="$CXXFLAGS"
 
   # libbacktrace's own configure.ac only tests atomics for int,
   # but the code actually uses atomics for size_t and pointers as well.
@@ -5356,7 +5357,8 @@ EOF
 rm -f conftest*
   fi
 
-  GLIBCXX_LANG_POP
+  CXXFLAGS="$old_CXXFLAGS"
+  AC_LANG_RESTORE
 
   if test "$glibcxx_cv_libbacktrace_atomics" = yes; then
 BACKTRACE_CPPFLAGS="$BACKTRACE_CPPFLAGS -DHAVE_ATOMIC_FUNCTIONS=1"
diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
index 3fd03b8a95d..0529ff5708f 100755
--- a/libstdc++-v3/configure
+++ b/libstdc++-v3/configure
@@ -53537,7 +53537,13 @@ fi
 
   BACKTRACE_CPPFLAGS="-D_GNU_SOURCE"
 
-  GLIBCXX_LANG_PUSH
+  ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS 
conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+  old_CXXFLAGS="$CXXFLAGS"
 
   # libbacktrace's own configure.ac only tests atomics for int,
   # but the code actually uses atomics for size_t and pointers as well.
@@ -53578,7 +53584,7 @@ main ()
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+if ac_fn_cxx_try_link "$LINENO"; then :
   glibcxx_cv_libbacktrace_atomics=yes
 else
   glibcxx_cv_libbacktrace_atomics=no
@@ -53595,7 +53601,7 @@ $as_echo "$glibcxx_cv_libbacktrace_atomics" >&6; }
 CXXFLAGS='-O0 -S'
 
 cat > conftest.$ac_ext << EOF
-#line 53598 "configure"
+#line 53604 "configure"
 #include 
 int main()
 {
@@ -53633,7 +53639,13 @@ $as_echo "$glibcxx_cv_libbacktrace_atomics" >&6; }
 rm -f conftest*
   fi
 
-  GLIBCXX_LANG_POP
+  CXXFLAGS="$old_CXXFLAGS"
+  ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext 
$LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
 
   if test "$glibcxx_cv_libbacktrace_atomics" = yes; then
 BACKTRACE_CPPFLAGS="$BACKTRACE_CPPFLAGS -DHAVE_ATOMIC_FUNCTIONS=1"
-- 
2.49.0



[PATCH 1/8] AArch64: place branch instruction rules together

2025-05-07 Thread Karl Meakin
The rules for conditional branches were spread throughout `aarch64.md`.
Group them together so it is easier to understand how `cbranch4`
is lowered to RTL.

gcc/ChangeLog:

* config/aarch64/aarch64.md (condjump): move.
(*compare_condjump): likewise.
(aarch64_cb1): likewise.
(*cb1): likewise.
(tbranch_3): likewise.
(@aarch64_tb): likewise.
---
 gcc/config/aarch64/aarch64.md | 387 ++
 1 file changed, 201 insertions(+), 186 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c678f7afb1a..4d556d886bc 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -674,6 +674,10 @@ (define_insn "aarch64_write_sysregti"
  "msrr\t%x0, %x1, %H1"
 )
 
+;; ---
+;; Unconditional jumps
+;; ---
+
 (define_insn "indirect_jump"
   [(set (pc) (match_operand:DI 0 "register_operand" "r"))]
   ""
@@ -692,6 +696,12 @@ (define_insn "jump"
   [(set_attr "type" "branch")]
 )
 
+
+
+;; ---
+;; Conditional jumps
+;; ---
+
 (define_expand "cbranch4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
[(match_operand:GPI 1 "register_operand")
@@ -731,6 +741,197 @@ (define_expand "cbranchcc4"
   ""
   "")
 
+(define_insn "condjump"
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+   [(match_operand 1 "cc_register" "") (const_int 0)])
+  (label_ref (match_operand 2 "" ""))
+  (pc)))]
+  ""
+  {
+/* GCC's traditional style has been to use "beq" instead of "b.eq", etc.,
+   but the "." is required for SVE conditions.  */
+bool use_dot_p = GET_MODE (operands[1]) == CC_NZCmode;
+if (get_attr_length (insn) == 8)
+  return aarch64_gen_far_branch (operands, 2, "Lbcond",
+use_dot_p ? "b.%M0\\t" : "b%M0\\t");
+else
+  return use_dot_p ? "b.%m0\\t%l2" : "b%m0\\t%l2";
+  }
+  [(set_attr "type" "branch")
+   (set (attr "length")
+   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
+  (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
+ (const_int 4)
+ (const_int 8)))
+   (set (attr "far_branch")
+   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
+  (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
+ (const_int 0)
+ (const_int 1)))]
+)
+
+;; For a 24-bit immediate CST we can optimize the compare for equality
+;; and branch sequence from:
+;; mov x0, #imm1
+;; movkx0, #imm2, lsl 16 /* x0 contains CST.  */
+;; cmp x1, x0
+;; b .Label
+;; into the shorter:
+;; sub x0, x1, #(CST & 0xfff000)
+;; subsx0, x0, #(CST & 0x000fff)
+;; b .Label
+(define_insn_and_split "*compare_condjump"
+  [(set (pc) (if_then_else (EQL
+ (match_operand:GPI 0 "register_operand" "r")
+ (match_operand:GPI 1 "aarch64_imm24" "n"))
+  (label_ref:P (match_operand 2 "" ""))
+  (pc)))]
+  "!aarch64_move_imm (INTVAL (operands[1]), mode)
+   && !aarch64_plus_operand (operands[1], mode)
+   && !reload_completed"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff;
+HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000;
+rtx tmp = gen_reg_rtx (mode);
+emit_insn (gen_add3 (tmp, operands[0], GEN_INT (-hi_imm)));
+emit_insn (gen_add3_compare0 (tmp, tmp, GEN_INT (-lo_imm)));
+rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
+rtx cmp_rtx = gen_rtx_fmt_ee (, mode,
+ cc_reg, const0_rtx);
+emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
+DONE;
+  }
+)
+
+(define_insn "aarch64_cb1"
+  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
+   (const_int 0))
+  (label_ref (match_operand 1 "" ""))
+  (pc)))]
+  "!aarch64_track_speculation"
+  {
+if (get_attr_length (insn) == 8)
+  return aarch64_gen_far_branch (operands, 1, "Lcb", "\\t%0, ");
+else
+  return "\\t%0, %l1";
+  }
+  [(set_attr "type" "branch")
+   (set (attr "length")
+   (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
+  (lt (minus (match_dup 1) (pc)) (const_int 1048572)))
+ (const_int 4)
+ (const_int 8)))
+   (set (attr "far_branch")
+   (if_then_else (and (ge (minu

Re: [PATCH] [PR117978] AArch64: Fold SVE load/store with certain ptrue patterns to LDR/STR.

2025-05-07 Thread Jennifer Schmitz


> On 7 May 2025, at 13:35, Richard Sandiford  wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Jennifer Schmitz  writes:
>> @@ -3698,6 +3706,24 @@ aarch64_partial_ptrue_length (rtx_vector_builder 
>> &builder,
>>   return vl;
>> }
>> 
>> +/* Return:
>> +
>> +  * -1 if all bits of PRED are set
>> +  * N if PRED has N leading set bits followed by all clear bits
>> +  * 0 if PRED does not have any of these forms.  */
> 
> Sorry for the formatting nits, but the indentation looks off here.
> Each nonempty line should be indented by three spaces rather than two.
> 
>> [...]
>> @@ -23526,6 +23567,39 @@ aarch64_simd_valid_imm (rtx op, simd_immediate_info 
>> *info,
>>   return false;
>> }
>> 
>> +/* Try to optimize the expansion of a maskload or maskstore with
>> +  the operands in OPERANDS, given that the vector being loaded or
>> +  stored has mode MODE.  Return true on success or false if the normal
>> +  expansion should be used.  */
> 
> Same here.
> 
>> +
>> +bool
>> +aarch64_expand_maskloadstore (rtx *operands, machine_mode mode)
>> +{
>> +  /* If the predicate in operands[2] is a patterned SVE PTRUE predicate
>> +   with patterns VL1, VL2, VL4, VL8, or VL16 and at most the bottom
>> +   128 bits are loaded/stored, emit an ASIMD load/store.  */
> 
> Same here (five spaces rather than three).
> 
> OK with those nits fixed, thanks.
Thanks (and apologies for the formatting issues).
Pushed to trunk: 210d06502f22964c7214586c54f8eb54a6965bfd
Jennifer
> 
> Richard



smime.p7s
Description: S/MIME cryptographic signature


[PATCH] ipa/120146 - deal with vanished varpool nodes in IPA PTA

2025-05-07 Thread Richard Biener
I don't understand why they vanish when still refered to, but
lets deal with that in a conservative way.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

There's questions around the cgraph code in the PR, but the
patch is somewhat obvious at least.

Richard.

PR ipa/120146
* tree-ssa-structalias.cc (create_variable_info_for): If
the symtab cannot tell us whether all refs to a variable
are explicit assume they are not.

* g++.dg/ipa/pr120146.C: New testcase.
---
 gcc/testsuite/g++.dg/ipa/pr120146.C | 12 
 gcc/tree-ssa-structalias.cc |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ipa/pr120146.C

diff --git a/gcc/testsuite/g++.dg/ipa/pr120146.C 
b/gcc/testsuite/g++.dg/ipa/pr120146.C
new file mode 100644
index 000..33644b4f7a6
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ipa/pr120146.C
@@ -0,0 +1,12 @@
+// { dg-do compile }
+// { dg-options "-O -fipa-pta" }
+
+struct basic_ios {
+  ~basic_ios();
+};
+struct basic_istream : virtual basic_ios {};
+template  struct basic_ifstream : basic_istream {
+  template  basic_ifstream(_Path, int);
+};
+extern template class basic_ifstream;
+void CompareFiles_path2() { basic_ifstream(CompareFiles_path2, 0); }
diff --git a/gcc/tree-ssa-structalias.cc b/gcc/tree-ssa-structalias.cc
index 3ad0c69930c..deca44ae0bf 100644
--- a/gcc/tree-ssa-structalias.cc
+++ b/gcc/tree-ssa-structalias.cc
@@ -6562,7 +6562,7 @@ create_variable_info_for (tree decl, const char *name, 
bool add_id)
  varpool_node *vnode = varpool_node::get (decl);
 
  /* For escaped variables initialize them from nonlocal.  */
- if (!vnode->all_refs_explicit_p ())
+ if (!vnode || !vnode->all_refs_explicit_p ())
make_copy_constraint (vi, nonlocal_id);
 
  /* While we can in theory walk references for the varpool
@@ -6581,7 +6581,7 @@ create_variable_info_for (tree decl, const char *name, 
bool add_id)
process_constraint (new_constraint (lhs, *rhsp));
  /* If this is a variable that escapes from the unit
 the initializer escapes as well.  */
- if (!vnode->all_refs_explicit_p ())
+ if (!vnode || !vnode->all_refs_explicit_p ())
{
  lhs.var = escaped_id;
  lhs.offset = 0;
-- 
2.43.0


Re: [RFC PATCH 0/5] aarch64: Support for user-defined aarch64 tuning parameters in JSON

2025-05-07 Thread David Malcolm
On Tue, 2025-05-06 at 14:00 +0530, soum...@nvidia.com wrote:
> From: Soumya AR 
> 
> Hi,
> 
> This RFC and subsequent patch series introduces support for printing
> and parsing
> of aarch64 tuning parameters in the form of JSON.
> 
> It is important to note that this mechanism is specifically intended
> for power
> users to experiment with tuning parameters. This proposal does not
> suggest the
> use of JSON tuning files in production. Additionally, the JSON format
> should not
> be considered stable and may change as GCC evolves.
> 
> [1] Introduction
> 
> Currently, the aarch64 backend in GCC (15) stores the tuning
> parameteres of all
> backends under gcc/config/aarch64/tuning_models/. Since these
> parameters are 
> hardcoded for each CPU, this RFC proposes a technique to support the
> adjustment
> of these parameters at runtime. This allows easier experimentation
> with more
> aggressive parameters to find optimal numbers.
> 
> The tuning data is fed to the compiler in JSON format, which was
> primarily 
> chosen for the following reasons:
> 
> * JSON can represent hierarchical data. This is useful for
> incorporating the
> nested nature of the tuning structures.
> * JSON supports integers, strings, booleans, and arrays.
> * GCC already has support for parsing and printing JSON, removing the
> need for
> writing APIs to read and write the JSON files.
>  
> Thus, if we take the following example of some tuning parameters:
> 
> static struct cpu_addrcost_table generic_armv9_a_addrcost_table =
> {
>     {
>   1, /* hi  */
>   0, /* si  */
>   0, /* di  */
>   1, /* ti  */
>     },
>   0, /* pre_modify  */
>   0, /* post_modify  */
>   2, /* post_modify_ld3_st3  */
>   2, /* post_modify_ld4_st4  */
> };
> 
> static cpu_prefetch_tune generic_armv9a_prefetch_tune =
> {
>   0,  /* num_slots  */
>   -1, /* l1_cache_size  */
>   64, /* l1_cache_line_size  */
>   -1, /* l2_cache_size  */
>   true,   /* prefetch_dynamic_strides */
> };
> 
> static struct tune_params neoversev3_tunings =
> {
>   &generic_armv9_a_addrcost_table,
>   10, /* issue_rate  */
>   AARCH64_FUSE_NEOVERSE_BASE, /* fusible_ops  */
>   "32:16",/* function_align.  */
>   &generic_armv9a_prefetch_tune,
>   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
> };
> 
> We can represent them in JSON as:
> 
> {
>   "tune_params": {
>     "addr_cost": {
>   "addr_scale_costs": { "hi": 1, "si": 0, "di": 0, "ti": 1 },
>   "pre_modify": 0,
>   "post_modify": 0,
>   "post_modify_ld3_st3": 2,
>   "post_modify_ld4_st4": 2
>     },
>     "issue_rate": 10,
>     "fusible_ops": 1584,
>     "function_align": "32:16",
>     "prefetch": {
>   "num_slots": 0,
>   "l1_cache_size": -1,
>   "l1_cache_line_size": 64,
>   "l2_cache_size": -1,
>   "prefetch_dynamic_strides": true
>     },
>     "ldp_policy_model": "AARCH64_LDP_STP_POLICY_ALWAYS"
>   }
> }
> 
> ---
> 
> [2] Methodology 
> 
> Before the internal tuning parameters are overridden with user
> provided ones, we
> must ensure the validity of the provided data.
> 
> This is done using a "base" JSON schema, which contains information
> about the 
> tune_params data structure used by the aarch64 backend.
> 
> Example:
> 
> {
>   "tune_params": {
>     "addr_cost": {
>   "addr_scale_costs": {
>     "hi": "int",
>     "si": "int",
>     "di": "int",
>     "ti": "int"
>   },
>   "pre_modify": "int",
>   "post_modify": "int",
>   "post_modify_ld3_st3": "int",
>   "post_modify_ld4_st4": "int"
>     },
>     "issue_rate": "int",
>     "fusible_ops": "uint",
>     "function_align": "string",
>     "prefetch": {
>   "num_slots": "int",
>   "l1_cache_size": "int",
>   "l1_cache_line_size": "int",
>   "l2_cache_size": "int",
>   "prefetch_dynamic_strides": "boolean"
>     },
>     "ldp_policy_model": "string"
>   }
> }
> 
> Using this schema, we can:
>   * Verify that the correct datatypes have been used.
>   * Verify if the user provided "key" or tuning parameter
> exists.
>   * Allow user to only specify the required fields (in nested
> fashion), 
>   eliminating the need to list down every single paramter if
> they only
>   wish to experiment with some.
>   
> The schema is currently stored as a raw JSON string in
> config/aarch64/aarch64-json-schema.h.

Does the schema get used to do validation anywhere?  FWIW (and as
posted in another followup), for SARIF the DejaGnu tests validate the
generated json against a schema; see gcc/testsuite/lib/scansarif.exp;
there's also run-sarif-pytest which allows DejaGnu to run Python
scripts to verify properties of the generated json.  The latter is
probably overkill for the aarch64 tuning use-case, but is very helpful
for SARIF, which has deeply nested json, cross-references, duplication
and de-duplication, etc (so regexps aren't expressive enough fo

Re: [PATCH 0/6] Remove non-SLP path from vectorizable_reduction

2025-05-07 Thread Richard Biener
On Tue, 6 May 2025, andre.simoesdiasvie...@arm.com wrote:

> From: Andre Vieira 
> 
> Somewhat following richi's example of SLP cleanup, this patch series 
> cleansup SLP code for vectorizable_reduction and friends.
> 
>  1) fold trivially true/false conditions based on the slp_node argument
> without code block removal/reindent, etc.
>  2) do trivial dead code elimination
>  3) first steps in cleanup simple things - it's expected that the 'ncopies' 
> variable
> vanishes (a vec_num one might remain), removed 
> 'get_initial_def_for_reduction'. Not entirely sure what to do about 
> STMT_VINFO_VECTYPE.
>  4) folds loops that used ncopies and now no longer need to be loops
>  5) When trying to do 3, I could not remove the use of 'vec_stmt' for 
> vectorizable_lc_phi as that was needed to distinguish between 'analysis' and 
> 'transformation', so in this step we split up vectorizable_lc_phi and create 
> a vect_transform_lc_phi, similar to other vectorizable_* and vect_transform_*.
>  6) Remove the gimple ** argument that is no longer needed from 
> vectorizable_lc_phi and vect_transform_lc_phi
> 
> The order of 5-6 relative to the rest could be changed, but I only noticed 
> its need after, happy to re-order and whether there are any objections to 
> these all being squashed together, there may be an argument for doing 5-6 as 
> a separate squashed patch.
> 
> I tested a version of these on x86_64-pc-linux-gnu, 
> aarch64-none-linux-gnu and arm-none-gnueabihf but made some minor 
> changes since. I'll retest after discussing some further changes around 
> the STMT_VINFO_VECTYPE stuff.

Thanks for tackling the most complicated case ;)  The series is OK
if squashed to a single commit with the minor issues resolved (some
possibly resolved themselves with later patches in the series, but
IIRC not all).

Richard.

> Andre Vieira (6):
>   vect: Remove non-SLP path from vectorizable_reduction
>   vect: Remove non-SLP path from vectorizable_reduction
>   vect: Remove non-SLP path from vectorizable_reduction
>   vect: Remove non-SLP path from vectorizable_reduction
>   vect: Split vectorizable_lc_phi
>   vect: Split vectorizable_lc_phi
> 
>  gcc/tree-vect-loop.cc  | 728 -
>  gcc/tree-vect-stmts.cc |  10 +-
>  gcc/tree-vectorizer.h  |   7 +-
>  3 files changed, 209 insertions(+), 536 deletions(-)
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH v2 3/4] Rewrite VCEs of integral types [PR116939]

2025-05-07 Thread Richard Biener
On Wed, May 7, 2025 at 3:55 AM Andrew Pinski  wrote:
>
> Like the patch to phiopt (r15-4033-g1f619fe25925a5f7), this adds rewriting
> of VCE to gimple_with_undefined_signed_overflow/rewrite_to_defined_overflow.
> In the case of moving VCE of a bool from being conditional to unconditional,
> it needs to be rewritten to not to use VCE but a normal cast. pr120122-1.c is
> an example of where LIM needs this rewriting.
>
> This also renames gimple_with_undefined_signed_overflow to 
> gimple_needing_rewrite_undefined
> and rewrite_to_defined_overflow to rewrite_to_defined_unconditional as they 
> will be doing
> more than just handling signed overflow.
>
> Bootstrappd and tested on x86_64-linux-gnu.
>
> PR tree-optimization/120122
> PR tree-optimization/116939
>
> gcc/ChangeLog:
>
> * gimple-fold.h (gimple_with_undefined_signed_overflow): Rename to ..
> (rewrite_to_defined_overflow): This.
> (gimple_needing_rewrite_undefined): Rename to ...
> (rewrite_to_defined_unconditional): this.
> * gimple-fold.cc (gimple_with_undefined_signed_overflow): Rename to 
> ...
> (gimple_needing_rewrite_undefined): This. Return true for VCE with 
> integral
> types.
> (rewrite_to_defined_overflow): Rename to ...
> (rewrite_to_defined_unconditional): This. Handle VCE rewriting to a 
> cast.
> * tree-if-conv.cc: 
> s/gimple_with_undefined_signed_overflow/gimple_needing_rewrite_undefined/
> s/rewrite_to_defined_overflow/rewrite_to_defined_unconditional.
> * tree-scalar-evolution.cc: Likewise
> * tree-ssa-ifcombine.cc: Likewise.
> * tree-ssa-loop-im.cc: Likewise.
> * tree-ssa-loop-split.cc: Likewise.
> * tree-ssa-reassoc.cc: Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/torture/pr120122-1.c: New test.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/gimple-fold.cc| 51 +--
>  gcc/gimple-fold.h |  6 +--
>  gcc/testsuite/gcc.dg/torture/pr120122-1.c | 51 +++
>  gcc/tree-if-conv.cc   |  6 +--
>  gcc/tree-scalar-evolution.cc  |  4 +-
>  gcc/tree-ssa-ifcombine.cc |  4 +-
>  gcc/tree-ssa-loop-im.cc   |  4 +-
>  gcc/tree-ssa-loop-split.cc|  4 +-
>  gcc/tree-ssa-reassoc.cc   |  4 +-
>  9 files changed, 106 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/torture/pr120122-1.c
>
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index c060ef81a42..4f45aeb7ff8 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -10588,10 +10588,12 @@ arith_code_with_undefined_signed_overflow 
> (tree_code code)
>
>  /* Return true if STMT has an operation that operates on a signed
> integer types involves undefined behavior on overflow and the
> -   operation can be expressed with unsigned arithmetic.  */
> +   operation can be expressed with unsigned arithmetic.
> +   Also returns true if STMT is a VCE that needs to be rewritten
> +   if moved to be executed unconditionally.   */
>
>  bool
> -gimple_with_undefined_signed_overflow (gimple *stmt)
> +gimple_needing_rewrite_undefined (gimple *stmt)
>  {
>if (!is_gimple_assign (stmt))
>  return false;
> @@ -10602,6 +10604,14 @@ gimple_with_undefined_signed_overflow (gimple *stmt)
>if (!INTEGRAL_TYPE_P (lhs_type)
>&& !POINTER_TYPE_P (lhs_type))
>  return false;
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +  /* VCE from integral types to another integral types but with
> + different precisions need to be changed into casts
> + to be well defined. */
> +  if (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
> +  && INTEGRAL_TYPE_P (TREE_TYPE (TREE_OPERAND (rhs, 0)))

So this does not perform the precision check and IMO we should never end
up with a V_C_E from a lower-precision operand to a higher precision.  So
does it work with an additional

   && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (rhs, 0))
  > TYPE_PRECISION (lhs_type)

check?

> +  && is_gimple_val (TREE_OPERAND (rhs, 0)))
> +return true;
>if (!TYPE_OVERFLOW_UNDEFINED (lhs_type))
>  return false;
>if (!arith_code_with_undefined_signed_overflow
> @@ -10621,19 +10631,36 @@ gimple_with_undefined_signed_overflow (gimple *stmt)
> contain a modified form of STMT itself.  */
>
>  static gimple_seq
> -rewrite_to_defined_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
> -bool in_place)
> +rewrite_to_defined_unconditional (gimple_stmt_iterator *gsi, gimple *stmt,
> + bool in_place)
>  {
>if (dump_file && (dump_flags & TDF_DETAILS))
>  {
> -  fprintf (dump_file, "rewriting stmt with undefined signed "
> -  "overflow ");
> +  fprintf (dump_file, "rewriting stmt for being uncondtional defined");
>print_gimple_stmt

Re: [PATCH v6 0/4] Implement extents from the mdspan header.

2025-05-07 Thread Jonathan Wakely
On Wed, 30 Apr 2025 at 13:30, Luc Grosheintz wrote:
>
> This is the sixth interation and replaces:
> https://gcc.gnu.org/pipermail/libstdc++/2025-April/061190.html
>
> Changes since v5:
> * Removed superfluous braces.
> * Fixed std.cc.in
> * Fixed Copyright statement.
>
> Any layout related code has been removed from this patch
> series.

Thanks! I've pushed these to trunk now, congratulations on your first
GCC contributions.

I accidentally applied an older version of the patch modifying
std.cc.in so I fixed that by hand afterwards.


>
> Luc Grosheintz (4):
>   libstdc++: Setup internal FTM for mdspan.
>   libstdc++: Add header mdspan to the build-system.
>   libstdc++: Implement std::extents [PR107761].
>   libstdc++: Add tests for std::extents.
>
>  libstdc++-v3/doc/doxygen/user.cfg.in  |   1 +
>  libstdc++-v3/include/Makefile.am  |   1 +
>  libstdc++-v3/include/Makefile.in  |   1 +
>  libstdc++-v3/include/bits/version.def |   9 +
>  libstdc++-v3/include/bits/version.h   |   9 +
>  libstdc++-v3/include/precompiled/stdc++.h |   1 +
>  libstdc++-v3/include/std/mdspan   | 309 ++
>  libstdc++-v3/src/c++23/std.cc.in  |   9 +-
>  .../mdspan/extents/class_mandates_neg.cc  |   8 +
>  .../23_containers/mdspan/extents/ctor_copy.cc |  82 +
>  .../23_containers/mdspan/extents/ctor_ints.cc |  62 
>  .../mdspan/extents/ctor_shape.cc  | 160 +
>  .../mdspan/extents/custom_integer.cc  |  87 +
>  .../23_containers/mdspan/extents/misc.cc  | 224 +
>  14 files changed, 962 insertions(+), 1 deletion(-)
>  create mode 100644 libstdc++-v3/include/std/mdspan
>  create mode 100644 
> libstdc++-v3/testsuite/23_containers/mdspan/extents/class_mandates_neg.cc
>  create mode 100644 
> libstdc++-v3/testsuite/23_containers/mdspan/extents/ctor_copy.cc
>  create mode 100644 
> libstdc++-v3/testsuite/23_containers/mdspan/extents/ctor_ints.cc
>  create mode 100644 
> libstdc++-v3/testsuite/23_containers/mdspan/extents/ctor_shape.cc
>  create mode 100644 
> libstdc++-v3/testsuite/23_containers/mdspan/extents/custom_integer.cc
>  create mode 100644 
> libstdc++-v3/testsuite/23_containers/mdspan/extents/misc.cc
>
> --
> 2.49.0
>



Re: [PATCH v2 1/4] Loop-IM: Hoist (non-expensive) stmts to executed all loop when running before PRE

2025-05-07 Thread Richard Biener
On Wed, May 7, 2025 at 3:50 AM Andrew Pinski  wrote:
>
> While fixing up how rewrite_to_defined_overflow works, gcc.dg/Wrestrict-22.c 
> started
> to fail. This is because `d p+ 2` would moved by LIM and then be rewritten 
> not using
> pointer plus. The rewriting part is correct behavior. It only recently 
> started to be
> moved out; due to r16-190-g6901d56fea2132.
> Which has the following comment:
> ```
> When we run before PRE and PRE is active hoist all expressions
> since PRE would do so anyway and we can preserve range info
> but PRE cannot.
> ```
> This is not true if hoisting past the always executed point; so, instead of 
> hoisting
> these statements all the way out of the max loops, take into account the 
> always executed
> loop too.
>
> Bootstrapped and tested on x86_64-linux-gnu.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
> * tree-ssa-loop-im.cc (compute_invariantness): Hoist to the always 
> executed point
> if ignorning the cost.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/tree-ssa-loop-im.cc | 22 +-
>  1 file changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
> index a3ca5af3e3e..b7f9f9befa5 100644
> --- a/gcc/tree-ssa-loop-im.cc
> +++ b/gcc/tree-ssa-loop-im.cc
> @@ -1241,12 +1241,24 @@ compute_invariantness (basic_block bb)
>lim_data->cost);
> }
>
> -  if (lim_data->cost >= LIM_EXPENSIVE
> - /* When we run before PRE and PRE is active hoist all expressions
> -since PRE would do so anyway and we can preserve range info
> -but PRE cannot.  */
> - || (flag_tree_pre && !in_loop_pipeline))
> +  if (lim_data->cost >= LIM_EXPENSIVE)
> set_profitable_level (stmt);
> +  /* When we run before PRE and PRE is active hoist all expressions
> +to the always executed loop since PRE would do so anyway
> +and we can preserve range info while PRE cannot.  */
> +  else if (flag_tree_pre && !in_loop_pipeline
> +  && outermost)
> +   {
> + class loop *mloop = lim_data->max_loop;
> + if (loop_depth (outermost) > loop_depth (mloop))
> +   {
> + mloop = outermost;
> + if (dump_file && (dump_flags & TDF_DETAILS))
> +   fprintf (dump_file, "  constraining to loop depth %d\n\n\n",
> +loop_depth (mloop));
> +   }
> + set_level (stmt, bb->loop_father, mloop);
> +   }
>  }
>  }
>
> --
> 2.34.1
>


[PATCH 01/13] arm: clarify the logic of SECONDARY_(INPUT/OUTPUT)_RELOAD_CLASS

2025-05-07 Thread Richard Earnshaw
The flattened logic of these functions and the complexity of the
numerous clauses makes it very difficult to understand what's written
in these macros.  Additionally, SECONDARY_INPUT_RELOAD_CLASS was not
laid out with the correct formatting.

Add some parenthesis and re-indent to make the logic clearer.

No functional change.

gcc:
* config/arm/arm.h (SECONDARY_OUTPUT_RELOAD_CLASS): Add parentheis
and re-indent.
(SECONDARY_INPUT_RELOAD_CLASS): Likewise.
---
 gcc/config/arm/arm.h | 55 +++-
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 8472b756127..9c3a644873b 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -1460,34 +1460,37 @@ extern const char *fp_sysreg_names[NB_FP_SYSREGS];
 /* Return the register class of a scratch register needed to copy IN into
or out of a register in CLASS in MODE.  If it can be done directly,
NO_REGS is returned.  */
-#define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X)  \
-  /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
-  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS)) \
-   ? coproc_secondary_reload_class (MODE, X, FALSE)\
-   : (TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) \
-   ? coproc_secondary_reload_class (MODE, X, TRUE) \
-   : TARGET_32BIT  \
-   ? (((MODE) == HImode && ! arm_arch4 && true_regnum (X) == -1) \
-? GENERAL_REGS : NO_REGS)  \
-   : THUMB_SECONDARY_OUTPUT_RELOAD_CLASS (CLASS, MODE, X))
+#define SECONDARY_OUTPUT_RELOAD_CLASS(CLASS, MODE, X)  \
+  /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */
\
+  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS)) \
+   ? coproc_secondary_reload_class (MODE, X, FALSE)\
+   : ((TARGET_IWMMXT && (CLASS) == IWMMXT_REGS)
\
+  ? coproc_secondary_reload_class (MODE, X, TRUE)  \
+  : (TARGET_32BIT  \
+? (((MODE) == HImode && ! arm_arch4 && true_regnum (X) == -1)  \
+   ? GENERAL_REGS  \
+   : NO_REGS)  \
+: THUMB_SECONDARY_OUTPUT_RELOAD_CLASS (CLASS, MODE, X
 
 /* If we need to load shorts byte-at-a-time, then we need a scratch.  */
-#define SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X)   \
-  /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */ \
-  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS)) \
-? coproc_secondary_reload_class (MODE, X, FALSE) : \
-(TARGET_IWMMXT && (CLASS) == IWMMXT_REGS) ?\
-coproc_secondary_reload_class (MODE, X, TRUE) :\
-   (TARGET_32BIT ? \
-(((CLASS) == IWMMXT_REGS || (CLASS) == IWMMXT_GR_REGS) \
- && CONSTANT_P (X))\
-? GENERAL_REGS :   \
-(((MODE) == HImode && ! arm_arch4  \
-  && (MEM_P (X)\
- || ((REG_P (X) || GET_CODE (X) == SUBREG) \
- && true_regnum (X) == -1)))   \
- ? GENERAL_REGS : NO_REGS) \
-: THUMB_SECONDARY_INPUT_RELOAD_CLASS (CLASS, MODE, X)))
+#define SECONDARY_INPUT_RELOAD_CLASS(CLASS, MODE, X)   \
+  /* Restrict which direct reloads are allowed for VFP/iWMMXt regs.  */
\
+  ((TARGET_HARD_FLOAT && IS_VFP_CLASS (CLASS)) \
+   ? coproc_secondary_reload_class (MODE, X, FALSE)\
+   : ((TARGET_IWMMXT && (CLASS) == IWMMXT_REGS)
\
+  ? coproc_secondary_reload_class (MODE, X, TRUE)  \
+  : (TARGET_32BIT  \
+? CLASS) == IWMMXT_REGS || (CLASS) == IWMMXT_GR_REGS)  \
+&& CONSTANT_P (X)) \
+   ? GENERAL_REGS  \
+   : (((MODE) == HImode\
+   && ! arm_arch4  \
+   && (MEM_P (X)   \
+   || ((REG_P (X) || GET_CODE (X) == SUBREG)   \
+   && true_regnum (X) == -1))) \
+  ? GENERAL_REGS   \
+  : NO_REGS))  \
+: THUMB_SECONDARY_INPUT_RELOAD_CLASS (CLASS, MODE, X
 
 /* Return the maxi

[PATCH 04/13] arm: remove iWMMX builtins support.

2025-05-07 Thread Richard Earnshaw
This is the first step of removing the various builtins for iwmmxt,
removing the builtins expansion code.  It leaves a lot of code
elsewhere, but we'll clean that up in subsequent patches.

I'm not sure why safe_vector_operand would unconditionally try to
expand to an iwmmxt instruction if passed (const_int 0).  Clearly
that's meaningless on other architectures, but perhaps this can't
happen elsewhere.  Anyway, for now, just mark this as unreachable so
that we'll know about it if it ever happens.

gcc/ChangeLog:

* config/arm/arm-builtins.cc (enum arm_builtins): Delete iWMMX
builtin values.
(bdesc_2arg): Likewise.
(bdesc_1arg): Likewise.
(arm_init_iwmmxt_builtins): Delete.
(arm_init_builtins): Don't call arm_init_iwmmxt_builtins.
(safe_vector_operand): Use __builtin_unreachable instead of emitting
an iwmmxt builtin.
(arm_general_expand_builtin): Remove iWMMX builtins support.
---
 gcc/config/arm/arm-builtins.cc | 1276 +---
 1 file changed, 2 insertions(+), 1274 deletions(-)

diff --git a/gcc/config/arm/arm-builtins.cc b/gcc/config/arm/arm-builtins.cc
index c56ab5db985..0ddc6669509 100644
--- a/gcc/config/arm/arm-builtins.cc
+++ b/gcc/config/arm/arm-builtins.cc
@@ -816,252 +816,6 @@ static arm_builtin_cde_datum cde_builtin_data[] =
 
 enum arm_builtins
 {
-  ARM_BUILTIN_GETWCGR0,
-  ARM_BUILTIN_GETWCGR1,
-  ARM_BUILTIN_GETWCGR2,
-  ARM_BUILTIN_GETWCGR3,
-
-  ARM_BUILTIN_SETWCGR0,
-  ARM_BUILTIN_SETWCGR1,
-  ARM_BUILTIN_SETWCGR2,
-  ARM_BUILTIN_SETWCGR3,
-
-  ARM_BUILTIN_WZERO,
-
-  ARM_BUILTIN_WAVG2BR,
-  ARM_BUILTIN_WAVG2HR,
-  ARM_BUILTIN_WAVG2B,
-  ARM_BUILTIN_WAVG2H,
-
-  ARM_BUILTIN_WACCB,
-  ARM_BUILTIN_WACCH,
-  ARM_BUILTIN_WACCW,
-
-  ARM_BUILTIN_WMACS,
-  ARM_BUILTIN_WMACSZ,
-  ARM_BUILTIN_WMACU,
-  ARM_BUILTIN_WMACUZ,
-
-  ARM_BUILTIN_WSADB,
-  ARM_BUILTIN_WSADBZ,
-  ARM_BUILTIN_WSADH,
-  ARM_BUILTIN_WSADHZ,
-
-  ARM_BUILTIN_WALIGNI,
-  ARM_BUILTIN_WALIGNR0,
-  ARM_BUILTIN_WALIGNR1,
-  ARM_BUILTIN_WALIGNR2,
-  ARM_BUILTIN_WALIGNR3,
-
-  ARM_BUILTIN_TMIA,
-  ARM_BUILTIN_TMIAPH,
-  ARM_BUILTIN_TMIABB,
-  ARM_BUILTIN_TMIABT,
-  ARM_BUILTIN_TMIATB,
-  ARM_BUILTIN_TMIATT,
-
-  ARM_BUILTIN_TMOVMSKB,
-  ARM_BUILTIN_TMOVMSKH,
-  ARM_BUILTIN_TMOVMSKW,
-
-  ARM_BUILTIN_TBCSTB,
-  ARM_BUILTIN_TBCSTH,
-  ARM_BUILTIN_TBCSTW,
-
-  ARM_BUILTIN_WMADDS,
-  ARM_BUILTIN_WMADDU,
-
-  ARM_BUILTIN_WPACKHSS,
-  ARM_BUILTIN_WPACKWSS,
-  ARM_BUILTIN_WPACKDSS,
-  ARM_BUILTIN_WPACKHUS,
-  ARM_BUILTIN_WPACKWUS,
-  ARM_BUILTIN_WPACKDUS,
-
-  ARM_BUILTIN_WADDB,
-  ARM_BUILTIN_WADDH,
-  ARM_BUILTIN_WADDW,
-  ARM_BUILTIN_WADDSSB,
-  ARM_BUILTIN_WADDSSH,
-  ARM_BUILTIN_WADDSSW,
-  ARM_BUILTIN_WADDUSB,
-  ARM_BUILTIN_WADDUSH,
-  ARM_BUILTIN_WADDUSW,
-  ARM_BUILTIN_WSUBB,
-  ARM_BUILTIN_WSUBH,
-  ARM_BUILTIN_WSUBW,
-  ARM_BUILTIN_WSUBSSB,
-  ARM_BUILTIN_WSUBSSH,
-  ARM_BUILTIN_WSUBSSW,
-  ARM_BUILTIN_WSUBUSB,
-  ARM_BUILTIN_WSUBUSH,
-  ARM_BUILTIN_WSUBUSW,
-
-  ARM_BUILTIN_WAND,
-  ARM_BUILTIN_WANDN,
-  ARM_BUILTIN_WOR,
-  ARM_BUILTIN_WXOR,
-
-  ARM_BUILTIN_WCMPEQB,
-  ARM_BUILTIN_WCMPEQH,
-  ARM_BUILTIN_WCMPEQW,
-  ARM_BUILTIN_WCMPGTUB,
-  ARM_BUILTIN_WCMPGTUH,
-  ARM_BUILTIN_WCMPGTUW,
-  ARM_BUILTIN_WCMPGTSB,
-  ARM_BUILTIN_WCMPGTSH,
-  ARM_BUILTIN_WCMPGTSW,
-
-  ARM_BUILTIN_TEXTRMSB,
-  ARM_BUILTIN_TEXTRMSH,
-  ARM_BUILTIN_TEXTRMSW,
-  ARM_BUILTIN_TEXTRMUB,
-  ARM_BUILTIN_TEXTRMUH,
-  ARM_BUILTIN_TEXTRMUW,
-  ARM_BUILTIN_TINSRB,
-  ARM_BUILTIN_TINSRH,
-  ARM_BUILTIN_TINSRW,
-
-  ARM_BUILTIN_WMAXSW,
-  ARM_BUILTIN_WMAXSH,
-  ARM_BUILTIN_WMAXSB,
-  ARM_BUILTIN_WMAXUW,
-  ARM_BUILTIN_WMAXUH,
-  ARM_BUILTIN_WMAXUB,
-  ARM_BUILTIN_WMINSW,
-  ARM_BUILTIN_WMINSH,
-  ARM_BUILTIN_WMINSB,
-  ARM_BUILTIN_WMINUW,
-  ARM_BUILTIN_WMINUH,
-  ARM_BUILTIN_WMINUB,
-
-  ARM_BUILTIN_WMULUM,
-  ARM_BUILTIN_WMULSM,
-  ARM_BUILTIN_WMULUL,
-
-  ARM_BUILTIN_PSADBH,
-  ARM_BUILTIN_WSHUFH,
-
-  ARM_BUILTIN_WSLLH,
-  ARM_BUILTIN_WSLLW,
-  ARM_BUILTIN_WSLLD,
-  ARM_BUILTIN_WSRAH,
-  ARM_BUILTIN_WSRAW,
-  ARM_BUILTIN_WSRAD,
-  ARM_BUILTIN_WSRLH,
-  ARM_BUILTIN_WSRLW,
-  ARM_BUILTIN_WSRLD,
-  ARM_BUILTIN_WRORH,
-  ARM_BUILTIN_WRORW,
-  ARM_BUILTIN_WRORD,
-  ARM_BUILTIN_WSLLHI,
-  ARM_BUILTIN_WSLLWI,
-  ARM_BUILTIN_WSLLDI,
-  ARM_BUILTIN_WSRAHI,
-  ARM_BUILTIN_WSRAWI,
-  ARM_BUILTIN_WSRADI,
-  ARM_BUILTIN_WSRLHI,
-  ARM_BUILTIN_WSRLWI,
-  ARM_BUILTIN_WSRLDI,
-  ARM_BUILTIN_WRORHI,
-  ARM_BUILTIN_WRORWI,
-  ARM_BUILTIN_WRORDI,
-
-  ARM_BUILTIN_WUNPCKIHB,
-  ARM_BUILTIN_WUNPCKIHH,
-  ARM_BUILTIN_WUNPCKIHW,
-  ARM_BUILTIN_WUNPCKILB,
-  ARM_BUILTIN_WUNPCKILH,
-  ARM_BUILTIN_WUNPCKILW,
-
-  ARM_BUILTIN_WUNPCKEHSB,
-  ARM_BUILTIN_WUNPCKEHSH,
-  ARM_BUILTIN_WUNPCKEHSW,
-  ARM_BUILTIN_WUNPCKEHUB,
-  ARM_BUILTIN_WUNPCKEHUH,
-  ARM_BUILTIN_WUNPCKEHUW,
-  ARM_BUILTIN_WUNPCKELSB,
-  ARM_BUILTIN_WUNPCKELSH,
-  ARM_BUILTIN_WUNPCKELSW,
-  ARM_BUILTIN_WUNPCKELUB,
-  ARM_BUILTIN_WUNPCKELUH,
-  ARM_BUILTIN_WUNPCKELUW,
-
-  ARM_BUILTIN_WABSB,
-  ARM_BUILTIN_WABSH,
-  ARM_BUILTIN_W

[PATCH 10/13] arm: cleanup iterators.md after removing iwmmxt

2025-05-07 Thread Richard Earnshaw
Mostly this is just removing references to iWMMXT in comments, but also remove
some now unused iterators and attributes.

gcc/ChangeLog:

* config/arm/iterators.md (VMMX, VMMX2): Remove mode iterators.
(MMX_char): Remove mode iterator attribute.
---
 gcc/config/arm/iterators.md | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 743fe48e6cc..0c163ed4782 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -59,30 +59,25 @@ (define_mode_iterator CCSI [(CC_Z "TARGET_32BIT") (SI 
"TARGET_THUMB1")])
 ;; A list of modes which the VFP unit can handle
 (define_mode_iterator SDF [(SF "") (DF "TARGET_VFP_DOUBLE")])
 
-;; Integer element sizes implemented by IWMMXT.
-(define_mode_iterator VMMX [V2SI V4HI V8QI])
-
-(define_mode_iterator VMMX2 [V4HI V2SI])
-
 ;; Integer element sizes for shifts.
 (define_mode_iterator VSHFT [V4HI V2SI DI])
 
-;; Integer and float modes supported by Neon and IWMMXT.
+;; Integer and float modes supported by Neon.
 (define_mode_iterator VALL [V2DI V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF])
 
-;; Integer and float modes supported by Neon, IWMMXT and MVE.
+;; Integer and float modes supported by Neon and MVE.
 (define_mode_iterator VNIM1 [V16QI V8HI V4SI V4SF V2DI])
 
-;; Integer and float modes supported by Neon and IWMMXT but not MVE.
+;; Integer and float modes supported by Neon but not MVE.
 (define_mode_iterator VNINOTM1 [V2SI V4HI V8QI V2SF])
 
-;; Integer and float modes supported by Neon and IWMMXT, except V2DI.
+;; Integer and float modes supported by Neon, except V2DI.
 (define_mode_iterator VALLW [V2SI V4HI V8QI V2SF V4SI V8HI V16QI V4SF])
 
-;; Integer modes supported by Neon and IWMMXT
+;; Integer modes supported by Neon
 (define_mode_iterator VINT [V2DI V2SI V4HI V8QI V4SI V8HI V16QI])
 
-;; Integer modes supported by Neon and IWMMXT, except V2DI
+;; Integer modes supported by Neon, except V2DI
 (define_mode_iterator VINTW [V2SI V4HI V8QI V4SI V8HI V16QI])
 
 ;; Double-width vector modes, on which we support arithmetic (no HF!)
@@ -1644,9 +1639,6 @@ (define_int_iterator CDE_VCX [UNSPEC_VCDE UNSPEC_VCDEA])
 ;; distinguishes between 16-bit Thumb and 32-bit Thumb/ARM.
 (define_mode_attr arch [(CC_Z "32") (SI "t1")])
 
-;; Determine element size suffix from vector mode.
-(define_mode_attr MMX_char [(V8QI "b") (V4HI "h") (V2SI "w") (DI "d")])
-
 ;; vtbl suffix for NEON vector modes.
 (define_mode_attr VTAB_n [(TI "2") (EI "3") (OI "4")])
 
-- 
2.43.0



[PATCH 00/13] arm: Remove iWMMXT code generation

2025-05-07 Thread Richard Earnshaw


The header file for the Arm implementation of mmintrin.h was changed in GCC-15
to disable access to the intrinsics.  This patch removes the internal code
as well.

We still allow -mcpu/-march options for the wmmx cpus, but they are now treated
in exactly the same way as XScale - generating code for an Armv5te architecture.

Richard Earnshaw (13):
  arm: clarify the logic of SECONDARY_(INPUT/OUTPUT)_RELOAD_CLASS
  arm: testsuite: remove iwmmxt tests
  arm: treat -mcpu/arch=iwmmxt{,2} like XScale
  arm: remove iWMMX builtins support.
  arm: Remove iwmmxt patterns.
  arm: remove IWMMXT checks from MD files.
  arm: remove support for the iwmmxt ABI variant.
  arm: Remove iwmmxt support from arm.cc
  arm: remove iwmmxt-related attributes from machine description
  arm: cleanup iterators.md after removing iwmmxt
  arm: remove dead predefines when using WMMX
  arm: remove most remaining iwmmxt code.
  arm: remove iwmmxt registers from allocator tables

 gcc/config.gcc |2 +-
 gcc/config/arm/aout.h  |5 -
 gcc/config/arm/arm-builtins.cc | 1276 +
 gcc/config/arm/arm-c.cc|7 -
 gcc/config/arm/arm-cpus.in |   28 +-
 gcc/config/arm/arm-generic.md  |4 +-
 gcc/config/arm/arm-opts.h  |1 -
 gcc/config/arm/arm-protos.h|8 -
 gcc/config/arm/arm-tables.opt  |6 -
 gcc/config/arm/arm-tune.md |   53 +-
 gcc/config/arm/arm.cc  |  401 +-
 gcc/config/arm/arm.h   |  169 +--
 gcc/config/arm/arm.md  |   43 +-
 gcc/config/arm/arm.opt |3 -
 gcc/config/arm/constraints.md  |   18 +-
 gcc/config/arm/iterators.md|   20 +-
 gcc/config/arm/iwmmxt.md   | 1766 
 gcc/config/arm/iwmmxt2.md  |  903 
 gcc/config/arm/marvell-f-iwmmxt.md |  189 ---
 gcc/config/arm/predicates.md   |8 +-
 gcc/config/arm/t-arm   |3 -
 gcc/config/arm/thumb2.md   |2 +-
 gcc/config/arm/types.md|  123 --
 gcc/config/arm/unspecs.md  |   29 -
 gcc/config/arm/vec-common.md   |   31 +-
 gcc/doc/invoke.texi|2 +-
 gcc/doc/sourcebuild.texi   |4 -
 gcc/testsuite/gcc.target/arm/ivopts.c  |3 +-
 gcc/testsuite/gcc.target/arm/mmx-1.c   |   26 -
 gcc/testsuite/gcc.target/arm/mmx-2.c   |  166 ---
 gcc/testsuite/gcc.target/arm/pr64208.c |   25 -
 gcc/testsuite/gcc.target/arm/pr79145.c |   16 -
 gcc/testsuite/gcc.target/arm/pr99724.c |   31 -
 gcc/testsuite/gcc.target/arm/pr99786.c |   30 -
 gcc/testsuite/lib/target-supports.exp  |   13 -
 35 files changed, 141 insertions(+), 5273 deletions(-)
 delete mode 100644 gcc/config/arm/iwmmxt.md
 delete mode 100644 gcc/config/arm/iwmmxt2.md
 delete mode 100644 gcc/config/arm/marvell-f-iwmmxt.md
 delete mode 100644 gcc/testsuite/gcc.target/arm/mmx-1.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/mmx-2.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr64208.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr79145.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr99724.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr99786.c

-- 
2.43.0



[PATCH 02/13] arm: testsuite: remove iwmmxt tests

2025-05-07 Thread Richard Earnshaw
These two tests were specific to iWMMXT, but we're about to remove
that code, so the tests are now redundant.

gcc/testsuite/ChangeLog:

* gcc.target/arm/mmx-1.c: Removed.
* gcc.target/arm/mmx-2.c: Removed.
* gcc.target/arm/pr64208.c: Removed.
* gcc.target/arm/pr79145.c: Removed.
* gcc.target/arm/pr99724.c: Removed.
* gcc.target/arm/pr99786.c: Removed.
---
 gcc/testsuite/gcc.target/arm/mmx-1.c   |  26 
 gcc/testsuite/gcc.target/arm/mmx-2.c   | 166 -
 gcc/testsuite/gcc.target/arm/pr64208.c |  25 
 gcc/testsuite/gcc.target/arm/pr79145.c |  16 ---
 gcc/testsuite/gcc.target/arm/pr99724.c |  31 -
 gcc/testsuite/gcc.target/arm/pr99786.c |  30 -
 6 files changed, 294 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/arm/mmx-1.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/mmx-2.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr64208.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr79145.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr99724.c
 delete mode 100644 gcc/testsuite/gcc.target/arm/pr99786.c

diff --git a/gcc/testsuite/gcc.target/arm/mmx-1.c 
b/gcc/testsuite/gcc.target/arm/mmx-1.c
deleted file mode 100644
index 8060dbd40af..000
--- a/gcc/testsuite/gcc.target/arm/mmx-1.c
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Verify that if IP is saved to ensure stack alignment, we don't load
-   it into sp.  */
-/* { dg-do compile } */
-/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mcpu=*" } { 
"-mcpu=iwmmxt" } } */
-/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mabi=*" } { 
"-mabi=iwmmxt" } } */
-/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-march=*" } { 
"-march=iwmmxt" } } */
-/* { dg-skip-if "Test is specific to ARM mode" { arm*-*-* } { "-mthumb" } { "" 
} } */
-/* { dg-options "-O -mno-apcs-frame -mcpu=iwmmxt -mabi=iwmmxt" } */
-/* { dg-require-effective-target arm32 } */
-/* { dg-require-effective-target arm_iwmmxt_ok } */
-/* { dg-final { scan-assembler "push.*ip,\[ ]*pc" } } */
-/* { dg-skip-if "r9 is reserved in FDPIC" { arm*-*-uclinuxfdpiceabi } "*" "" } 
*/
-
-/* This function uses all the call-saved registers, namely r4, r5, r6,
-   r7, r8, r9, sl, fp.  Since we also save lr, that leaves an odd
-   number of registers, and the compiler will push ip to align the
-   stack.  Make sure that we restore ip into ip, not into sp as is
-   done when using a frame pointer.  The -mno-apcs-frame option
-   permits the frame pointer to be used as an ordinary register.  */
-
-void
-foo(void)
-{
-  __asm volatile ("" : : :
- "r4", "r5", "r6", "r7", "r8", "r9", "sl", "fp", "lr");
-}
diff --git a/gcc/testsuite/gcc.target/arm/mmx-2.c 
b/gcc/testsuite/gcc.target/arm/mmx-2.c
deleted file mode 100644
index 0540f659d1a..000
--- a/gcc/testsuite/gcc.target/arm/mmx-2.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/* { dg-do compile } */
-/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mcpu=*" } { 
"-mcpu=iwmmxt" } } */
-/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-mabi=*" } { 
"-mabi=iwmmxt" } } */
-/* { dg-skip-if "Test is specific to the iWMMXt" { arm*-*-* } { "-march=*" } { 
"-march=iwmmxt" } } */
-/* { dg-skip-if "Test is specific to ARM mode" { arm*-*-* } { "-mthumb" } { "" 
} } */
-/* { dg-require-effective-target arm32 } */
-/* { dg-require-effective-target arm_iwmmxt_ok } */
-/* { dg-options "-mcpu=iwmmxt -flax-vector-conversions -std=gnu99" } */
-
-/* Internal data types for implementing the intrinsics.  */
-typedef int __v2si __attribute__ ((vector_size (8)));
-typedef short __v4hi __attribute__ ((vector_size (8)));
-typedef signed char __v8qi __attribute__ ((vector_size (8)));
-
-void
-foo(void)
-{
-  volatile int isink;
-  volatile long long llsink;
-  volatile __v8qi v8sink;
-  volatile __v4hi v4sink;
-  volatile __v2si v2sink;
-
-  isink = __builtin_arm_getwcgr0 ();
-  __builtin_arm_setwcgr0 (isink);
-  isink = __builtin_arm_getwcgr1 ();
-  __builtin_arm_setwcgr1 (isink);
-  isink = __builtin_arm_getwcgr2 ();
-  __builtin_arm_setwcgr2 (isink);
-  isink = __builtin_arm_getwcgr3 ();
-  __builtin_arm_setwcgr3 (isink);
-
-  isink = __builtin_arm_textrmsb (v8sink, 0);
-  isink = __builtin_arm_textrmsh (v4sink, 0);
-  isink = __builtin_arm_textrmsw (v2sink, 0);
-  isink = __builtin_arm_textrmub (v8sink, 0);
-  isink = __builtin_arm_textrmuh (v4sink, 0);
-  isink = __builtin_arm_textrmuw (v2sink, 0);
-  v8sink = __builtin_arm_tinsrb (v8sink, isink, 0);
-  v4sink = __builtin_arm_tinsrh (v4sink, isink, 0);
-  v2sink = __builtin_arm_tinsrw (v2sink, isink, 0);
-  llsink = __builtin_arm_tmia (llsink, isink, isink);
-  llsink = __builtin_arm_tmiabb (llsink, isink, isink);
-  llsink = __builtin_arm_tmiabt (llsink, isink, isink);
-  llsink = __builtin_arm_tmiaph (llsink, isink, isink);
-  llsink = __builtin_arm_tmiatb (llsink, isink, isink);
-  llsink = __builtin_arm_tmiatt (llsink, isink, isink);
-  isink = __b

[PATCH 13/13] arm: remove iwmmxt registers from allocator tables

2025-05-07 Thread Richard Earnshaw
These registers can no-longer be allocated, so remove them from the
various tables.

gcc/ChangeLog:

* config/arm/aout.h (REGISTER_NAMES): Remove iwmmxt registers.
* config/arm/arm.h (FIRST_IWMMXT_REGNUM): Delete.
(LAST_IWMMXT_REGNUM): Delete.
(FIRST_IWMMXT_GR_REGNUM): Delete.
(LAST_IWMMXT_GR_REGNUM): Delete.
(IS_IWMMXT_REGNUM):  Delete.
(IS_IWMMXT_GR_REGNUM): Delete.
(FRAME_POINTER_REGNUM): Define relative to CC_REGNUM.
(ARG_POINTER_REGNUM): Define relative to FRAME_POINTER_REGNUM.
(FIRST_PSEUDO_REGISTER): Adjust.
(WREG): Delete.
(WGREG): Delete.
(REG_ALLOC_ORDER): Remove iWMMX registers.
(enum reg_class): Remove iWMMX register classes.
(REG_CLASS_NAMES): Likewise.
(REG_CLASS_CONTENTS):  Remove iWMMX registers.
* config/arm/arm.md (CC_REGNUM): Adjust value.
(VFPCC_RENGUM): Likewise.
(APSRQ_REGNUM): Likewise.
(APSRGE_REGNUM): Likewise.
(VPR_REGNUM): Likewise.
(RA_AUTH_CODE): Likewise.
---
 gcc/config/arm/aout.h |  5 ---
 gcc/config/arm/arm.h  | 83 +--
 gcc/config/arm/arm.md | 12 +++
 3 files changed, 30 insertions(+), 70 deletions(-)

diff --git a/gcc/config/arm/aout.h b/gcc/config/arm/aout.h
index cdce361e078..a9b0dfaec38 100644
--- a/gcc/config/arm/aout.h
+++ b/gcc/config/arm/aout.h
@@ -69,11 +69,6 @@
   "d20", "?20", "d21", "?21", "d22", "?22", "d23", "?23",  \
   "d24", "?24", "d25", "?25", "d26", "?26", "d27", "?27",  \
   "d28", "?28", "d29", "?29", "d30", "?30", "d31", "?31",  \
-  "wr0",   "wr1",   "wr2",   "wr3",\
-  "wr4",   "wr5",   "wr6",   "wr7",\
-  "wr8",   "wr9",   "wr10",  "wr11",   \
-  "wr12",  "wr13",  "wr14",  "wr15",   \
-  "wcgr0", "wcgr1", "wcgr2", "wcgr3",  \
   "cc", "vfpcc", "sfp", "afp", "apsrq", "apsrge", "p0",\
   "ra_auth_code"   \
 }
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 1990a276781..be5e915ee19 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -842,10 +842,6 @@ extern const int arm_arch_cde_coproc_bits[];
   1,1,1,1,1,1,1,1, \
   1,1,1,1,1,1,1,1, \
   1,1,1,1,1,1,1,1, \
-  /* IWMMXT regs.  */  \
-  1,1,1,1,1,1,1,1, \
-  1,1,1,1,1,1,1,1, \
-  1,1,1,1, \
   /* Specials.  */ \
   1,1,1,1,1,1,1,1  \
 }
@@ -872,10 +868,6 @@ extern const int arm_arch_cde_coproc_bits[];
   1,1,1,1,1,1,1,1, \
   1,1,1,1,1,1,1,1, \
   1,1,1,1,1,1,1,1, \
-  /* IWMMXT regs.  */  \
-  1,1,1,1,1,1,1,1, \
-  1,1,1,1,1,1,1,1, \
-  1,1,1,1, \
   /* Specials.  */ \
   1,1,1,1,1,1,1,1  \
 }
@@ -997,23 +989,11 @@ extern const int arm_arch_cde_coproc_bits[];
 /* Register to use for pushing function arguments.  */
 #define STACK_POINTER_REGNUM   SP_REGNUM
 
-#define FIRST_IWMMXT_REGNUM(LAST_HI_VFP_REGNUM + 1)
-#define LAST_IWMMXT_REGNUM (FIRST_IWMMXT_REGNUM + 15)
-
-/* Need to sync with WCGR in iwmmxt.md.  */
-#define FIRST_IWMMXT_GR_REGNUM (LAST_IWMMXT_REGNUM + 1)
-#define LAST_IWMMXT_GR_REGNUM  (FIRST_IWMMXT_GR_REGNUM + 3)
-
-#define IS_IWMMXT_REGNUM(REGNUM) \
-  (((REGNUM) >= FIRST_IWMMXT_REGNUM) && ((REGNUM) <= LAST_IWMMXT_REGNUM))
-#define IS_IWMMXT_GR_REGNUM(REGNUM) \
-  (((REGNUM) >= FIRST_IWMMXT_GR_REGNUM) && ((REGNUM) <= LAST_IWMMXT_GR_REGNUM))
-
 /* Base register for access to local variables of the function.  */
-#define FRAME_POINTER_REGNUM   102
+#define FRAME_POINTER_REGNUM   (CC_REGNUM + 2)
 
 /* Base register for access to arguments of the function.  */
-#define ARG_POINTER_REGNUM 103
+#define ARG_POINTER_REGNUM (FRAME_POINTER_REGNUM + 1)
 
 #define FIRST_VFP_REGNUM   16
 #define D7_VFP_REGNUM  (FIRST_VFP_REGNUM + 15)
@@ -1054,9 +1034,8 @@ extern const int arm_arch_cde_coproc_bits[];
 
 /* The number of hard registers is 16 ARM + 1 CC + 1 SFP + 1 AFP
+ 1 APSRQ + 1 APSRGE + 1 VPR + 1 Pseudo register to save PAC.  */
-/* Intel Wireless MMX Technology registers add 16 + 4 more.  */
 /* VFP (VFP3) adds 32 (64) + 1 VFPCC.  */
-#define FIRST_PSEUDO_REGISTER   108
+#define FIRST_PSEUDO_REGISTER   88
 
 #define DWARF_PAC_REGNUM 143
 
@@ -1222,8 +1201,6 @@ extern int arm_regs_in_sequence[];
function.  */
 
 #define VREG(X)  (FIRST_VFP_REGNUM + (X))
-#define WREG(X)  (FIRST_IWMMXT_REGNUM + (X))
-#define WGREG(X) (FIRST_IWMMXT_GR_REGNUM + (X))
 
 #define REG_ALLOC_ORDER\
 {  \
@@ -1249,12 +1226,6 @@ extern int arm_regs_in_sequence[];
   VREG(20), VREG(21), VREG(22), VREG(23),  \
   VREG(24), VREG(25), VREG(26), VREG(27),  

[PATCH 05/13] arm: Remove iwmmxt patterns.

2025-05-07 Thread Richard Earnshaw
This patch deletes the patterns relating to iwmmxt and iwmmxt2 and
updates the relevant dependencies.

gcc/ChangeLog:

* config/arm/arm.md: Don't include iwmmxt.md.
* config/arm/t-arm (MD_INCLUDES): Remove iwmmxt*.md.
* config/arm/iwmmxt.md: Removed.
* config/arm/iwmmxt2.md: Removed.
* config/arm/unspecs.md: Remove comment referring to
iwmmxt2.md.
* config/arm/unspecs.md (enum unspec): Remove iWMMXt unspec
values.
(enum unspecv): Likewise.
* config/arm/predicates.md (imm_or_reg_operand): Delete.
---
 gcc/config/arm/arm.md|2 -
 gcc/config/arm/iwmmxt.md | 1766 --
 gcc/config/arm/iwmmxt2.md|  903 -
 gcc/config/arm/predicates.md |8 +-
 gcc/config/arm/t-arm |2 -
 gcc/config/arm/unspecs.md|   29 -
 6 files changed, 1 insertion(+), 2709 deletions(-)
 delete mode 100644 gcc/config/arm/iwmmxt.md
 delete mode 100644 gcc/config/arm/iwmmxt2.md

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 597ef6725bb..af0564c36a9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -13125,8 +13125,6 @@ (define_insn "bti_nop"
 
 ;; Vector bits common to IWMMXT, Neon and MVE
 (include "vec-common.md")
-;; Load the Intel Wireless Multimedia Extension patterns
-(include "iwmmxt.md")
 ;; Load the VFP co-processor patterns
 (include "vfp.md")
 ;; Thumb-1 patterns
diff --git a/gcc/config/arm/iwmmxt.md b/gcc/config/arm/iwmmxt.md
deleted file mode 100644
index 0aa5dcd6709..000
--- a/gcc/config/arm/iwmmxt.md
+++ /dev/null
@@ -1,1766 +0,0 @@
-;; Patterns for the Intel Wireless MMX technology architecture.
-;; Copyright (C) 2003-2025 Free Software Foundation, Inc.
-;; Contributed by Red Hat.
-
-;; This file is part of GCC.
-
-;; GCC is free software; you can redistribute it and/or modify it under
-;; the terms of the GNU General Public License as published by the Free
-;; Software Foundation; either version 3, or (at your option) any later
-;; version.
-
-;; GCC is distributed in the hope that it will be useful, but WITHOUT
-;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-;; License for more details.
-
-;; You should have received a copy of the GNU General Public License
-;; along with GCC; see the file COPYING3.  If not see
-;; .
-
-;; Register numbers. Need to sync with FIRST_IWMMXT_GR_REGNUM in arm.h
-(define_constants
-  [(WCGR0   96)
-   (WCGR1   97)
-   (WCGR2   98)
-   (WCGR3   99)
-  ]
-)
-
-(define_insn "tbcstv8qi"
-  [(set (match_operand:V8QI   0 "register_operand" "=y")
-(vec_duplicate:V8QI (match_operand:QI 1 "s_register_operand" "r")))]
-  "TARGET_REALLY_IWMMXT"
-  "tbcstb%?\\t%0, %1"
-  [(set_attr "predicable" "yes")
-   (set_attr "type" "wmmx_tbcst")]
-)
-
-(define_insn "tbcstv4hi"
-  [(set (match_operand:V4HI   0 "register_operand" "=y")
-(vec_duplicate:V4HI (match_operand:HI 1 "s_register_operand" "r")))]
-  "TARGET_REALLY_IWMMXT"
-  "tbcsth%?\\t%0, %1"
-  [(set_attr "predicable" "yes")
-   (set_attr "type" "wmmx_tbcst")]
-)
-
-(define_insn "tbcstv2si"
-  [(set (match_operand:V2SI   0 "register_operand" "=y")
-(vec_duplicate:V2SI (match_operand:SI 1 "s_register_operand" "r")))]
-  "TARGET_REALLY_IWMMXT"
-  "tbcstw%?\\t%0, %1"
-  [(set_attr "predicable" "yes")
-   (set_attr "type" "wmmx_tbcst")]
-)
-
-(define_insn "iwmmxt_iordi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-(ior:DI (match_operand:DI 1 "register_operand" "%y")
-   (match_operand:DI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wor%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")
-   (set_attr "length" "4")
-   (set_attr "type" "wmmx_wor")]
-)
-
-(define_insn "iwmmxt_xordi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-(xor:DI (match_operand:DI 1 "register_operand" "%y")
-   (match_operand:DI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wxor%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")
-   (set_attr "length" "4")
-   (set_attr "type" "wmmx_wxor")]
-)
-
-(define_insn "iwmmxt_anddi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-(and:DI (match_operand:DI 1 "register_operand" "%y")
-   (match_operand:DI 2 "register_operand"  "y")))]
-  "TARGET_REALLY_IWMMXT"
-  "wand%?\\t%0, %1, %2"
-  [(set_attr "predicable" "yes")
-   (set_attr "length" "4")
-   (set_attr "type" "wmmx_wand")]
-)
-
-(define_insn "iwmmxt_nanddi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-(and:DI (match_operand:DI 1 "register_operand"  "y")
-   (not:DI (match_operand:DI 2 "register_operand"  "y"]
-  "TARGET_REALLY_IWMMXT"
-  "wandn%?\\t%0, %1, %2"
-  [(set_attr "predicabl

[PATCH 07/13] arm: remove support for the iwmmxt ABI variant.

2025-05-07 Thread Richard Earnshaw
The iwmmxt ABI is a variant of the ABI that supported passing certain
parameters and results in iwmmxt registers.  But since we no-longer
support the instructions that can read and write these registers, the
ABI variant can no-longer be used.

gcc/ChangeLog:

* config.gcc (arm, --with-abi): Remove iwmmxt abi option.
* config/arm/arm.opt (enum ARM_ABI_IWMMXT): Remove.
* config/arm/arm.h (TARGET_IWMMXT_ABI): Delete.
(enum arm_pcs): Remove ARM_PCS_AAPCS_IWMMXT.
(FUNCTION_ARG_REGNO_P): Remove IWMMXT ABI support.
(CUMULATIVE_ARGS): Remove iwmmxt_nregs.
* config/arm/arm.cc (arm_options_perform_arch_sanity_checks):
Remove IWMMXT ABI checks.
(arm_libcall_value_1): Likewise.
(arm_function_value_regno_p): Likewise.
(arm_apply_result_size): Remove adjustment for IWMMXT ABI.
(arm_function_arg): Remove IWMMXT ABI support.
(arm_arg_partial_bytes): Likewise.
(arm_function_arg_advance): Likewise.
(arm_init_cumulative_args): Don't initialize iwmmxt_nregs.
* doc/invoke.texi (arm -mabi): Remove mention of the iwmmxt
ABI option.
* config/arm/arm-opts.h (enum arm_abi_type): Remove ARM_ABI_IWMMXT.
---
 gcc/config.gcc|  2 +-
 gcc/config/arm/arm-opts.h |  1 -
 gcc/config/arm/arm.cc | 44 +++
 gcc/config/arm/arm.h  |  8 +--
 gcc/config/arm/arm.opt|  3 ---
 gcc/doc/invoke.texi   |  2 +-
 6 files changed, 6 insertions(+), 54 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index afbf82fd2b8..c9fe996f2f7 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -4459,7 +4459,7 @@ case "${target}" in
 
case "$with_abi" in
"" \
-   | apcs-gnu | atpcs | aapcs | iwmmxt | aapcs-linux )
+   | apcs-gnu | atpcs | aapcs | aapcs-linux )
#OK
;;
*)
diff --git a/gcc/config/arm/arm-opts.h b/gcc/config/arm/arm-opts.h
index 06a1939d087..5c543bf5246 100644
--- a/gcc/config/arm/arm-opts.h
+++ b/gcc/config/arm/arm-opts.h
@@ -46,7 +46,6 @@ enum arm_abi_type
   ARM_ABI_APCS,
   ARM_ABI_ATPCS,
   ARM_ABI_AAPCS,
-  ARM_ABI_IWMMXT,
   ARM_ABI_AAPCS_LINUX
 };
 
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 670f487bcce..30beae03ffe 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -4000,9 +4000,6 @@ arm_options_perform_arch_sanity_checks (void)
   if (TARGET_IWMMXT && !ARM_DOUBLEWORD_ALIGN)
 error ("iwmmxt requires an AAPCS compatible ABI for proper operation");
 
-  if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT)
-error ("iwmmxt abi requires an iwmmxt capable cpu");
-
   /* BPABI targets use linker tricks to allow interworking on cores
  without thumb support.  */
   if (TARGET_INTERWORK
@@ -4043,9 +4040,7 @@ arm_options_perform_arch_sanity_checks (void)
 
   if (TARGET_AAPCS_BASED)
 {
-  if (arm_abi == ARM_ABI_IWMMXT)
-   arm_pcs_default = ARM_PCS_AAPCS_IWMMXT;
-  else if (TARGET_HARD_FLOAT_ABI)
+  if (TARGET_HARD_FLOAT_ABI)
{
  arm_pcs_default = ARM_PCS_AAPCS_VFP;
  if (!bitmap_bit_p (arm_active_target.isa, isa_bit_vfpv2)
@@ -6048,9 +6043,6 @@ arm_libcall_value_1 (machine_mode mode)
 {
   if (TARGET_AAPCS_BASED)
 return aapcs_libcall_value (mode);
-  else if (TARGET_IWMMXT_ABI
-  && arm_vector_mode_supported_p (mode))
-return gen_rtx_REG (mode, FIRST_IWMMXT_REGNUM);
   else
 return gen_rtx_REG (mode, ARG_REGISTER (1));
 }
@@ -6083,9 +6075,7 @@ arm_function_value_regno_p (const unsigned int regno)
   || (TARGET_32BIT
  && TARGET_AAPCS_BASED
  && TARGET_HARD_FLOAT
- && regno == FIRST_VFP_REGNUM)
-  || (TARGET_IWMMXT_ABI
- && regno == FIRST_IWMMXT_REGNUM))
+ && regno == FIRST_VFP_REGNUM))
 return true;
 
   return false;
@@ -6102,8 +6092,6 @@ arm_apply_result_size (void)
 {
   if (TARGET_HARD_FLOAT_ABI)
size += 32;
-  if (TARGET_IWMMXT_ABI)
-   size += 8;
 }
 
   return size;
@@ -6265,7 +6253,6 @@ const struct pcs_attribute_arg
 #if 0
 /* We could recognize these, but changes would be needed elsewhere
  * to implement them.  */
-{"aapcs-iwmmxt", ARM_PCS_AAPCS_IWMMXT},
 {"atpcs", ARM_PCS_ATPCS},
 {"apcs", ARM_PCS_APCS},
 #endif
@@ -7195,7 +7182,6 @@ arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree 
fntype,
 
   /* On the ARM, the offset starts at 0.  */
   pcum->nregs = 0;
-  pcum->iwmmxt_nregs = 0;
   pcum->can_split = true;
 
   /* Varargs vectors are treated the same as long long.
@@ -7308,22 +7294,6 @@ arm_function_arg (cumulative_args_t pcum_v, const 
function_arg_info &arg)
   return pcum->aapcs_reg;
 }
 
-  /* Varargs vectors are treated the same as long long.
- named_count avoids having to change the way arm handles 'named' */
-  if (TARGET_IWMMXT_ABI
-  && arm_vector_mode_supported_p (arg.mode)
-  

[PATCH 08/13] arm: Remove iwmmxt support from arm.cc

2025-05-07 Thread Richard Earnshaw
TARGET_IWMMXT, TARGET_IWMMXT2 and their _REALLY_ equivalents are never
true now, so the code using them can be simplified.

gcc/ChangeLog:

* config/arm/arm.cc (arm_option_check_internal): Remove
IWMMXT check.
(arm_options_perform_arch_sanity_checks): Likewise.
(use_return_insn): Likewise.
(arm_init_cumulative_args): Likewise.
(arm_legitimate_index_p): Likewise.
(thumb2_legitimate_index_p): Likewise.
(arm_compute_save_core_reg_mask): Likewise.
(output_return_instruction): Likewise.
(arm_compute_frame_layout): Likewise.
(arm_save_coproc_regs): Likewise.
(arm_hard_regno_mode_ok): Likewise.
(arm_expand_epilogue_apcs_frame): Likewise.
(arm_expand_epilogue): Likewise.
(arm_vector_mode_supported_p): Likewise.
(arm_preferred_simd_mode): Likewise.
(arm_conditional_register_usage): Likewise.
---
 gcc/config/arm/arm.cc | 183 +-
 1 file changed, 2 insertions(+), 181 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 30beae03ffe..11fd6dfb3ed 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -2970,11 +2970,6 @@ arm_option_check_internal (struct gcc_options *opts)
 {
   int flags = opts->x_target_flags;
 
-  /* iWMMXt and NEON are incompatible.  */
-  if (TARGET_IWMMXT
-  && bitmap_bit_p (arm_active_target.isa, isa_bit_neon))
-error ("iWMMXt and NEON are incompatible");
-
   /* Make sure that the processor choice does not conflict with any of the
  other command line choices.  */
   if (TARGET_ARM_P (flags)
@@ -2997,10 +2992,6 @@ arm_option_check_internal (struct gcc_options *opts)
 warning (0, "%<-g%> with %<-mno-apcs-frame%> may not give sensible "
 "debugging");
 
-  /* iWMMXt unsupported under Thumb mode.  */
-  if (TARGET_THUMB_P (flags) && TARGET_IWMMXT)
-error ("iWMMXt unsupported under Thumb mode");
-
   if (TARGET_HARD_TP && TARGET_THUMB1_P (flags))
 error ("cannot use %<-mtp=cp15%> with 16-bit Thumb");
 
@@ -3997,9 +3988,6 @@ arm_options_perform_arch_sanity_checks (void)
   if (arm_arch5t)
 target_flags &= ~MASK_INTERWORK;
 
-  if (TARGET_IWMMXT && !ARM_DOUBLEWORD_ALIGN)
-error ("iwmmxt requires an AAPCS compatible ABI for proper operation");
-
   /* BPABI targets use linker tricks to allow interworking on cores
  without thumb support.  */
   if (TARGET_INTERWORK
@@ -4550,11 +4538,6 @@ use_return_insn (int iscond, rtx sibling)
   if (reg_needs_saving_p (regno))
return 0;
 
-  if (TARGET_REALLY_IWMMXT)
-for (regno = FIRST_IWMMXT_REGNUM; regno <= LAST_IWMMXT_REGNUM; regno++)
-  if (reg_needs_saving_p (regno))
-   return 0;
-
   return 1;
 }
 
@@ -7188,19 +7171,6 @@ arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree 
fntype,
  named_count avoids having to change the way arm handles 'named' */
   pcum->named_count = 0;
   pcum->nargs = 0;
-
-  if (TARGET_REALLY_IWMMXT && fntype)
-{
-  tree fn_arg;
-
-  for (fn_arg = TYPE_ARG_TYPES (fntype);
-  fn_arg;
-  fn_arg = TREE_CHAIN (fn_arg))
-   pcum->named_count += 1;
-
-  if (! pcum->named_count)
-   pcum->named_count = INT_MAX;
-}
 }
 
 /* Return 2 if double word alignment is required for argument passing,
@@ -8868,12 +8838,6 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
 
-  if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))
-return (code == CONST_INT
-   && INTVAL (index) < 1024
-   && INTVAL (index) > -1024
-   && (INTVAL (index) & 3) == 0);
-
   if (GET_MODE_SIZE (mode) <= 4
   && ! (arm_arch4
&& (mode == HImode
@@ -8953,17 +8917,6 @@ thumb2_legitimate_index_p (machine_mode mode, rtx index, 
int strict_p)
&& INTVAL (index) > -256
&& (INTVAL (index) & 3) == 0);
 
-  if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))
-{
-  /* For DImode assume values will usually live in core regs
-and only allow LDRD addressing modes.  */
-  if (!TARGET_LDRD || mode != DImode)
-   return (code == CONST_INT
-   && INTVAL (index) < 1024
-   && INTVAL (index) > -1024
-   && (INTVAL (index) & 3) == 0);
-}
-
   /* For quad modes, we restrict the constant offset to be slightly less
  than what the instruction format permits.  We do this because for
  quad mode moves, we will actually decompose them into two separate
@@ -21412,34 +21365,6 @@ arm_compute_save_core_reg_mask (void)
   if (cfun->machine->lr_save_eliminated)
 save_reg_mask &= ~ (1 << LR_REGNUM);
 
-  if (TARGET_REALLY_IWMMXT
-  && ((bit_count (save_reg_mask)
-  + ARM_NUM_INTS (crtl->args.pretend_args_size +
-  arm_compute_static_chain_stack_bytes())
-  ) % 2) != 0)
-{
-  /* The

[PATCH 03/13] arm: treat -mcpu/arch=iwmmxt{,2} like XScale

2025-05-07 Thread Richard Earnshaw
Treat options that select iwmmxt variants as we would for xscale.  We
leave the feature bits in for now, since they are still needed
elsewhere, but they are never enabled.

Also remove the remaining testsuite framework support for iwmmxt,
since this will never trigger now.

gcc/

* config/arm/arm-cpus.in (arch iwmmxt): treat in the same
way as we would treat XScale.
(arch iwmmxt2): Likewise.
(cpu xscale): Add aliases for iwmmxt and iwmmxt2.
(cpu iwmmxt): Delete.
(cpu iwmmxt2): Delete.
* config/arm/arm-generic.md (load_ldsched_xscale): Remove references
to iwmmxt.
(load_ldsched): Likewise.
* config/arm/arm-tables.opt: Regenerated.
* config/arm/arm-tune.md: Regenerated.
* doc/sourcebuild.texi (arm_iwmmxt_ok): Delete.

gcc/testsuite/ChangeLog:

* gcc.target/arm/ivopts.c: Remove test for iwmmxt
* lib/target-supports.exp
(check_effective_target_arm_iwmmxt_ok): Delete.
---
 gcc/config/arm/arm-cpus.in| 22 +++
 gcc/config/arm/arm-generic.md |  4 +-
 gcc/config/arm/arm-tables.opt |  6 ---
 gcc/config/arm/arm-tune.md| 53 +--
 gcc/doc/sourcebuild.texi  |  4 --
 gcc/testsuite/gcc.target/arm/ivopts.c |  3 +-
 gcc/testsuite/lib/target-supports.exp | 13 ---
 7 files changed, 35 insertions(+), 70 deletions(-)

diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index 1939d55b9fd..b34c441ec76 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -778,18 +778,19 @@ begin arch armv9-a
  option bf16 add bf16 FP_ARMv8 DOTPROD
 end arch armv9-a
 
+# We no-longer support the iwmmxt{,2} extensions, so treat these like xscale.
 begin arch iwmmxt
- tune for iwmmxt
+ tune for xscale
  tune flags LDSCHED STRONG XSCALE
  base 5TE
- isa ARMv5te xscale iwmmxt
+ isa ARMv5te xscale
 end arch iwmmxt
 
 begin arch iwmmxt2
- tune for iwmmxt2
+ tune for xscale
  tune flags LDSCHED STRONG XSCALE
  base 5TE
- isa ARMv5te xscale iwmmxt iwmmxt2
+ isa ARMv5te xscale
 end arch iwmmxt2
 
 # CPU entries
@@ -924,23 +925,12 @@ end cpu arm10e
 
 begin cpu xscale
  tune flags LDSCHED XSCALE
+ alias iwmmxt iwmmxt2
  architecture armv5te
  isa xscale
  costs xscale
 end cpu xscale
 
-begin cpu iwmmxt
- tune flags LDSCHED XSCALE
- architecture iwmmxt
- costs xscale
-end cpu iwmmxt
-
-begin cpu iwmmxt2
- tune flags LDSCHED XSCALE
- architecture iwmmxt2
- costs xscale
-end cpu iwmmxt2
-
 begin cpu fa606te
  tune flags LDSCHED
  architecture armv5te
diff --git a/gcc/config/arm/arm-generic.md b/gcc/config/arm/arm-generic.md
index c2700568c00..a8af0e6f255 100644
--- a/gcc/config/arm/arm-generic.md
+++ b/gcc/config/arm/arm-generic.md
@@ -96,14 +96,14 @@ (define_insn_reservation "load_ldsched_xscale" 3
   (and (eq_attr "generic_sched" "yes")
(and (eq_attr "ldsched" "yes") 
(and (eq_attr "type" "load_byte,load_4")
-(eq_attr "tune" "xscale,iwmmxt,iwmmxt2"
+(eq_attr "tune" "xscale"
   "core")
 
 (define_insn_reservation "load_ldsched" 2
   (and (eq_attr "generic_sched" "yes")
(and (eq_attr "ldsched" "yes") 
(and (eq_attr "type" "load_byte,load_4")
-(eq_attr "tune" "!xscale,iwmmxt,iwmmxt2"
+(eq_attr "tune" "!xscale"
   "core")
 
 (define_insn_reservation "load_or_store" 2
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index db7767a2d6c..544de84df80 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -66,12 +66,6 @@ Enum(processor_type) String(arm10e) Value( TARGET_CPU_arm10e)
 EnumValue
 Enum(processor_type) String(xscale) Value( TARGET_CPU_xscale)
 
-EnumValue
-Enum(processor_type) String(iwmmxt) Value( TARGET_CPU_iwmmxt)
-
-EnumValue
-Enum(processor_type) String(iwmmxt2) Value( TARGET_CPU_iwmmxt2)
-
 EnumValue
 Enum(processor_type) String(fa606te) Value( TARGET_CPU_fa606te)
 
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index a04d1eeb62d..20b5f932344 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -25,31 +25,30 @@ (define_attr "tune"
fa526,fa626,arm7tdmi,
arm710t,arm9,arm9tdmi,
arm920t,arm10tdmi,arm9e,
-   arm10e,xscale,iwmmxt,
-   iwmmxt2,fa606te,fa626te,
-   fmp626,fa726te,arm926ejs,
-   arm1026ejs,arm1136js,arm1136jfs,
-   arm1176jzs,arm1176jzfs,mpcorenovfp,
-   mpcore,arm1156t2s,arm1156t2fs,
-   cortexm1,cortexm0,cortexm0plus,
-   cortexm1smallmultiply,cortexm0smallmultiply,cortexm0plussmallmultiply,
-   genericv7a,cortexa5,cortexa7,
-   cortexa8,cortexa9,cortexa12,
-   cortexa15,cortexa17,cortexr4,
-   cortexr4f,cortexr5,cortexr7,
-   cortexr8,cortexm7,cortexm4,
-   cortexm3,marvell_pj4,cortexa15cortexa7,
-   cortexa17cortexa7,cortexa32,cortexa35,
-   cortexa53,cortexa57,cortexa72,
-   cortexa73,exynosm

[PATCH 11/13] arm: remove dead predefines when using WMMX

2025-05-07 Thread Richard Earnshaw
Since we no-longer enable iWMMXT, these predefines are no-longer enabled
when preprocessing C.  Remove them.

gcc/ChangeLog:

* config/arm/arm-c.cc (arm_cpu_builtins):  Remove predefines
for __IWWMXT__, __IWMMXT2__ and __ARM_WMMX.
---
 gcc/config/arm/arm-c.cc | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/gcc/config/arm/arm-c.cc b/gcc/config/arm/arm-c.cc
index 15e4080904f..d257e62b563 100644
--- a/gcc/config/arm/arm-c.cc
+++ b/gcc/config/arm/arm-c.cc
@@ -373,13 +373,6 @@ arm_cpu_builtins (struct cpp_reader* pfile)
   builtin_define (arm_arch_name);
   if (arm_arch_xscale)
 builtin_define ("__XSCALE__");
-  if (arm_arch_iwmmxt)
-{
-  builtin_define ("__IWMMXT__");
-  builtin_define ("__ARM_WMMX");
-}
-  if (arm_arch_iwmmxt2)
-builtin_define ("__IWMMXT2__");
   /* ARMv6KZ was originally identified as the misspelled __ARM_ARCH_6ZK__.  To
  preserve the existing behavior, the misspelled feature macro must still be
  defined.  */
-- 
2.43.0



[PATCH 06/13] arm: remove IWMMXT checks from MD files.

2025-05-07 Thread Richard Earnshaw
Remove the various checks for TARGET_IWMMXT{,2} and
TARGET_REALLY_IWMMXT{,2} from the remaining machine description files.
These flags can never be true now.

gcc/ChangeLog:

* config/arm/arm.md(attr arch): Remove iwmmxt and iwmmxt2.
Remove checks based on TARGET_REALLY_IWMMXT2 from all split
patterns.
(arm_movdi): Likewise.
(*arm_movt): Likewise.
(arch_enabled): Remove test for iwmmxt2.
* config/arm/constraints.md (y, z): Remove register constraints.
(Uy): Remove memory constraint.
* config/arm/thumb2.md (thumb2_pop_single): Remove check for
IWMMXT.
* config/arm/vec-common.md (mov): Remove check for IWMMXT.
(mul3): Likewise.
(xor3): Likewise.
(2): Likewise.
(@movmisalign): Likewise.
(@mve_q_): Likewise.
(vashl3): Likewise.
(vashr3): Likewise.
(vlshr3): Likewise.
(uavg3_ceil): Likewise.
---
 gcc/config/arm/arm.md | 15 ---
 gcc/config/arm/constraints.md | 18 +++---
 gcc/config/arm/thumb2.md  |  2 +-
 gcc/config/arm/vec-common.md  | 31 ---
 4 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index af0564c36a9..ce1b987b241 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -149,7 +149,7 @@ (define_attr "length" ""
 ; This attribute is used to compute attribute "enabled",
 ; use type "any" to enable an alternative in all cases.
 (define_attr "arch" "any, a, t, 32, t1, t2, v6,nov6, v6t2, \
-v8mb, fix_vlldm, iwmmxt, iwmmxt2, armv6_or_vfpv3, \
+v8mb, fix_vlldm, armv6_or_vfpv3, \
 neon, mve"
   (const_string "any"))
 
@@ -197,10 +197,6 @@ (define_attr "arch_enabled" "no,yes"
  (match_test "fix_vlldm"))
 (const_string "yes")
 
-(and (eq_attr "arch" "iwmmxt2")
- (match_test "TARGET_REALLY_IWMMXT2"))
-(const_string "yes")
-
 (and (eq_attr "arch" "armv6_or_vfpv3")
  (match_test "arm_arch6 || TARGET_VFP3"))
 (const_string "yes")
@@ -2893,14 +2889,12 @@ (define_expand "one_cmpldi2"
 ;; Split DImode and, ior, xor operations.  Simply perform the logical
 ;; operation on the upper and lower halves of the registers.
 ;; This is needed for atomic operations in arm_split_atomic_op.
-;; Avoid splitting IWMMXT instructions.
 (define_split
   [(set (match_operand:DI 0 "s_register_operand" "")
(match_operator:DI 6 "logical_binary_operator"
  [(match_operand:DI 1 "s_register_operand" "")
   (match_operand:DI 2 "s_register_operand" "")]))]
-  "TARGET_32BIT && reload_completed
-   && ! IS_IWMMXT_REGNUM (REGNO (operands[0]))"
+  "TARGET_32BIT && reload_completed"
   [(set (match_dup 0) (match_op_dup:SI 6 [(match_dup 1) (match_dup 2)]))
(set (match_dup 3) (match_op_dup:SI 6 [(match_dup 4) (match_dup 5)]))]
   "
@@ -6345,7 +6339,6 @@ (define_insn "*arm_movdi"
   "TARGET_32BIT
&& !(TARGET_HARD_FLOAT)
&& !(TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT)
-   && !TARGET_IWMMXT
&& (   register_operand (operands[0], DImode)
|| register_operand (operands[1], DImode))"
   "*
@@ -6554,7 +6547,7 @@ (define_insn "*arm_movt"
 (define_insn "*arm_movsi_insn"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m")
(match_operand:SI 1 "general_operand"  "rk, I,K,j,mi,rk"))]
-  "TARGET_ARM && !TARGET_IWMMXT && !TARGET_HARD_FLOAT
+  "TARGET_ARM && !TARGET_HARD_FLOAT
&& (   register_operand (operands[0], SImode)
|| register_operand (operands[1], SImode))"
   "@
@@ -13123,7 +13116,7 @@ (define_insn "bti_nop"
   [(set_attr "conds" "unconditional")
(set_attr "type" "nop")])
 
-;; Vector bits common to IWMMXT, Neon and MVE
+;; Vector bits common to Neon and MVE
 (include "vec-common.md")
 ;; Load the VFP co-processor patterns
 (include "vfp.md")
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 9f1a37aa5d4..24743a82356 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -19,11 +19,12 @@
 ;; .
 
 ;; The following register constraints have been used:
-;; - in ARM/Thumb-2 state: t, w, x, y, z
+;; - in ARM/Thumb-2 state: t, w, x
 ;; - in Thumb state: h, b
 ;; - in both states: l, c, k, q, Cs, Ts, US
 ;; In ARM state, 'l' is an alias for 'r'
 ;; 'f' and 'v' were previously used for FPA and MAVERICK registers.
+;; 'y' and 'z' were previously used for iWMMX registers (removed after gcc-15)
 
 ;; The following normal constraints have been used:
 ;; in ARM/Thumb-2 state: G, I, j, J, K, L, M
@@ -39,7 +40,7 @@
 ;; in all states: Pg
 
 ;; The following memory constraints have been used:
-;; in ARM/Thumb-2 state: Uh, Ut, Uv, Uy, Un, Um, Us, Uo, Up, Uf, Ux, Ul, Uz
+;; in ARM/Thumb-2 state: Uh, Ut, Uv, Un, Um, Us, Uo, Up, Uf, Ux, Ul, Uz
 ;; in ARM state: Uq
 ;; in Thumb state

[PATCH 12/13] arm: remove most remaining iwmmxt code.

2025-05-07 Thread Richard Earnshaw
Remove most of the remaining code for iWMMXT support, except for the
register allocation table entries.

gcc/ChangeLog:

* config/arm/arm-cpus.in (feature iwmmxt, feature iwmmxt2):  Delete.
* config/arm/arm-protos.h (arm_output_iwmmxt_shift_immediate): Delete.
(arm_output_iwmmxt_tinsr): Delete.
(arm_arch_iwmmxt): Delete.
(arm_arch_iwmmxt2): Delete.
* config/arm/arm.h (TARGET_IWMMXT): Delete.
(TARGET_IWMMXT2): Delete.
(TARGET_REALLY_IWMMXT): Delete.
(TARGET_REALLY_IWMMXT2): Delete.
(VALID_IWMMXT_REG_MODE): Delete.
(ARM_HAVE_V8QI_ARITH): Remove iWMMXT.
(ARM_HAVE_V4HI_ARITH): Likewise.
(ARM_HAVE_V2SI_ARITH): Likewise.
(ARM_HAVE_V8QI_LDST): Likewise.
(ARM_HAVE_V4HI_LDST): Likewise.
(ARM_HAVE_V2SI_LDST): Likewise.
(SECONDARY_OUTPUT_RELOAD_CLASS):  Remove iWMMXT cases.
(SECONDARY_INPUT_RELOAD_CLASS): Likewise.
* config/arm/arm.cc (arm_arch_iwmmxt): Delete.
(arm_arch_iwmmxt2): Delete.
(arm_option_reconfigure_globals): Don't initialize them.
(arm_register_move_cost): Remove costs for iwmmxt.
(struct minipool_node):  Update comment.
(output_move_double): Likewise
(output_return_instruction): Likewise.
(arm_print_operand, cases 'U' and 'w'): Report an error if
used.
(arm_regno_class): Remove iWMMXT cases.
(arm_debugger_regno): Remove iWMMXT cases.
(arm_output_iwmmxt_shift_immediate): Delete.
(arm_output_iwmmxt_tinsr): Delete.
---
 gcc/config/arm/arm-cpus.in  |   6 --
 gcc/config/arm/arm-protos.h |   8 --
 gcc/config/arm/arm.cc   | 174 ++--
 gcc/config/arm/arm.h|  69 +-
 4 files changed, 32 insertions(+), 225 deletions(-)

diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in
index b34c441ec76..7f5a8c670b6 100644
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -102,12 +102,6 @@ define feature armv8
 # ARMv8 CRC32 instructions.
 define feature crc32
 
-# XScale v2 (Wireless MMX).
-define feature iwmmxt
-
-# XScale Wireless MMX2.
-define feature iwmmxt2
-
 # Architecture rel 8.1.
 define feature armv8_1
 
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 254c7310794..ff7e7658f91 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -190,8 +190,6 @@ extern void arm_output_multireg_pop (rtx *, bool, rtx, 
bool, bool);
 extern void arm_set_return_address (rtx, rtx);
 extern int arm_eliminable_register (rtx);
 extern const char *arm_output_shift(rtx *, int);
-extern const char *arm_output_iwmmxt_shift_immediate (const char *, rtx *, 
bool);
-extern const char *arm_output_iwmmxt_tinsr (rtx *);
 extern unsigned int arm_sync_loop_insns (rtx , rtx *);
 extern int arm_attr_length_push_multi(rtx, rtx);
 extern int arm_attr_length_pop_multi(rtx *, bool, bool);
@@ -475,12 +473,6 @@ extern int arm_ld_sched;
 /* Nonzero if this chip is a StrongARM.  */
 extern int arm_tune_strongarm;
 
-/* Nonzero if this chip supports Intel Wireless MMX technology.  */
-extern int arm_arch_iwmmxt;
-
-/* Nonzero if this chip supports Intel Wireless MMX2 technology.  */
-extern int arm_arch_iwmmxt2;
-
 /* Nonzero if this chip is an XScale.  */
 extern int arm_arch_xscale;
 
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 11fd6dfb3ed..4d7c42bd5b8 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -948,12 +948,6 @@ int arm_ld_sched = 0;
 /* Nonzero if this chip is a StrongARM.  */
 int arm_tune_strongarm = 0;
 
-/* Nonzero if this chip supports Intel Wireless MMX technology.  */
-int arm_arch_iwmmxt = 0;
-
-/* Nonzero if this chip supports Intel Wireless MMX2 technology.  */
-int arm_arch_iwmmxt2 = 0;
-
 /* Nonzero if this chip is an XScale.  */
 int arm_arch_xscale = 0;
 
@@ -3919,8 +3913,6 @@ arm_option_reconfigure_globals (void)
   arm_arch_thumb1 = bitmap_bit_p (arm_active_target.isa, isa_bit_thumb);
   arm_arch_thumb2 = bitmap_bit_p (arm_active_target.isa, isa_bit_thumb2);
   arm_arch_xscale = bitmap_bit_p (arm_active_target.isa, isa_bit_xscale);
-  arm_arch_iwmmxt = bitmap_bit_p (arm_active_target.isa, isa_bit_iwmmxt);
-  arm_arch_iwmmxt2 = bitmap_bit_p (arm_active_target.isa, isa_bit_iwmmxt2);
   arm_arch_thumb_hwdiv = bitmap_bit_p (arm_active_target.isa, isa_bit_tdiv);
   arm_arch_arm_hwdiv = bitmap_bit_p (arm_active_target.isa, isa_bit_adiv);
   arm_arch_crc = bitmap_bit_p (arm_active_target.isa, isa_bit_crc32);
@@ -12378,11 +12370,6 @@ arm_register_move_cost (machine_mode mode 
ATTRIBUTE_UNUSED,
   if ((IS_VFP_CLASS (from) && !IS_VFP_CLASS (to))
  || (!IS_VFP_CLASS (from) && IS_VFP_CLASS (to)))
return 15;
-  else if ((from == IWMMXT_REGS && to != IWMMXT_REGS)
-  || (from != IWMMXT_REGS && to == IWMMXT_REGS))
-   return 4;
-  else if (from == IWMMXT_GR_REGS || to == IWMMXT_GR_REGS)
-  

[PATCH] tree-optimization/120143 - ICE with failed early break store move

2025-05-07 Thread Richard Biener
The early break vectorization store moving was incorrectly trying
to move the pattern stmt instead of the original one which failed
to register and then confused virtual SSA form due to the update
triggered by a degenerate virtual PHI.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

PR tree-optimization/120143
* tree-vect-data-refs.cc (vect_analyze_early_break_dependences):
Move/update the original stmts, not the pattern stmts which
lack virtual operands and are not in the IL.

* gcc.dg/vect/vect-early-break_135-pr120143.c: New testcase.
---
 .../vect/vect-early-break_135-pr120143.c   | 18 ++
 gcc/tree-vect-data-refs.cc |  1 -
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-early-break_135-pr120143.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_135-pr120143.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_135-pr120143.c
new file mode 100644
index 000..1ee30a821e2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_135-pr120143.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-additional-options "-O3 -fwhole-program" } */
+
+short a;
+extern _Bool b[][23];
+short g = 6;
+int v[4];
+int x[3];
+void c(short g, int v[], int x[]) {
+  for (;;)
+for (unsigned y = 0; y < 023; y++) {
+  b[y][y] = v[y];
+  for (_Bool aa = 0; aa < (_Bool)g; aa = x[y])
+a = a > 0;
+}
+}
+int main() { c(g, v, x); }
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 231a3cab4f8..9fd1ef29650 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -734,7 +734,6 @@ vect_analyze_early_break_dependences (loop_vec_info 
loop_vinfo)
 
  stmt_vec_info stmt_vinfo
= vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (stmt));
- stmt = STMT_VINFO_STMT (stmt_vinfo);
  auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
  if (!dr_ref)
continue;
-- 
2.43.0


Re: [RFC PATCH 0/5] aarch64: Support for user-defined aarch64 tuning parameters in JSON

2025-05-07 Thread Kyrylo Tkachov
 In Hi Richard,

> On 6 May 2025, at 12:34, Richard Sandiford  wrote:
> 
>  writes:
>> From: Soumya AR 
>> 
>> Hi,
>> 
>> This RFC and subsequent patch series introduces support for printing and 
>> parsing
>> of aarch64 tuning parameters in the form of JSON.
> 
> Thanks for doing this.  It looks really useful.  My main question is:
> rather than write the parsing and printing routines by hand, could we
> generate the structure definitions, the parsing code, and the printing
> code from the schema?
> 
> The schema would need to provide more information about the structures
> compared to the current one.  The approach would also presumably need
> build/*.o versions of the json routines.  But it seems like something
> that we might want to do elsewhere, so would be worth building a bit
> of infrastructure around.  And it would reduce the maintenance burden
> associated with adding a new field or changing an existing one.
> 

Thanks for your thoughts. I suspected that we may need something like that 
eventually.
Hypothetically in the future we’d like to be able to batch up the various 
generic —params in a JSON input file as well to help superoptimiser tools for 
performance exploration purposes.
It looks like the parsing and printing code would be easy to autogenerate.
The structure definitions in aarch64-protos.h may be tricker. As long as they 
are effectively containers of primitive data it should be okay, though 
currently some extend others (like the issue info structs).
Haven’t thought it through completely yet.

Thanks,
Kyrill

> Much more minor, but: in patch 1, I'm all in favour of removing the
> "const"s from the field declarations, such as:
> 
> struct scale_addr_mode_cost
> {
> -  const int hi;
> -  const int si;
> -  const int di;
> -  const int ti;
> +  int hi;
> +  int si;
> +  int di;
> +  int ti;
> };
> 
> IMO those consts should never have been there.  But can we keep the
> predefined tables themselves as const, without things like:
> 
> -const struct cpu_cost_table tsv110_extra_costs =
> +struct cpu_cost_table tsv110_extra_costs =
> 
> ?  If we make any changes to the contents of these tables, it should
> IMO be done via temporaries instead.
> 
> Thanks,
> Richard



[V2 PATCH] Fix name mismatch for fortran.

2025-05-07 Thread liuhongt
From: "hongtao.liu" 

> The check you added seems correct to me. Do we need to keep the 
> afdo_string_table->get_index (IDENTIFIER_POINTER (
>             DECL_ASSEMBLER_NAME (edge->callee->decl))) != s->name ()
> check? Should your check replace it rather than be an additional check?
I verified that "replace" generates same binaries as "an additional check".
So update the patch and committed.

Function name in afdo_string_table is step3d_t_tile.
but DECL_ASSEMBLER_NAME (edge->callee->decl))) gets
__step3d_t_mod_MOD_step3d_t_tile, Looks like the prefix is not in the
debug string table.
The patch uses
afdo_string_table->get_index_by_decl (edge->callee->decl) instead.

gcc/ChangeLog:

PR gcov-profile/118508
* auto-profile.cc
(autofdo_source_profile::get_callsite_total_count): Fix name
mismatch for fortran.
---
 gcc/auto-profile.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index aa4d1634f01..106e4216dda 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -837,8 +837,8 @@ autofdo_source_profile::get_callsite_total_count (
 
   function_instance *s = get_function_instance_by_inline_stack (stack);
   if (s == NULL
-  || afdo_string_table->get_index (IDENTIFIER_POINTER (
- DECL_ASSEMBLER_NAME (edge->callee->decl))) != s->name ())
+  ||(afdo_string_table->get_index_by_decl (edge->callee->decl)
+!= s->name()))
 return 0;
 
   return s->total_count ();
-- 
2.34.1



Ignore me ..

2025-05-07 Thread Umesh Kalappa



[PATCH] Fix off-by-one when collecting range expression

2025-05-07 Thread Andreas Schwab
Fixes this error during build of fixincludes:

In function ‘byte_regex_compile’,
inlined from ‘xregcomp’ at ../libiberty/../../libiberty/regex.c:7973:11:
../libiberty/../../libiberty/regex.c:3477:29: warning: writing 1 byte into a 
region of size 0 [-Wstringop-overflow=]
 3477 | str[c1] = '\0';
  | ^
../libiberty/../../libiberty/regex.c: In function ‘xregcomp’:
../libiberty/../../libiberty/regex.c:3454:35: note: at offset 128 into 
destination object ‘str’ of size 128
 3454 | unsigned char str[128]; /* Should be large 
enough.  */
  |   ^


* regex.c (regex_compile): Don't write beyond array bounds when
collecting range expression.
---
 libiberty/regex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libiberty/regex.c b/libiberty/regex.c
index bc36f43d450..8337deaef5a 100644
--- a/libiberty/regex.c
+++ b/libiberty/regex.c
@@ -3468,7 +3468,7 @@ PREFIX(regex_compile) (const char *ARG_PREFIX(pattern),
PATFETCH (c);
if ((c == '.' && *p == ']') || p == pend)
  break;
-   if (c1 < sizeof (str))
+   if (c1 < sizeof (str) - 1)
  str[c1++] = c;
else
  /* This is in any case an invalid class name.  */
-- 
2.49.0


-- 
Andreas Schwab, SUSE Labs, sch...@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."


Re: [PATCH] asf: Fix calling of emit_move_insn on registers of different modes [PR119884]

2025-05-07 Thread Richard Sandiford
Konstantinos Eleftheriou  writes:
> Hi Richard,
>
> Thanks for the feedback! We have sent a new version that uses
> lowpart_subreg 
> (https://gcc.gnu.org/pipermail/gcc-patches/2025-May/682835.html).
> We had tried that before, but we were mishandling the case where there
> are multiple stores with the same offset as the load.

Thanks, I'll have a look.

> As for `it->offset`, that's actually the offset difference between the
> store and the load (we're trying to find the store with the same
> offset as the load), so the endianness should be irrelevant in that
> case.

But I thought the code was allowing multiple stores to be forwarded to
a single (wider) load.  E.g. 4 individual byte stores at address X, X+1,
X+2 and X+3 could be forwarded to a 4-byte load at address X.  And the code
I mentioned is handling the least significant byte by zero-extending it.

For big-endian targets, the least significant byte should come from
address X+3 rather than address X.  The byte at address X (i.e. the
byte with the equal offset) should instead go in the most significant
byte, typically using a shift left.

Richard

>
> Konstantinos
>
> On Tue, Apr 29, 2025 at 8:48 PM Richard Sandiford
>  wrote:
>>
>> Konstantinos Eleftheriou  writes:
>> > During the base register initialization, when we are eliminating the load
>> > instruction, we were calling `emit_move_insn` on registers of the same
>> > size but of different mode in some cases, causing an ICE.
>> >
>> > This patch fixes this, by adding a check for the modes to match before
>> > calling `emit_move_insn`.
>> >
>> > Bootstrapped/regtested on AArch64 and x86_64.
>> >
>> >   PR rtl-optimization/119884
>> >
>> > gcc/ChangeLog:
>> >
>> >   * avoid-store-forwarding.cc (process_store_forwarding):
>> >   Added check to ensure that the register modes match
>> >   before calling `emit_move_insn`.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> >   * gcc.target/i386/pr119884.c: New test.
>> > ---
>> >  gcc/avoid-store-forwarding.cc|  3 ++-
>> >  gcc/testsuite/gcc.target/i386/pr119884.c | 13 +
>> >  2 files changed, 15 insertions(+), 1 deletion(-)
>> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr119884.c
>> >
>> > diff --git a/gcc/avoid-store-forwarding.cc b/gcc/avoid-store-forwarding.cc
>> > index ded8d7e596e0..aec05c22ac37 100644
>> > --- a/gcc/avoid-store-forwarding.cc
>> > +++ b/gcc/avoid-store-forwarding.cc
>> > @@ -244,7 +244,8 @@ process_store_forwarding (vec &stores, 
>> > rtx_insn *load_insn,
>> >   GET_MODE_BITSIZE (GET_MODE (it->mov_reg
>> >   base_reg = gen_rtx_ZERO_EXTEND (dest_mode, it->mov_reg);
>> >
>> > -   if (base_reg)
>> > +   /* Generate a move instruction, only when the modes match.  */
>> > +   if (base_reg && dest_mode == GET_MODE (base_reg))
>> >   {
>> > rtx_insn *move0 = emit_move_insn (dest, base_reg);
>> > if (recog_memoized (move0) >= 0)
>>
>> Is this a complete fix though?  It looks like:
>>
>>   rtx base_reg = it->mov_reg;
>>   if (known_gt (GET_MODE_BITSIZE (dest_mode),
>> GET_MODE_BITSIZE (GET_MODE (it->mov_reg
>> base_reg = gen_rtx_ZERO_EXTEND (dest_mode, it->mov_reg);
>>
>> implicitly assumes that dest_mode and GET_MODE (it->mov_reg) are both
>> integer modes.
>>
>> Given that this code only triggers if we're going to eliminate the load,
>> and thus presumably define all the other bytes of DEST_MODE later,
>> couldn't we just use lowpart_subreg?  That way we can cope with different
>> like-sized modes and with "extensions" from one mode to another.  Of course,
>> there might be cases where doing this for different modes would trigger
>> unwanted cross-file register transfers, but that seems like a general
>> risk with forwarding.
>>
>> Not related to this PR, but on the same piece of code:
>>
>>   /* If we're eliminating the load then find the store with zero offset
>>  and use it as the base register to avoid a bit insert if possible.  
>> */
>>   if (load_elim && it->offset == 0)
>>
>> Doesn't this offset check need to take endianness into account?
>> Byte 0 would normally be the msb for big-endian targets.
>>
>> Thanks,
>> Richard
>>
>> > diff --git a/gcc/testsuite/gcc.target/i386/pr119884.c 
>> > b/gcc/testsuite/gcc.target/i386/pr119884.c
>> > new file mode 100644
>> > index ..34d5b689244d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/i386/pr119884.c
>> > @@ -0,0 +1,13 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O2 -fno-dse -favoid-store-forwarding" } */
>> > +
>> > +typedef __attribute__((__vector_size__(64))) char V;
>> > +char c;
>> > +V v;
>> > +
>> > +char
>> > +foo()
>> > +{
>> > +  v *= c;
>> > +  return v[0];
>> > +}
>> > \ No newline at end of file


Unreviewed COBOL patches

2025-05-07 Thread Rainer Orth
Four COBOL patches have remained unreviewed for a month.  They are
required to get the cobol1 and libgcobol to build on Solaris:

cobol: Don't require GLOB_BRACE etc. [PR119217]
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680675.html

cobol: Initialize regmatch_t portably [PR119217]
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680676.html

It's unclear how to proceed with this one: my simple patch or Jakub's
proposal.

cobol: Allow for undefined NAME_MAX [PR119217]
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680682.html

This one is unclear, too: for one, it turned out that the use of
NAME_MAX isn't related to filenames at all (which suggests just
replacing the macro with it's Linux value, 255), but also Richi's
observation that cbl_funtion_t.name is never set all all (which suggests
removing it completely).

libgcobol: Heed --enable-libgcobol
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680684.html

This allows --enable-libgcobol to enable building the runtime lib even
if the target isn't listed as supported in configure.tgt.

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [RFC PATCH 3/5] json: Add get_map() method to JSON object class

2025-05-07 Thread Kyrylo Tkachov



> On 6 May 2025, at 10:30, Soumya AR  wrote:
> 
> From: Soumya AR 
> 
> This patch adds a get_map () method to the JSON object class to provide access
> to the underlying hash map that stores the JSON key-value pairs.
> 
> It also reorganizes the private and public sections of the class to expose the
> map_t typedef, which is the return type of get_map().
> 
> This change is needed to allow traversal of key-value pairs when parsing
> user-provided JSON tuning data.
> 
> This patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> 

I think this needs rebasing on top of the recent changes from David here.
Thanks,
Kyrill

> Signed-off-by: Soumya AR 
> 
> gcc/ChangeLog:
> 
> * json.h (class object): Add get_map () method.
> ---
> gcc/json.h | 21 -
> 1 file changed, 12 insertions(+), 9 deletions(-)
> 
> diff --git a/gcc/json.h b/gcc/json.h
> index e369244cf8b..6fc2ac62e2c 100644
> --- a/gcc/json.h
> +++ b/gcc/json.h
> @@ -94,7 +94,16 @@ class value
> 
> class object : public value
> {
> - public:
> +private:
> +  typedef hash_map +   simple_hashmap_traits >
> +map_t;
> +  map_t m_map;
> +
> +  /* Keep track of order in which keys were inserted.  */
> +  auto_vec m_keys;
> +
> +public:
>   ~object ();
> 
>   enum kind get_kind () const final override { return JSON_OBJECT; }
> @@ -120,20 +129,14 @@ class object : public value
> 
>   value *get (const char *key) const;
> 
> +  const map_t &get_map () const { return m_map; }
> +
>   void set_string (const char *key, const char *utf8_value);
>   void set_integer (const char *key, long v);
>   void set_float (const char *key, double v);
> 
>   /* Set to literal true/false.  */
>   void set_bool (const char *key, bool v);
> -
> - private:
> -  typedef hash_map  -simple_hashmap_traits > map_t;
> -  map_t m_map;
> -
> -  /* Keep track of order in which keys were inserted.  */
> -  auto_vec  m_keys;
> };
> 
> /* Subclass of value for arrays.  */
> -- 
> 2.44.0
> 



Re: [PATCH 4/8] AArch64: add constants for branch displacements

2025-05-07 Thread Richard Sandiford
Karl Meakin  writes:
> Extract the hardcoded values for the minimum PC-relative displacements
> into named constants and document them.
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.md (BRANCH_LEN_P_128MiB): New constant.
>   (BRANCH_LEN_N_128MiB): likewise.
>   (BRANCH_LEN_P_1MiB): likewise.
>   (BRANCH_LEN_N_1MiB): likewise.
>   (BRANCH_LEN_P_32KiB): likewise.
>   (BRANCH_LEN_N_32KiB): likewise.
>   (BRANCH_LEN_P_1KiB): likewise.
>   (BRANCH_LEN_N_1KiB): likewise.
> ---
>  gcc/config/aarch64/aarch64.md | 68 ++-
>  1 file changed, 52 insertions(+), 16 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 23775ec58ca..ca5bd96a754 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -696,7 +696,27 @@ (define_insn "jump"
>[(set_attr "type" "branch")]
>  )
>  
> +;; Maximum PC-relative positive/negative displacements for various branching
> +;; instructions.
> +(define_constants
> +  [
> +;; +/- 128MiB.  Used by B, BL.
> +(BRANCH_LEN_P_128Mib  134217724)
> +(BRANCH_LEN_N_128Mib -134217728)

It'd be good to use the same capitalisation style in the comments,
changelog, and identifiers, so MiB rather than Mib and KiB rather than Kib.

Otherwise +1 to Kyrill's ok.  Very minor, but...

> +
> +;; +/- 1MiB.  Used by B., CBZ, CBNZ.
> +(BRANCH_LEN_P_1Mib  1048572)
> +(BRANCH_LEN_N_1Mib -1048576)
>  
> +;; +/- 32KiB.  Used by TBZ, TBNZ.
> +(BRANCH_LEN_P_32Kib  32764)
> +(BRANCH_LEN_N_32Kib -32768)
> +
> +;; +/- 1KiB.  Used by CBB, CBH, CB.
> +(BRANCH_LEN_P_1Kib  1020)
> +(BRANCH_LEN_N_1Kib -1024)

...from a staging perspective, it might be better to put this in patch 8,
with the code that uses it.

Thanks,
Richard

> +  ]
> +)
>  
>  ;; ---
>  ;; Conditional jumps
> @@ -760,13 +780,17 @@ (define_insn "aarch64_bcond"
>}
>[(set_attr "type" "branch")
> (set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -(lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +(const_int BRANCH_LEN_N_1Mib))
> +(lt (minus (match_dup 2) (pc))
> +(const_int BRANCH_LEN_P_1Mib)))
> (const_int 4)
> (const_int 8)))
> (set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -(lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +(const_int BRANCH_LEN_N_1Mib))
> +(lt (minus (match_dup 2) (pc))
> +(const_int BRANCH_LEN_P_1Mib)))
> (const_int 0)
> (const_int 1)))]
>  )
> @@ -823,13 +847,17 @@ (define_insn "aarch64_cb1"
>}
>[(set_attr "type" "branch")
> (set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
> -(lt (minus (match_dup 1) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 1) (pc))
> +(const_int BRANCH_LEN_N_1Mib))
> +(lt (minus (match_dup 1) (pc))
> +(const_int BRANCH_LEN_P_1Mib)))
> (const_int 4)
> (const_int 8)))
> (set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -(lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +(const_int BRANCH_LEN_N_1Mib))
> +(lt (minus (match_dup 2) (pc))
> +(const_int BRANCH_LEN_P_1Mib)))
> (const_int 0)
> (const_int 1)))]
>  )
> @@ -864,13 +892,17 @@ (define_insn "*cb1"
>}
>[(set_attr "type" "branch")
> (set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
> -(lt (minus (match_dup 1) (pc)) (const_int 32764)))
> + (if_then_else (and (ge (minus (match_dup 1) (pc))
> +(const_int BRANCH_LEN_N_32Kib))
> +(lt (minus (match_dup 1) (pc))
> +(const_int BRANCH_LEN_P_32Kib)))
> (const_int 4)
> (const_int 8)))
> (set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
> -(lt (minus (match_dup 1) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 1) (pc))
> +  

Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Richard Sandiford
Kyrylo Tkachov  writes:
>> On 7 May 2025, at 12:27, Karl Meakin  wrote:
>> 
>> Commit the test file `cmpbr.c` before rules for generating the new
>> instructions are added, so that the changes in codegen are more obvious
>> in the next commit.
>
> I guess that’s an LLVM best practice.
> In GCC since we have the check-function-bodies mechanism we usually prefer to 
> include the relevant test together with the patch that adds the optimization.
> But this is not wrong either.
>
>
>> 
>> gcc/testsuite/ChangeLog:
>> 
>> * gcc.target/aarch64/cmpbr.c: New test.
>> ---
>> gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
>> 1 file changed, 1378 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c
>> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
>> b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
>> new file mode 100644
>> index 000..728d6ead91c
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
>> @@ -0,0 +1,1378 @@
>> +/* Test that the instructions added by FEAT_CMPBR are emitted */
>> +/* { dg-do compile } */
>> +/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
> As you’ll be adding new instructions to the compiler it’d be good to have it 
> a dg-do assemble test where possible.

Agreed FWIW, but:

> For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use it 
> like so to fallback to dg-do compile when the assembler is too old:
> /* { dg-do compile { target aarch64_asm_cmpbr_ok } } */

...dg-do assemble for this one :)

Thanks,
Richard

> /* { dg-do compile { target { ! aarch64_asm_cmpbr_ok } } } */
> Look in lib/target-supports.exp for “aarch64_asm” for how to define it.
>
> Ok otherwise.
> Thanks,
> Kyrill
>
>> +
>> +#include 
>> +
>> +typedef uint8_t u8;
>> +typedef int8_t i8;
>> +
>> +typedef uint16_t u16;
>> +typedef int16_t i16;
>> +
>> +typedef uint32_t u32;
>> +typedef int32_t i32;
>> +
>> +typedef uint64_t u64;
>> +typedef int64_t i64;
>> +
>> +int taken();
>> +int not_taken();
>> +
>> +#define COMPARE(ty, name, op, rhs)  
>>\
>> +  int ty##_x0_##name##_##rhs(ty x0, ty x1) {
>>\
>> +return (x0 op rhs) ? taken() : not_taken(); 
>>\
>> +  }
>> +
>> +#define COMPARE_ALL(unsigned_ty, signed_ty, rhs)
>>\
>> +  COMPARE(unsigned_ty, eq, ==, rhs);
>>\
>> +  COMPARE(unsigned_ty, ne, !=, rhs);
>>\
>> +
>>\
>> +  COMPARE(unsigned_ty, ult, <, rhs);
>>\
>> +  COMPARE(unsigned_ty, ule, <=, rhs);   
>>\
>> +  COMPARE(unsigned_ty, ugt, >, rhs);
>>\
>> +  COMPARE(unsigned_ty, uge, >=, rhs);   
>>\
>> +
>>\
>> +  COMPARE(signed_ty, slt, <, rhs);  
>>\
>> +  COMPARE(signed_ty, sle, <=, rhs); 
>>\
>> +  COMPARE(signed_ty, sgt, >, rhs);  
>>\
>> +  COMPARE(signed_ty, sge, >=, rhs);
>> +
>> +//  CBB (register) 
>> +COMPARE_ALL(u8, i8, x1);
>> +
>> +//  CBH (register) 
>> +COMPARE_ALL(u16, i16, x1);
>> +
>> +//  CB (register) 
>> +COMPARE_ALL(u32, i32, x1);
>> +COMPARE_ALL(u64, i64, x1);
>> +
>> +//  CB (immediate) 
>> +COMPARE_ALL(u32, i32, 42);
>> +COMPARE_ALL(u64, i64, 42);
>> +
>> +//  Special cases 
>> +// CBB and CBH cannot have immediate operands. Instead we have to do a 
>> MOV+CB
>> +COMPARE_ALL(u8, i8, 42);
>> +COMPARE_ALL(u16, i16, 42);
>> +
>> +// 65 is out of the range for immediate operands (0 to 63).
>> +// * For 8/16-bit types, use a MOV+CB as above.
>> +// * For 32/64-bit types, use a CMP+B instead, because
>> +//   B has a longer range than CB.
>> +COMPARE_ALL(u8, i8, 65);
>> +COMPARE_ALL(u16, i16, 65);
>> +COMPARE_ALL(u32, i32, 65);
>> +COMPARE_ALL(u64, i64, 65);
>> +
>> +// Comparisons against zero can use the wzr/xzr register.
>> +COMPARE_ALL(u8, i8, 0);
>> +COMPARE_ALL(u16, i16, 0);
>> +COMPARE_ALL(u32, i32, 0);
>> +COMPARE_ALL(u64, i64, 0);
>> +
>> +/*
>> +** u8_x0_eq_x1:
>> +** and w1, w1, 255
>> +** cmp w1, w0, uxtb
>> +** beq .L4
>> +** b not_taken
>> +** b taken
>> +*/
>> +
>> +/*
>> +** u8_x0_ne_x1:
>> +** and w1, w1, 255
>> +** cmp w1, w0, uxtb
>> +** beq .L6
>> +** b taken
>> +** b not_taken
>> +*/
>> +
>> +/*
>> +** u8_x0_ult_x1:
>> +** and w1, w1, 255
>> +** cmp w1, w0, uxtb
>> +** bls .L8
>> +** b taken
>> +** b not_taken
>> +*/
>> +
>> +/*
>> +** u8_x0_ule_x1:
>> +** and w1, w1, 255
>> +** cmp w1, w0, uxtb
>> +** bcc

Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Richard Earnshaw

On 07/05/2025 13:57, Richard Sandiford wrote:

Kyrylo Tkachov  writes:

On 7 May 2025, at 12:27, Karl Meakin  wrote:

Commit the test file `cmpbr.c` before rules for generating the new
instructions are added, so that the changes in codegen are more obvious
in the next commit.


I guess that’s an LLVM best practice.
In GCC since we have the check-function-bodies mechanism we usually prefer to 
include the relevant test together with the patch that adds the optimization.
But this is not wrong either.




gcc/testsuite/ChangeLog:

* gcc.target/aarch64/cmpbr.c: New test.
---
gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
1 file changed, 1378 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c

diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
new file mode 100644
index 000..728d6ead91c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
@@ -0,0 +1,1378 @@
+/* Test that the instructions added by FEAT_CMPBR are emitted */
+/* { dg-do compile } */
+/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */


As you’ll be adding new instructions to the compiler it’d be good to have it a 
dg-do assemble test where possible.


Agreed FWIW, but:


For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use it 
like so to fallback to dg-do compile when the assembler is too old:
/* { dg-do compile { target aarch64_asm_cmpbr_ok } } */


...dg-do assemble for this one :)


I don't think that works. If the first dg-do fails the test is just skipped.

You need to replicate the test with separate dg-do directives, IIRC.

R.



Thanks,
Richard


/* { dg-do compile { target { ! aarch64_asm_cmpbr_ok } } } */
Look in lib/target-supports.exp for “aarch64_asm” for how to define it.

Ok otherwise.
Thanks,
Kyrill


+
+#include 
+
+typedef uint8_t u8;
+typedef int8_t i8;
+
+typedef uint16_t u16;
+typedef int16_t i16;
+
+typedef uint32_t u32;
+typedef int32_t i32;
+
+typedef uint64_t u64;
+typedef int64_t i64;
+
+int taken();
+int not_taken();
+
+#define COMPARE(ty, name, op, rhs) 
\
+  int ty##_x0_##name##_##rhs(ty x0, ty x1) {   
\
+return (x0 op rhs) ? taken() : not_taken();
\
+  }
+
+#define COMPARE_ALL(unsigned_ty, signed_ty, rhs)   
\
+  COMPARE(unsigned_ty, eq, ==, rhs);   
\
+  COMPARE(unsigned_ty, ne, !=, rhs);   
\
+   
\
+  COMPARE(unsigned_ty, ult, <, rhs);   
\
+  COMPARE(unsigned_ty, ule, <=, rhs);  
\
+  COMPARE(unsigned_ty, ugt, >, rhs);   
\
+  COMPARE(unsigned_ty, uge, >=, rhs);  
\
+   
\
+  COMPARE(signed_ty, slt, <, rhs); 
\
+  COMPARE(signed_ty, sle, <=, rhs);
\
+  COMPARE(signed_ty, sgt, >, rhs); 
\
+  COMPARE(signed_ty, sge, >=, rhs);
+
+//  CBB (register) 
+COMPARE_ALL(u8, i8, x1);
+
+//  CBH (register) 
+COMPARE_ALL(u16, i16, x1);
+
+//  CB (register) 
+COMPARE_ALL(u32, i32, x1);
+COMPARE_ALL(u64, i64, x1);
+
+//  CB (immediate) 
+COMPARE_ALL(u32, i32, 42);
+COMPARE_ALL(u64, i64, 42);
+
+//  Special cases 
+// CBB and CBH cannot have immediate operands. Instead we have to do a MOV+CB
+COMPARE_ALL(u8, i8, 42);
+COMPARE_ALL(u16, i16, 42);
+
+// 65 is out of the range for immediate operands (0 to 63).
+// * For 8/16-bit types, use a MOV+CB as above.
+// * For 32/64-bit types, use a CMP+B instead, because
+//   B has a longer range than CB.
+COMPARE_ALL(u8, i8, 65);
+COMPARE_ALL(u16, i16, 65);
+COMPARE_ALL(u32, i32, 65);
+COMPARE_ALL(u64, i64, 65);
+
+// Comparisons against zero can use the wzr/xzr register.
+COMPARE_ALL(u8, i8, 0);
+COMPARE_ALL(u16, i16, 0);
+COMPARE_ALL(u32, i32, 0);
+COMPARE_ALL(u64, i64, 0);
+
+/*
+** u8_x0_eq_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** beq .L4
+** b not_taken
+** b taken
+*/
+
+/*
+** u8_x0_ne_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** beq .L6
+** b taken
+** b not_taken
+*/
+
+/*
+** u8_x0_ult_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bls .L8
+** b taken
+** b not_taken
+*/
+
+/*
+** u8_x0_ule_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bcc .L10
+** b taken
+** b not_taken
+*/
+
+/*
+** u8_x0_ugt_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bcs .L12
+** b taken
+** b not_taken
+*/
+
+/*
+** u8_x0_uge_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bhi .L14
+** b taken
+** b not_taken
+

i386: implement costs for float<->int conversions in ix86_vector_costs::add_stmt_cost

2025-05-07 Thread Jan Hubicka
Hi,
This patch adds pattern matching for float<->int conversions both as normal
statements and promote_demote.  While updating promote_demote I noticed that
in cleanups I turned "stmt_cost =" into "int stmt_cost = " which turned
the existing FP costing to NOOP. I also added comment on how demotes are done
when turning i.e. 32bit into 8bit value (which is the case of pr19919.c).

The patch disables vectorization in pr119919.c on generic tuning, but keeps
it at both zen and skylake+. The underlying problem is bad cost of open-coded
scatter which is tracked by 119902 so I simply added -mtune=znver1 so the 
testcase
keeps testing vectorization.

bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Add 
FLOAT_EXPR;
FIX_TRUNC_EXPR and vec_promote_demote costs.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr119919.c: Add -mtune=znver1

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index bef95ea18c8..fd36ea802c0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25767,6 +25767,26 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  (ix86_tune_cost, GET_MODE_BITSIZE (mode));
  break;
 
+   case FLOAT_EXPR:
+   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ix86_cost->cvtsi2ss;
+   else if (X87_FLOAT_MODE_P (mode))
+ /* TODO: We do not have cost tables for x87.  */
+ stmt_cost = ix86_cost->fadd;
+   else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+   break;
+
+   case FIX_TRUNC_EXPR:
+   if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+ stmt_cost = ix86_cost->cvtss2si;
+   else if (X87_FLOAT_MODE_P (mode))
+ /* TODO: We do not have cost tables for x87.  */
+ stmt_cost = ix86_cost->fadd;
+   else
+ stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+   break;
+
case COND_EXPR:
  {
/* SSE2 conditinal move sequence is:
@@ -25930,8 +25950,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
break;
   }
 
-  if (kind == vec_promote_demote
-  && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt
+  if (kind == vec_promote_demote)
 {
   int outer_size
= tree_to_uhwi
@@ -25941,16 +25960,25 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
= tree_to_uhwi
(TYPE_SIZE
(TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt;
-  int stmt_cost = vec_fp_conversion_cost
-   (ix86_tune_cost, GET_MODE_BITSIZE (mode));
-  /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will 
end
-up doing two conversions and packing them.  */
+  bool inner_fp = FLOAT_TYPE_P
+   (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)));
+
+  if (fp && inner_fp)
+   stmt_cost = vec_fp_conversion_cost
+ (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+  else if (fp && !inner_fp)
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+  else if (!fp && inner_fp)
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+  else
+   stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+  /* VEC_PACK_TRUNC_EXPR and similar demote operations: If outer size is
+greater than inner size we will end up doing two conversions and
+packing them.  We always pack pairs; if the size difference is greater
+it is split into multiple demote operations.  */
   if (inner_size > outer_size)
-   {
- int n = inner_size / outer_size;
- stmt_cost = stmt_cost * n
- + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
-   }
+   stmt_cost = stmt_cost * 2
+   + ix86_vec_cost (mode, ix86_cost->sse_op);
 }
 
   /* If we do elementwise loads into a vector then we are bound by
diff --git a/gcc/testsuite/gcc.target/i386/pr119919.c 
b/gcc/testsuite/gcc.target/i386/pr119919.c
index ed646561bd1..e39819f682d 100644
--- a/gcc/testsuite/gcc.target/i386/pr119919.c
+++ b/gcc/testsuite/gcc.target/i386/pr119919.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -fdump-tree-vect-details" } */
+/* { dg-options "-O2 -msse2 -fdump-tree-vect-details -mtune=znver1" } */
 int a[9*9];
 bool b[9];
 void test()


RE: [PATCH] cobol: Don't require GLOB_BRACE etc. [PR119217]

2025-05-07 Thread Robert Dubner
> -Original Message-
> From: Rainer Orth 
> Sent: Friday, April 11, 2025 04:43
> To: gcc-patches@gcc.gnu.org
> Cc: Robert Dubner ; James K. Lowden
> 
> Subject: [PATCH] cobol: Don't require GLOB_BRACE etc. [PR119217]
>
> cdf-copy.cc doesn't compile on Solaris:
>
> /vol/gcc/src/hg/master/local/gcc/cobol/cdf-copy.cc: In member function
> ‘int copybook_elem_t::open_file(const char*, bool)’:
> /vol/gcc/src/hg/master/local/gcc/cobol/cdf-copy.cc:317:34: error:
> ‘GLOB_BRACE’ was not declared in this scope; did you mean ‘GLOB_ERR’?
>   317 |   static int flags = GLOB_MARK | GLOB_BRACE | GLOB_TILDE;
>   |  ^~
>   |  GLOB_ERR
> /vol/gcc/src/hg/master/local/gcc/cobol/cdf-copy.cc:317:47: error:
> ‘GLOB_TILDE’ was not declared in this scope
>   317 |   static int flags = GLOB_MARK | GLOB_BRACE | GLOB_TILDE;
>   |   ^~
>
> GLOB_BRACE and GLOB_TILDE are BSD extensions not in POSIX.1, thus
> missing on Solaris probably due to its System V heritage.
>
> This patch introduces fallback definitions to avoid this.
>
> Bootstrapped without regressions on amd64-pc-solaris2.11,
> sparcv9-sun-solaris2.11, and x86_64-pc-linux-gnu.
>
> Ok for trunk?

OK for trunk.

>
>   Rainer
>
> --
> --
> ---
> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> 2025-04-08  Rainer Orth  
>
>   gcc/cobol:
>   PR cobol/119217
>   * cdf-copy.cc (GLOB_BRACE): Define fallback.
>   (GLOB_TILDE): Likewise.



Re: [RFC PATCH 0/5] aarch64: Support for user-defined aarch64 tuning parameters in JSON

2025-05-07 Thread David Malcolm
On Tue, 2025-05-06 at 11:34 +0100, Richard Sandiford wrote:
>  writes:
> > From: Soumya AR 
> > 
> > Hi,
> > 
> > This RFC and subsequent patch series introduces support for
> > printing and parsing
> > of aarch64 tuning parameters in the form of JSON.
> 
> Thanks for doing this.  It looks really useful.  My main question is:
> rather than write the parsing and printing routines by hand, could we
> generate the structure definitions, the parsing code, and the
> printing
> code from the schema?
> 
> The schema would need to provide more information about the
> structures
> compared to the current one.  The approach would also presumably need
> build/*.o versions of the json routines.  But it seems like something
> that we might want to do elsewhere, so would be worth building a bit
> of infrastructure around.  And it would reduce the maintenance burden
> associated with adding a new field or changing an existing one.

FWIW I have a lot of similar written-by-hand JSON-handling code for
SARIF; see gcc/libsarifreplay.cc for decoding from json and
gcc/diagnostic-format-sarif.cc for generating json.  It would be nice
to have some tooling for this based on schema files - but there are
lots of awkward cases to cope with - SARIF is non-trivial [1]

Other approaches which I've dabbled with are:
* what I called strongly-typed json, where as well as json::object and
json::array that can take arbitrary json::value as property values or
elements, to have a way to support objects where known properties are
stored in a special way (strongly typed); similarly for arrays.
* to have an adapter for json-like types, so that given a tree-like
structure in memory you can go direct to json output without needing to
build an in-memory tree; similar for parsing (iirc llvm has something
like this)

Note that for SARIF the DejaGnu tests validate the generated json
against a schema; see gcc/testsuite/lib/scansarif.exp; there's also
run-sarif-pytest which allows DejaGnu to run Python scripts to verify
properties of the generated json.  This is probably overkill for the
aarch64 tuning use-case, but is very helpful for SARIF, which is deeply
nested json, has cross-references, etc.

Dave

[1] Another wart here is that libsarifreplay.cc is built on top of
libgdiagnostics.h rather than diagnostic-core.h.  One nice things I
have is reporting chapter-and-verse of the specification when the
schema is violated, along with both logical location (JSON pointer) and
physical location, e.g.:

In JSON property '/runs/0/results/0/level':
bad-level.sarif:12:20: error: unrecognized value for 'level': 'mostly harmless' 
[SARIF v2.1.0 §3.27.10]
   12 |   "level": "mostly harmless",
  |^

Dave



Re: [PATCH 2/8] AArch64: reformat branch instruction rules

2025-05-07 Thread Richard Sandiford
Karl Meakin  writes:
> Make the formatting of the RTL templates in the rules for branch
> instructions more consistent with each other.

One source of variation is the 80-character limit.  It's a bit of a soft
limit for rtl, but it is still good to keep to it where that's easy.
So...

>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.md (cbranch4): reformat.
>   (cbranchcc4): likewise.
>   (condjump): likewise.
>   (*compare_condjump): likewise.
>   (aarch64_cb1): likewise.
>   (*cb1): likewise.
>   (tbranch_3): likewise.
>   (@aarch64_tb): likewise.
> ---
>  gcc/config/aarch64/aarch64.md | 82 ++-
>  1 file changed, 42 insertions(+), 40 deletions(-)
>
> [...]
> @@ -717,34 +717,34 @@ (define_expand "cbranch4"
>  )
>  
>  (define_expand "cbranch4"
> -  [(set (pc) (if_then_else
> - (match_operator 0 "aarch64_comparison_operator"
> -  [(match_operand:GPF_F16 1 "register_operand")
> -   (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
> - (label_ref (match_operand 3 "" ""))
> - (pc)))]
> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
> + [(match_operand:GPF_F16 1 "register_operand")
> +  (match_operand:GPF_F16 2 
> "aarch64_fp_compare_operand")])
> +(label_ref (match_operand 3))
> +(pc)))]

...for this one, the reformatting is making a line longer than 80 characters.
I think it'd be better to keep the original.

> @@ -807,9 +807,10 @@ (define_insn_and_split "*compare_condjump"
>  )
>  
>  (define_insn "aarch64_cb1"
> -  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
> - (const_int 0))
> -(label_ref (match_operand 1 "" ""))
> +  [(set (pc) (if_then_else (EQL
> +  (match_operand:GPI 0 "register_operand" "r")
> +  (const_int 0))
> +(label_ref (match_operand 1))
>  (pc)))]

I suppose this is personal perference, but it's IMO better not to break
after something short like EQL, if the break-free form would also fit
in 80 characters.  So I'd slightly prefer going the other way, and
avoiding a break after the condition if the break isn't needed for
line-length purposes.

Thanks,
Richard


Re: [PATCH 3/8] AArch64: rename branch instruction rules

2025-05-07 Thread Richard Sandiford
Kyrylo Tkachov  writes:
>> On 7 May 2025, at 12:27, Karl Meakin  wrote:
>> 
>> Give the `define_insn` rules used in lowering `cbranch4` to RTL
>> more descriptive and consistent names: from now on, each rule is named
>> after the AArch64 instruction that it generates. Also add comments to
>> document each rule.
>> 
>> gcc/ChangeLog:
>> 
>> * config/aarch64/aarch64.md (condjump): rename to ...
>> (aarch64_bcond): ...here.
>> (*compare_condjump): rename to ...
>> (*aarch64_bcond_wide_imm): ...here.
>> (restore_stack_nonlocal): handle rename.
>> (stack_protect_combined_test): likewise.
>> * config/aarch64/aarch64-simd.md (cbranch4): likewise.
>> * config/aarch64/aarch64-sme.md (aarch64_restore_za): likewise.
>> * config/aarch64/aarch64.cc (aarch64_gen_test_and_branch): likewise.
>> ---
>> gcc/config/aarch64/aarch64-simd.md |  2 +-
>> gcc/config/aarch64/aarch64-sme.md  |  3 ++-
>> gcc/config/aarch64/aarch64.cc  |  2 +-
>> gcc/config/aarch64/aarch64.md  | 15 +--
>> 4 files changed, 13 insertions(+), 9 deletions(-)
>> 
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index e2afe87e513..197a5f65f34 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -3946,7 +3946,7 @@ (define_expand "cbranch4"
>> 
>>   rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
>>   rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
>> -  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
>> +  emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[3]));
>>   DONE;
>> })
>> 
>> diff --git a/gcc/config/aarch64/aarch64-sme.md 
>> b/gcc/config/aarch64/aarch64-sme.md
>> index c49affd0dd3..6a7c31acf0a 100644
>> --- a/gcc/config/aarch64/aarch64-sme.md
>> +++ b/gcc/config/aarch64/aarch64-sme.md
>> @@ -389,7 +389,8 @@ (define_insn_and_split "aarch64_restore_za"
>> auto label = gen_label_rtx ();
>> auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM);
>> emit_insn (gen_aarch64_read_tpidr2 (tpidr2));
>> -auto jump = emit_likely_jump_insn (gen_aarch64_cbnedi1 (tpidr2, label));
>> +auto jump = emit_likely_jump_insn (
>> + gen_aarch64_cbnedi1 (tpidr2, label));
>
> IMO it’d be cleaner to break this before the “=“ instead. This doesn’t look 
> like a renaming, just a reformatting btw.

Yeah, since the line fits in its original form, it's probably better
to drop this hunk.

> Ok otherwise.
> Thanks,
> Kyrill
>
>
>> JUMP_LABEL (jump) = label;
>> 
>> aarch64_restore_za (operands[0]);
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index fff8d9da49d..c0afdeb87ee 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -2879,7 +2879,7 @@ aarch64_gen_test_and_branch (rtx_code code, rtx x, int 
>> bitnum,
>>   emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
>>   rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
>>   rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
>> -  return gen_condjump (x, cc_reg, label);
>> +  return gen_aarch64_bcond (x, cc_reg, label);
>> }
>>   return gen_aarch64_tb (code, mode, mode,
>> x, gen_int_mode (bitnum, mode), label);
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 45b2283c5c0..23775ec58ca 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -740,7 +740,8 @@ (define_expand "cbranchcc4"
>>   ""
>> )
>> 
>> -(define_insn "condjump"
>> +;; Emit `B`, assuming that the condition is already in the CC 
>> register.
>> +(define_insn "aarch64_bcond"
>>   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>>[(match_operand 1 "cc_register")
>> (const_int 0)])
>> @@ -780,7 +781,7 @@ (define_insn "condjump"
>> ;; sub x0, x1, #(CST & 0xfff000)
>> ;; subs x0, x0, #(CST & 0x000fff)
>> ;; b .Label
>> -(define_insn_and_split "*compare_condjump"
>> +(define_insn_and_split "*aarch64_bcond_wide_imm"
>>   [(set (pc) (if_then_else (EQL
>> (match_operand:GPI 0 "register_operand" "r")
>> (match_operand:GPI 1 "aarch64_imm24" "n"))
>> @@ -801,11 +802,12 @@ (define_insn_and_split "*compare_condjump"
>> rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
>> rtx cmp_rtx = gen_rtx_fmt_ee (, mode,
>>  cc_reg, const0_rtx);
>> -emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
>> +emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2]));
>> DONE;
>>   }
>> )
>> 
>> +;; For an EQ/NE comparison against zero, emit `CBZ`/`CBNZ`
>> (define_insn "aarch64_cb1"
>>   [(set (pc) (if_then_else (EQL
>> (match_operand:GPI 0 "register_operand" "r")
>> @@ -832,6 +834,7 @@ (define_insn "aarch64_cb1"
>>  (const_int 1)))]
>> )
>> 
>> +;; For an LT/GE comparison against zero, emit `TBZ`/`TBNZ`
>> (define_insn "*cb1"
>>   [(set (pc) (if_then_else (LTGE
>> (match_operand:ALLI 0 "register_operand" "r")
>> @@ -1325,13 +1328,13 @@ (de

RE: Unreviewed COBOL patches

2025-05-07 Thread Robert Dubner
Thank you for the reminder, and accept my apologies for the delays.

Jim and I have been been distracted by an intense effort to rewrite
exception/declarative processing.  There has also been a serious family
health issue that caused us significant delays as well.

I finished a sub-project yesterday, and I will look into these four
patches today.

> -Original Message-
> From: Rainer Orth 
> Sent: Wednesday, May 7, 2025 04:38
> To: gcc-patches@gcc.gnu.org
> Cc: Robert Dubner ; James K. Lowden
> ; Jakub Jelinek ; Richard
Biener
> 
> Subject: Unreviewed COBOL patches
> 
> Four COBOL patches have remained unreviewed for a month.  They are
> required to get the cobol1 and libgcobol to build on Solaris:
> 
>   cobol: Don't require GLOB_BRACE etc. [PR119217]
> https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680675.html
> 
>   cobol: Initialize regmatch_t portably [PR119217]
> https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680676.html
> 
> It's unclear how to proceed with this one: my simple patch or Jakub's
> proposal.
> 
>   cobol: Allow for undefined NAME_MAX [PR119217]
> https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680682.html
> 
> This one is unclear, too: for one, it turned out that the use of
> NAME_MAX isn't related to filenames at all (which suggests just
> replacing the macro with it's Linux value, 255), but also Richi's
> observation that cbl_funtion_t.name is never set all all (which suggests
> removing it completely).
> 
>   libgcobol: Heed --enable-libgcobol
> https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680684.html
> 
> This allows --enable-libgcobol to enable building the runtime lib even
> if the target isn't listed as supported in configure.tgt.
> 
>   Rainer
> 
> --
>
--
> ---
> Rainer Orth, Center for Biotechnology, Bielefeld University


RE: [PATCH] cobol: Allow for undefined NAME_MAX [PR119217]

2025-05-07 Thread Robert Dubner



> -Original Message-
> From: Rainer Orth 
> Sent: Friday, April 11, 2025 05:17
> To: gcc-patches@gcc.gnu.org
> Cc: Robert Dubner ; James K. Lowden
> 
> Subject: [PATCH] cobol: Allow for undefined NAME_MAX [PR119217]
>
> All users of symbols.h fail to compile on Solaris:
>
> /vol/gcc/src/hg/master/local/gcc/cobol/symbols.h: At global scope:
> /vol/gcc/src/hg/master/local/gcc/cobol/symbols.h:1365:13: error:
> ‘NAME_MAX’ was not declared in this scope
>  1365 |   char name[NAME_MAX];
>   | ^~~~
>
> NAME_MAX being undefined is allowed by POSIX.1, actually: it's listed
> for  under "Pathname Variable Values":
>
> A definition of one of the symbolic constants in the following list
> shall be omitted from the  header on specific implementations
> where the corresponding value is equal to or greater than the stated
> minimum, but where the value can vary depending on the file to which it
> is applied. The actual value supported for a specific pathname shall be
> provided by the pathconf() function.
>
> As a hack, this patch provides a fallback definition to allow the build
> to finish.
>
> Bootstrapped without regressions on amd64-pc-solaris2.11,
> sparcv9-sun-solaris2.11, and x86_64-pc-linux-gnu.
>
> Ok for trunk?

I agree that this is an expedient way of getting things to compile on 
systems that lack NAME_MAX.  I will be following through to see what are the 
implications of eliminating it completely.

It is OK for trunk.


>
>   Rainer
>
> --
> --
> ---
> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> 2025-04-08  Rainer Orth  
>
>   gcc/cobol:
>   PR cobol/119217
>   * symbols.h (NAME_MAX): Define fallback.



RE: [PATCH] cobol: Initialize regmatch_t portably [PR119217]

2025-05-07 Thread Robert Dubner
> -Original Message-
> From: Rainer Orth 
> Sent: Friday, April 11, 2025 04:50
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] cobol: Initialize regmatch_t portably [PR119217]
>
> The dts.h initialization of regmatch_t currently breaks Solaris
> compilation:
>
> In file included from /vol/gcc/src/hg/master/local/gcc/cobol/lexio.h:208,
>  from /vol/gcc/src/hg/master/local/gcc/cobol/lexio.cc:36:
> /vol/gcc/src/hg/master/local/gcc/cobol/dts.h: In constructor
> ‘dts::csub_match::csub_match(const char*)’:
> /vol/gcc/src/hg/master/local/gcc/cobol/dts.h:36:35: error: invalid
> conversion from ‘int’ to ‘const char*’ [-fpermissive]
>36 |   static regmatch_t empty = { -1, -1 };
>   |   ^~
>   |   |
>   |   int
>
>
> The problem is that Solaris regmatch_t has additional members before
> rm_so and rm_eo, as is always allowed by POSIX.1
>
> typedef struct {
> const char  *rm_sp, *rm_ep; /* Start pointer, end pointer */
> regoff_trm_so, rm_eo;   /* Start offset, end offset */
> int rm_ss, rm_es;   /* Used internally */
> } regmatch_t;
>
> so the initialization doesn't do what it's supposed to do.
>
> Fixed by initializing the rm_so and rm_eo members explicitly.
>
> Bootstrapped without regressions on amd64-pc-solaris2.11,
> sparcv9-sun-solaris2.11, and x86_64-pc-linux-gnu.
>
> Ok for trunk?

OK for trunk.

>
>   Rainer
>
> --
> --
> ---
> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> 2025-04-08  Rainer Orth  
>
>   gcc/cobol:
>   PR cobol/119217
>   * dts.h (csub_match): Initialize rm_so, rm_eo fields explicitly.



Re: [PATCH 3/6] vect: Remove non-SLP path from vectorizable_reduction

2025-05-07 Thread Richard Biener
On Tue, 6 May 2025, andre.simoesdiasvie...@arm.com wrote:

> 
> This removes the non-SLP paths from vectorizable_conversion and in the 
> process eliminates uses of 'ncopies' and removes 
> 'get_initial_def_for_reduction'.  Not done anything about 
> STMT_VINFO_VECTYPE as I'm not sure about its uses in 
> vectorizable_reduction in relation to the vectype_in/out.

That's fine I guess.  Further cleanup is always possible.

-  unsigned int group_size = 1, k;
+  unsigned int group_size, k;

I guess it's clearer to also move the declaration to where the
unconditional init now happens?

-  for (j = 0; j < ncopies; j++)
+  for (j = 0; j < 1; j++)
{
  tree new_def = copy_ssa_name (def);
  phi = create_phi_node (new_def, exit_bb);
- if (j)
+ if (0)
def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);

guess the next in the series will elide the loop and replace the
other use of 'j'.

@@ -7102,7 +7027,6 @@ vectorize_fold_left_reduction (loop_vec_info 
loop_vinfo,
   tree reduc_var = gimple_phi_result (reduc_def_stmt);

   int vec_num = vec_oprnds0.length ();
-  gcc_assert (vec_num == 1);

not sure if that's the one I complained about earlier.

-  vec_initial_defs.create (ncopies);
-  for (i = 0; i < ncopies; ++i)
+  vec_initial_defs.create (1);
+  for (i = 0; i < 1; ++i)
vec_initial_defs.quick_push (vec_initial_def);

likewise.

-   the BB of the related stmt is inside this loop.  */
+   the BB of the related stmt is inside this loop.
+   TODO: Do we still need this now that
+   'get_initial_def_for_reduction' has been removed?  */

I think so, there's a "copy" for SLP called 
get_initial_defs_for_reduction, so possibly just change the reference
in the comment.

Richard.



>  gcc/tree-vect-loop.cc  | 127 +++--
>  gcc/tree-vect-stmts.cc |   4 +-
>  gcc/tree-vectorizer.h  |   5 +-
>  3 files changed, 25 insertions(+), 111 deletions(-)
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Richard Sandiford
Richard Earnshaw  writes:
> On 07/05/2025 13:57, Richard Sandiford wrote:
>> Kyrylo Tkachov  writes:
 On 7 May 2025, at 12:27, Karl Meakin  wrote:

 Commit the test file `cmpbr.c` before rules for generating the new
 instructions are added, so that the changes in codegen are more obvious
 in the next commit.
>>>
>>> I guess that’s an LLVM best practice.
>>> In GCC since we have the check-function-bodies mechanism we usually prefer 
>>> to include the relevant test together with the patch that adds the 
>>> optimization.
>>> But this is not wrong either.
>>>
>>>

 gcc/testsuite/ChangeLog:

 * gcc.target/aarch64/cmpbr.c: New test.
 ---
 gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
 1 file changed, 1378 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c

 diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
 b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
 new file mode 100644
 index 000..728d6ead91c
 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
 @@ -0,0 +1,1378 @@
 +/* Test that the instructions added by FEAT_CMPBR are emitted */
 +/* { dg-do compile } */
 +/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
 +/* { dg-final { check-function-bodies "**" "" "" } } */
>>>
>>> As you’ll be adding new instructions to the compiler it’d be good to have 
>>> it a dg-do assemble test where possible.
>> 
>> Agreed FWIW, but:
>> 
>>> For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use it 
>>> like so to fallback to dg-do compile when the assembler is too old:
>>> /* { dg-do compile { target aarch64_asm_cmpbr_ok } } */
>> 
>> ...dg-do assemble for this one :)
>
> I don't think that works. If the first dg-do fails the test is just skipped.
>
> You need to replicate the test with separate dg-do directives, IIRC.

Hmm, can you remember the circumstances when you saw that?
We've been using the construct that Kyrill suggested with apparent
success in things like aarch64-sve2-acle-asm.exp.  E.g.:

/* { dg-do assemble { target aarch64_asm_sve2p1_ok } } */
/* { dg-do compile { target { ! aarch64_asm_sve2p1_ok } } } */
/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */

If I run this normally, it picks the assemble route (tests run with -c).
If I force aarch64_asm_sve2p1_ok to false by mangling the test
instruction, the test picks the compile route (tests with run -S).

Thanks,
Richard


Re: [PATCH 8/8] AArch64: rules for CMPBR instructions

2025-05-07 Thread Richard Sandiford
Richard Sandiford  writes:
>> @@ -758,6 +781,58 @@ (define_expand "cbranchcc4"
>>""
>>  )
>>  
>> +;; Emit a `CB (register)` or `CB (immediate)` instruction.
>> +(define_insn "aarch64_cb"
>> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>> +[(match_operand:GPI 1 "register_operand")
>> + (match_operand:GPI 2 "aarch64_cb_operand")])
>> +   (label_ref (match_operand 3))
>> +   (pc)))]
>> +  "TARGET_CMPBR"
>> +  "cb%m0\\t%1, %2, %l3";
>> +  [(set_attr "type" "branch")
>> +   (set (attr "length")
>> +(if_then_else (and (ge (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_N_1Kib))
>> +   (lt (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_P_1Kib)))
>> +  (const_int 4)
>> +  (const_int 8)))
>> +   (set (attr "far_branch")
>> +(if_then_else (and (ge (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_N_1Kib))
>> +   (lt (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_P_1Kib)))
>> +  (const_string "no")
>> +  (const_string "yes")))]
>> +)
>> +
>> +;; Emit a `CBB (register)` or `CBH (register)` instruction.
>> +(define_insn "aarch64_cb"
>> +  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>> +[(match_operand:SHORT 1 "register_operand")
>> + (match_operand:SHORT 2 
>> "aarch64_cb_short_operand")])
>> +   (label_ref (match_operand 3))
>> +   (pc)))]
>> +  "TARGET_CMPBR"
>> +  "cb%m0\\t%1, %2, %l3";
>> +  [(set_attr "type" "branch")
>> +   (set (attr "length")
>> +(if_then_else (and (ge (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_N_1Kib))
>> +   (lt (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_P_1Kib)))
>> +  (const_int 4)
>> +  (const_int 8)))
>> +   (set (attr "far_branch")
>> +(if_then_else (and (ge (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_N_1Kib))
>> +   (lt (minus (match_dup 3) (pc))
>> +   (const_int BRANCH_LEN_P_1Kib)))
>> +  (const_string "no")
>> +  (const_string "yes")))]
>> +)
>> +
>
> [...]
>
> A slight wrinkle is that the CB immediate instruction requires CBLT
> rather than CBLE, etc.  IIRC, GCC canonicalises in the opposite
> direction, preferring LEU over LTU, etc.
>
> So I think we might need a custom version of aarch64_comparison_operator
> that checks whether the immediate is in the range [0, 63] for the "native"
> cmoparisons and an appropriate variant for the "non-native" comparisons
> (LE, GE, LEU, GEU).  The output asm section would then need to adjust
> the instruction accordingly before printing it out.

Sorry, I realised later than a match_operator along the lines I suggested
wouldn't work well in combination with the constraints.  We'd need to use
one constraint for the native operations and another constraint for the
emulated ones.

So perhaps the define_insns will need to use a code iterator instead
of a match_operator.  With a code iterator, the constraint string can
use a code attribute to choose the appropriate range.

Thanks,
Richard


[PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Karl Meakin
Commit the test file `cmpbr.c` before rules for generating the new
instructions are added, so that the changes in codegen are more obvious
in the next commit.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/cmpbr.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
 1 file changed, 1378 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c

diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
new file mode 100644
index 000..728d6ead91c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
@@ -0,0 +1,1378 @@
+/* Test that the instructions added by FEAT_CMPBR are emitted */
+/* { dg-do compile } */
+/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include 
+
+typedef uint8_t u8;
+typedef int8_t i8;
+
+typedef uint16_t u16;
+typedef int16_t i16;
+
+typedef uint32_t u32;
+typedef int32_t i32;
+
+typedef uint64_t u64;
+typedef int64_t i64;
+
+int taken();
+int not_taken();
+
+#define COMPARE(ty, name, op, rhs) 
\
+  int ty##_x0_##name##_##rhs(ty x0, ty x1) {   
\
+return (x0 op rhs) ? taken() : not_taken();
\
+  }
+
+#define COMPARE_ALL(unsigned_ty, signed_ty, rhs)   
\
+  COMPARE(unsigned_ty, eq, ==, rhs);   
\
+  COMPARE(unsigned_ty, ne, !=, rhs);   
\
+   
\
+  COMPARE(unsigned_ty, ult, <, rhs);   
\
+  COMPARE(unsigned_ty, ule, <=, rhs);  
\
+  COMPARE(unsigned_ty, ugt, >, rhs);   
\
+  COMPARE(unsigned_ty, uge, >=, rhs);  
\
+   
\
+  COMPARE(signed_ty, slt, <, rhs); 
\
+  COMPARE(signed_ty, sle, <=, rhs);
\
+  COMPARE(signed_ty, sgt, >, rhs); 
\
+  COMPARE(signed_ty, sge, >=, rhs);
+
+//  CBB (register) 
+COMPARE_ALL(u8, i8, x1);
+
+//  CBH (register) 
+COMPARE_ALL(u16, i16, x1);
+
+//  CB (register) 
+COMPARE_ALL(u32, i32, x1);
+COMPARE_ALL(u64, i64, x1);
+
+//  CB (immediate) 
+COMPARE_ALL(u32, i32, 42);
+COMPARE_ALL(u64, i64, 42);
+
+//  Special cases 
+// CBB and CBH cannot have immediate operands. Instead we have to do a MOV+CB
+COMPARE_ALL(u8, i8, 42);
+COMPARE_ALL(u16, i16, 42);
+
+// 65 is out of the range for immediate operands (0 to 63).
+// * For 8/16-bit types, use a MOV+CB as above.
+// * For 32/64-bit types, use a CMP+B instead, because
+//   B has a longer range than CB.
+COMPARE_ALL(u8, i8, 65);
+COMPARE_ALL(u16, i16, 65);
+COMPARE_ALL(u32, i32, 65);
+COMPARE_ALL(u64, i64, 65);
+
+// Comparisons against zero can use the wzr/xzr register.
+COMPARE_ALL(u8, i8, 0);
+COMPARE_ALL(u16, i16, 0);
+COMPARE_ALL(u32, i32, 0);
+COMPARE_ALL(u64, i64, 0);
+
+/*
+** u8_x0_eq_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** beq .L4
+** b   not_taken
+** b   taken
+*/
+
+/*
+** u8_x0_ne_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** beq .L6
+** b   taken
+** b   not_taken
+*/
+
+/*
+** u8_x0_ult_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bls .L8
+** b   taken
+** b   not_taken
+*/
+
+/*
+** u8_x0_ule_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bcc .L10
+** b   taken
+** b   not_taken
+*/
+
+/*
+** u8_x0_ugt_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bcs .L12
+** b   taken
+** b   not_taken
+*/
+
+/*
+** u8_x0_uge_x1:
+** and w1, w1, 255
+** cmp w1, w0, uxtb
+** bhi .L14
+** b   taken
+** b   not_taken
+*/
+
+/*
+** i8_x0_slt_x1:
+** sxtbw1, w1
+** cmp w1, w0, sxtb
+** ble .L16
+** b   taken
+** b   not_taken
+*/
+
+/*
+** i8_x0_sle_x1:
+** sxtbw1, w1
+** cmp w1, w0, sxtb
+** blt .L18
+** b   taken
+** b   not_taken
+*/
+
+/*
+** i8_x0_sgt_x1:
+** sxtbw1, w1
+** cmp w1, w0, sxtb
+** bge .L20
+** b   taken
+** b   not_taken
+*/
+
+/*
+** i8_x0_sge_x1:
+** sxtbw1, w1
+** cmp w1, w0, sxtb
+** bgt .L22
+** b   taken
+** b   not_taken
+*/
+
+/*
+** u16_x0_eq_x1:
+** and w1, w1, 65535
+** cmp w1, w0, uxth
+** beq .L25
+** b   not_taken
+** b   taken
+*/
+
+/*
+** u16_x0_ne_x1:
+** and 

[PATCH 2/8] AArch64: reformat branch instruction rules

2025-05-07 Thread Karl Meakin
Make the formatting of the RTL templates in the rules for branch
instructions more consistent with each other.

gcc/ChangeLog:

* config/aarch64/aarch64.md (cbranch4): reformat.
(cbranchcc4): likewise.
(condjump): likewise.
(*compare_condjump): likewise.
(aarch64_cb1): likewise.
(*cb1): likewise.
(tbranch_3): likewise.
(@aarch64_tb): likewise.
---
 gcc/config/aarch64/aarch64.md | 82 ++-
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 4d556d886bc..45b2283c5c0 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -706,7 +706,7 @@ (define_expand "cbranch4"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
[(match_operand:GPI 1 "register_operand")
 (match_operand:GPI 2 "aarch64_plus_operand")])
-  (label_ref (match_operand 3 "" ""))
+  (label_ref (match_operand 3))
   (pc)))]
   ""
   "
@@ -717,34 +717,34 @@ (define_expand "cbranch4"
 )
 
 (define_expand "cbranch4"
-  [(set (pc) (if_then_else
-   (match_operator 0 "aarch64_comparison_operator"
-[(match_operand:GPF_F16 1 "register_operand")
- (match_operand:GPF_F16 2 "aarch64_fp_compare_operand")])
-   (label_ref (match_operand 3 "" ""))
-   (pc)))]
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+   [(match_operand:GPF_F16 1 "register_operand")
+(match_operand:GPF_F16 2 
"aarch64_fp_compare_operand")])
+  (label_ref (match_operand 3))
+  (pc)))]
   ""
-  "
+  {
   operands[1] = aarch64_gen_compare_reg (GET_CODE (operands[0]), operands[1],
 operands[2]);
   operands[2] = const0_rtx;
-  "
+  }
 )
 
 (define_expand "cbranchcc4"
-  [(set (pc) (if_then_else
- (match_operator 0 "aarch64_comparison_operator"
-  [(match_operand 1 "cc_register")
-   (match_operand 2 "const0_operand")])
- (label_ref (match_operand 3 "" ""))
- (pc)))]
+  [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
+   [(match_operand 1 "cc_register")
+(match_operand 2 "const0_operand")])
+  (label_ref (match_operand 3))
+  (pc)))]
   ""
-  "")
+  ""
+)
 
 (define_insn "condjump"
   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
-   [(match_operand 1 "cc_register" "") (const_int 0)])
-  (label_ref (match_operand 2 "" ""))
+   [(match_operand 1 "cc_register")
+(const_int 0)])
+  (label_ref (match_operand 2))
   (pc)))]
   ""
   {
@@ -782,9 +782,9 @@ (define_insn "condjump"
 ;; b .Label
 (define_insn_and_split "*compare_condjump"
   [(set (pc) (if_then_else (EQL
- (match_operand:GPI 0 "register_operand" "r")
- (match_operand:GPI 1 "aarch64_imm24" "n"))
-  (label_ref:P (match_operand 2 "" ""))
+(match_operand:GPI 0 "register_operand" "r")
+(match_operand:GPI 1 "aarch64_imm24" "n"))
+  (label_ref:P (match_operand 2))
   (pc)))]
   "!aarch64_move_imm (INTVAL (operands[1]), mode)
&& !aarch64_plus_operand (operands[1], mode)
@@ -807,9 +807,10 @@ (define_insn_and_split "*compare_condjump"
 )
 
 (define_insn "aarch64_cb1"
-  [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
-   (const_int 0))
-  (label_ref (match_operand 1 "" ""))
+  [(set (pc) (if_then_else (EQL
+(match_operand:GPI 0 "register_operand" "r")
+(const_int 0))
+  (label_ref (match_operand 1))
   (pc)))]
   "!aarch64_track_speculation"
   {
@@ -832,9 +833,10 @@ (define_insn "aarch64_cb1"
 )
 
 (define_insn "*cb1"
-  [(set (pc) (if_then_else (LTGE (match_operand:ALLI 0 "register_operand" "r")
-(const_int 0))
-  (label_ref (match_operand 1 "" ""))
+  [(set (pc) (if_then_else (LTGE
+(match_operand:ALLI 0 "register_operand" "r")
+(const_int 0))
+  (label_ref (match_operand 1))
   (pc)))
(clobber (reg:CC CC_REGNUM))]
   "!aarch64_track_speculation"
@@ -875,11 +877,11 @@ (define_insn "*c

[PATCH 6/8] AArch64: recognize `+cmpbr` option

2025-05-07 Thread Karl Meakin
Add the `+cmpbr` option to enable the FEAT_CMPBR architectural
extension.

gcc/ChangeLog:

* config/aarch64/aarch64-option-extensions.def (cmpbr): new
option.
* config/aarch64/aarch64.h (TARGET_CMPBR): new macro.
* doc/invoke.texi (cmpbr): new option.
---
 gcc/config/aarch64/aarch64-option-extensions.def | 2 ++
 gcc/config/aarch64/aarch64.h | 3 +++
 gcc/doc/invoke.texi  | 3 +++
 3 files changed, 8 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-option-extensions.def 
b/gcc/config/aarch64/aarch64-option-extensions.def
index dbbb021f05a..1c3e69799f5 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -249,6 +249,8 @@ AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "mops")
 
 AARCH64_OPT_EXTENSION("cssc", CSSC, (), (), (), "cssc")
 
+AARCH64_OPT_EXTENSION("cmpbr", CMPBR, (), (), (), "cmpbr")
+
 AARCH64_OPT_EXTENSION("lse128", LSE128, (LSE), (), (), "lse128")
 
 AARCH64_OPT_EXTENSION("d128", D128, (LSE128), (), (), "d128")
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e8bd8c73c12..d5c4a42e96d 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -410,6 +410,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 /* CSSC instructions are enabled through +cssc.  */
 #define TARGET_CSSC AARCH64_HAVE_ISA (CSSC)
 
+/* CB instructions are enabled through +cmpbr.  */
+#define TARGET_CMPBR AARCH64_HAVE_ISA (CMPBR)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 32bc45725de..3f05e5e0e34 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22252,6 +22252,9 @@ Enable the FlagM2 flag conversion instructions.
 Enable the Pointer Authentication Extension.
 @item cssc
 Enable the Common Short Sequence Compression instructions.
+@item cmpbr
+Enable the shorter compare and branch instructions, @code{cbb}, @code{cbh} and
+@code{cb}.
 @item sme
 Enable the Scalable Matrix Extension.  This is only supported when SVE2 is also
 enabled.
-- 
2.45.2



[PATCH 4/8] AArch64: add constants for branch displacements

2025-05-07 Thread Karl Meakin
Extract the hardcoded values for the minimum PC-relative displacements
into named constants and document them.

gcc/ChangeLog:

* config/aarch64/aarch64.md (BRANCH_LEN_P_128MiB): New constant.
(BRANCH_LEN_N_128MiB): likewise.
(BRANCH_LEN_P_1MiB): likewise.
(BRANCH_LEN_N_1MiB): likewise.
(BRANCH_LEN_P_32KiB): likewise.
(BRANCH_LEN_N_32KiB): likewise.
(BRANCH_LEN_P_1KiB): likewise.
(BRANCH_LEN_N_1KiB): likewise.
---
 gcc/config/aarch64/aarch64.md | 68 ++-
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 23775ec58ca..ca5bd96a754 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -696,7 +696,27 @@ (define_insn "jump"
   [(set_attr "type" "branch")]
 )
 
+;; Maximum PC-relative positive/negative displacements for various branching
+;; instructions.
+(define_constants
+  [
+;; +/- 128MiB.  Used by B, BL.
+(BRANCH_LEN_P_128Mib  134217724)
+(BRANCH_LEN_N_128Mib -134217728)
+
+;; +/- 1MiB.  Used by B., CBZ, CBNZ.
+(BRANCH_LEN_P_1Mib  1048572)
+(BRANCH_LEN_N_1Mib -1048576)
 
+;; +/- 32KiB.  Used by TBZ, TBNZ.
+(BRANCH_LEN_P_32Kib  32764)
+(BRANCH_LEN_N_32Kib -32768)
+
+;; +/- 1KiB.  Used by CBB, CBH, CB.
+(BRANCH_LEN_P_1Kib  1020)
+(BRANCH_LEN_N_1Kib -1024)
+  ]
+)
 
 ;; ---
 ;; Conditional jumps
@@ -760,13 +780,17 @@ (define_insn "aarch64_bcond"
   }
   [(set_attr "type" "branch")
(set (attr "length")
-   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
-  (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
+   (if_then_else (and (ge (minus (match_dup 2) (pc))
+  (const_int BRANCH_LEN_N_1Mib))
+  (lt (minus (match_dup 2) (pc))
+  (const_int BRANCH_LEN_P_1Mib)))
  (const_int 4)
  (const_int 8)))
(set (attr "far_branch")
-   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
-  (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
+   (if_then_else (and (ge (minus (match_dup 2) (pc))
+  (const_int BRANCH_LEN_N_1Mib))
+  (lt (minus (match_dup 2) (pc))
+  (const_int BRANCH_LEN_P_1Mib)))
  (const_int 0)
  (const_int 1)))]
 )
@@ -823,13 +847,17 @@ (define_insn "aarch64_cb1"
   }
   [(set_attr "type" "branch")
(set (attr "length")
-   (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
-  (lt (minus (match_dup 1) (pc)) (const_int 1048572)))
+   (if_then_else (and (ge (minus (match_dup 1) (pc))
+  (const_int BRANCH_LEN_N_1Mib))
+  (lt (minus (match_dup 1) (pc))
+  (const_int BRANCH_LEN_P_1Mib)))
  (const_int 4)
  (const_int 8)))
(set (attr "far_branch")
-   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
-  (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
+   (if_then_else (and (ge (minus (match_dup 2) (pc))
+  (const_int BRANCH_LEN_N_1Mib))
+  (lt (minus (match_dup 2) (pc))
+  (const_int BRANCH_LEN_P_1Mib)))
  (const_int 0)
  (const_int 1)))]
 )
@@ -864,13 +892,17 @@ (define_insn "*cb1"
   }
   [(set_attr "type" "branch")
(set (attr "length")
-   (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
-  (lt (minus (match_dup 1) (pc)) (const_int 32764)))
+   (if_then_else (and (ge (minus (match_dup 1) (pc))
+  (const_int BRANCH_LEN_N_32Kib))
+  (lt (minus (match_dup 1) (pc))
+  (const_int BRANCH_LEN_P_32Kib)))
  (const_int 4)
  (const_int 8)))
(set (attr "far_branch")
-   (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
-  (lt (minus (match_dup 1) (pc)) (const_int 1048572)))
+   (if_then_else (and (ge (minus (match_dup 1) (pc))
+  (const_int BRANCH_LEN_N_1Mib))
+  (lt (minus (match_dup 1) (pc))
+  (const_int BRANCH_LEN_P_1Mib)))
  (const_int 0)
  (const_int 1)))]
 )
@@ -925,13 +957,17 @@ (define_insn "@aarch64_tb"
   }
   [(set_attr "type" "branch")
(set (attr "length")
-   (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -32768))
- 

[COMMITTED,PATCH] s390: Add cstoreti4 expander

2025-05-07 Thread Stefan Schulze Frielinghaus
For target VXE3 just emit a 128-bit comparison followed by a conditional
load.  For targets prior VXE3, emulate the 128-bit comparison and make
use of a conditional load, too.

gcc/ChangeLog:

* config/s390/s390-protos.h (s390_expand_cstoreti4): New
function.
* config/s390/s390.cc (s390_expand_cstoreti4): New function.
* config/s390/s390.md (CC_SUZ): New mode iterator.
(l): New mode attribute.
(cc_tolower): New mode attribute.
* config/s390/vector.md (cstoreti4): New expander.
(*vec_cmpv2di_lane0_): New insn.
(*vec_cmpti_): New insn.

gcc/testsuite/ChangeLog:

* gcc.target/s390/vector/cstoreti-1.c: New test.
* gcc.target/s390/vector/cstoreti-2.c: New test.
---
 gcc/config/s390/s390-protos.h |   1 +
 gcc/config/s390/s390.cc   |  82 ++-
 gcc/config/s390/s390.md   |   4 +
 gcc/config/s390/vector.md |  30 +
 .../gcc.target/s390/vector/cstoreti-1.c   | 127 ++
 .../gcc.target/s390/vector/cstoreti-2.c   |  25 
 6 files changed, 266 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/cstoreti-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/vector/cstoreti-2.c

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index e8c7f830849..d760a7e20ff 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -114,6 +114,7 @@ extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
 extern void s390_expand_vec_strlen (rtx, rtx, rtx);
 extern void s390_expand_vec_movstr (rtx, rtx, rtx);
 extern bool s390_expand_addcc (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
+extern void s390_expand_cstoreti4 (rtx, rtx, rtx, rtx);
 extern bool s390_expand_insv (rtx, rtx, rtx, rtx);
 extern void s390_expand_cs (machine_mode, rtx, rtx, rtx, rtx, rtx, bool);
 extern void s390_expand_atomic_exchange_tdsi (rtx, rtx, rtx);
diff --git a/gcc/config/s390/s390.cc b/gcc/config/s390/s390.cc
index e3edf859513..2d44cecfeed 100644
--- a/gcc/config/s390/s390.cc
+++ b/gcc/config/s390/s390.cc
@@ -7210,6 +7210,82 @@ s390_expand_mask_and_shift (rtx val, machine_mode mode, 
rtx count)
  NULL_RTX, 1, OPTAB_DIRECT);
 }
 
+/* Expand optab cstoreti4.  */
+
+void
+s390_expand_cstoreti4 (rtx dst, rtx cmp, rtx op1, rtx op2)
+{
+  rtx_code code = GET_CODE (cmp);
+
+  if (TARGET_VXE3)
+{
+  rtx cond = s390_emit_compare (GET_MODE (cmp), code, op1, op2);
+  emit_insn (gen_movsicc (dst, cond, const1_rtx, const0_rtx));
+  return;
+}
+
+  /* Prior VXE3 emulate the comparison.  For an (in)equality test exploit
+ VECTOR COMPARE EQUAL.  For a relational test, first compare the high part
+ via VECTOR ELEMENT COMPARE (LOGICAL).  If the high part does not equal,
+ then consume the CC immediatelly by a subsequent LOAD ON CONDITION.
+ Otherweise, if the high part equals, then perform a subsequent VECTOR
+ COMPARE HIGH LOGICAL followed by a LOAD ON CONDITION.  */
+
+  op1 = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op1, TImode, 0));
+  op2 = force_reg (V2DImode, simplify_gen_subreg (V2DImode, op2, TImode, 0));
+
+  if (code == EQ || code == NE)
+{
+  s390_expand_vec_compare_cc (dst, code, op1, op2, code == EQ);
+  return;
+}
+
+  /* Normalize code into either GE(U) or GT(U).  */
+  if (code == LT || code == LE || code == LTU || code == LEU)
+{
+  std::swap (op1, op2);
+  code = swap_condition (code);
+}
+
+  /* For (un)signed comparisons
+ - high(op1) >= high(op2) instruction VECG op1, op2 sets CC1
+   if the relation does _not_ hold.
+ - high(op1) >  high(op2) instruction VECG op2, op1 sets CC1
+   if the relation holds.  */
+  if (code == GT || code == GTU)
+std::swap (op1, op2);
+  machine_mode cc_mode = (code == GEU || code == GTU) ? CCUmode : CCSmode;
+  rtx lane0 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+  emit_insn (
+gen_rtx_SET (gen_rtx_REG (cc_mode, CC_REGNUM),
+gen_rtx_COMPARE (cc_mode,
+ gen_rtx_VEC_SELECT (DImode, op1, lane0),
+ gen_rtx_VEC_SELECT (DImode, op2, lane0;
+  rtx ccs_reg = gen_rtx_REG (CCSmode, CC_REGNUM);
+  rtx lab = gen_label_rtx ();
+  s390_emit_jump (lab, gen_rtx_NE (VOIDmode, ccs_reg, const0_rtx));
+  /* At this point we have that high(op1) == high(op2).  Thus, test the low
+ part, now.  For unsigned comparisons
+ - low(op1) >= low(op2) instruction VCHLGS op2, op1 sets CC1
+   if the relation does _not_ hold.
+ - low(op1) >  low(op2) instruction VCHLGS op1, op2 sets CC1
+   if the relation holds.  */
+  std::swap (op1, op2);
+  emit_insn (gen_rtx_PARALLEL (
+VOIDmode,
+gen_rtvec (2,
+  gen_rtx_SET (gen_rtx_REG (CCVIHUmode, CC_REGNUM),
+   gen_rtx_COMPARE (CCVIHUmode, op1, op2)),

Re: [PATCH] RISC-V: Add pattern for vector-scalar multiply-add/sub [PR119100]

2025-05-07 Thread Robin Dapp
Thanks Jeff. I will rebase and update my patch. One question though, I 
noticed that Pan's patch introduced a command-line parameter to tweak the 
GR2VR cost; do we need something equivalent for FR2VR?


Yes, we need it in order to be able to test both paths, i.e. combining and not 
combining.  Also make sure to test with multiple types and situations as in 
Pan's patch.


--
Regards
Robin



[PATCH 5/8] AArch64: make `far_branch` attribute a boolean

2025-05-07 Thread Karl Meakin
The `far_branch` attribute only ever takes the values 0 or 1, so make it
a `no/yes` valued string attribute instead.

gcc/ChangeLog:

* config/aarch64/aarch64.md (far_branch): replace 0/1 with
no/yes.
(aarch64_bcond): handle rename.
(aarch64_cb1): likewise.
(*cb1): likewise.
(@aarch64_tb): likewise.
---
 gcc/config/aarch64/aarch64.md | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index ca5bd96a754..256df0dcc04 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -561,9 +561,7 @@ (define_attr "enabled" "no,yes"
 ;; Attribute that specifies whether we are dealing with a branch to a
 ;; label that is far away, i.e. further away than the maximum/minimum
 ;; representable in a signed 21-bits number.
-;; 0 :=: no
-;; 1 :=: yes
-(define_attr "far_branch" "" (const_int 0))
+(define_attr "far_branch" "no,yes" (const_string "no"))
 
 ;; Attribute that specifies whether the alternative uses MOVPRFX.
 (define_attr "movprfx" "no,yes" (const_string "no"))
@@ -791,8 +789,8 @@ (define_insn "aarch64_bcond"
   (const_int BRANCH_LEN_N_1Mib))
   (lt (minus (match_dup 2) (pc))
   (const_int BRANCH_LEN_P_1Mib)))
- (const_int 0)
- (const_int 1)))]
+ (const_string "no")
+ (const_string "yes")))]
 )
 
 ;; For a 24-bit immediate CST we can optimize the compare for equality
@@ -858,8 +856,8 @@ (define_insn "aarch64_cb1"
   (const_int BRANCH_LEN_N_1Mib))
   (lt (minus (match_dup 2) (pc))
   (const_int BRANCH_LEN_P_1Mib)))
- (const_int 0)
- (const_int 1)))]
+ (const_string "no")
+ (const_string "yes")))]
 )
 
 ;; For an LT/GE comparison against zero, emit `TBZ`/`TBNZ`
@@ -874,7 +872,7 @@ (define_insn "*cb1"
   {
 if (get_attr_length (insn) == 8)
   {
-   if (get_attr_far_branch (insn) == 1)
+   if (get_attr_far_branch (insn) == FAR_BRANCH_YES)
  return aarch64_gen_far_branch (operands, 1, "Ltb",
 "\\t%0, , ");
else
@@ -903,8 +901,8 @@ (define_insn "*cb1"
   (const_int BRANCH_LEN_N_1Mib))
   (lt (minus (match_dup 1) (pc))
   (const_int BRANCH_LEN_P_1Mib)))
- (const_int 0)
- (const_int 1)))]
+ (const_string "no")
+ (const_string "yes")))]
 )
 
 ;; ---
@@ -968,8 +966,8 @@ (define_insn "@aarch64_tb"
   (const_int BRANCH_LEN_N_1Mib))
   (lt (minus (match_dup 2) (pc))
   (const_int BRANCH_LEN_P_1Mib)))
- (const_int 0)
- (const_int 1)))]
+ (const_string "no")
+ (const_string "yes")))]
 
 )
 
-- 
2.45.2



Re: [PATCH 2/6] vect: Remove non-SLP path from vectorizable_reduction

2025-05-07 Thread Richard Biener
On Tue, 6 May 2025, andre.simoesdiasvie...@arm.com wrote:

> 
> Prunes code from the trivial true/false conditions.

-  gcc_assert (vec_num == 1 || 1);
+  gcc_assert (vec_num == 1);

that looks like a now stronger assert, pruning would have removed
the assert instead?

   if (ncopies > 1
-  && (0
- || (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
- && SLP_TREE_LANES (slp_node) == 1))
+  && (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
+ && SLP_TREE_LANES (slp_node) == 1)

the extra parens around the "inner" && is no longer needed, the
conditions can be brought in-line with the outer &&


> ---
>  gcc/tree-vect-loop.cc | 540 --
>  1 file changed, 155 insertions(+), 385 deletions(-)
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


[committed] libstdc++: Fix module std export for std::extents

2025-05-07 Thread Jonathan Wakely
libstdc++-v3/ChangeLog:

* src/c++23/std.cc.in: Fix export for std::extents.
---

Lightly tested x86_64-linux. Pushed to trunk.

 libstdc++-v3/src/c++23/std.cc.in | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libstdc++-v3/src/c++23/std.cc.in b/libstdc++-v3/src/c++23/std.cc.in
index 0df27cd7e7da..73316d168c94 100644
--- a/libstdc++-v3/src/c++23/std.cc.in
+++ b/libstdc++-v3/src/c++23/std.cc.in
@@ -1834,10 +1834,13 @@ export namespace std
 }
 
 // 
+#if __glibcxx_mdspan
+export namespace std
 {
   using std::extents;
   // FIXME layout_*, default_accessor and mdspan
 }
+#endif
 
 // 20.2 
 export namespace std
-- 
2.49.0



[PATCH] libstdc++: Use _Padding_sink in __formatter_chrono to produce padded output.

2025-05-07 Thread Tomasz Kamiński
Formatting code is extracted to _M_format_to function, that produced output
to specified iterator. This function is now invoked either with __fc.out()
directly (if width is not specified) or _Padding_sink::out().

This avoid formatting to temporary string if no padding is requested,
and minimize allocations otherwise. For more details see commit message of
r16-142-g01e5ef3e8b91288f5d387a27708f9f8979a50edf.

This should not increase number of instantiations, as implementation only
produce basis_format_context with _Sink_iter as iterator, which is also
_Padding_sink iterator.

libstdc++-v3/ChangeLog:

* include/bits/chrono_io.h (__formatter_chrono::_M_format_to):
Extracted from _M_format.
(__formatter_chrono::_M_format): Use _Padding_sink and delegate
to _M_format_to.
---
I have checked that there are no other calls to out() in this file,
so _M_format_to uses only __out, and not iterator from __fc.
Testing on x86_64-linux. OK for trunk?

 libstdc++-v3/include/bits/chrono_io.h | 55 ++-
 1 file changed, 20 insertions(+), 35 deletions(-)

diff --git a/libstdc++-v3/include/bits/chrono_io.h 
b/libstdc++-v3/include/bits/chrono_io.h
index 620227a9f35..ace8b9f2629 100644
--- a/libstdc++-v3/include/bits/chrono_io.h
+++ b/libstdc++-v3/include/bits/chrono_io.h
@@ -503,9 +503,7 @@ namespace __format
_M_format(const _Tp& __t, _FormatContext& __fc,
  bool __is_neg = false) const
{
- auto __first = _M_spec._M_chrono_specs.begin();
- const auto __last = _M_spec._M_chrono_specs.end();
- if (__first == __last)
+ if (_M_spec._M_chrono_specs.empty())
return _M_format_to_ostream(__t, __fc, __is_neg);
 
 #if defined _GLIBCXX_USE_NL_LANGINFO_L && __CHAR_BIT__ == 8
@@ -525,29 +523,29 @@ namespace __format
__fc._M_loc =  __with_encoding_conversion(__loc);
}
 #endif
-
- _Sink_iter<_CharT> __out;
- __format::_Str_sink<_CharT> __sink;
- bool __write_direct = false;
- if constexpr (is_same_v>)
-   {
- if (_M_spec._M_width_kind == __format::_WP_none)
-   {
- __out = __fc.out();
- __write_direct = true;
-   }
- else
-   __out = __sink.out();
-   }
- else
-   __out = __sink.out();
-
  // formatter passes the correct value of __is_neg
  // for durations but for hh_mm_ss we decide it here.
  if constexpr (__is_specialization_of<_Tp, chrono::hh_mm_ss>)
__is_neg = __t.is_negative();
 
+ const size_t __padwidth = _M_spec._M_get_width(__fc);
+ if (__padwidth == 0)
+   return _M_format_to(__t, __fc.out(), __fc, __is_neg);
+
+ using _Out = typename _FormatContext::iterator;
+ _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth);
+ _M_format_to(__t, __sink.out(), __fc, __is_neg);
+ return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
+   }
+
+  template
+   _Out
+   _M_format_to(const _Tp& __t, _Out __out, _FormatContext& __fc,
+bool __is_neg) const
+   {
+ auto __first = _M_spec._M_chrono_specs.begin();
+ const auto __last = _M_spec._M_chrono_specs.end();
+
  auto __print_sign = [&__is_neg, &__out] {
if constexpr (chrono::__is_duration_v<_Tp>
|| __is_specialization_of<_Tp, chrono::hh_mm_ss>)
@@ -699,20 +697,7 @@ namespace __format
}
}
  while (__first != __last);
-
- if constexpr (is_same_v>)
-   if (__write_direct)
- return __out;
-
- auto __str = __sink.view();
- size_t __width;
- if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
-   __width = __unicode::__field_width(__str);
- else
-   __width = __str.size();
- return __format::__write_padded_as_spec(__str, __width,
- __fc, _M_spec);
+ return std::move(__out);
}
 
   _ChronoSpec<_CharT> _M_spec;
-- 
2.49.0



Re: [PATCH 4/8] AArch64: add constants for branch displacements

2025-05-07 Thread Kyrylo Tkachov



> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> Extract the hardcoded values for the minimum PC-relative displacements
> into named constants and document them.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64.md (BRANCH_LEN_P_128MiB): New constant.
> (BRANCH_LEN_N_128MiB): likewise.
> (BRANCH_LEN_P_1MiB): likewise.
> (BRANCH_LEN_N_1MiB): likewise.
> (BRANCH_LEN_P_32KiB): likewise.
> (BRANCH_LEN_N_32KiB): likewise.
> (BRANCH_LEN_P_1KiB): likewise.
> (BRANCH_LEN_N_1KiB): likewise.

Ok.
Thanks,
Kyrill


> ---
> gcc/config/aarch64/aarch64.md | 68 ++-
> 1 file changed, 52 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 23775ec58ca..ca5bd96a754 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -696,7 +696,27 @@ (define_insn "jump"
>   [(set_attr "type" "branch")]
> )
> 
> +;; Maximum PC-relative positive/negative displacements for various branching
> +;; instructions.
> +(define_constants
> +  [
> +;; +/- 128MiB.  Used by B, BL.
> +(BRANCH_LEN_P_128Mib  134217724)
> +(BRANCH_LEN_N_128Mib -134217728)
> +
> +;; +/- 1MiB.  Used by B., CBZ, CBNZ.
> +(BRANCH_LEN_P_1Mib  1048572)
> +(BRANCH_LEN_N_1Mib -1048576)
> 
> +;; +/- 32KiB.  Used by TBZ, TBNZ.
> +(BRANCH_LEN_P_32Kib  32764)
> +(BRANCH_LEN_N_32Kib -32768)
> +
> +;; +/- 1KiB.  Used by CBB, CBH, CB.
> +(BRANCH_LEN_P_1Kib  1020)
> +(BRANCH_LEN_N_1Kib -1024)
> +  ]
> +)
> 
> ;; ---
> ;; Conditional jumps
> @@ -760,13 +780,17 @@ (define_insn "aarch64_bcond"
>   }
>   [(set_attr "type" "branch")
>(set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -   (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_N_1Mib))
> +   (lt (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_P_1Mib)))
>  (const_int 4)
>  (const_int 8)))
>(set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -   (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_N_1Mib))
> +   (lt (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_P_1Mib)))
>  (const_int 0)
>  (const_int 1)))]
> )
> @@ -823,13 +847,17 @@ (define_insn "aarch64_cb1"
>   }
>   [(set_attr "type" "branch")
>(set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
> -   (lt (minus (match_dup 1) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 1) (pc))
> +   (const_int BRANCH_LEN_N_1Mib))
> +   (lt (minus (match_dup 1) (pc))
> +   (const_int BRANCH_LEN_P_1Mib)))
>  (const_int 4)
>  (const_int 8)))
>(set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -   (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_N_1Mib))
> +   (lt (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_P_1Mib)))
>  (const_int 0)
>  (const_int 1)))]
> )
> @@ -864,13 +892,17 @@ (define_insn "*cb1"
>   }
>   [(set_attr "type" "branch")
>(set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
> -   (lt (minus (match_dup 1) (pc)) (const_int 32764)))
> + (if_then_else (and (ge (minus (match_dup 1) (pc))
> +   (const_int BRANCH_LEN_N_32Kib))
> +   (lt (minus (match_dup 1) (pc))
> +   (const_int BRANCH_LEN_P_32Kib)))
>  (const_int 4)
>  (const_int 8)))
>(set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -1048576))
> -   (lt (minus (match_dup 1) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 1) (pc))
> +   (const_int BRANCH_LEN_N_1Mib))
> +   (lt (minus (match_dup 1) (pc))
> +   (const_int BRANCH_LEN_P_1Mib)))
>  (const_int 0)
>  (const_int 1)))]
> )
> @@ -925,13 +957,17 @@ (define_insn "@aarch64_tb"
>   }
>   [(set_attr "type" "branch")
>(set (attr "length")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -32768))
> -   (lt (minus (match_dup 2) (pc)) (const_int 32764)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_N_32Kib))
> +   (lt (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_P_32Kib)))
>  (const_int 4)
>  (const_int 8)))
>(set (attr "far_branch")
> - (if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -1048576))
> -   (lt (minus (match_dup 2) (pc)) (const_int 1048572)))
> + (if_then_else (and (ge (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_N_1Mib))
> +   (lt (minus (match_dup 2) (pc))
> +   (const_int BRANCH_LEN_P_1Mib)))
>   

RE: [PATCH] libgcobol: Heed --enable-libgcobol

2025-05-07 Thread Robert Dubner



> -Original Message-
> From: Rainer Orth 
> Sent: Friday, April 11, 2025 05:26
> To: gcc-patches@gcc.gnu.org
> Cc: Robert Dubner ; James K. Lowden
> 
> Subject: [PATCH] libgcobol: Heed --enable-libgcobol
>
> If some target isn't listed as supported in configure.tgt,
> --enable-libgcobol cannot override that.  However, that's what should
> happen just like an explicit --enable-languages=cobol forces the
> frontend to be built.
>
> This patch, shamelessly adapted from libphobos, does just that.
>
> Tested on amd64-pc-solaris2.11, sparcv9-sun-solaris2.11, and
> x86_64-pc-linux-gnu.
>
> Ok for trunk?

I was unable to apply this patch.  "git apply ..." results in

:~/repos/gcc-cobol$ git apply libgcobol-enable-libgcobol.patch
error: patch failed: libgcobol/configure:788
error: libgcobol/configure: patch does not apply
error: patch failed: libgcobol/configure.ac:40
error: libgcobol/configure.ac: patch does not apply

I don't understand the problem, but I don't know much about how diff and 
apply work.

I have no way of checking the solaris part of it; I was just trying to do 
"due diligence", and check that it didn't adversely affect x86_64-linux-gnu.

But since I can't do that, all I can say is, I see no reason for you not to 
apply a patch you know works.

Bob D.


>
>   Rainer
>
> --
> --
> ---
> Rainer Orth, Center for Biotechnology, Bielefeld University
>
>
> 2025-04-08  Rainer Orth  
>
>   libgcobol:
>   * configure.ac: Handle --enable-libgcobol.
>   Let it override LIBGCOBOL_SUPPORTED.
>   * configure: Regenerate.



[PATCH 09/13] arm: remove iwmmxt-related attributes from machine description

2025-05-07 Thread Richard Earnshaw
Since we no-longer have any iwmxxt instructions, the iwmmxt-related
attributes can never be set.  Consequently, the marvel-f-iwmmxt
scheduler is redundant as none of the pipes are ever used now.

gcc/ChangeLog:

* config/arm/arm.md (core_cycles): Remove iwmmxt attributes.
* config/arm/types.md (autodetect_type): Likewise.
* config/arm/marvell-f-iwmmxt.md: Removed.
* config/arm/t-arm: Remove marvell-f-iwmmxt.md
---
 gcc/config/arm/arm.md  |  14 +--
 gcc/config/arm/marvell-f-iwmmxt.md | 189 -
 gcc/config/arm/t-arm   |   1 -
 gcc/config/arm/types.md| 123 ---
 4 files changed, 1 insertion(+), 326 deletions(-)
 delete mode 100644 gcc/config/arm/marvell-f-iwmmxt.md

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index ce1b987b241..7cbff8d3b60 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -358,18 +358,7 @@ (define_attr "core_cycles" "single,multi"
 alus_ext, alus_imm, alus_sreg,\
 alus_shift_imm, alus_shift_reg, bfm, csel, rev, logic_imm, logic_reg,\
 logic_shift_imm, logic_shift_reg, logics_imm, logics_reg,\
-logics_shift_imm, logics_shift_reg, extend, shift_imm, float, fcsel,\
-wmmx_wor, wmmx_wxor, wmmx_wand, wmmx_wandn, wmmx_wmov, wmmx_tmcrr,\
-wmmx_tmrrc, wmmx_wldr, wmmx_wstr, wmmx_tmcr, wmmx_tmrc, wmmx_wadd,\
-wmmx_wsub, wmmx_wmul, wmmx_wmac, wmmx_wavg2, wmmx_tinsr, wmmx_textrm,\
-wmmx_wshufh, wmmx_wcmpeq, wmmx_wcmpgt, wmmx_wmax, wmmx_wmin, wmmx_wpack,\
-wmmx_wunpckih, wmmx_wunpckil, wmmx_wunpckeh, wmmx_wunpckel, wmmx_wror,\
-wmmx_wsra, wmmx_wsrl, wmmx_wsll, wmmx_wmadd, wmmx_tmia, wmmx_tmiaph,\
-wmmx_tmiaxy, wmmx_tbcst, wmmx_tmovmsk, wmmx_wacc, wmmx_waligni,\
-wmmx_walignr, wmmx_tandc, wmmx_textrc, wmmx_torc, wmmx_torvsc, wmmx_wsad,\
-wmmx_wabs, wmmx_wabsdiff, wmmx_waddsubhx, wmmx_wsubaddhx, wmmx_wavg4,\
-wmmx_wmulw, wmmx_wqmulm, wmmx_wqmulwm, wmmx_waddbhus, wmmx_wqmiaxy,\
-wmmx_wmiaxy, wmmx_wmiawxy, wmmx_wmerge")
+logics_shift_imm, logics_shift_reg, extend, shift_imm, float, fcsel")
(const_string "single")
(const_string "multi")))
 
@@ -431,7 +420,6 @@ (define_attr "generic_vfp" "yes,no"
  (const_string "yes")
  (const_string "no"
 
-(include "marvell-f-iwmmxt.md")
 (include "arm-generic.md")
 (include "arm926ejs.md")
 (include "arm1020e.md")
diff --git a/gcc/config/arm/marvell-f-iwmmxt.md 
b/gcc/config/arm/marvell-f-iwmmxt.md
deleted file mode 100644
index c9c7b00f6cb..000
--- a/gcc/config/arm/marvell-f-iwmmxt.md
+++ /dev/null
@@ -1,189 +0,0 @@
-;; Marvell WMMX2 pipeline description
-;; Copyright (C) 2011-2025 Free Software Foundation, Inc.
-;; Written by Marvell, Inc.
-
-;; This file is part of GCC.
-
-;; GCC is free software; you can redistribute it and/or modify it
-;; under the terms of the GNU General Public License as published
-;; by the Free Software Foundation; either version 3, or (at your
-;; option) any later version.
-
-;; GCC is distributed in the hope that it will be useful, but WITHOUT
-;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-;; License for more details.
-
-;; You should have received a copy of the GNU General Public License
-;; along with GCC; see the file COPYING3.  If not see
-;; .
-
-
-(define_automaton "marvell_f_iwmmxt")
-
-
-;; Pipelines
-
-
-;; This is a 7-stage pipelines:
-;;
-;;MD | MI | ME1 | ME2 | ME3 | ME4 | MW
-;;
-;; There are various bypasses modelled to a greater or lesser extent.
-;;
-;; Latencies in this file correspond to the number of cycles after
-;; the issue stage that it takes for the result of the instruction to
-;; be computed, or for its side-effects to occur.
-
-(define_cpu_unit "mf_iwmmxt_MD" "marvell_f_iwmmxt")
-(define_cpu_unit "mf_iwmmxt_MI" "marvell_f_iwmmxt")
-(define_cpu_unit "mf_iwmmxt_ME1" "marvell_f_iwmmxt")
-(define_cpu_unit "mf_iwmmxt_ME2" "marvell_f_iwmmxt")
-(define_cpu_unit "mf_iwmmxt_ME3" "marvell_f_iwmmxt")
-(define_cpu_unit "mf_iwmmxt_ME4" "marvell_f_iwmmxt")
-(define_cpu_unit "mf_iwmmxt_MW" "marvell_f_iwmmxt")
-
-(define_reservation "mf_iwmmxt_ME"
-  "mf_iwmmxt_ME1,mf_iwmmxt_ME2,mf_iwmmxt_ME3,mf_iwmmxt_ME4"
-)
-
-(define_reservation "mf_iwmmxt_pipeline"
-  "mf_iwmmxt_MD, mf_iwmmxt_MI, mf_iwmmxt_ME, mf_iwmmxt_MW"
-)
-
-;; An attribute to indicate whether our reservations are applicable.
-(define_attr "marvell_f_iwmmxt" "yes,no"
-  (const (if_then_else (symbol_ref "arm_arch_iwmmxt")
-   (const_string "yes") (const_string "no"
-
-
-;; instruction classes
-;;

Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Richard Earnshaw
On 07/05/2025 17:28, Richard Earnshaw (lists) wrote:
> On 07/05/2025 16:54, Richard Sandiford wrote:
>> Richard Earnshaw  writes:
>>> On 07/05/2025 13:57, Richard Sandiford wrote:
 Kyrylo Tkachov  writes:
>> On 7 May 2025, at 12:27, Karl Meakin  wrote:
>>
>> Commit the test file `cmpbr.c` before rules for generating the new
>> instructions are added, so that the changes in codegen are more obvious
>> in the next commit.
>
> I guess that’s an LLVM best practice.
> In GCC since we have the check-function-bodies mechanism we usually 
> prefer to include the relevant test together with the patch that adds the 
> optimization.
> But this is not wrong either.
>
>
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/aarch64/cmpbr.c: New test.
>> ---
>> gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
>> 1 file changed, 1378 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c
>>
>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
>> b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
>> new file mode 100644
>> index 000..728d6ead91c
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
>> @@ -0,0 +1,1378 @@
>> +/* Test that the instructions added by FEAT_CMPBR are emitted */
>> +/* { dg-do compile } */
>> +/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>
> As you’ll be adding new instructions to the compiler it’d be good to have 
> it a dg-do assemble test where possible.

 Agreed FWIW, but:

> For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use 
> it like so to fallback to dg-do compile when the assembler is too old:
> /* { dg-do compile { target aarch64_asm_cmpbr_ok } } */

 ...dg-do assemble for this one :)
>>>
>>> I don't think that works. If the first dg-do fails the test is just skipped.
>>>
>>> You need to replicate the test with separate dg-do directives, IIRC.
>>
>> Hmm, can you remember the circumstances when you saw that?
>> We've been using the construct that Kyrill suggested with apparent
>> success in things like aarch64-sve2-acle-asm.exp.  E.g.:
> 
> Well, the implementation of dg-do contains the comment:
> 
> # Note: A previous occurrence of `dg-do' with target/xfail selectors
> # is a user mistake.  We clobber previous values here.
>  
> So one might interpret that as meaning multiple dg-do's are not intended to 
> be supported.
> 
> But I might have misremembered the exact scenario I was facing.  I think it 
> might have been that a test failed to fall back to the dg-do-default if a 
> specific dg-do didn't match.  The scenario I remember was something like 
> dg-do-default = compile, then the test was trying to change that to execute 
> if HW was available; but that meant that if it wasn't we didn't fall back to 
> checking the assembler output.
> 

The comment at the head of the function says:

# Multiple instances are supported (since we don't support target and xfail
# selectors on one line), though it doesn't make much sense to change the
# compile/assemble/link/run field.  Nor does it make any sense to have
# multiple lines of target selectors (use one line).

So maybe the code is intended to support multiple reasons for skipping the test 
(but why not use require-effective-target
for that).

I'm not sure now what's going on...

R.

> R.
> 
>>
>> /* { dg-do assemble { target aarch64_asm_sve2p1_ok } } */
>> /* { dg-do compile { target { ! aarch64_asm_sve2p1_ok } } } */
>> /* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>>
>> If I run this normally, it picks the assemble route (tests run with -c).
>> If I force aarch64_asm_sve2p1_ok to false by mangling the test
>> instruction, the test picks the compile route (tests with run -S).
>>
>> Thanks,
>> Richard
> 



[PATCH v2] Evaluate the object size by the size of the pointee type when the type is a structure with flexible array member which is annotated with counted_by.

2025-05-07 Thread Qing Zhao
Hi, 

This is the 2nd version of the patch for:

Evaluate the object size by the size of the pointee type when the type
is a structure with flexible array member which is annotated with
counted_by. 

Per the following discussion: (Questions on replacing a structure
pointer reference to a call to .ACCESS_WITH_SIZE in C FE)

https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680540.html
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681229.html

The summary of the above discussion: 
   A. It’s not safe in general to replace a structure pointer 
reference to a call to .ACCESS_WITH_SIZE in C FE. 
  Since data-flow analysis is needed to make sure that the access
to the size member is valid, i.e, the structure is accessible 
and initialized, etc.

   B. It should be safe to generate the reference to field member
when we evaluate the BDOS builtin as the 1st version of the
patch . And doing this in tree-object-size should also cover
-fsanitize=object-size.

   C. When generating the reference to the field member in tree-object-size,
we should guard this reference with a checking on the pointer to 
the structure is valid.

Compare to the 1st version, the major changes are based on the above:

   1. Update the comments per Sid's suggestions.
   2. Reorg the code to make it easily to be understood:
Add a new routine "is_pointee_fam_struct_with_counted_by" to make 
the checking easier;
   3. Add one more cond_expr to guard the size_expr as:

  (prt == NULL) ? SIZE_UNKNOWN : SIZE_EXPR

   4. In order to gimplify the above COND_EXPR (the current 
force_gimple_operand doesnot work for the new control flow), Add 
one more new routine "insert_cond_and_size" to construct the 
whole control flow graph for the above cond_expr. Please refer to:
  https://gcc.gnu.org/pipermail/gcc-patches/2025-April/682021.html
  for this discussion.

   5. Add new testing cases for testing the prt==NULL cases. 

The patch has been bootstrapped and regression tested on both x86 and aarch64.

Okay for trunk?

thanks.

Qing


In tree-object-size.cc, if the size is UNKNOWN after evaluating use-def
chain, We can evaluate the SIZE of the pointee TYPE ONLY when this TYPE
is a structure type with flexible array member which is attached a
counted_by attribute, since a structure with FAM can not be an element
of an array, so, the pointer must point to a single object with this
structure with FAM.

This is only available for C now.

gcc/c/ChangeLog:

* c-lang.cc (LANG_HOOKS_BUILD_COUNTED_BY_REF):
Define to below function.
* c-tree.h (c_build_counted_by_ref): New extern function.
* c-typeck.cc (build_counted_by_ref): Rename to ...
(c_build_counted_by_ref): ...this.
(handle_counted_by_for_component_ref): Call the renamed function.

gcc/ChangeLog:

* langhooks-def.h (LANG_HOOKS_BUILD_COUNTED_BY_REF):
New language hook.
* langhooks.h (struct lang_hooks_for_types): Add
build_counted_by_ref.
* tree-object-size.cc (struct object_size_info): Add a new field
insert_cf.
(insert_cond_and_size): New function.
(gimplify_size_expressions): Handle new field insert_cf.
(compute_builtin_object_size): Init the new field to false;
(is_pointee_fam_struct_with_counted_by): New function.
(record_with_fam_object_size): New function.
(collect_object_sizes_for): Call record_with_fam_object_size.
(dynamic_object_sizes_execute_one): Special handling for insert_cf.

gcc/testsuite/ChangeLog:

* gcc.dg/flex-array-counted-by-3.c: Update test for whole object size;
* gcc.dg/flex-array-counted-by-4.c: Likewise.
* gcc.dg/flex-array-counted-by-5.c: Likewise.
* gcc.dg/flex-array-counted-by-10.c: New test.
---
 gcc/c/c-lang.cc   |   3 +
 gcc/c/c-tree.h|   1 +
 gcc/c/c-typeck.cc |   6 +-
 gcc/langhooks-def.h   |   4 +-
 gcc/langhooks.h   |   5 +
 .../gcc.dg/flex-array-counted-by-10.c |  41 +++
 .../gcc.dg/flex-array-counted-by-3.c  |   5 +
 .../gcc.dg/flex-array-counted-by-4.c  |  34 +-
 .../gcc.dg/flex-array-counted-by-5.c  |   4 +
 gcc/tree-object-size.cc   | 302 +-
 10 files changed, 383 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/flex-array-counted-by-10.c

diff --git a/gcc/c/c-lang.cc b/gcc/c/c-lang.cc
index c69077b2a93..e9ec9e6e64a 100644
--- a/gcc/c/c-lang.cc
+++ b/gcc/c/c-lang.cc
@@ -51,6 +51,9 @@ enum c_language_kind c_language = clk_c;
 #undef LANG_HOOKS_GET_SARIF_SOURCE_LANGUAGE
 #define LANG_HOOKS_GET_SARIF_SOURCE_LANGUAGE c_get_sarif_source_language
 
+#undef LANG_HO

Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Richard Earnshaw (lists)
On 07/05/2025 16:54, Richard Sandiford wrote:
> Richard Earnshaw  writes:
>> On 07/05/2025 13:57, Richard Sandiford wrote:
>>> Kyrylo Tkachov  writes:
> On 7 May 2025, at 12:27, Karl Meakin  wrote:
>
> Commit the test file `cmpbr.c` before rules for generating the new
> instructions are added, so that the changes in codegen are more obvious
> in the next commit.

 I guess that’s an LLVM best practice.
 In GCC since we have the check-function-bodies mechanism we usually prefer 
 to include the relevant test together with the patch that adds the 
 optimization.
 But this is not wrong either.


>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/cmpbr.c: New test.
> ---
> gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
> 1 file changed, 1378 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
> b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
> new file mode 100644
> index 000..728d6ead91c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
> @@ -0,0 +1,1378 @@
> +/* Test that the instructions added by FEAT_CMPBR are emitted */
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */

 As you’ll be adding new instructions to the compiler it’d be good to have 
 it a dg-do assemble test where possible.
>>>
>>> Agreed FWIW, but:
>>>
 For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use 
 it like so to fallback to dg-do compile when the assembler is too old:
 /* { dg-do compile { target aarch64_asm_cmpbr_ok } } */
>>>
>>> ...dg-do assemble for this one :)
>>
>> I don't think that works. If the first dg-do fails the test is just skipped.
>>
>> You need to replicate the test with separate dg-do directives, IIRC.
> 
> Hmm, can you remember the circumstances when you saw that?
> We've been using the construct that Kyrill suggested with apparent
> success in things like aarch64-sve2-acle-asm.exp.  E.g.:

Well, the implementation of dg-do contains the comment:

# Note: A previous occurrence of `dg-do' with target/xfail selectors
# is a user mistake.  We clobber previous values here.
 
So one might interpret that as meaning multiple dg-do's are not intended to be 
supported.

But I might have misremembered the exact scenario I was facing.  I think it 
might have been that a test failed to fall back to the dg-do-default if a 
specific dg-do didn't match.  The scenario I remember was something like 
dg-do-default = compile, then the test was trying to change that to execute if 
HW was available; but that meant that if it wasn't we didn't fall back to 
checking the assembler output.

R.

> 
> /* { dg-do assemble { target aarch64_asm_sve2p1_ok } } */
> /* { dg-do compile { target { ! aarch64_asm_sve2p1_ok } } } */
> /* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> 
> If I run this normally, it picks the assemble route (tests run with -c).
> If I force aarch64_asm_sve2p1_ok to false by mangling the test
> instruction, the test picks the compile route (tests with run -S).
> 
> Thanks,
> Richard



Re: [PATCH] libfortran: Readd 15 accidentally removed libgfortran symbols [PR120152]

2025-05-07 Thread Thomas Koenig

Hi Jakub,


The r15-4124-gc0002a675a92e76d change seems to have accidentally
dropped 5 sourcefiles from i_maxloc1_c, which resulted in dropping
15 GFORTRAN_8 symbols on x86_64 and 6 on i686.

The following patch adds it back, so that we export those symbols
again, fixing the ABI problem.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk
and 15.2?


OK. Thanks a lot for for fixing this!

Best regards

Thomas



[to-be-committed][RISC-V][PR target/120137] Don't create out-of-range permutation constants

2025-05-07 Thread Jeff Law


To make hashing sensible we canonicalize constant vectors in the hash 
table so that their first entry always has the value zero.  That 
normalization can result in a value that can't be represented in the 
element mode.


So before entering anything into the hash table we need to verify the 
normalized entries will fit into the element's mode.


This fixes both 120137 and its duplicate 120154.  This has been tested 
in my tester.  I'm just waiting for the pre-commit tester to render its 
verdict.


Jeff


PR target/120137
gcc/
* config/riscv/riscv-vect-permconst.cc (process_bb): Verify each
canonicalized element fits into the vector element mode.

gcc/testsuite/

* gcc.target/riscv/pr120137.c: New test.
* gcc.target/riscv/pr120154.c: New test.

diff --git a/gcc/config/riscv/riscv-vect-permconst.cc 
b/gcc/config/riscv/riscv-vect-permconst.cc
index feecc7ed6da..05819b3a30c 100644
--- a/gcc/config/riscv/riscv-vect-permconst.cc
+++ b/gcc/config/riscv/riscv-vect-permconst.cc
@@ -203,6 +203,25 @@ vector_permconst::process_bb (basic_block bb)
   if (bias < 0 || bias > 16384 / 8)
continue;
 
+  /* We need to verify that each element would be a valid value
+in the inner mode after applying the bias.  */
+  machine_mode inner = GET_MODE_INNER (GET_MODE (cvec));
+  HOST_WIDE_INT precision = GET_MODE_PRECISION (inner).to_constant ();
+  int i;
+  for (i = 0; i < CONST_VECTOR_NUNITS (cvec).to_constant (); i++)
+   {
+ HOST_WIDE_INT val = INTVAL (CONST_VECTOR_ELT (cvec, i)) - bias;
+ if (val != sext_hwi (val, precision))
+   break;
+   }
+
+  /* If the loop terminated early, then we found a case where the
+adjusted constant would not fit, so we can't record the constant
+for this case (it's unlikely to be useful anyway.  */
+  if (i != CONST_VECTOR_NUNITS (cvec).to_constant ())
+   continue;
+  
+
   /* At this point we have a load of a constant integer vector from the
 constant pool.  That constant integer vector is hopefully a
 permutation constant.  We need to make a copy of the vector and
@@ -211,7 +230,7 @@ vector_permconst::process_bb (basic_block bb)
 XXX This violates structure sharing conventions.  */
   rtvec_def *nvec = gen_rtvec (CONST_VECTOR_NUNITS (cvec).to_constant ());
 
-  for (int i = 0; i < CONST_VECTOR_NUNITS (cvec).to_constant (); i++)
+  for (i = 0; i < CONST_VECTOR_NUNITS (cvec).to_constant (); i++)
nvec->elem[i] = GEN_INT (INTVAL (CONST_VECTOR_ELT (cvec, i)) - bias);
 
   rtx copy = gen_rtx_CONST_VECTOR (GET_MODE (cvec), nvec);
diff --git a/gcc/testsuite/gcc.target/riscv/pr120137.c 
b/gcc/testsuite/gcc.target/riscv/pr120137.c
new file mode 100644
index 000..c55a1c1b5bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr120137.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl256b -mrvv-vector-bits=zvl -mabi=lp64" } */
+
+char b[13][13];
+void c() {
+  for (int d = 0; d < 13; ++d)
+for (int e = 0; e < 13; ++e)
+  b[d][e] = e == 0 ? -98 : 38;
+}
+
+
+
diff --git a/gcc/testsuite/gcc.target/riscv/pr120154.c 
b/gcc/testsuite/gcc.target/riscv/pr120154.c
new file mode 100644
index 000..fd849ca154a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr120154.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gv -mabi=lp64" } */
+
+
+
+typedef __attribute__((__vector_size__(4))) char V;
+
+V g;
+
+V
+bar(V a, V b)
+{
+  V s = a + b + g;
+  return s;
+}
+
+V
+foo()
+{
+  return bar((V){20}, (V){23, 150});
+}
+


Re: [PATCH] libfortran: Add 5 missing UNSIGNED symbols [PR120153]

2025-05-07 Thread Thomas Koenig

Hi Jakub,


While looking at PR120152, I have noticed that libgfortran.so doesn't
export 5 *m16* symbols I would have expected that should be exported.
This is caused by 2 issues, one filename was forgotten to be added in r15-4124
to i_maxloc1_c (guess because generated/maxloc1_16_i16.c was kept in the
position after generated/maxloc1_8_m16.c and the i -> m difference wasn't
spotted), and one some garbage prefix on HAVE_GFC_UINTEGER_16 macro.

The first two hunks of this patch fix that.
Though, as GCC 15.1 has been released already, we can't add these symbols
to GFORTRAN_15 symbol version as they've never been there, so the patch
adds them to a new GFORTRAN_15.2 symbol version instead.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 15.2?


OK. And thanks, again!

Best regards

Thomas



Re: [PATCH][www] Mark reload as to be removed for GCC 16

2025-05-07 Thread Richard Biener
On Tue, 6 May 2025, Richard Sandiford wrote:

> Richard Biener  writes:
> > The following amends gcc-15/changes.html with a note that reload
> > is going to be removed for GCC 16.
> >
> > OK for www?
> >
> > * htdocs/gcc-15/changes.html: Mark GCC 15 as last release
> > supporting reload.
> 
> My reading of the threads was that no-one is objecting to this timeline,
> just giving status updates about how it will affect the ports that they
> care about.  And in any case, the transition period has already been
> quite a bit longer than originally planned.
> 
> On that basis: LGTM, thanks.

Pushed now.  So IMO we're set to remove reload for GCC 16, given feedback
I defer to Segher for the actual patches achieving this (but I'll help
with missing approvals when necessary).

Richard.

> Richard
> 
> > ---
> >  htdocs/gcc-15/changes.html | 7 +++
> >  1 file changed, 7 insertions(+)
> >
> > diff --git a/htdocs/gcc-15/changes.html b/htdocs/gcc-15/changes.html
> > index d851a744..4b1fd284 100644
> > --- a/htdocs/gcc-15/changes.html
> > +++ b/htdocs/gcc-15/changes.html
> > @@ -31,6 +31,13 @@ You may also want to check out our
> >In the AArch64 port, support for ILP32 (-mabi=ilp32) has
> >  been deprecated and will be removed in a future release.
> >
> > +  This is the last release supporting the old reload local
> > +register allocation code.  It will be removed for GCC 16, causing
> > +targets that do not support the new LRA local register
> > +allocation code to be removed.  See the list of supported
> > +https://gcc.gnu.org/backends.html";>targets for which
> > +ports are going to be affected (look for missing a, the
> > +ports that do not use LRA by default).
> >{0} initializer in C or C++ for unions no longer
> >  guarantees clearing of the whole union (except for static storage
> >  duration initialization), it just initializes the first
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


Re: [PATCH 3/8] AArch64: rename branch instruction rules

2025-05-07 Thread Kyrylo Tkachov


> On 7 May 2025, at 12:27, Karl Meakin  wrote:
> 
> Give the `define_insn` rules used in lowering `cbranch4` to RTL
> more descriptive and consistent names: from now on, each rule is named
> after the AArch64 instruction that it generates. Also add comments to
> document each rule.
> 
> gcc/ChangeLog:
> 
> * config/aarch64/aarch64.md (condjump): rename to ...
> (aarch64_bcond): ...here.
> (*compare_condjump): rename to ...
> (*aarch64_bcond_wide_imm): ...here.
> (restore_stack_nonlocal): handle rename.
> (stack_protect_combined_test): likewise.
> * config/aarch64/aarch64-simd.md (cbranch4): likewise.
> * config/aarch64/aarch64-sme.md (aarch64_restore_za): likewise.
> * config/aarch64/aarch64.cc (aarch64_gen_test_and_branch): likewise.
> ---
> gcc/config/aarch64/aarch64-simd.md |  2 +-
> gcc/config/aarch64/aarch64-sme.md  |  3 ++-
> gcc/config/aarch64/aarch64.cc  |  2 +-
> gcc/config/aarch64/aarch64.md  | 15 +--
> 4 files changed, 13 insertions(+), 9 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index e2afe87e513..197a5f65f34 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3946,7 +3946,7 @@ (define_expand "cbranch4"
> 
>   rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
>   rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
> -  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
> +  emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[3]));
>   DONE;
> })
> 
> diff --git a/gcc/config/aarch64/aarch64-sme.md 
> b/gcc/config/aarch64/aarch64-sme.md
> index c49affd0dd3..6a7c31acf0a 100644
> --- a/gcc/config/aarch64/aarch64-sme.md
> +++ b/gcc/config/aarch64/aarch64-sme.md
> @@ -389,7 +389,8 @@ (define_insn_and_split "aarch64_restore_za"
> auto label = gen_label_rtx ();
> auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM);
> emit_insn (gen_aarch64_read_tpidr2 (tpidr2));
> -auto jump = emit_likely_jump_insn (gen_aarch64_cbnedi1 (tpidr2, label));
> +auto jump = emit_likely_jump_insn (
> + gen_aarch64_cbnedi1 (tpidr2, label));

IMO it’d be cleaner to break this before the “=“ instead. This doesn’t look 
like a renaming, just a reformatting btw.
Ok otherwise.
Thanks,
Kyrill


> JUMP_LABEL (jump) = label;
> 
> aarch64_restore_za (operands[0]);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index fff8d9da49d..c0afdeb87ee 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -2879,7 +2879,7 @@ aarch64_gen_test_and_branch (rtx_code code, rtx x, int 
> bitnum,
>   emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
>   rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
>   rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
> -  return gen_condjump (x, cc_reg, label);
> +  return gen_aarch64_bcond (x, cc_reg, label);
> }
>   return gen_aarch64_tb (code, mode, mode,
> x, gen_int_mode (bitnum, mode), label);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 45b2283c5c0..23775ec58ca 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -740,7 +740,8 @@ (define_expand "cbranchcc4"
>   ""
> )
> 
> -(define_insn "condjump"
> +;; Emit `B`, assuming that the condition is already in the CC register.
> +(define_insn "aarch64_bcond"
>   [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator"
>[(match_operand 1 "cc_register")
> (const_int 0)])
> @@ -780,7 +781,7 @@ (define_insn "condjump"
> ;; sub x0, x1, #(CST & 0xfff000)
> ;; subs x0, x0, #(CST & 0x000fff)
> ;; b .Label
> -(define_insn_and_split "*compare_condjump"
> +(define_insn_and_split "*aarch64_bcond_wide_imm"
>   [(set (pc) (if_then_else (EQL
> (match_operand:GPI 0 "register_operand" "r")
> (match_operand:GPI 1 "aarch64_imm24" "n"))
> @@ -801,11 +802,12 @@ (define_insn_and_split "*compare_condjump"
> rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
> rtx cmp_rtx = gen_rtx_fmt_ee (, mode,
>  cc_reg, const0_rtx);
> -emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2]));
> +emit_jump_insn (gen_aarch64_bcond (cmp_rtx, cc_reg, operands[2]));
> DONE;
>   }
> )
> 
> +;; For an EQ/NE comparison against zero, emit `CBZ`/`CBNZ`
> (define_insn "aarch64_cb1"
>   [(set (pc) (if_then_else (EQL
> (match_operand:GPI 0 "register_operand" "r")
> @@ -832,6 +834,7 @@ (define_insn "aarch64_cb1"
>  (const_int 1)))]
> )
> 
> +;; For an LT/GE comparison against zero, emit `TBZ`/`TBNZ`
> (define_insn "*cb1"
>   [(set (pc) (if_then_else (LTGE
> (match_operand:ALLI 0 "register_operand" "r")
> @@ -1325,13 +1328,13 @@ (define_expand "restore_stack_nonlocal"
>   emit_insn (gen_subdi3_compare1 (gcs_now, gcs_old, gcs_now));
>   rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM);
>   rtx cmp_rtx = gen_rtx_fmt_ee (EQ, DImode, cc_reg, const0_rtx);
> -  

Re: [PATCH 7/8] AArch64: precommit test for CMPBR instructions

2025-05-07 Thread Richard Sandiford
Richard Earnshaw  writes:
> On 07/05/2025 17:28, Richard Earnshaw (lists) wrote:
>> On 07/05/2025 16:54, Richard Sandiford wrote:
>>> Richard Earnshaw  writes:
 On 07/05/2025 13:57, Richard Sandiford wrote:
> Kyrylo Tkachov  writes:
>>> On 7 May 2025, at 12:27, Karl Meakin  wrote:
>>>
>>> Commit the test file `cmpbr.c` before rules for generating the new
>>> instructions are added, so that the changes in codegen are more obvious
>>> in the next commit.
>>
>> I guess that’s an LLVM best practice.
>> In GCC since we have the check-function-bodies mechanism we usually 
>> prefer to include the relevant test together with the patch that adds 
>> the optimization.
>> But this is not wrong either.
>>
>>
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> * gcc.target/aarch64/cmpbr.c: New test.
>>> ---
>>> gcc/testsuite/gcc.target/aarch64/cmpbr.c | 1378 ++
>>> 1 file changed, 1378 insertions(+)
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/cmpbr.c
>>>
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpbr.c 
>>> b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
>>> new file mode 100644
>>> index 000..728d6ead91c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/cmpbr.c
>>> @@ -0,0 +1,1378 @@
>>> +/* Test that the instructions added by FEAT_CMPBR are emitted */
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-march=armv9.5-a+cmpbr -O2" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>>
>> As you’ll be adding new instructions to the compiler it’d be good to 
>> have it a dg-do assemble test where possible.
>
> Agreed FWIW, but:
>
>> For that you’ll need to create a new aarch64_asm_cmpbr_ok target and use 
>> it like so to fallback to dg-do compile when the assembler is too old:
>> /* { dg-do compile { target aarch64_asm_cmpbr_ok } } */
>
> ...dg-do assemble for this one :)

 I don't think that works. If the first dg-do fails the test is just 
 skipped.

 You need to replicate the test with separate dg-do directives, IIRC.
>>>
>>> Hmm, can you remember the circumstances when you saw that?
>>> We've been using the construct that Kyrill suggested with apparent
>>> success in things like aarch64-sve2-acle-asm.exp.  E.g.:
>> 
>> Well, the implementation of dg-do contains the comment:
>> 
>> # Note: A previous occurrence of `dg-do' with target/xfail selectors
>> # is a user mistake.  We clobber previous values here.
>>  
>> So one might interpret that as meaning multiple dg-do's are not intended to 
>> be supported.
>> 
>> But I might have misremembered the exact scenario I was facing.  I think it 
>> might have been that a test failed to fall back to the dg-do-default if a 
>> specific dg-do didn't match.  The scenario I remember was something like 
>> dg-do-default = compile, then the test was trying to change that to execute 
>> if HW was available; but that meant that if it wasn't we didn't fall back to 
>> checking the assembler output.
>> 
>
> The comment at the head of the function says:
>
> # Multiple instances are supported (since we don't support target and xfail
> # selectors on one line), though it doesn't make much sense to change the
> # compile/assemble/link/run field.  Nor does it make any sense to have
> # multiple lines of target selectors (use one line).
>
> So maybe the code is intended to support multiple reasons for skipping the 
> test (but why not use require-effective-target
> for that).
>
> I'm not sure now what's going on...

Richard and I discussed this more off-list, and it turns out that the
above construct started to work after:

https://git.savannah.gnu.org/gitweb/?p=dejagnu.git;a=commit;h=569f8718b534a2cd9511a7d640352eb0126ff492

which was first releasted in 1.6 (9 years ago).  Before that, the "what"
(compile/assemble/etc.) in the last dg-do won, regardless of whether
the dg-do was selected or deselected.

I suppose the question is whether we can reasonably assume that people
are using dejagnu 1.6+ or whether we need to support older dejagnus.

It looks like Alex added dg-do-if (in e6f5fadec5f6a719145ed2ed513209ec3e8eeb2f)
to support older dejagnu, so that's an option.  I.e.:

/* { dg-do compile } */
/* { dg-do-if assemble { target aarch64_asm_cmpbr_ok } } */

Which if it works (haven't tried!) also avoids having to specify the
selector twice.  Not sure whether it's worth going back and changing
all existing aarch64 tests to this style though.

TIL :)  Thanks Richard for bringing it up.

Richard


RE: [PATCH] Canonicalize vec_merge in simplify_ternary_operation

2025-05-07 Thread quic_pzheng
> Pengxuan Zheng  writes:
> > Similar to the canonicalization done in combine, we canonicalize
> > vec_merge with swap_communattive_operands_p in
> simplify_ternary_operation too.
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-protos.h (aarch64_exact_log2_inverse):
> New.
> > * config/aarch64/aarch64-simd.md
> (aarch64_simd_vec_set_zero):
> > Update pattern accordingly.
> > * config/aarch64/aarch64.cc (aarch64_exact_log2_inverse): New.
> > * simplify-rtx.cc (simplify_context::simplify_ternary_operation):
> > Canonicalize vec_merge.
> 
> OK for GCC 16, thanks.  aarch64_exact_log2_inverse isn't really
target-specific,
> but I can't think of a target-independent set of interfaces that it would
> naturally fit.
> 
> Richard

Thanks, pushed the patch as r16-459-g9b13bea07706a.

Pengxuan
> 
> >
> > Signed-off-by: Pengxuan Zheng 
> > ---
> >  gcc/config/aarch64/aarch64-protos.h |  1 +
> > gcc/config/aarch64/aarch64-simd.md  | 10 ++
> >  gcc/config/aarch64/aarch64.cc   | 10 ++
> >  gcc/simplify-rtx.cc |  7 +++
> >  4 files changed, 24 insertions(+), 4 deletions(-)
> >
> > diff --git a/gcc/config/aarch64/aarch64-protos.h
> > b/gcc/config/aarch64/aarch64-protos.h
> > index 4235f4a0ca5..2391b99cacd 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -1051,6 +1051,7 @@ void aarch64_subvti_scratch_regs (rtx, rtx, rtx *,
> >   rtx *, rtx *, rtx *);
> >  void aarch64_expand_subvti (rtx, rtx, rtx,
> > rtx, rtx, rtx, rtx, bool);
> > +int aarch64_exact_log2_inverse (unsigned int, rtx);
> >
> >
> >  /* Initialize builtins for SIMD intrinsics.  */ diff --git
> > a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index e2afe87e513..1099e742cbf 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -1193,12 +1193,14 @@ (define_insn
> "@aarch64_simd_vec_set"
> >  (define_insn "aarch64_simd_vec_set_zero"
> >[(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > (vec_merge:VALL_F16
> > -   (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "")
> > -   (match_operand:VALL_F16 3 "register_operand" "0")
> > +   (match_operand:VALL_F16 1 "register_operand" "0")
> > +   (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "")
> > (match_operand:SI 2 "immediate_operand" "i")))]
> > -  "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
> > +  "TARGET_SIMD && aarch64_exact_log2_inverse (, operands[2])
> >= 0"
> >{
> > -int elt = ENDIAN_LANE_N (, exact_log2 (INTVAL
(operands[2])));
> > +int elt = ENDIAN_LANE_N (,
> > +aarch64_exact_log2_inverse (,
> > +operands[2]));
> >  operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
> >  return "ins\\t%0.[%p2], zr";
> >}
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index f5f23f6ff4b..103a00915e5 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23682,6 +23682,16 @@ aarch64_strided_registers_p (rtx *operands,
> unsigned int num_operands,
> >return true;
> >  }
> >
> > +/* Return the base 2 logarithm of the bit inverse of OP masked by the
> lowest
> > +   NELTS bits, if OP is a power of 2.  Otherwise, returns -1.  */
> > +
> > +int
> > +aarch64_exact_log2_inverse (unsigned int nelts, rtx op) {
> > +  return exact_log2 ((~INTVAL (op))
> > +& ((HOST_WIDE_INT_1U << nelts) - 1)); }
> > +
> >  /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
> > HIGH (exclusive).  */
> >  void
> > diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index
> > c478bd060fc..22002d1e1ab 100644
> > --- a/gcc/simplify-rtx.cc
> > +++ b/gcc/simplify-rtx.cc
> > @@ -7307,6 +7307,13 @@ simplify_context::simplify_ternary_operation
> (rtx_code code, machine_mode mode,
> >   return gen_rtx_CONST_VECTOR (mode, v);
> > }
> >
> > + if (swap_commutative_operands_p (op0, op1)
> > + /* Two operands have same precedence, then first bit of mask
> > +select first operand.  */
> > + || (!swap_commutative_operands_p (op1, op0) && !(sel & 1)))
> > +   return simplify_gen_ternary (code, mode, mode, op1, op0,
> > +GEN_INT (~sel & mask));
> > +
> >   /* Replace (vec_merge (vec_merge a b m) c n) with (vec_merge b c
n)
> >  if no element from a appears in the result.  */
> >   if (GET_CODE (op0) == VEC_MERGE)



Re: Patch Submission: Optimize Size of true and false Macros in C

2025-05-07 Thread SAKSHAM JOSHI
Ok. I got it. Thank you sir.

On Thu, May 8, 2025, 4:26 AM Joseph Myers  wrote:

> On Tue, 6 May 2025, SAKSHAM JOSHI wrote:
>
> > I am submitting a patch for the GCC compiler's "stdbool.h" to optimize
> the
> > size of the true and false macros in the C programming language.
> Currently,
> > the size of the true and false macros is 4 bytes, whereas the _Bool
> > datatype is 1 byte in size. This patch proposes a change that will set
> the
> > size of the true and false macros to 1 byte, aligning them more closely
> > with the _Bool datatype.
>
> If you want the macros to have type bool, use C23 mode, which is the
> default.  Older standards require the macros to expand exactly to
> particular integer constants.
>
> --
> Joseph S. Myers
> josmy...@redhat.com
>
>


Re: [PATCH] libfortran: Fix up maxval/maxloc for UNSIGNED [PR120158]

2025-05-07 Thread Thomas Koenig

Hi Jakub,


Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and
15.2?


OK. Thanks for catching and fixing this!

Best regards

Thomas König



Re: PING: [PATCH] Add TARGET_STORE_BY_PIECES_ICODE

2025-05-07 Thread Richard Biener
On Thu, 8 May 2025, H.J. Lu wrote:

> On Mon, Apr 28, 2025 at 8:57 PM H.J. Lu  wrote:
> >
> > On x86, both stores with 32-bit immediate and register are supported:
> >
> >0: 48 c7 40 10 00 00 00 00 movq   $0x0,0x10(%rax)
> >8: 48 89 50 10  movq   %rdx,0x10(%rax)
> >
> > But store with 32-bit immediate is 4 byte longer.
> >
> > Add UNSPEC_STORE_BY_PIECES to x86 backend for register store to avoid
> > store with 32-bit immediate for shorter encoding and add a target hook to
> > select the store instruction used by the store by_pieces infrastructure
> > so that a target can choose a specific instruction for shorter encoding.
> > When optimizing on x86, we choose register store:
> >
> > 1. If length-changing prefix (LCP) stall is avoided with 16-bit register
> > store. Or
> > 2. If more than 2 stores with 32-bit immediate will be used.
> >
> > gcc/
> >
> > * expr.c (store_by_pieces_d::prepare_mode): Call
> > targetm.store_by_pieces_icode to get store by_pieces insn code.
> > * target.def (store_by_pieces_icode): New hook.
> > * targhooks.cc (default_store_by_pieces_icode): New.
> > targhooks.h (default_store_by_pieces_icode): Likewise.
> > * config/i386/i386.cc (ix86_store_by_pieces_icode): New.
> > (TARGET_STORE_BY_PIECES_ICODE): Likewise.
> > * config/i386/i386.md (UNSPEC_STORE_BY_PIECES): New.
> > (store_by_pieces_mov): Likewise.
> > (store_by_pieces_mov_1): Likewise.
> > * config/i386/x86-tune.def (X86_TUNE_USE_REGISTER_STORE_BY_PIECES):
> > Likewise.
> > * doc/tm.texi: Regenerated.
> > * doc/tm.texi.in: Add TARGET_STORE_BY_PIECES_ICODE.
> >
> > gcc/testsuite/
> >
> > * gcc.target/i386/memset-strategy-10.c: New test.
> > * gcc.target/i386/memset-strategy-11.c: Likewise.
> > * gcc.target/i386/memset-strategy-12.c: Likewise.
> > * gcc.target/i386/memset-strategy-13.c: Likewise.
> > * gcc.target/i386/memset-strategy-14.c: Likewise.
> > * gcc.target/i386/memset-strategy-15.c: Likewise.
> > * gcc.target/i386/memset-strategy-16.c: Likewise.
> > * gcc.target/i386/memset-strategy-17.c: Likewise.
> > * gcc.target/i386/memset-strategy-18.c: Likewise.
> > * gcc.target/i386/memset-strategy-19.c: Likewise.
> > * gcc.target/i386/memset-strategy-20.c: Likewise.
> > * gcc.target/i386/memset-strategy-21.c: Likewise.
> > * gcc.target/i386/pr72839.c: Scan for register store.
> >
> > OK for master?
> >
> > Thanks.
> >
> > --
> > H.J.
> 
> PING:
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2025-April/682007.html

IMO it's better to have the underlying issue - lack of "CSE"
of immediates - to be addressed, either in generic code or
in a machine dependent pass since this comes up not only in
store-by-pieces context.

Didn't you do such a machine pass recently?

Using an UNSPEC in RTL for this will very likely pessimize
optimization there.

I wonder if we should consider to only allow (large) immediates
after reload?

Richard.

  1   2   >