Hi Saurabh,

Thanks for the feedback, I've given some thoughts on some and
agreed to a few of your other suggestions.

Thanks,
Richard

On Fri, Oct 31, 2025 at 10:12:31AM +0000, Saurabh Jha wrote:
> 
> 
> On 10/31/2025 12:10 AM, [email protected] wrote:
> > From: Richard Ball <[email protected]>
> > 
> > This patch adds support for the atomic_store_with_stshh intrinsic
> > in aarch64. This intrinsic is part of FEAT_PCDPHINT.
> > 
> > gcc/ChangeLog:
> > 
> >     * config/aarch64/aarch64-builtins.cc
> >     (enum aarch64_builtins): Add new flags.
> >     (aarch64_init_pcdphint_builtins): Create new Builtin functions.
> Nit: was capitalising `Builtin` intentional?>
> (aarch64_general_init_builtins): Call init for PCDPHINT.
> >     (aarch64_expand_stshh_builtin): Expander for new intrinsic.
> >     (aarch64_general_expand_builtin): Call new expander.
> >     * config/aarch64/aarch64-c.cc
> >     (aarch64_update_cpp_builtins): New feature.
> >     * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): 
> > Likewise.
> >     * config/aarch64/aarch64.h (TARGET_PCDPHINT): Likewise.
> >     * config/aarch64/arm_acle.h
> >     (__atomic_store_with_stshh): Generic to call builtins.
> >     * config/aarch64/atomics.md
> >     (@aarch64_atomic_store_stshh<mode>): New pattern for intrinsic.
> >     * config/aarch64/iterators.md: New UNSPEC.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >     * gcc.target/aarch64/atomic_store_with_stshh.c.c: New test.
> > ---
> >   gcc/config/aarch64/aarch64-builtins.cc        | 128 ++++++++++++++++++
> >   gcc/config/aarch64/aarch64-c.cc               |   1 +
> >   .../aarch64/aarch64-option-extensions.def     |   2 +
> >   gcc/config/aarch64/aarch64.h                  |   3 +
> >   gcc/config/aarch64/arm_acle.h                 |  31 +++++
> >   gcc/config/aarch64/atomics.md                 |  23 ++++
> >   gcc/config/aarch64/iterators.md               |   1 +
> >   .../aarch64/atomic_store_with_stshh.c.c       | 119 ++++++++++++++++
> >   8 files changed, 308 insertions(+)
> >   create mode 100644 
> > gcc/testsuite/gcc.target/aarch64/atomic_store_with_stshh.c.c
> > 
> > diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> > b/gcc/config/aarch64/aarch64-builtins.cc
> > index 408099a50e8..e8277185010 100644
> > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > @@ -901,6 +901,14 @@ enum aarch64_builtins
> >     AARCH64_BUILTIN_GCSPR,
> >     AARCH64_BUILTIN_GCSPOPM,
> >     AARCH64_BUILTIN_GCSSS,
> > +  /* Armv9.6-A builtins.  */
> > +  AARCH64_BUILTIN_STSHH_QI,
> > +  AARCH64_BUILTIN_STSHH_HI,
> > +  AARCH64_BUILTIN_STSHH_SI,
> > +  AARCH64_BUILTIN_STSHH_DI,
> > +  AARCH64_BUILTIN_STSHH_PTR,
> > +  AARCH64_BUILTIN_STSHH_SF,
> > +  AARCH64_BUILTIN_STSHH_DF,
> >     AARCH64_BUILTIN_MAX
> >   };
> > @@ -2466,6 +2474,70 @@ aarch64_init_gcs_builtins (void)
> >                                AARCH64_BUILTIN_GCSSS);
> >   }
> > +/* Add builtins for FEAT_PCDPHINT.  */
> > +
> > +static void
> > +aarch64_init_pcdphint_builtins (void)
> > +{
> > +  tree ftype;
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               unsigned_char_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_QI]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_qi", ftype,
> > +                              AARCH64_BUILTIN_STSHH_QI);
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               short_unsigned_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_HI]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_hi", ftype,
> > +                              AARCH64_BUILTIN_STSHH_HI);
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> 
> How about naming these function types differently, like unsigned_char_ftype,
> short_unsigned_ftype, or unsigned_ftype? Likewise for below ftype variables.
> 
> Or you can also define them inline.
>

These variables are defined outside this scope. They are used across
all the different function type lists so creating a new one for this
is not really an option. The ftype variables could be given different
names although it is within keeping of the rest of the file not to.

 
> I am just thinking from readability perspective here.
>  > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_SI]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_si", ftype,
> > +                              AARCH64_BUILTIN_STSHH_SI);
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               long_long_unsigned_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_DI]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_di", ftype,
> > +                              AARCH64_BUILTIN_STSHH_DI);
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               ptr_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_PTR]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_ptr", ftype,
> > +                              AARCH64_BUILTIN_STSHH_PTR);
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               float_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_SF]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_sf", ftype,
> > +                              AARCH64_BUILTIN_STSHH_SF);
> > +
> > +  ftype = build_function_type_list (void_type_node, ptr_type_node,
> > +                               double_type_node,
> > +                               unsigned_type_node,
> > +                               unsigned_type_node, NULL_TREE);
> > +  aarch64_builtin_decls[AARCH64_BUILTIN_STSHH_DF]
> > +    = aarch64_general_add_builtin ("__builtin_aarch64_stshh_df", ftype,
> > +                              AARCH64_BUILTIN_STSHH_DF);
> > +}
> > +
> >   /* Initialize all builtins in the AARCH64_BUILTIN_GENERAL group.  */
> >   void
> > @@ -2514,6 +2586,7 @@ aarch64_general_init_builtins (void)
> >                                AARCH64_BUILTIN_CHKFEAT);
> >     aarch64_init_gcs_builtins ();
> > +  aarch64_init_pcdphint_builtins ();
> >     if (in_lto_p)
> >       handle_arm_acle_h ();
> > @@ -3932,6 +4005,52 @@ aarch64_expand_tbl_tbx (vec<expand_operand> &ops, 
> > int unspec)
> >     return result;
> >   }
> > +static rtx
> > +aarch64_expand_stshh_builtin (tree exp, int fcode)
> > +{
> > +  expand_operand ops[4];
> > +  enum insn_code icode;
> > +  machine_mode mode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp, 1)));
> > +
> > +  rtx val = expand_normal (CALL_EXPR_ARG (exp, 1));
> > +  rtx mem_order = expand_normal (CALL_EXPR_ARG (exp, 2));
> > +  rtx ret = expand_normal (CALL_EXPR_ARG (exp, 3));
> > +
> > +  if (fcode == AARCH64_BUILTIN_STSHH_SF)
> 
> Would a switch be more helpful here?
>

I was on the fence, I'll be honest. It's only 3 alternatives, though I suppose
given the possibilty of future additions a switch might be better. I'll update 
in the
next revision.
 
> > +    {
> > +      rtx ival = gen_reg_rtx (SImode);
> > +      emit_move_insn (ival, gen_lowpart (SImode, val));
> > +      val = ival;
> > +      mode = SImode;
> > +    }
> > +  else if (fcode == AARCH64_BUILTIN_STSHH_DF)
> > +    {
> > +      rtx ival = gen_reg_rtx (DImode);
> > +      emit_move_insn (ival, gen_lowpart (DImode, val));
> > +      val = ival;
> > +      mode = DImode;
> > +    }
> > +  else
> > +    {
> > +      if (!register_operand (val, mode))
> > +   val = force_reg (mode, val);
> > +    }
> > +
> > +  rtx addr = expand_normal (CALL_EXPR_ARG (exp, 0));
> > +  addr = convert_memory_address (Pmode, addr);
> > +  rtx mem  = gen_rtx_MEM (mode, addr);
> > +  set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
> > +
> > +  icode = code_for_aarch64_atomic_store_stshh (mode);
> How about you declare "expand_operand ops[]" here, closer to use?

Works for me.

> 
> > +  create_output_operand (&ops[0], mem, mode);
> > +  create_input_operand (&ops[1], val, mode);
> > +  create_input_operand (&ops[2], mem_order, SImode);
> > +  create_input_operand (&ops[3], ret, SImode);
> > +
> > +  expand_insn (icode, 4, ops);
> > +  return ops[0].value;
> > +}
> > +
> >   /* Expand CALL_EXPR EXP, given that it is a call to the function described
> >      by BUILTIN_DATA, and return the function's return value.  Put the 
> > result
> >      in TARGET if convenient.  */
> > @@ -4422,6 +4541,15 @@ aarch64_general_expand_builtin (unsigned int fcode, 
> > tree exp, rtx target,
> >       case AARCH64_BUILTIN_GCSPOPM:
> >       case AARCH64_BUILTIN_GCSSS:
> >         return aarch64_expand_gcs_builtin (exp, target, fcode, ignore);
> > +
> > +    case AARCH64_BUILTIN_STSHH_QI:
> > +    case AARCH64_BUILTIN_STSHH_HI:
> > +    case AARCH64_BUILTIN_STSHH_SI:
> > +    case AARCH64_BUILTIN_STSHH_DI:
> > +    case AARCH64_BUILTIN_STSHH_PTR:
> > +    case AARCH64_BUILTIN_STSHH_SF:
> > +    case AARCH64_BUILTIN_STSHH_DF:
> > +      return aarch64_expand_stshh_builtin (exp, fcode);
> >       }
> >     if (fcode >= AARCH64_SIMD_BUILTIN_BASE && fcode <= 
> > AARCH64_SIMD_BUILTIN_MAX)
> > diff --git a/gcc/config/aarch64/aarch64-c.cc 
> > b/gcc/config/aarch64/aarch64-c.cc
> > index c3957c762ef..71ea66d1762 100644
> > --- a/gcc/config/aarch64/aarch64-c.cc
> > +++ b/gcc/config/aarch64/aarch64-c.cc
> > @@ -295,6 +295,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
> >     aarch64_def_or_undef (AARCH64_HAVE_ISA (SME2p1),
> >                     "__ARM_FEATURE_SME2p1", pfile);
> >     aarch64_def_or_undef (TARGET_FAMINMAX, "__ARM_FEATURE_FAMINMAX", pfile);
> > +  aarch64_def_or_undef (TARGET_PCDPHINT, "__ARM_FEATURE_PCDPHINT", pfile);
> >     // Function multi-versioning defines
> >     aarch64_def_or_undef (targetm.has_ifunc_p (),
> > diff --git a/gcc/config/aarch64/aarch64-option-extensions.def 
> > b/gcc/config/aarch64/aarch64-option-extensions.def
> > index a70375c053f..9e39aeacc18 100644
> > --- a/gcc/config/aarch64/aarch64-option-extensions.def
> > +++ b/gcc/config/aarch64/aarch64-option-extensions.def
> > @@ -281,6 +281,8 @@ AARCH64_OPT_EXTENSION ("sme-lutv2", SME_LUTv2, (SME2), 
> > (), (), "sme-lutv2")
> >   AARCH64_OPT_EXTENSION("cpa", CPA, (), (), (), "")
> > +AARCH64_OPT_EXTENSION ("pcdphint", PCDPHINT, (), (), (), "")
> > +
> >   #undef AARCH64_OPT_FMV_EXTENSION
> >   #undef AARCH64_OPT_EXTENSION
> >   #undef AARCH64_FMV_FEATURE
> > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> > index 2cd929d83f9..24597d49a61 100644
> > --- a/gcc/config/aarch64/aarch64.h
> > +++ b/gcc/config/aarch64/aarch64.h
> > @@ -392,6 +392,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE 
> > ATTRIBUTE_UNUSED
> >      but are incompatible with -mtrack-speculation. */
> >   #define TARGET_CMPBR (AARCH64_HAVE_ISA (CMPBR) && 
> > !aarch64_track_speculation)
> > +/* PCDPHINT instructions are enabled through +pcdphint.  */
> > +#define TARGET_PCDPHINT AARCH64_HAVE_ISA (PCDPHINT)
> > +
> >   /* Make sure this is always defined so we don't have to check for ifdefs
> >      but rather use normal ifs.  */
> >   #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
> > diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
> > index 507b6e72bcb..d2a5be0e8f7 100644
> > --- a/gcc/config/aarch64/arm_acle.h
> > +++ b/gcc/config/aarch64/arm_acle.h
> > @@ -132,6 +132,37 @@ __sqrtf (float __x)
> >     return __builtin_aarch64_sqrtsf (__x);
> >   }
> > +#define __atomic_store_with_stshh(dst, value, memory_order, ret)   \
> 
> Why is this defined as a macro rather than a function?

Having this as a function would require types for the parameters, as
value can take any number of types that becomes difficult. The
_Generic layout as I have done it here has been used a few times in GCC
and seems to normally be a macro. 


> 
> Thanks,
> Saurabh
> 
> > +  _Generic ((value),                                                       
> > \
> > +    char:            __builtin_aarch64_stshh_qi,   \
> > +    unsigned char:   __builtin_aarch64_stshh_qi,   \
> > +    signed char:     __builtin_aarch64_stshh_qi,   \
> > +    unsigned short:       __builtin_aarch64_stshh_hi,      \
> > +    short:           __builtin_aarch64_stshh_hi,   \
> > +    unsigned int:    __builtin_aarch64_stshh_si,   \
> > +    int:             __builtin_aarch64_stshh_si,   \
> > +    unsigned long:   __builtin_aarch64_stshh_di,   \
> > +    long:            __builtin_aarch64_stshh_di,   \
> > +    unsigned long long:   __builtin_aarch64_stshh_di,      \
> > +    long long:               __builtin_aarch64_stshh_di,   \
> > +    float:           __builtin_aarch64_stshh_sf,   \
> > +    double:                  __builtin_aarch64_stshh_df,   \
> > +    char*:           __builtin_aarch64_stshh_ptr,  \
> > +    unsigned char*:       __builtin_aarch64_stshh_ptr,  \
> > +    signed char*:    __builtin_aarch64_stshh_ptr,  \
> > +    unsigned short*:      __builtin_aarch64_stshh_ptr,  \
> > +    short*:                  __builtin_aarch64_stshh_ptr,  \
> > +    unsigned int*:   __builtin_aarch64_stshh_ptr,  \
> > +    int*:            __builtin_aarch64_stshh_ptr,  \
> > +    unsigned long*:       __builtin_aarch64_stshh_ptr,  \
> > +    long*:           __builtin_aarch64_stshh_ptr,  \
> > +    unsigned long long*:  __builtin_aarch64_stshh_ptr,  \
> > +    long long*:              __builtin_aarch64_stshh_ptr,  \
> > +    float*:                  __builtin_aarch64_stshh_ptr,  \
> > +    double*:                 __builtin_aarch64_stshh_ptr,  \
> > +    default:                 __builtin_aarch64_stshh_di    \
> > +  )((dst), (value), (memory_order), (ret))
> > +
> >   #pragma GCC push_options
> >   #pragma GCC target ("+nothing+jscvt")
> >   __extension__ extern __inline int32_t
> > diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> > index ea4a9367fc8..4a245bbebff 100644
> > --- a/gcc/config/aarch64/atomics.md
> > +++ b/gcc/config/aarch64/atomics.md
> > @@ -751,6 +751,29 @@
> >     [(set_attr "arch" "*,rcpc8_4")]
> >   )
> > +(define_insn "@aarch64_atomic_store_stshh<mode>"
> > +  [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "=Q,Ust")
> > +    (unspec_volatile:ALLI
> > +       [(match_operand:ALLI 1 "register_operand" "r,r")
> > +       (match_operand:SI 2 "const_int_operand")                    ;; model
> > +       (match_operand:SI 3 "const_int_operand")]           ;; ret_policy
> > +      UNSPECV_STSHH))]
> > +  ""
> > +  {
> > +    if (INTVAL (operands[3]) == 0)
> > +      output_asm_insn ("stshh keep", operands);
> > +    else
> > +      output_asm_insn ("stshh strm", operands);
> > +    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> > +    if (is_mm_relaxed (model))
> > +      return "str<atomic_sfx>\t%<w>1, %0";
> > +    else if (which_alternative == 0)
> > +      return "stlr<atomic_sfx>\t%<w>1, %0";
> > +    else
> > +      return "stlur<atomic_sfx>\t%<w>1, %0";
> > +  }
> > +)
> > +
> >   (define_insn "@aarch64_load_exclusive<mode>"
> >     [(set (match_operand:SI 0 "register_operand" "=r")
> >       (zero_extend:SI
> > diff --git a/gcc/config/aarch64/iterators.md 
> > b/gcc/config/aarch64/iterators.md
> > index 517b2808b5f..97b4ece1444 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -1302,6 +1302,7 @@
> >       UNSPECV_LDA                   ; Represent an atomic load or 
> > load-acquire.
> >       UNSPECV_LDAP          ; Represent an atomic acquire load with RCpc 
> > semantics.
> >       UNSPECV_STL                   ; Represent an atomic store or 
> > store-release.
> > +    UNSPECV_STSHH          ; Represent an atomic store with an stshh hint.
> >       UNSPECV_ATOMIC_CMPSW  ; Represent an atomic compare swap.
> >       UNSPECV_ATOMIC_EXCHG  ; Represent an atomic exchange.
> >       UNSPECV_ATOMIC_CAS            ; Represent an atomic CAS.
> > diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_store_with_stshh.c.c 
> > b/gcc/testsuite/gcc.target/aarch64/atomic_store_with_stshh.c.c
> > new file mode 100644
> > index 00000000000..e5857df22d7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/atomic_store_with_stshh.c.c
> > @@ -0,0 +1,119 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=armv8-a -save-temps" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +
> > +#include <arm_acle.h>
> > +
> > +/*
> > +** testFun1:
> > +** ...
> > +** stshh keep
> > +** strb    w[0-9]+, \[x[0-9]+\]
> > +** ...
> > +*/
> > +void
> > +testFun1 ()
> > +{
> > +  char item1 = 0;
> > +  char* ptr1 = &item1;
> > +  char test1 = 1;
> > +
> > +  __atomic_store_with_stshh (ptr1, test1, __ATOMIC_RELAXED, 0);
> > +}
> > +
> > +/*
> > +** testFun2:
> > +** ...
> > +** stshh keep
> > +** stlrh   w[0-9]+, \[x[0-9]+\]
> > +** ...
> > +*/
> > +void
> > +testFun2 ()
> > +{
> > +  short item2 = 10;
> > +  short* ptr2 = &item2;
> > +  short test2 = 11;
> > +  __atomic_store_with_stshh (ptr2, test2, __ATOMIC_RELEASE, 0);
> > +}
> > +
> > +/*
> > +** testFun3:
> > +** ...
> > +** stshh strm
> > +** stlr    w[0-9]+, \[x[0-9]+\]
> > +** ...
> > +*/
> > +void
> > +testFun3 ()
> > +{
> > +  unsigned int item3 = 10;
> > +  unsigned int* ptr3 = &item3;
> > +  unsigned int test3 = 11;
> > +  __atomic_store_with_stshh (ptr3, test3, __ATOMIC_SEQ_CST, 1);
> > +}
> > +
> > +/*
> > +** testFun4:
> > +** ...
> > +** stshh strm
> > +** str     x[0-9]+, \[x[0-9]+\]
> > +** ...
> > +*/
> > +void
> > +testFun4 ()
> > +{
> > +  long item4 = 10;
> > +  long* ptr4 = &item4;
> > +  long test4 = 11;
> > +  __atomic_store_with_stshh (ptr4, test4, __ATOMIC_RELAXED, 1);
> > +}
> > +
> > +/*
> > +** testFun5:
> > +** ...
> > +** stshh keep
> > +** stlr    x[0-9]+, \[sp\]
> > +** ...
> > +*/
> > +void
> > +testFun5 ()
> > +{
> > +  long item5 = 10;
> > +  long* ptr5 = &item5;
> > +  long test5item = 11;
> > +  long* test5 = &test5item;
> > +  __atomic_store_with_stshh (ptr5, test5, __ATOMIC_SEQ_CST, 0);
> > +}
> > +
> > +/*
> > +** testFun6:
> > +** ...
> > +** stshh keep
> > +** stlr    w[0-9]+, \[x[0-9]+\]
> > +** ...
> > +*/
> > +void
> > +testFun6 ()
> > +{
> > +  float item6 = 10;
> > +  float* ptr6 = &item6;
> > +  float test6 = 11;
> > +  __atomic_store_with_stshh (ptr6, test6, __ATOMIC_SEQ_CST, 0);
> > +}
> > +
> > +/*
> > +** testFun7:
> > +** ...
> > +** stshh strm
> > +** str     x[0-9]+, \[x[0-9]+\]
> > +** ...
> > +*/
> > +void
> > +testFun7 ()
> > +{
> > +  double item7 = 10;
> > +  double* ptr7 = &item7;
> > +  double test7 = 11;
> > +  __atomic_store_with_stshh (ptr7, test7, __ATOMIC_RELAXED, 1);
> > +}
> > \ No newline at end of file
> 

Reply via email to