[PATCH V2] [PATCH 3/5] [RISC-V] Generate Zicond instruction for select pattern with condition eq or neq to 0

2023-07-29 Thread Xiao Zeng
This patch recognizes Zicond patterns when the select pattern
with condition eq or neq to 0 (using eq as an example), namely:

1 rd = (rs2 == 0) ? non-imm : 0
2 rd = (rs2 == 0) ? non-imm : non-imm
3 rd = (rs2 == 0) ? reg : non-imm
4 rd = (rs2 == 0) ? reg : reg

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_expand_conditional_move): Recognize
Zicond patterns
* config/riscv/riscv.md: Recognize Zicond patterns through movcc

gcc/testsuite/ChangeLog:

* gcc.target/riscv/zicond-primitiveSemantics_return_0_imm.c: New test.
* gcc.target/riscv/zicond-primitiveSemantics_return_imm_imm.c: New test.
* gcc.target/riscv/zicond-primitiveSemantics_return_imm_reg.c: New test.
* gcc.target/riscv/zicond-primitiveSemantics_return_reg_reg.c: New test.
---
 gcc/config/riscv/riscv.cc | 144 ++
 gcc/config/riscv/riscv.md |   4 +-
 .../zicond-primitiveSemantics_return_0_imm.c  |  65 
 ...zicond-primitiveSemantics_return_imm_imm.c |  73 +
 ...zicond-primitiveSemantics_return_imm_reg.c |  65 
 ...zicond-primitiveSemantics_return_reg_reg.c |  65 
 6 files changed, 414 insertions(+), 2 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/zicond-primitiveSemantics_return_0_imm.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/zicond-primitiveSemantics_return_imm_imm.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/zicond-primitiveSemantics_return_imm_reg.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/zicond-primitiveSemantics_return_reg_reg.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 941ea25e1f2..6ac39f63dd7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3516,6 +3516,150 @@ riscv_expand_conditional_move (rtx dest, rtx op, rtx 
cons, rtx alt)
  cond, cons, alt)));
   return true;
 }
+  else if (TARGET_ZICOND
+   && (code == EQ || code == NE)
+   && GET_MODE_CLASS (mode) == MODE_INT)
+{
+  need_eq_ne_p = true;
+  /* 0 + imm  */
+  if (GET_CODE (cons) == CONST_INT && cons == const0_rtx
+  && GET_CODE (alt) == CONST_INT && alt != const0_rtx)
+{
+  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
+  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+  alt = force_reg (mode, alt);
+  emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cond,
+  cons, alt)));
+  return true;
+}
+  /* imm + imm  */
+  else if (GET_CODE (cons) == CONST_INT && cons != const0_rtx
+   && GET_CODE (alt) == CONST_INT && alt != const0_rtx)
+{
+  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
+  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+  rtx reg = gen_reg_rtx (mode);
+  rtx temp = GEN_INT (INTVAL (alt) - INTVAL (cons));
+  emit_insn (gen_rtx_SET (reg, temp));
+  emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cond,
+  CONST0_RTX 
(mode),
+  reg)));
+  riscv_emit_binary (PLUS, dest, dest, cons);
+  return true;
+}
+  /* imm + reg  */
+  else if (GET_CODE (cons) == CONST_INT && cons != const0_rtx
+   && GET_CODE (alt) == REG)
+{
+  /* Optimize for register value of 0.  */
+  if (op0 == alt && op1 == const0_rtx)
+{
+  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+  cons = force_reg (mode, cons);
+  emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cond,
+  cons, alt)));
+  return true;
+}
+  /* Handle the special situation of: -2048 == INTVAL (alt)
+ to avoid failure due to an unrecognized insn. Let the costing
+ model determine if the conditional move sequence is better
+ than the branching sequence.  */
+  if (-2048 == INTVAL (cons))
+{
+  rtx reg = gen_reg_rtx (mode);
+  emit_insn (gen_rtx_SET (reg, cons));
+  return riscv_expand_conditional_move (dest, op, reg, alt);
+}
+  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
+  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
+  rtx temp = GEN_INT (-1 * INTVAL (cons));
+  riscv_emit_binary (PLUS, alt, alt, temp);
+  emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cond,
+  CONST0_RTX 
(mode),
+ 

Re: Re: [PATCH 3/5] [RISC-V] Generate Zicond instruction for select pattern with condition eq or neq to 0

2023-07-29 Thread Xiao Zeng
On Sat, Jul 29, 2023 at 04:59:00 AM  Jeff Law  wrote:
>
>
>
>On 7/19/23 04:11, Xiao Zeng wrote:
>
>> +  else if (TARGET_ZICOND
>> +   && (code == EQ || code == NE)
>> +   && GET_MODE_CLASS (mode) == MODE_INT)
>> +    {
>> +  need_eq_ne_p = true;
>> +  /* 0 + imm  */
>> +  if (GET_CODE (cons) == CONST_INT && cons == const0_rtx
>> +  && GET_CODE (alt) == CONST_INT && alt != const0_rtx)
>> +    {
>> +  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
>> +  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
>> +  alt = force_reg (mode, alt);
>> +  emit_insn (gen_rtx_SET (dest,
>> +  gen_rtx_IF_THEN_ELSE (mode, cond,
>> +    cons, alt)));
>> +  return true;
>> +    }
>> +  /* imm + imm  */
>> +  else if (GET_CODE (cons) == CONST_INT && cons != const0_rtx
>> +   && GET_CODE (alt) == CONST_INT && alt != const0_rtx)
>> +    {
>> +  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
>> +  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
>> +  alt = force_reg (mode, alt);
>> +  rtx temp1 = gen_reg_rtx (mode);
>> +  rtx temp2 = GEN_INT(-1 * INTVAL (cons));
>> +  riscv_emit_binary(PLUS, temp1, alt, temp2);
>So in this sequence you're just computing a constant since both ALT and
>CONS are constants.  It's better to just form the constant directly,
>then force that into a register because it'll make the costing more
>correct, particularly if the resulting constant needs more than one
>instruction to synthesize. 

Fixed

>
>And a nit.  There should always be a space between a function name and
>its argument list. 

Fixed

>
>
>
>> +  emit_insn (gen_rtx_SET (dest,
>> +  gen_rtx_IF_THEN_ELSE (mode, cond,
>> +    const0_rtx, alt)));
>> +  riscv_emit_binary(PLUS, dest, dest, cons);
>> +  return true;
>I don't see how this can be correct from a code generation standpoint.
>You compute ALT-CONS into TEMP1 earlier.  But you never use TEMP1 after
>that.  I think you meant to use TEMP1 instead of ALT as the false arm if
>the IF-THEN-ELSE you constructed. 

Fixed

>
>In general you should be using CONST0_RTX (mode) rather than const0_rtx.
> 

Fixed

>> +    }
>> +  /* imm + reg  */
>> +  else if (GET_CODE (cons) == CONST_INT && cons != const0_rtx
>> +   && GET_CODE (alt) == REG)
>> +    {
>> +  /* Optimize for register value of 0.  */
>> +  if (op0 == alt && op1 == const0_rtx)
>> +    {
>> +  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
>> +  cons = force_reg (mode, cons);
>> +  emit_insn (gen_rtx_SET (dest,
>> +  gen_rtx_IF_THEN_ELSE (mode, cond,
>> +    cons, alt)));
>> +  return true;
>> +    }
>> +  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
>> +  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
>> +  rtx temp1 = gen_reg_rtx (mode);
>> +  rtx temp2 = GEN_INT(-1 * INTVAL (cons));
>> +  riscv_emit_binary(PLUS, temp1, alt, temp2);
>Here you have to be careful if CONS is -2048.  You negate it resulting
>in +2048 which can't be used in an addi.  This will cause the entire
>sequence to fail due to an unrecognized insn.  It would be better to
>handle that scenario directly so the generated sequence is still valid.
>
>By generating recognizable code in that case we let the costing model
>determine if the conditional move sequence is better than the branching
>sequence. 

Thank you for pointing out this special situation, it has been fixed

>
>
>> +  emit_insn (gen_rtx_SET (dest,
>> +  gen_rtx_IF_THEN_ELSE (mode, cond,
>> +    const0_rtx, alt)));
>I think we have the same problem with the use of ALT here rather than
>TEMP1 that we had in the previous case. 

Fixed

>
>
>
>> +  /* reg + imm  */
>> +  else if (GET_CODE (cons) == REG
>> +   && GET_CODE (alt) == CONST_INT && alt != const0_rtx)
>> +    {
>> +  /* Optimize for register value of 0.  */
>> +  if (op0 == cons && op1 == const0_rtx)
>> +    {
>> +  rtx cond = gen_rtx_fmt_ee (code, GET_MODE (op0), op0, op1);
>> +  alt = force_reg (mode, alt);
>> +  emit_insn (gen_rtx_SET (dest,
>> +  gen_rtx_IF_THEN_ELSE (mode, cond,
>> +    cons, alt)));
>> +  return true;
>> +    }
>> +  riscv_emit_int_compare (&code, &op0, &op1, need_eq_ne_p);
>> +

Re: Re: [PATCH 3/5] [RISC-V] Generate Zicond instruction for select pattern with condition eq or neq to 0

2023-07-29 Thread Xiao Zeng
On Fri, Jul 28, 2023 at 11:09:00 PM  Jeff Law  wrote:
>
>
>
>On 7/25/23 11:55, Andreas Schwab wrote:
>> On Jul 19 2023, Xiao Zeng wrote:
>>
>>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>>> index 38d8eb2fcf5..7e6b24bd232 100644
>>> --- a/gcc/config/riscv/riscv.cc
>>> +++ b/gcc/config/riscv/riscv.cc
>>> @@ -2448,6 +2448,17 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
>>> outer_code, int opno ATTRIBUTE_UN
>>>     *total = COSTS_N_INSNS (1);
>>>     return true;
>>>   }
>>> +  else if (TARGET_ZICOND && outer_code == SET &&
>>> +   ((GET_CODE (XEXP (x, 1)) == REG && XEXP (x, 2) == 
>>> const0_rtx) ||
>>> +   (GET_CODE (XEXP (x, 2)) == REG && XEXP (x, 1) == 
>>> const0_rtx) ||
>>> +   (GET_CODE (XEXP (x, 1)) == REG && GET_CODE (XEXP (x, 2)) &&
>>> +    XEXP (x, 1) == XEXP (XEXP (x, 0), 0)) ||
>>> +   (GET_CODE (XEXP (x, 1)) == REG && GET_CODE (XEXP (x, 2)) &&
>>> +    XEXP (x, 2) == XEXP (XEXP (x, 0), 0
>>
>> Line breaks before the operator, not after.
>Also note that && GET_CODE (XEXP (x, 2)) && that appears twice. 

This is an error that I will fix in patch[cost] and provide a detailed 
explanation.

>
>That just verifies the code isn't RTX_UNKNOWN which I suspect isn't what
>the author intended.  It probably needs to be adjusted for SUBREGs and
>the pointer equality issues with REGs after reload.
>
>I'll take care of these goofs since the costing ought to be able to move
>forward independently of the improvements Xiao made to generating
>conditional move sequences.
>
>Jeff 

After V2-patch[3/5] is accepted, a patch[cost] will be submitted to provide 
detailed
explanation of this issue. Of course, as Jeff mentioned, some issues will also 
be fixed.

Thanks
Xiao Zeng

Re: Re: [PATCH 0/5] Recognize Zicond extension

2023-07-29 Thread Xiao Zeng
On Fri, Jul 28, 2023 at 11:03:00 PM  Jeff Law  wrote:
>
>
>
>On 7/28/23 00:34, Xiao Zeng wrote:
>

 Does that work for you?
>>> I'm going to look at 3/5 today pretty closely.  Exposing zicond to
>>> movcc is something we had implemented inside Ventana and I want to
>>> compare/contrast your work with ours.
>>
>> What a coincidence!
>Zicond is a direct descendant of xventanacondops.  The only notable
>difference is in their encodings. 
It explains the matter.

>
>>
>>>
>>> What I like about yours is it keeps all the logic in riscv.cc rather
>>> than scattering it across riscv.cc and riscv.md.
>>
>> Yes, when I use enough test cases, I cannot find a concise way to optimize
>> all test cases. When I enumerated all possible cases in the movcc
>> function of the RISC-V backend, I found a method that satisfied me, which
>> is the method in patch [3/5].
>I got pulled away to another task yesterday, so didn't get as far as I
>wanted.   The biggest inight from yesterday was determining that some of
>the cases you're handling in riscv_expand_conditional_move were things
>we were doing inside ifcvt.cc.
>
>The difference is likely because the initial work on zicond here was
>primarily driven by changes to ifcvt.  It was only after evaluating that
>initial implementation that we started to the effort to use zicond at
>RTL expansion time.
>
>I could make a case for either approach, but the more I ponder them the
>more I'm inclined to go with something like yours.  

>We want to capture
>the cases implementable as a conditional move as early as possible in
>the RTL pipeline rather than relying on ifcvt to catch it later.  It
>also avoids polluting ifcvt with transformations that are only likely
>needed on risc-v. 
That's why I did this optimization in riscv.cc riscv_expand_conditional_move.

>
>
>>>
>>
>> If it's just for the Zicond instruction set, is it necessary to make 
>> judgments
>> outside of eq/ne? After all, it does not support comparison actions other
>> than eq/ne. Of course, it is also possible to use a special technique to use
>> Zicond in non eq/ne comparisons.
>It's not necessary, but it's definitely helpful to cover the other
>conditions.  In fact, we can even cover a variety of fp conditions by
>utilizing the sCC type insns. 
It would be great if we could do this.

>
>
>So what I'm looking at for patch #3 is to split out the costing bits
>into its own patch which can go forward immediately.  
As you expected, V2-patch[3/5] has arrived,
and its address is: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625781.html

>THen continue
>evaluating the best way to handle unifying the expander/canonicalization
>code.
That's nice.
  
>Your testcases in patch #3 are particularly helpful to make sure
>we're not missing cases. 
Yes, I have always believed that test cases can be redundant, but they cannot
be omitted. As we all know, the compiler will always make some magical changes
without our knowledge, which may not be what we expect. And test cases
can help us stay away from this risk.

>
>Jeff 

Thanks
Xiao Zeng

Re: [patch] libgomp: cuda.h and omp_target_memcpy_rect cleanup (was: [patch] OpenMP: Call cuMemcpy2D/cuMemcpy3D for nvptx for omp_target_memcpy_rect)

2023-07-29 Thread Tobias Burnus

Now committed as r14-2865-g8b9e559fe7ca5715c74115322af99dbf9137a399

Tobias

On 28.07.23 13:51, Tobias Burnus wrote:

thanks for proof reading and the suggestions! – Do have comments to the
attached patch?

* * *

Crossref: For further optimizations, see also

https://gcc.gnu.org/PR101581 — [OpenMP] omp_target_memcpy – support
inter-device memcpy
https://gcc.gnu.org/PR110813 — [OpenMP] omp_target_memcpy_rect (+
strided 'target update'): Improve GCN performance and contiguous
subranges

and just added based on Thomas' comment:

https://gcc.gnu.org/PR107424 — [OpenMP] Check whether device locking is
really needed for bare memcopy to/from devices (omp_target_memcpy...)

* * *

On 27.07.23 23:00, Thomas Schwinge wrote:

+++ b/include/cuda/cuda.h

I note that you're not actually using everything you're adding here.
(..., but I understand you're simply adding everying that relates to
these 'cuMemcpy[...]' routines -- OK as far as I'm concerned.)


Yes. That was on purpose to make it easier to pick something when needed
– especially as we might want to use some of those later on.

For symmetry, I now also added cuMemcpyPeer + ...Async, which also
remain unused. (But could be used as part of the PRs linked above.)


+  const void *dstHost;

That last one isn't 'const'.  ;-)

Fixed - three times.

A 'cuda.h' that I looked at calls that last one 'reserved0', with
comment
"Must be NULL".

Seems to be unused in real world code and in the documentation. But
let's use this name as it might be exposed in the wild.

--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
+extern int GOMP_OFFLOAD_memcpy2d (int, int, size_t, size_t,
+   void*, size_t, size_t, size_t,
+   const void*, size_t, size_t, size_t);
+extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t,
void *,
+   size_t, size_t, size_t, size_t, size_t,
+   const void *, size_t, size_t,
size_t, size_t,
+   size_t);

Oh, wow.  ;-)


Maybe this is not the best ABI. We can consider to modify it before the
GCC 14 release. (And in principle also afterwards, given that libgomp
and its plugins should™ be compiled and installed alongside.)

I think once we know how to implement GCN, we will see whether it was
done smartly or whether other arguments should be used or whether the
two functions should be combined.

[Regarding the reserve0/reserve1 values for cuMemcpy3D and whether it
should be NULL or not; quoting the usage in plugin-nvptx.c:]


I note that this doesn't adhere to the two "Must be NULL" remarks from
above -- but I'm confused, because, for example, on
capabilities
+& GOMP_OFFLOAD_CAP_SHARED_MEM)))

Are these 'GOMP_OFFLOAD_CAP_SHARED_MEM' actually reachable, given that
'omp_target_memcpy_check' (via 'omp_target_memcpy_rect_check') clears
out
the device to 'NULL' for 'GOMP_OFFLOAD_CAP_SHARED_MEM'?


I have now undone this change – I did not dig deep enough into the
function calls.



+  else if (dst_devicep == NULL && src_devicep == NULL)
+ {
+   memcpy ((char *) dst + dst_off, (const char *) src + src_off,
+   length);
+   ret = 1;
+ }
else if (src_devicep == dst_devicep)
   ret = src_devicep->dev2dev_func (src_devicep->target_id,
(char *) dst + dst_off,
(const char *) src + src_off,
length);

..., but also left the intra-device case here -- which should now be
dead
code here?


Why? Unless I missed something, the old, the current, and the proposed
(= old) code do still run this code.

I have not added an assert to confirm, but in any case, it is tested for
in my recently added testcase - thus, we could add a 'printf' to confirm.


+   else if (*tmp_size < length)
+ {
+   *tmp_size = length;
+   *tmp = realloc (*tmp, length);
+   if (*tmp == NULL)
+ return ENOMEM;

If 'realloc' returns 'NULL', we should 'free' the original '*tmp'?

Do we really need here the property here that if the re-allocation can't
be done in-place, 'realloc' copies the original content to the new?  In
other words, should we just unconditionally 'free' and re-'malloc' here,
instead of 'realloc'?

I have now done so – but I am not really sure what's faster on average.
If it can be enlarged, 'realloc' is f

RE: [PATCH] Change fma_reassoc_width tuning for ampere1

2023-07-29 Thread Di Zhao OS via Gcc-patches
Cherry-picked this to gcc-13.

Thanks,
Di Zhao

> -Original Message-
> From: Richard Sandiford 
> Sent: Monday, June 26, 2023 10:28 PM
> To: Philipp Tomsich 
> Cc: Di Zhao OS via Gcc-patches ; Di Zhao OS
> 
> Subject: Re: [PATCH] Change fma_reassoc_width tuning for ampere1
> 
> Philipp Tomsich  writes:
> > Richard,
> >
> > OK for backport to GCC-13?
> 
> Yeah, OK for GCC 13 too.
> 
> Thanks,
> Richard
> 
> > Thanks,
> > Philipp.
> >
> > On Thu, 22 Jun 2023 at 16:18, Richard Sandiford via Gcc-patches
> >  wrote:
> >>
> >> Di Zhao OS via Gcc-patches  writes:
> >> > This patch enables reassociation of floating-point additions on ampere1.
> >> > This brings about 1% overall benefit on spec2017 fprate cases. (There
> >> > are minor regressions in 510.parest_r and 508.namd_r, analyzed here:
> >> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110279 .)
> >> >
> >> > Bootstrapped and tested on aarch64-unknown-linux-gnu. Is this OK for
> trunk?
> >> >
> >> > Thanks,
> >> > Di Zhao
> >> >
> >> > gcc/ChangeLog:
> >> >
> >> > * config/aarch64/aarch64.cc: Change fma_reassoc_width for
> ampere1
> >>
> >> Thanks, pushed to trunk.
> >>
> >> Richard
> >>
> >> > ---
> >> > diff --git a/gcc/config/aarch64/aarch64.cc
> b/gcc/config/aarch64/aarch64.cc
> >> > index d16565b5581..301c9f6c0cd 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> > @@ -1927,7 +1927,7 @@ static const struct tune_params ampere1_tunings =
> >> >"32:12",   /* loop_align.  */
> >> >2, /* int_reassoc_width.  */
> >> >4, /* fp_reassoc_width.  */
> >> > -  1, /* fma_reassoc_width.  */
> >> > +  4, /* fma_reassoc_width.  */
> >> >2, /* vec_reassoc_width.  */
> >> >2, /* min_div_recip_mul_sf.  */
> >> >2, /* min_div_recip_mul_df.  */


Re: [PATCH v5 4/5] c++modules: report imported CMI files as dependencies

2023-07-29 Thread Ben Boeckel via Gcc-patches
On Thu, Jul 27, 2023 at 18:13:48 -0700, Jason Merrill wrote:
> On 7/23/23 20:26, Ben Boeckel wrote:
> > Sure, *CMake* knows them, but the *build tool* needs to be told
> > (typically `make` or `ninja`) because it is what is actually executing
> > the build graph. The way this is communicated is via `-MF` files and
> > that's what I'm providing in this patch. Note that `ninja` does not
> > allow rules to specify such dependencies for other rules than the one it
> > is reading the file for.
> 
> But since the direct imports need to be rebuilt themselves if the 
> transitive imports change, the build graph should be the same whether or 
> not the transitive imports are repeated?  Either way, if a transitive 
> import changes you need to rebuild the direct import and then the importer.

I suppose I have seen enough bad build systems that don't do everything
correctly that I'm interested in creating "pits of success" rather than
"well, you didn't get thing X 100% correct, so you're screwed here too".

The case that I think is most likely here is that someone has a
"superbuild" with 3 projects A, B, and C where C uses B and B uses A. At
the top-level the superbuild exposes just "make projectA
projectB projectC"-granularity (rather than a combined build graph; they
may use different build systems) and then users go into some projectC
directly and forget to update projectB after updating projectA (known to
all use the same compiler/flags so that CMI sharing is possible). The
build it still broken, but ideally they get notified in some useful way
when rebuilding the TU rather than…whatever ends up catching the problem
incidentally.

> I guess it shouldn't hurt to have the transitive imports in the -MF 
> file, as long as they aren't also in the p1689 file, so I'm not 
> particularly opposed to this change, but I don't see how it makes a 
> practical difference.

Correct. The P1689 shouldn't even know about transitive imports (well,
maybe from header units?) as it just records "I saw an `import`
statement" and should never look up CMI files (indeed, we would need
another scanning step to know what CMI files to create for the P1689
scan if they were necessary…).

--Ben


Re: [PATCH v3 0/4] diagnostics: libcpp: Overhaul locations for _Pragma tokens

2023-07-29 Thread Lewis Hyatt via Gcc-patches
On Fri, Jul 28, 2023 at 6:22 PM David Malcolm  wrote:
>
> On Fri, 2023-07-21 at 19:08 -0400, Lewis Hyatt wrote:
> > Hello-
> >
> > This is an update to the v2 patch series last sent in January:
> > https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609473.html
> >
> > While I did not receive any feedback on the v2 patches yet, they did
> > need some
> > rebasing on top of other recent commits to input.cc, so I thought it
> > would be
> > helpful to send them again now. The patches have not otherwise
> > changed from
> > v2, and the above-linked message explains how all the patches fit in
> > with the
> > original v1 series sent last November.
> >
> > Dave, I would appreciate it very much if you could please let me know
> > what you
> > think of this approach? I feel like the diagnostics we currently
> > output for _Pragmas are worth improving. As a reminder, say for this
> > example:
> >
> > =
> >  #define S "GCC diagnostic ignored \"oops"
> >  _Pragma(S)
> > =
> >
> > We currently output:
> >
> > =
> > file.cpp:2:24: warning: missing terminating " character
> > 2 | _Pragma(S)
> >   |^
> > =
> >
> > While after these patches, we would output:
> >
> > ==
> > :1:24: warning: missing terminating " character
> > 1 | GCC diagnostic ignored "oops
> >   |^
> > file.cpp:2:1: note: in <_Pragma directive>
> > 2 | _Pragma(S)
> >   | ^~~
> > ==
> >
> > Thanks!
>
> Hi Lewis; sorry for not responding to the v2 patches.
>
> I've started looking at the v3 patches in detail, but I have some high-
> level questions about memory usage:
>
> Am I right in thinking that the effect of this patch is that for every
> _Pragma in the source we will create a new line_map_ordinary, and a new
> buffer for the stringified content of that _Pragma, and that these
> allocations will persist for the rest of the compilation?  (plus a
> little extra allocation within the "location_t" space from 0 to
> 0x7fff).
>
> It sounds like this will probably be a rounding error that won't be
> noticable in profiling, but did you attempt any such measurement of the
> memory usage before/after this patch on some real-world projects?
>
> Thanks
> Dave
>

Thanks for looking at the patches, I appreciate it whenever you have
time to get to them.

This is a fair point about the memory usage, basically it means that
each instance of a _Pragma has comparable memory footprint to a macro
definition. (In addition to the overheads you mentioned, it also
creates a macro map to generate a virtual location for the tokens, so
that it's able to output the "in expansion of _Pragma" note. That part
can be disabled with -ftrack-macro-expansion=0 at least.)

I had the sense that _Pragma isn't used often enough for that to be a
problem, but agreed it is worth checking. (I really hope this memory
usage isn't an issue since there are also numerous PRs complaining
about 32-bit limitations in location tracking, that make it tempting
to explore 64-bit line maps or some other option someday too.)

I tried one thing now, wxWidgets uses a lot of diagnostic pragmas
wrapped up inside macros that use _Pragma. (See
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578). The testsuite
contains a file allheaders.cpp which includes the whole library, so I
tried compiling this into a pch, which I believe measures the entire
memory footprint including the ordinary and macro line maps and the
_Pragma strings. The resulting PCH sizes were:

279000173 bytes before the changes
279491345 bytes after the changes

So 0.1% bigger. Happy to check other projects too, do you have any
standard gotos? Maybe firefox or something I take it.

I see your other response on patch #1, I am thinking about that and
will reply later. Thanks again!

-Lewis


New Swedish PO file for 'gcc' (version 13.2.0)

2023-07-29 Thread Translation Project Robot
Hello, gentle maintainer.

This is a message from the Translation Project robot.

A revised PO file for textual domain 'gcc' has been submitted
by the Swedish team of translators.  The file is available at:

https://translationproject.org/latest/gcc/sv.po

(This file, 'gcc-13.2.0.sv.po', has just now been sent to you in
a separate email.)

All other PO files for your package are available in:

https://translationproject.org/latest/gcc/

Please consider including all of these in your next release, whether
official or a pretest.

Whenever you have a new distribution with a new version number ready,
containing a newer POT file, please send the URL of that distribution
tarball to the address below.  The tarball may be just a pretest or a
snapshot, it does not even have to compile.  It is just used by the
translators when they need some extra translation context.

The following HTML page has been updated:

https://translationproject.org/domain/gcc.html

If any question arises, please contact the translation coordinator.

Thank you for all your work,

The Translation Project robot, in the
name of your translation coordinator.




Re: [PATCH v3 0/4] diagnostics: libcpp: Overhaul locations for _Pragma tokens

2023-07-29 Thread David Malcolm via Gcc-patches
On Sat, 2023-07-29 at 10:27 -0400, Lewis Hyatt wrote:
> On Fri, Jul 28, 2023 at 6:22 PM David Malcolm 
> wrote:
> > 
> > On Fri, 2023-07-21 at 19:08 -0400, Lewis Hyatt wrote:
> > > Hello-
> > > 
> > > This is an update to the v2 patch series last sent in January:
> > > https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609473.html
> > > 
> > > While I did not receive any feedback on the v2 patches yet, they
> > > did
> > > need some
> > > rebasing on top of other recent commits to input.cc, so I thought
> > > it
> > > would be
> > > helpful to send them again now. The patches have not otherwise
> > > changed from
> > > v2, and the above-linked message explains how all the patches fit
> > > in
> > > with the
> > > original v1 series sent last November.
> > > 
> > > Dave, I would appreciate it very much if you could please let me
> > > know
> > > what you
> > > think of this approach? I feel like the diagnostics we currently
> > > output for _Pragmas are worth improving. As a reminder, say for
> > > this
> > > example:
> > > 
> > > =
> > >  #define S "GCC diagnostic ignored \"oops"
> > >  _Pragma(S)
> > > =
> > > 
> > > We currently output:
> > > 
> > > =
> > > file.cpp:2:24: warning: missing terminating " character
> > >     2 | _Pragma(S)
> > >   |    ^
> > > =
> > > 
> > > While after these patches, we would output:
> > > 
> > > ==
> > > :1:24: warning: missing terminating " character
> > >     1 | GCC diagnostic ignored "oops
> > >   |    ^
> > > file.cpp:2:1: note: in <_Pragma directive>
> > >     2 | _Pragma(S)
> > >   | ^~~
> > > ==
> > > 
> > > Thanks!
> > 
> > Hi Lewis; sorry for not responding to the v2 patches.
> > 
> > I've started looking at the v3 patches in detail, but I have some
> > high-
> > level questions about memory usage:
> > 
> > Am I right in thinking that the effect of this patch is that for
> > every
> > _Pragma in the source we will create a new line_map_ordinary, and a
> > new
> > buffer for the stringified content of that _Pragma, and that these
> > allocations will persist for the rest of the compilation?  (plus a
> > little extra allocation within the "location_t" space from 0 to
> > 0x7fff).
> > 
> > It sounds like this will probably be a rounding error that won't be
> > noticable in profiling, but did you attempt any such measurement of
> > the
> > memory usage before/after this patch on some real-world projects?
> > 
> > Thanks
> > Dave
> > 
> 
> Thanks for looking at the patches, I appreciate it whenever you have
> time to get to them.
> 
> This is a fair point about the memory usage, basically it means that
> each instance of a _Pragma has comparable memory footprint to a macro
> definition. (In addition to the overheads you mentioned, it also
> creates a macro map to generate a virtual location for the tokens, so
> that it's able to output the "in expansion of _Pragma" note. That
> part
> can be disabled with -ftrack-macro-expansion=0 at least.)
> 
> I had the sense that _Pragma isn't used often enough for that to be a
> problem, but agreed it is worth checking. (I really hope this memory
> usage isn't an issue since there are also numerous PRs complaining
> about 32-bit limitations in location tracking, that make it tempting
> to explore 64-bit line maps or some other option someday too.)
> 
> I tried one thing now, wxWidgets uses a lot of diagnostic pragmas
> wrapped up inside macros that use _Pragma. (See
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578). The testsuite
> contains a file allheaders.cpp which includes the whole library, so I
> tried compiling this into a pch, which I believe measures the entire
> memory footprint including the ordinary and macro line maps and the
> _Pragma strings. The resulting PCH sizes were:
> 
> 279000173 bytes before the changes
> 279491345 bytes after the changes
> 
> So 0.1% bigger. Happy to check other projects too, do you have any
> standard gotos? Maybe firefox or something I take it.

Thanks for doing that test; I think that slight increase on a heavy
user of _Pragma is acceptable.
> 
> I see your other response on patch #1, I am thinking about that and
> will reply later. Thanks again!

Thanks.  Hope that my patch #1 response makes sense and that I'm not
missing something about the way this works.

Dave



[Committed] Use QImode for offsets in zero_extract/sign_extract in i386.md (take #2)

2023-07-29 Thread Roger Sayle

This patch reattempts to change the ZERO_EXTRACTs and SIGN_EXTRACTs
in i386.md to consistently use QImode for bit offsets (i.e. third and fourth
operands), matching the use of QImode for bit counts in shifts and rotates.

This iteration corrects the "ne:QI" vs "eq:QI" mistake in the previous
version, which was responsible for PR 110787 and PR 110790 and so was
rapidly reverted last weekend.  New test cases have been added to check
the correct behaviour.

This patch has been tested on x86_64-pc-linux-gnu with and without
--enable-languages="all", with make bootstrap and make -k check, both
with and without --target_board=unix{-m32} with no new failures.
Committed to mainline as an obvious fix to the previously approved
patch.  Sorry again for the temporary inconvenience, and thanks to
Rainer Orth for identifying/confirming the problematic patch.


2023-07-29  Roger Sayle  

gcc/ChangeLog
PR target/110790
* config/i386/i386.md (extv): Use QImode for offsets.
(extzv): Likewise.
(insv): Likewise.
(*testqi_ext_3): Likewise.
(*btr_2): Likewise.
(define_split): Likewise.
(*btsq_imm): Likewise.
(*btrq_imm): Likewise.
(*btcq_imm): Likewise.
(define_peephole2 x3): Likewise.
(*bt): Likewise
(*bt_mask): New define_insn_and_split.
(*jcc_bt): Use QImode for offsets.
(*jcc_bt_1): Delete obsolete pattern.
(*jcc_bt_mask): Use QImode offsets.
(*jcc_bt_mask_1): Likewise.
(define_split): Likewise.
(*bt_setcqi): Likewise.
(*bt_setncqi): Likewise.
(*bt_setnc): Likewise.
(*bt_setncqi_2): Likewise.
(*bt_setc_mask): New define_insn_and_split.
(bmi2_bzhi_3): Use QImode offsets.
(*bmi2_bzhi_3): Likewise.
(*bmi2_bzhi_3_1): Likewise.
(*bmi2_bzhi_3_1_ccz): Likewise.
(@tbm_bextri_): Likewise.

gcc/testsuite/ChangeLog
PR target/110790
* gcc.target/i386/pr110790-1.c: New test case.
* gcc.target/i386/pr110790-2.c: Likewise.


diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4db210c..efac228 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3312,8 +3312,8 @@
 (define_expand "extv"
   [(set (match_operand:SWI24 0 "register_operand")
(sign_extract:SWI24 (match_operand:SWI24 1 "register_operand")
-   (match_operand:SI 2 "const_int_operand")
-   (match_operand:SI 3 "const_int_operand")))]
+   (match_operand:QI 2 "const_int_operand")
+   (match_operand:QI 3 "const_int_operand")))]
   ""
 {
   /* Handle extractions from %ah et al.  */
@@ -3340,8 +3340,8 @@
 (define_expand "extzv"
   [(set (match_operand:SWI248 0 "register_operand")
(zero_extract:SWI248 (match_operand:SWI248 1 "register_operand")
-(match_operand:SI 2 "const_int_operand")
-(match_operand:SI 3 "const_int_operand")))]
+(match_operand:QI 2 "const_int_operand")
+(match_operand:QI 3 "const_int_operand")))]
   ""
 {
   if (ix86_expand_pextr (operands))
@@ -3428,8 +3428,8 @@
 
 (define_expand "insv"
   [(set (zero_extract:SWI248 (match_operand:SWI248 0 "register_operand")
-(match_operand:SI 1 "const_int_operand")
-(match_operand:SI 2 "const_int_operand"))
+(match_operand:QI 1 "const_int_operand")
+(match_operand:QI 2 "const_int_operand"))
 (match_operand:SWI248 3 "register_operand"))]
   ""
 {
@@ -10788,8 +10788,8 @@
 (match_operator 1 "compare_operator"
  [(zero_extract:SWI248
 (match_operand 2 "int_nonimmediate_operand" "rm")
-(match_operand 3 "const_int_operand")
-(match_operand 4 "const_int_operand"))
+(match_operand:QI 3 "const_int_operand")
+(match_operand:QI 4 "const_int_operand"))
   (const_int 0)]))]
   "/* Ensure that resulting mask is zero or sign extended operand.  */
INTVAL (operands[4]) >= 0
@@ -15965,7 +15965,7 @@
   [(set (zero_extract:HI
  (match_operand:SWI12 0 "nonimmediate_operand")
  (const_int 1)
- (zero_extend:SI (match_operand:QI 1 "register_operand")))
+ (match_operand:QI 1 "register_operand"))
(const_int 0))
(clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT && ix86_pre_reload_split ()"
@@ -15989,7 +15989,7 @@
   [(set (zero_extract:HI
  (match_operand:SWI12 0 "register_operand")
  (const_int 1)
- (zero_extend:SI (match_operand:QI 1 "register_operand")))
+ (match_operand:QI 1 "register_operand"))
(const_int 0))
(clobber (reg:CC FLAGS_REG))]
   "TARGET_USE_BT && ix86_pre_reload_split ()"
@@ -16016,7 +16016,7 @@
 (define_insn "*btsq_imm"
   [(set (zero_extract:DI (match_ope

[PATCHv2] tree-optimization: [PR100864] `(a&!b) | b` is not opimized to `a | b` for comparisons

2023-07-29 Thread Andrew Pinski via Gcc-patches
This is a new version of the patch.
Instead of doing the matching of inversion comparison directly inside
match, creating a new function (bitwise_not_equal_p) to do it.
It is very similar to bitwise_equal_p that was added in 
r14-2751-g2a3556376c69a1fb
but instead it says `expr1 == ~expr2`. A follow on patch, will
use this function in other patterns where we try to match `@0` and `(bit_not 
@0)`.

Note I am not a fan of the name of the function as it gives out `a != b` vibes
rather than `a == ~a` vibes. If anyone can think of a better name that would be 
nice.

OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

PR tree-optimization/100864

gcc/ChangeLog:

* generic-match-head.cc (bitwise_not_equal_p): New function.
* gimple-match-head.cc (bitwise_not_equal_p): New macro.
(gimple_bitwise_not_equal_p): New function.
* match.pd ((~x | y) & x): Use bitwise_not_equal_p
instead of direct matching bit_not.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/bitops-3.c: New test.
---
 gcc/generic-match-head.cc| 42 ++
 gcc/gimple-match-head.cc | 71 
 gcc/match.pd |  5 +-
 gcc/testsuite/gcc.dg/tree-ssa/bitops-3.c | 67 ++
 4 files changed, 183 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/bitops-3.c

diff --git a/gcc/generic-match-head.cc b/gcc/generic-match-head.cc
index a71c0727b0b..3b922d0c063 100644
--- a/gcc/generic-match-head.cc
+++ b/gcc/generic-match-head.cc
@@ -121,3 +121,45 @@ bitwise_equal_p (tree expr1, tree expr2)
 return wi::to_wide (expr1) == wi::to_wide (expr2);
   return operand_equal_p (expr1, expr2, 0);
 }
+
+/* Return true if EXPR1 and EXPR2 have the bitwise opposite value,
+   but not necessarily same type.
+   The types can differ through nop conversions.  */
+
+static inline bool
+bitwise_not_equal_p (tree expr1, tree expr2)
+{
+  STRIP_NOPS (expr1);
+  STRIP_NOPS (expr2);
+  if (expr1 == expr2)
+return false;
+  if (!tree_nop_conversion_p (TREE_TYPE (expr1), TREE_TYPE (expr2)))
+return false;
+  if (TREE_CODE (expr1) == INTEGER_CST && TREE_CODE (expr2) == INTEGER_CST)
+return wi::to_wide (expr1) == ~wi::to_wide (expr2);
+  if (operand_equal_p (expr1, expr2, 0))
+return false;
+  if (TREE_CODE (expr1) == BIT_NOT_EXPR
+  && bitwise_equal_p (TREE_OPERAND (expr1, 0), expr2))
+return true;
+  if (TREE_CODE (expr2) == BIT_NOT_EXPR
+  && bitwise_equal_p (expr1, TREE_OPERAND (expr2, 0)))
+return true;
+  if (COMPARISON_CLASS_P (expr1)
+  && COMPARISON_CLASS_P (expr2))
+{
+  tree op10 = TREE_OPERAND (expr1, 0);
+  tree op20 = TREE_OPERAND (expr2, 0);
+  if (!operand_equal_p (op10, op20))
+   return false;
+  tree op11 = TREE_OPERAND (expr1, 1);
+  tree op21 = TREE_OPERAND (expr2, 1);
+  if (!operand_equal_p (op11, op21))
+   return false;
+  if (invert_tree_comparison (TREE_CODE (expr1),
+ HONOR_NANS (op10))
+ == TREE_CODE (expr2))
+   return true;
+}
+  return false;
+}
diff --git a/gcc/gimple-match-head.cc b/gcc/gimple-match-head.cc
index 5d6d26d009b..1e0dd48dd14 100644
--- a/gcc/gimple-match-head.cc
+++ b/gcc/gimple-match-head.cc
@@ -263,3 +263,74 @@ gimple_bitwise_equal_p (tree expr1, tree expr2, tree 
(*valueize) (tree))
 return true;
   return false;
 }
+
+/* Return true if EXPR1 and EXPR2 have the bitwise opposite value,
+   but not necessarily same type.
+   The types can differ through nop conversions.  */
+#define bitwise_not_equal_p(expr1, expr2) \
+  gimple_bitwise_not_equal_p (expr1, expr2, valueize)
+
+/* Helper function for bitwise_equal_p macro.  */
+
+static inline bool
+gimple_bitwise_not_equal_p (tree expr1, tree expr2, tree (*valueize) (tree))
+{
+  if (expr1 == expr2)
+return false;
+  if (!tree_nop_conversion_p (TREE_TYPE (expr1), TREE_TYPE (expr2)))
+return false;
+  if (TREE_CODE (expr1) == INTEGER_CST && TREE_CODE (expr2) == INTEGER_CST)
+return wi::to_wide (expr1) == ~wi::to_wide (expr2);
+  if (operand_equal_p (expr1, expr2, 0))
+return false;
+
+  tree other;
+  if (gimple_nop_convert (expr1, &other, valueize)
+  && gimple_bitwise_not_equal_p (other, expr2, valueize))
+return true;
+
+  if (gimple_nop_convert (expr2, &other, valueize)
+  && gimple_bitwise_not_equal_p (expr1, other, valueize))
+return true;
+
+  if (TREE_CODE (expr1) != SSA_NAME
+  || TREE_CODE (expr2) != SSA_NAME)
+return false;
+
+  gimple *d1 = get_def (valueize, expr1);
+  gassign *a1 = safe_dyn_cast  (d1);
+  gimple *d2 = get_def (valueize, expr2);
+  gassign *a2 = safe_dyn_cast  (d2);
+  if (a1
+  && gimple_assign_rhs_code (a1) == BIT_NOT_EXPR
+  && gimple_bitwise_equal_p (do_valueize (valueize,
+ gimple_assign_rhs1 (a1)),
+expr2, valueiz

[PATCH] RISC-V: Enable basic VLS auto-vectorization

2023-07-29 Thread Juzhe-Zhong
Consider this following case:
void
foo (int8_t *in, int8_t *out, int8_t x)
{
  for (int i = 0; i < 16; i++)
in[i] = x;
}

Compile option: --param=riscv-autovec-preference=scalable -fno-builtin

Before this patch:

foo:
li  a5,16
csrra4,vlenb
vsetvli a3,zero,e8,m1,ta,ma
vmv.v.x v1,a2
bleua5,a4,.L2
mv  a5,a4
.L2:
vsetvli zero,a5,e8,m1,ta,ma
vse8.v  v1,0(a0)
ret

After this patch:

foo:
vsetivlizero,16,e8,mf8,ta,ma
vmv.v.x v1,a2
vse8.v  v1,0(a0)
ret

gcc/ChangeLog:

* config/riscv/autovec-vls.md (@vec_duplicate): New pattern.
* config/riscv/riscv-v.cc (autovectorize_vector_modes): Enable VLS 
auto-vectorization.
* config/riscv/riscv.cc (riscv_estimated_poly_value): Fix incorrect 
poly estimation.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/v-1.c: Adapt test.
* gcc.target/riscv/rvv/autovec/zve32f_zvl128b-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/zve64d_zvl128b-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/zve64f_zvl128b-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/dup-1.c: New test.
* gcc.target/riscv/rvv/autovec/vls/dup-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls/dup-3.c: New test.
* gcc.target/riscv/rvv/autovec/vls/dup-4.c: New test.
* gcc.target/riscv/rvv/autovec/vls/dup-5.c: New test.
* gcc.target/riscv/rvv/autovec/vls/dup-6.c: New test.
* gcc.target/riscv/rvv/autovec/vls/dup-7.c: New test.

---
 gcc/config/riscv/autovec-vls.md   |  19 ++
 gcc/config/riscv/riscv-v.cc   |  21 ++-
 gcc/config/riscv/riscv.cc |  16 +-
 .../gcc.target/riscv/rvv/autovec/v-1.c|   2 +-
 .../gcc.target/riscv/rvv/autovec/vls/dup-1.c  | 168 ++
 .../gcc.target/riscv/rvv/autovec/vls/dup-2.c  | 153 
 .../gcc.target/riscv/rvv/autovec/vls/dup-3.c  | 153 
 .../gcc.target/riscv/rvv/autovec/vls/dup-4.c  | 137 ++
 .../gcc.target/riscv/rvv/autovec/vls/dup-5.c  | 137 ++
 .../gcc.target/riscv/rvv/autovec/vls/dup-6.c  | 122 +
 .../gcc.target/riscv/rvv/autovec/vls/dup-7.c  | 122 +
 .../riscv/rvv/autovec/zve32f_zvl128b-1.c  |   2 +-
 .../riscv/rvv/autovec/zve64d_zvl128b-1.c  |   2 +-
 .../riscv/rvv/autovec/zve64f_zvl128b-1.c  |   2 +-
 14 files changed, 1043 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-6.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/dup-7.c

diff --git a/gcc/config/riscv/autovec-vls.md b/gcc/config/riscv/autovec-vls.md
index 9ece317ca4e..1a64dfdd91e 100644
--- a/gcc/config/riscv/autovec-vls.md
+++ b/gcc/config/riscv/autovec-vls.md
@@ -139,3 +139,22 @@
   "vmv%m1r.v\t%0,%1"
   [(set_attr "type" "vmov")
(set_attr "mode" "")])
+
+;; -
+;;  Duplicate Operations
+;; -
+
+(define_insn_and_split "@vec_duplicate"
+  [(set (match_operand:VLS 0 "register_operand")
+(vec_duplicate:VLS
+  (match_operand: 1 "reg_or_int_operand")))]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+riscv_vector::emit_vlmax_insn (code_for_pred_broadcast (mode),
+   riscv_vector::RVV_UNOP, operands);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 9e89f970a4c..c10e51b362e 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -2533,7 +2533,6 @@ autovectorize_vector_modes (vector_modes *modes, bool)
 {
   if (autovec_use_vlmax_p ())
 {
-  /* TODO: We will support RVV VLS auto-vectorization mode in the future. 
*/
   poly_uint64 full_size
= BYTES_PER_RISCV_VECTOR * ((int) riscv_autovec_lmul);
 
@@ -2561,7 +2560,25 @@ autovectorize_vector_modes (vector_modes *modes, bool)
modes->safe_push (mode);
}
 }
-  return 0;
+  unsigned int flag = 0;
+  if (TARGET_VECTOR_VLS)
+{
+  /* Enable VECT_COMPARE_COSTS between VLA modes VLS modes for scalable
+auto-vectorization.  */
+  flag |= VECT_COMPARE_COSTS;
+  /* Push all VLSmodes according to TARGET_MIN_VLEN.  */
+  unsigned int i = 0;
+  unsigned int base_size = TARGET_MIN_VLEN * riscv_autovec_lmul / 8;
+  unsigned int size = base_size;
+  machine_mode mode;
+  while (size > 0 && get_v