[PATCH] Add arm_cortex_m7_tune.

2014-10-21 Thread Hale Wang
Hi,

This patch is used to tune the gcc for Cortex-M7.

The performance of Dhrystone can be improved by 1%.
The performance of Coremark can be improved by 2.3%.

Patch also attached for convenience.  

Is it ok for trunk?

Thanks and Best Regards,
Hale Wang

gcc/ChangeLog
2014-10-11  Hale Wang  

* config/arm/arm.c: Add cortex-m7 tune.
* config/arm/arm-cores.def: Use cortex-m7 tune.

diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 56ec7fd..3b34173 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -149,7 +149,7 @@ ARM_CORE("cortex-r4",
cortexr4, cortexr4,  7R,  FL_LDSCHED, cortex)
ARM_CORE("cortex-r4f",cortexr4f, cortexr4f,
7R,  FL_LDSCHED, cortex)
ARM_CORE("cortex-r5",  cortexr5, cortexr5,
7R,  FL_LDSCHED | FL_ARM_DIV, cortex)
ARM_CORE("cortex-r7",  cortexr7, cortexr7,
7R,  FL_LDSCHED | FL_ARM_DIV, cortex)
-ARM_CORE("cortex-m7",   cortexm7, cortexm7,
7EM, FL_LDSCHED, v7m)
+ARM_CORE("cortex-m7",  cortexm7, cortexm7,
7EM, FL_LDSCHED, cortex_m7)
ARM_CORE("cortex-m4",   cortexm4, cortexm4,
7EM, FL_LDSCHED, v7m)
ARM_CORE("cortex-m3",   cortexm3, cortexm3,
7M,  FL_LDSCHED, v7m)
ARM_CORE("marvell-pj4", marvell_pj4, marvell_pj4,
7A,  FL_LDSCHED, 9e)
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 93b989d..834b13a 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -2003,6 +2003,27 @@ const struct tune_params arm_v7m_tune =
   8
/* Maximum insns to inline memset.  */
};

+/* Cortex-M7 tuning.  */
+
+const struct tune_params arm_cortex_m7_tune =
+{
+  arm_9e_rtx_costs,
+  &v7m_extra_costs,
+  NULL,
/* Sched adj cost.  */
+  0,
/* Constant limit.  */
+  0,
/* Max cond insns.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  true,
/* Prefer constant pool.  */
+  arm_cortex_m_branch_cost,
+  false,
/* Prefer LDRD/STRD.  */
+  {true, true},
/* Prefer non short circuit.  */
+  &arm_default_vec_cost,/* Vectorizer costs.  */
+  false,/* Prefer Neon for 64-bits
bitops.  */
+  false, false, /* Prefer 32-bit encodings.
*/
+  false,
/* Prefer Neon for stringops.  */
+  8
/* Maximum insns to inline memset.  */
+};
+
/* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus.
*/
const struct tune_params arm_v6m_tune =





RE: [PATCH] Add arm_cortex_m7_tune.

2014-10-21 Thread Hale Wang
Attach the patch.

> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Hale Wang
> Sent: Tuesday, October 21, 2014 5:49 PM
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] Add arm_cortex_m7_tune.
> 
> Hi,
> 
> This patch is used to tune the gcc for Cortex-M7.
> 
> The performance of Dhrystone can be improved by 1%.
> The performance of Coremark can be improved by 2.3%.
> 
> Patch also attached for convenience.
> 
> Is it ok for trunk?
> 
> Thanks and Best Regards,
> Hale Wang
> 
> gcc/ChangeLog
> 2014-10-11  Hale Wang  
> 
> * config/arm/arm.c: Add cortex-m7 tune.
> * config/arm/arm-cores.def: Use cortex-m7 tune.
> 
> diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
> index 56ec7fd..3b34173 100644
> --- a/gcc/config/arm/arm-cores.def
> +++ b/gcc/config/arm/arm-cores.def
> @@ -149,7 +149,7 @@ ARM_CORE("cortex-r4",
> cortexr4, cortexr4,  7R,  FL_LDSCHED, cortex)
> ARM_CORE("cortex-r4f",cortexr4f, cortexr4f,
> 7R,  FL_LDSCHED, cortex)
> ARM_CORE("cortex-r5",  cortexr5, cortexr5,
> 7R,  FL_LDSCHED | FL_ARM_DIV, cortex)
> ARM_CORE("cortex-r7",  cortexr7, cortexr7,
> 7R,  FL_LDSCHED | FL_ARM_DIV, cortex)
> -ARM_CORE("cortex-m7",   cortexm7, cortexm7,
> 7EM, FL_LDSCHED, v7m)
> +ARM_CORE("cortex-m7",  cortexm7, cortexm7,
> 7EM, FL_LDSCHED, cortex_m7)
> ARM_CORE("cortex-m4",   cortexm4, cortexm4,
> 7EM, FL_LDSCHED, v7m)
> ARM_CORE("cortex-m3",   cortexm3, cortexm3,
> 7M,  FL_LDSCHED, v7m)
> ARM_CORE("marvell-pj4", marvell_pj4, marvell_pj4,
> 7A,  FL_LDSCHED, 9e)
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index
> 93b989d..834b13a 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -2003,6 +2003,27 @@ const struct tune_params arm_v7m_tune =
>8
> /* Maximum insns to inline memset.  */
> };
> 
> +/* Cortex-M7 tuning.  */
> +
> +const struct tune_params arm_cortex_m7_tune = {
> +  arm_9e_rtx_costs,
> +  &v7m_extra_costs,
> +  NULL,
> /* Sched adj cost.  */
> +  0,
> /* Constant limit.  */
> +  0,
> /* Max cond insns.  */
> +  ARM_PREFETCH_NOT_BENEFICIAL,
> +  true,
> /* Prefer constant pool.  */
> +  arm_cortex_m_branch_cost,
> +  false,
> /* Prefer LDRD/STRD.  */
> +  {true, true},
> /* Prefer non short circuit.  */
> +  &arm_default_vec_cost,/* Vectorizer costs.  */
> +  false,/* Prefer Neon for
64-bits
> bitops.  */
> +  false, false, /* Prefer 32-bit
encodings.
> */
> +  false,
> /* Prefer Neon for stringops.  */
> +  8
> /* Maximum insns to inline memset.  */
> +};
> +
> /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
> arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and cortex-m0plus.
> */
> const struct tune_params arm_v6m_tune =
> 
> 
> 


cortex-m7-tune-2.patch
Description: Binary data


Small multiplier support in Cortex-M0/1/+

2014-10-21 Thread Hale Wang
Hi,

Some configurations of the Cortex-M0 and Cortex-M1 come with a high latency
multiplier. This patch adds support for such configurations.

Small multiplier means using add/sub/shift instructions to replace the mul
instruction for the MCU that has no fast multiplier.

The following strategies are adopted in this patch:
1. Define new CPUs as
-mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,cortex-m1.small-
multiply to support small multiplier.
2. -Os means size is preferred. A threshold of 5 is set which means it will
prevent spliting if ending up with more than 5 instructions. As for non-OS,
there will be no such a limit.

Some test cases are also added in the testsuite to verify this function.

Is it ok for trunk?

Thanks and Best Regards,
Hale Wang

gcc/ChangeLog:

2014-08-29  Hale Wang  

* config/arm/arm-cores.def: Add support for
-mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
cortex-m1.small-multiply.
* config/arm/arm-tables.opt: Regenerate.
* config/arm/arm-tune.md: Regenerate.
* config/arm/arm.c: Update the rtx-costs for MUL.
* config/arm/bpabi.h: Handle
-mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
cortex-m1.small-multiply.
* doc/invoke.texi: Document
-mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
cortex-m1.small-multiply.
* testsuite/gcc.target/arm/small-multiply-m0-1.c: New test case.
* testsuite/gcc.target/arm/small-multiply-m0-2.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m0-3.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m0plus-1.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m0plus-2.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m0plus-3.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m1-1.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m1-2.c: Likewise.
* testsuite/gcc.target/arm/small-multiply-m1-3.c: Likewise.

===
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index a830a83..af4b373 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -137,6 +137,11 @@ ARM_CORE("cortex-m1",  cortexm1, cortexm1,
6M, FL_LDSCHED, v6m)
 ARM_CORE("cortex-m0",  cortexm0, cortexm0, 6M,
FL_LDSCHED, v6m)
 ARM_CORE("cortex-m0plus",  cortexm0plus, cortexm0plus, 6M,
FL_LDSCHED, v6m)
 
+/* V6M Architecture Processors for small-multiply implementations.  */
+ARM_CORE("cortex-m1.small-multiply",   cortexm1smallmultiply, cortexm1,
6M, FL_LDSCHED | FL_SMALLMUL, v6m)
+ARM_CORE("cortex-m0.small-multiply",   cortexm0smallmultiply, cortexm0,
6M, FL_LDSCHED | FL_SMALLMUL, v6m)
+ARM_CORE("cortex-m0plus.small-multiply",cortexm0plussmallmultiply,
cortexm0plus,6M, FL_LDSCHED | FL_SMALLMUL, v6m)
+
 /* V7 Architecture Processors */
 ARM_CORE("generic-armv7-a",genericv7a, genericv7a, 7A,
FL_LDSCHED, cortex)
 ARM_CORE("cortex-a5",  cortexa5, cortexa5, 7A,
FL_LDSCHED, cortex_a5)
diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
index bc046a0..bd65bd2 100644
--- a/gcc/config/arm/arm-tables.opt
+++ b/gcc/config/arm/arm-tables.opt
@@ -241,6 +241,15 @@ EnumValue
 Enum(processor_type) String(cortex-m0plus) Value(cortexm0plus)
 
 EnumValue
+Enum(processor_type) String(cortex-m1.small-multiply)
Value(cortexm1smallmultiply)
+
+EnumValue
+Enum(processor_type) String(cortex-m0.small-multiply)
Value(cortexm0smallmultiply)
+
+EnumValue
+Enum(processor_type) String(cortex-m0plus.small-multiply)
Value(cortexm0plussmallmultiply)
+
+EnumValue
 Enum(processor_type) String(generic-armv7-a) Value(genericv7a)
 
 EnumValue
diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
index 954cab8..8b5c778 100644
--- a/gcc/config/arm/arm-tune.md
+++ b/gcc/config/arm/arm-tune.md
@@ -25,6 +25,7 @@
arm1176jzs,arm1176jzfs,mpcorenovfp,
mpcore,arm1156t2s,arm1156t2fs,
cortexm1,cortexm0,cortexm0plus,
+
cortexm1smallmultiply,cortexm0smallmultiply,cortexm0plussmallmultiply,
genericv7a,cortexa5,cortexa7,
cortexa8,cortexa9,cortexa12,
cortexa15,cortexr4,cortexr4f,
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 93b989d..5062c85 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -751,6 +751,8 @@ static int thumb_call_reg_needed;
 #define FL_ARCH8  (1 << 24)   /* Architecture 8.  */
 #define FL_CRC32  (1 << 25)  /* ARMv8 CRC32 instructions.
*/
 
+#define FL_SMALLMUL   (1 << 26)   /* Small multiply supported.  */
+
 #define FL_IWMMXT (1 << 29)  /* XScale v2 or "Intel
Wireless MMX technology".  */
 #define FL_IWMMXT2(1 << 30)   /* "I

[PATCH] PR61123 : Fix the ABI mis-matching error caused by LTO

2014-06-18 Thread Hale Wang
Hi,

With LTO, -fno-short-enums is ignored, resulting in ABI mis-matching in
linking.

Refer https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61123  for details. 

This patch add fshort-enums and fshout-wchar to LTO group.

To check it, a new procedure object-readelf is added in
testsuite/lib/lto.exp and new lto tests are added in gcc.target/arm/lto.

Bootstrap and no make check regression on X86-64.

Patch also attached for convenience.  Is It ok for trunk?

Thanks and Best Regards,
Hale Wang

c-family/ChangeLog
2014-06-18 Hale Wang 

    PR lto/61123
    *c.opt (fshort-enums): Add to LTO.
    *c.opt (fshort-wchar): Likewise.

testsuite/ChangeLog
2014-06-18 Hale Wang 

    * gcc.target/arm/lto/: New folder to verify the LTO
option for ARM specific.
    * gcc.target/arm/lto/pr61123-enum-size_0.c: New test
case.
    * gcc.target/arm/lto/pr61123-enum-size_1.c: Likewise.
    * gcc.target/arm/lto/lto.exp: New exp file used to test
LTO option for ARM specific.
    * lib/lto.exp (object-readelf): New procedure used to
catch the enum size in the final executable.

Index: gcc/c-family/c.opt
===
--- gcc/c-family/c.opt (revision 211394)
+++ gcc/c-family/c.opt  (working copy)
@@ -1189,11 +1189,11 @@
Use the same size for double as for float

 fshort-enums
-C ObjC C++ ObjC++ Optimization Var(flag_short_enums)
+C ObjC C++ ObjC++ LTO Optimization Var(flag_short_enums)
Use the narrowest integer type possible for enumeration types

 fshort-wchar
-C ObjC C++ ObjC++ Optimization Var(flag_short_wchar)
+C ObjC C++ ObjC++ LTO Optimization Var(flag_short_wchar)
Force the underlying type for \"wchar_t\" to be \"unsigned short\"

 fsigned-bitfields
Index: gcc/testsuite/gcc.target/arm/lto/pr61123-enum-size_0.c
===
--- gcc/testsuite/gcc.target/arm/lto/pr61123-enum-size_0.c   (revision
0)
+++ gcc/testsuite/gcc.target/arm/lto/pr61123-enum-size_0.c    (revision 0)
@@ -0,0 +1,22 @@
+/* { dg-lto-do link } */
+/* { dg-lto-options { { -fno-short-enums -Wl,-Ur,--no-enum-size-warning -Os
-nostdlib -flto } } } */
+
+#include 
+
+enum enum_size_attribute
+{
+  small_size, int_size
+};
+
+struct debug_ABI_enum_size
+{
+  enum enum_size_attribute es;
+};
+
+int
+foo1 (struct debug_ABI_enum_size *x)
+{
+  return sizeof (x->es);
+}
+
+/* { dg-final { object-readelf Tag_ABI_enum_size int { target arm_eabi } }
} */
Index: gcc/testsuite/gcc.target/arm/lto/lto.exp
===
--- gcc/testsuite/gcc.target/arm/lto/lto.exp    (revision 0)
+++ gcc/testsuite/gcc.target/arm/lto/lto.exp (revision 0)
@@ -0,0 +1,59 @@
+# Copyright (C) 2009-2014 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+#
+# Contributed by Diego Novillo 
+
+
+# Test link-time optimization across multiple files.
+#
+# Programs are broken into multiple files.  Each one is compiled
+# separately with LTO information.  The final executable is generated
+# by collecting all the generated object files using regular LTO or WHOPR.
+
+if $tracelevel then {
+    strace $tracelevel
+}
+
+# Load procedures from common libraries.
+load_lib standard.exp
+load_lib gcc.exp
+
+# Load the language-independent compabibility support procedures.
+load_lib lto.exp
+
+# If LTO has not been enabled, bail.
+if { ![check_effective_target_lto] } {
+    return
+}
+
+gcc_init
+lto_init no-mathlib
+
+# Define an identifier for use with this suite to avoid name conflicts
+# with other lto tests running at the same time.
+set sid "c_lto"
+
+# Main loop.
+foreach src [lsort [find $srcdir/$subdir *_0.c]] {
+    # If we're only testing specific files and this isn't one of them, skip
it.
+    if ![runtest_file_p $runtests $src] then {
+ continue
+    }
+
+    lto-execute $src $sid
+}
+
+lto_finish
Index: gcc/testsuite/gcc.target/arm/lto/pr61123-enum-size_1.c
===
--- gcc/testsuite/gcc.target/arm/lto/pr61123-enum-size_1.c   (revision
0)
+++ gcc/testsuite/gcc.target/arm/lto/pr61123-enum-size_1.c    (revision 0)
@@ -0,0 +1,5 @@
+int
+foo2 (int 

[Committed] [PATCH] PR61123 : Fix the ABI mis-matching error caused by LTO

2014-06-19 Thread Hale Wang


> -Original Message-
> From: Mike Stump [mailto:mikest...@comcast.net]
> Sent: 2014年6月19日 1:42
> To: Richard Biener
> Cc: Hale Wang; Mike Stump; GCC Patches
> Subject: Re: [PATCH] PR61123 : Fix the ABI mis-matching error caused by
LTO
> 
> On Jun 18, 2014, at 3:22 AM, Richard Biener 
> wrote:
> >> Space after the *.
> >>
> >> I think you don't need to copy the LTO harness but you can simply use
> >> dg.exp and sth similar to gcc.dg/20081223-1.c (there is an effective
> >> target 'lto' to guard for lto support).
> >>
> >> So simply place the testcase in gcc.target/arm/ (make sure to put a
> >> dg-do compile on the 2nd file and use dg-additional-sources).
> >>
> >> If that doesn't work I'd say put the testcase in gcc.dg/lto/ instead
> >> and do a dg-skip-if for non-arm targets.
> >>
> >> Ok with one of those changes.
> >
> > Oh, I see you need a new object-readelf ... I defer to a testsuite
> > maintainer for this part.
> 
> The testsuite bits are Ok.  My guidance on the test suite would be this,
all lto
> test cases in .*lto directories.  20 or fewer test cases for a given
target, in the
> main lto directory, more than 50, in the arm/lto directory.  When one is
> tracking down bugs and trying to clean test suite results if they break,
it is
> nice to be able to skip in mass all lto bugs first, and resolve all
non-lto issues
> and then come back to the lto issues last, in hopes that they are all then
> resolved.  Also, if one it redoing lto bits, and a test case with lto in
the name
> pops up as a regression, and you’re not an lto person, you can stop
thinking
> about it and just pass to the lto person, it is a slightly different
mindset.  :-)

Thanks! Patch was committed @r211832 with minimal format changes due to
"TAB". The final change log and  patch is:

2014-06-20 Hale Wang 

PR lto/61123
* c.opt (fshort-enums): Add to LTO.
* c.opt (fshort-wchar): Likewise.

testsuite/ChangeLog
2014-06-20 Hale Wang 

* gcc.target/arm/lto/: New folder to verify the LTO option.
* gcc.target/arm/lto/pr61123-enum-size_0.c: New test case.
* gcc.target/arm/lto/pr61123-enum-size_1.c: Likewise.
* gcc.target/arm/lto/lto.exp: New exp file used to test LTO option.
* lib/lto.exp (object-readelf): New procedure.


Index: gcc/c-family/c.opt
===
--- gcc/c-family/c.opt  (revision 211394)
+++ gcc/c-family/c.opt  (working copy)
@@ -1189,11 +1189,11 @@
 Use the same size for double as for float
 
 fshort-enums
-C ObjC C++ ObjC++ Optimization Var(flag_short_enums)
+C ObjC C++ ObjC++ LTO Optimization Var(flag_short_enums)
 Use the narrowest integer type possible for enumeration types
 
 fshort-wchar
-C ObjC C++ ObjC++ Optimization Var(flag_short_wchar)
+C ObjC C++ ObjC++ LTO Optimization Var(flag_short_wchar)
 Force the underlying type for \"wchar_t\" to be \"unsigned short\"
 
 fsigned-bitfields
Index: gcc/testsuite/lib/lto.exp
===
--- gcc/testsuite/lib/lto.exp   (revision 211394)
+++ gcc/testsuite/lib/lto.exp   (working copy)
@@ -650,3 +650,82 @@
fail "scan-symbol $args"
 }
 }
+
+# Call pass if object readelf is ok, otherwise fail.
+# example: /* { dg-final { object-readelf Tag_ABI_enum_size int} } */
+proc object-readelf { args } {
+global readelf
+global base_dir
+upvar 2 execname execname
+
+if { [llength $args] < 2 } {
+   error "object-readelf: too few arguments"
+   return
+}
+if { [llength $args] > 3 } {
+   error "object-readelf: too many arguments"
+   return
+}
+if { [llength $args] >= 3 } {
+   switch [dg-process-target [lindex $args 2]] {
+   "S" { }
+   "N" { return }
+   "F" { setup_xfail "*-*-*" }
+   "P" { }
+   }
+}
+
+# Find size like we find g++ in g++.exp.
+if ![info exists readelf]  {
+   set readelf [findfile $base_dir/../../../binutils/readelf \
+   $base_dir/../../../binutils/readelf \
+   [findfile $base_dir/../../readelf $base_dir/../../readelf \
+   [findfile $base_dir/readelf $base_dir/readelf \
+   [transform readelf
+   verbose -log "readelf is $readelf"
+}
+
+set what [lindex $args 0]
+set with [lindex $args 1]
+
+if ![file_on_host exists $execname] {
+   verbose -log "$execname does not exist"
+   unresolved "object-readelf $what "
+   return
+}
+
+set output [remote_exec host "$readelf -A" "$execname"]
+se

RE: [Committed] [PATCH] PR61123 : Fix the ABI mis-matching error caused by LTO

2014-06-22 Thread Hale Wang


> -Original Message-
> From: Jakub Jelinek [mailto:ja...@redhat.com]
> Sent: 2014年6月21日 5:24
> To: Hale Wang
> Cc: 'Mike Stump'; Richard Biener; Mike Stump; GCC Patches
> Subject: Re: [Committed] [PATCH] PR61123 : Fix the ABI mis-matching error
> caused by LTO
> 
> On Fri, Jun 20, 2014 at 01:14:52PM +0800, Hale Wang wrote:
> > 2014-06-20 Hale Wang 
> >
> > * gcc.target/arm/lto/: New folder to verify the LTO option.
> > * gcc.target/arm/lto/pr61123-enum-size_0.c: New test case.
> > * gcc.target/arm/lto/pr61123-enum-size_1.c: Likewise.
> > * gcc.target/arm/lto/lto.exp: New exp file used to test LTO
option.
> > * lib/lto.exp (object-readelf): New procedure.
> 
> This FAILs on non-arm targets.
> 
> I've committed following fix as obvious.

Thank you very much.

> On the other side, if you don't plan to add too many arm LTO tests,
> supposedly putting it into gcc.dg/lto and just using arm*-*-* target
selector
> might be better.

We plan to add more cases to check LTO ABI issues, like short-wchar. All the
ABI would be ARM special. So all the test cases would be ARM special.

Thanks and Best Regards,
Hale Wang 
 
> 2014-06-20  Jakub Jelinek  
> 
>   * gcc.target/arm/lto/lto.exp: Exit immediately if not arm*-*-*
target.
> 
> --- gcc/testsuite/gcc.target/arm/lto/lto.exp.jj   2014-06-20
> 08:02:50.0 +0200
> +++ gcc/testsuite/gcc.target/arm/lto/lto.exp  2014-06-20
> 23:19:33.850043692 +0200
> @@ -16,6 +16,10 @@
>  #
>  # Contributed by Diego Novillo 
> 
> +# Exit immediately if this isn't an ARM target.
> +if ![istarget arm*-*-*] then {
> +  return
> +}
> 
>  # Test link-time optimization across multiple files.
>  #
> 
> 
>   Jakub





[PATCH] [gcc, combine] Backport to GCC 5.0 branch "PR46164: Don't combine the insns if a volatile register is contained".

2015-05-13 Thread Hale Wang
> -Original Message-
> From: Terry Guo [mailto:flame...@gmail.com]
> Sent: Wednesday, April 22, 2015 3:24 PM
> To: Segher Boessenkool
> Cc: l...@redhat.com; Hale Wang; GCC Patches; Richard Sandiford
> Subject: Re: Ping^3 : [PATCH] [gcc, combine] PR46164: Don't combine the
> insns if a volatile register is contained.
> 
> On Wed, Apr 22, 2015 at 10:30 AM, Segher Boessenkool
>  wrote:
> > On Wed, Apr 22, 2015 at 10:21:43AM +0800, Terry Guo wrote:
> >> gcc/ChangeLog:
> >> 2015-04-22 Hale Wang 
> >> Terry Guo  
> >>
> >>PR rtl-optimization/64818
> >>* combine.c (can_combine_p): Don't combine user-specified register
> if
> >>it is in an asm input.
> >>
> >> gcc/testsuite/ChangeLog:
> >> 2015-04-22 Hale Wang 
> >> Terry Guo  
> >>
> >>PR rtl-optimization/64818
> >>* gcc.target/arm/pr64818.c: New.
> >
> > This is okay for trunk, if it has been bootstrapped and regression tested.
> >
> > Thanks,
> >
> >
> > Segher
> 
> Thanks Segher. The patch is tested with bootstrap and regression test for
> x86_64. No problem found. Committed as revision 222306.
> 

This patch applies cleanly on GCC 5.0 branch. Bootstrap and regression test are 
OK for X86_64.

Can we backport this patch to GCC 5.0 branch?

Best Regards,
Hale

> BR,
> Terry





RE: [PATCH] [gcc, combine] Backport to GCC 5.0 branch "PR46164: Don't combine the insns if a volatile register is contained".

2015-05-18 Thread Hale Wang

> -Original Message-
> From: Segher Boessenkool [mailto:seg...@kernel.crashing.org]
> Sent: Thursday, May 14, 2015 9:29 PM
> To: Hale Wang
> Cc: l...@redhat.com; GCC Patches; Richard Sandiford; 'Terry Guo'
> Subject: Re: [PATCH] [gcc, combine] Backport to GCC 5.0 branch "PR46164:
> Don't combine the insns if a volatile register is contained".
> 
> On Thu, May 14, 2015 at 01:56:54PM +0800, Hale Wang wrote:
> > > >> gcc/ChangeLog:
> > > >> 2015-04-22 Hale Wang 
> > > >> Terry Guo  
> > > >>
> > > >>PR rtl-optimization/64818
> > > >>* combine.c (can_combine_p): Don't combine user-specified
> > > >> register
> > > if
> > > >>it is in an asm input.
> > > >>
> > > >> gcc/testsuite/ChangeLog:
> > > >> 2015-04-22 Hale Wang 
> > > >> Terry Guo  
> > > >>
> > > >>PR rtl-optimization/64818
> > > >>* gcc.target/arm/pr64818.c: New.
> 
> > This patch applies cleanly on GCC 5.0 branch. Bootstrap and regression
test
> are OK for X86_64.
> >
> > Can we backport this patch to GCC 5.0 branch?
> 
> It should be perfectly safe, and it's a pretty nasty bug.  But it is
technically not
> a regression (or is it?), so I'll defer to the release managers.
> 

Yes, I agree it is not a regression.
> 
> Segher





RE: [PATCH] [1/2] [ARM] [libgcc] Support RTABI half-precision conversion functions.

2015-04-22 Thread Hale Wang
> -Original Message-
> From: Ramana Radhakrishnan [mailto:ramana@googlemail.com]
> Sent: Wednesday, April 22, 2015 3:50 PM
> To: Joseph Myers
> Cc: Hale Wang; GCC Patches
> Subject: Re: [PATCH] [1/2] [ARM] [libgcc] Support RTABI half-precision
> conversion functions.
> 
> On Mon, Apr 13, 2015 at 12:25 PM, Joseph Myers
>  wrote:
> > On Mon, 13 Apr 2015, Hale Wang wrote:
> >
> >> Yes, you are right. It's my fault to add the "only" here. Thank you
> >> to point out this.
> >> Beside this, is this patch OK for you?
> >
> > I don't think it's a good idea for libgcc to include large pieces of
> > assembly code generated by a compiler.  Just compile the code with
> > whatever options are needed at the time libgcc is built - possibly
> > with #if conditionals to allow compiling different versions of the code.

Indeed, just compile the code with option '-mfloat-abi=soft' at the time libgcc 
is build which can solve this problem.

> > Indeed, are any special options needed at all?
>
 
The reason is that the current GNU versions of the fp16 conversions are more 
efficient than the AEABI versions in this patch(and also more efficient than 
the code compiled with option '-mfloat-abi=soft', because no fp registers will 
be used to implement these functions which is allowed in the GNU versions). We 
provide an option so that the users can choose the version as they want(whether 
they want to follow the AEABI constraint or not).

> 
> I agree and I don't think it's maintainable in the long run. From my reading 
> of
> this thread I can't see any special options being needed.
> Can we just massage it in C ?
> 

The reason is that the implementations of these helper functions are allowed to 
corrupt the integer core registers permitted to be corrupted by the [AAPCS] 
(r0-r3, ip, lr, and CPSR). To guarantee this if we just massage it in C, as 
Joseph suggested, we can compile the code with whatever options are needed at 
the time libgcc is built. Possibly the option '-mfloat-abi=soft ' can help us 
to guarantee this (seems more strict than the AEABI constraint).

The special option is provided so that the users can choose the version as they 
want(whether they want to follow the AEABI constraint or not). Because the 
current GNU versions of the fp16 conversions are more efficient than the AEABI 
versions in this patch.

Best Regards,
Hale
> 
> regards
> Ramana
> 
> >
> > --
> > Joseph S. Myers
> > jos...@codesourcery.com





RE: [PATCH] [1/2] [ARM] [libgcc] Support RTABI half-precision conversion functions.

2015-04-27 Thread Hale Wang
> -Original Message-
> From: Ramana Radhakrishnan [mailto:ramana@googlemail.com]
> Sent: Wednesday, April 22, 2015 5:00 PM
> To: Hale Wang
> Cc: Ramana Radhakrishnan; Joseph Myers; GCC Patches
> Subject: Re: [PATCH] [1/2] [ARM] [libgcc] Support RTABI half-precision
> conversion functions.
> 
> On Wed, Apr 22, 2015 at 9:32 AM, Hale Wang  wrote:
> >> -Original Message-
> >> From: Ramana Radhakrishnan [mailto:ramana@googlemail.com]
> >> Sent: Wednesday, April 22, 2015 3:50 PM
> >> To: Joseph Myers
> >> Cc: Hale Wang; GCC Patches
> >> Subject: Re: [PATCH] [1/2] [ARM] [libgcc] Support RTABI
> >> half-precision conversion functions.
> >>
> >> On Mon, Apr 13, 2015 at 12:25 PM, Joseph Myers
> >>  wrote:
> >> > On Mon, 13 Apr 2015, Hale Wang wrote:
> >> >
> >> >> Yes, you are right. It's my fault to add the "only" here. Thank
> >> >> you to point out this.
> >> >> Beside this, is this patch OK for you?
> >> >
> >> > I don't think it's a good idea for libgcc to include large pieces
> >> > of assembly code generated by a compiler.  Just compile the code
> >> > with whatever options are needed at the time libgcc is built -
> >> > possibly with #if conditionals to allow compiling different versions of 
> >> > the
> code.
> >
> > Indeed, just compile the code with option '-mfloat-abi=soft' at the time
> libgcc is build which can solve this problem.
> 
> Or why not "conditionally" use the ``pcs'' attribute on the ARM port ?
> That then means you don't need options magic on top ?

OK. I think your suggestion can solve this problem more clearly. I will 
resubmit a patch later. I think we can discard this patch this time.
Thanks a lot.

Hale
> 
> Ramana





RE: [Ping^3] [PATCH, ARM, libgcc] New aeabi_idiv function for armv6-m

2015-04-30 Thread Hale Wang
> -Original Message-
> From: Hale Wang [mailto:hale.w...@arm.com]
> Sent: Monday, February 09, 2015 9:54 AM
> To: Richard Earnshaw
> Cc: Hale Wang; gcc-patches; Matthew Gretton-Dann
> Subject: RE: [Ping^2] [PATCH, ARM, libgcc] New aeabi_idiv function for
> armv6-m
> 
> Ping https://gcc.gnu.org/ml/gcc-patches/2014-12/msg01059.html.
> 

Ping for trunk. Is it ok for trunk now?

Thanks,
Hale
> > -Original Message-
> > From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> > ow...@gcc.gnu.org] On Behalf Of Hale Wang
> > Sent: Friday, December 12, 2014 9:36 AM
> > To: gcc-patches
> > Subject: RE: [Ping] [PATCH, ARM, libgcc] New aeabi_idiv function for
> > armv6- m
> >
> > Ping? Already applied to arm/embedded-4_9-branch, is it OK for trunk?
> >
> > -Hale
> >
> > > -Original Message-
> > > From: Joey Ye [mailto:joey.ye...@gmail.com]
> > > Sent: Thursday, November 27, 2014 10:01 AM
> > > To: Hale Wang
> > > Cc: gcc-patches
> > > Subject: Re: [PATCH, ARM, libgcc] New aeabi_idiv function for
> > > armv6-m
> > >
> > > OK applying to arm/embedded-4_9-branch, though you still need
> > > maintainer approval into trunk.
> > >
> > > - Joey
> > >
> > > On Wed, Nov 26, 2014 at 11:43 AM, Hale Wang 
> > wrote:
> > > > Hi,
> > > >
> > > > This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
> > > > (https://git.linaro.org/toolchain/cortex-strings.git), which was
> > > > contributed by ARM under Free BSD license.
> > > >
> > > > The new aeabi_idiv routine is used to replace the one in
> > > > libgcc/config/arm/lib1funcs.S. This replacement happens within the
> > > > Thumb1 wrapper. The new routine is under LGPLv3 license.
> > > >
> > > > The main advantage of this version is that it can improve the
> > > > performance of the aeabi_idiv function for Thumb1. This solution
> > > > will also increase the code size. So it will only be used if
> > > > __OPTIMIZE_SIZE__ is
> > > not defined.
> > > >
> > > > Make check passed for armv6-m.
> > > >
> > > > OK for trunk?
> > > >
> > > > Thanks,
> > > > Hale Wang
> > > >
> > > > libgcc/ChangeLog:
> > > >
> > > > 2014-11-26  Hale Wang  
> > > >
> > > > * config/arm/lib1funcs.S: Add new wrapper.
> > > >
> > > > ===
> > > > diff --git a/libgcc/config/arm/lib1funcs.S
> > > > b/libgcc/config/arm/lib1funcs.S index b617137..de66c81 100644
> > > > --- a/libgcc/config/arm/lib1funcs.S
> > > > +++ b/libgcc/config/arm/lib1funcs.S
> > > > @@ -306,34 +306,12 @@ LSYM(Lend_fde):
> > > >  #ifdef __ARM_EABI__
> > > >  .macro THUMB_LDIV0 name signed
> > > >  #if defined(__ARM_ARCH_6M__)
> > > > -   .ifc \signed, unsigned
> > > > -   cmp r0, #0
> > > > -   beq 1f
> > > > -   mov r0, #0
> > > > -   mvn r0, r0  @ 0x
> > > > -1:
> > > > -   .else
> > > > -   cmp r0, #0
> > > > -   beq 2f
> > > > -   blt 3f
> > > > +
> > > > +   push{r0, lr}
> > > > mov r0, #0
> > > > -   mvn r0, r0
> > > > -   lsr r0, r0, #1  @ 0x7fff
> > > > -   b   2f
> > > > -3: mov r0, #0x80
> > > > -   lsl r0, r0, #24 @ 0x8000
> > > > -2:
> > > > -   .endif
> > > > -   push{r0, r1, r2}
> > > > -   ldr r0, 4f
> > > > -   adr r1, 4f
> > > > -   add r0, r1
> > > > -   str r0, [sp, #8]
> > > > -   @ We know we are not on armv4t, so pop pc is safe.
> > > > -   pop {r0, r1, pc}
> > > > -   .align  2
> > > > -4:
> > > > -   .word   __aeabi_idiv0 - 4b
> > > > +   bl  SYM(__aeabi_idiv0)
> > > > +   pop {r1, pc}
> > > > +
> > > >  #elif defined(__thumb2__)
> > > > .syntax unified
> > > > .ifc \signed, unsigned
> > > > @@ -927,7 +905,158 @@ LSYM(Lover7):
> > > > add dividend, w

RE: Small multiplier support in Cortex-M0/1/+

2014-11-12 Thread Hale Wang

> -Original Message-
> From: Hale Wang [mailto:hale.w...@arm.com]
> Sent: Thursday, November 13, 2014 2:16 PM
> To: 'Christophe Lyon'
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: Small multiplier support in Cortex-M0/1/+
> 
> > -Original Message-
> > From: Christophe Lyon [mailto:christophe.l...@linaro.org]
> > Sent: Wednesday, November 12, 2014 9:49 PM
> > To: Hale Wang
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: Re: Small multiplier support in Cortex-M0/1/+
> >
> > On 21 October 2014 12:01, Hale Wang  wrote:
> > > Hi,
> > >
> > > Some configurations of the Cortex-M0 and Cortex-M1 come with a high
> > > latency multiplier. This patch adds support for such configurations.
> > >
> > > Small multiplier means using add/sub/shift instructions to replace
> > > the mul instruction for the MCU that has no fast multiplier.
> > >
> > > The following strategies are adopted in this patch:
> > > 1. Define new CPUs as
> > > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,cortex-m1.
> > > small-
> > > multiply to support small multiplier.
> > > 2. -Os means size is preferred. A threshold of 5 is set which means
> > > it will prevent spliting if ending up with more than 5 instructions.
> > > As for non-OS, there will be no such a limit.
> > >
> > > Some test cases are also added in the testsuite to verify this function.
> > >
> > > Is it ok for trunk?
> > >
> > > Thanks and Best Regards,
> > > Hale Wang
> > >
> > > gcc/ChangeLog:
> > >
> > > 2014-08-29  Hale Wang  
> > >
> > > * config/arm/arm-cores.def: Add support for
> > > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
> > > cortex-m1.small-multiply.
> > > * config/arm/arm-tables.opt: Regenerate.
> > > * config/arm/arm-tune.md: Regenerate.
> > > * config/arm/arm.c: Update the rtx-costs for MUL.
> > > * config/arm/bpabi.h: Handle
> > > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
> > > cortex-m1.small-multiply.
> > > * doc/invoke.texi: Document
> > > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
> > > cortex-m1.small-multiply.
> > > * testsuite/gcc.target/arm/small-multiply-m0-1.c: New test case.
> > > * testsuite/gcc.target/arm/small-multiply-m0-2.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m0-3.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m0plus-1.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m0plus-2.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m0plus-3.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m1-1.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m1-2.c: Likewise.
> > > * testsuite/gcc.target/arm/small-multiply-m1-3.c: Likewise.
> > >
> > >
> >
> ==
> > =
> > > diff --git a/gcc/config/arm/arm-cores.def
> > > b/gcc/config/arm/arm-cores.def index a830a83..af4b373 100644
> > > --- a/gcc/config/arm/arm-cores.def
> > > +++ b/gcc/config/arm/arm-cores.def
> > > @@ -137,6 +137,11 @@ ARM_CORE("cortex-m1",  cortexm1,
> > cortexm1,
> > > 6M, FL_LDSCHED, v6m)
> > >  ARM_CORE("cortex-m0",  cortexm0, cortexm0, 6M,
> > > FL_LDSCHED, v6m)
> > >  ARM_CORE("cortex-m0plus",  cortexm0plus, cortexm0plus, 6M,
> > > FL_LDSCHED, v6m)
> > >
> > > +/* V6M Architecture Processors for small-multiply implementations.  */
> > > +ARM_CORE("cortex-m1.small-multiply",   cortexm1smallmultiply,
> > cortexm1,
> > > 6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> > > +ARM_CORE("cortex-m0.small-multiply",   cortexm0smallmultiply,
> > cortexm0,
> > > 6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> > > +ARM_CORE("cortex-m0plus.small-multiply",cortexm0plussmallmultiply,
> > > cortexm0plus,6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> > > +
> > >  /* V7 Architecture Processors */
> > >  ARM_CORE("generic-armv7-a",genericv7a, genericv7a, 7A,
> > > FL_LDSCHED, cortex)
> > >  ARM_CORE("cortex-a5",  cortexa5, cortexa5, 7A,
>

[PATCH, ARM] Constrain the small multiply test cases to be more restrictive.

2014-11-16 Thread Hale Wang
Hi,

Refer to the previous small multiply patch (r217175).

The conditions in the small multiply test cases are not restrictive enough.
If forcing the march=armv4t/armv5t, these cases will fail.
These cases can be used only if we defined "
-mcpu=cortex-m0/m1/m0plus.small-multiply ".

This patch is used to fix this issue.

These cases will be skipped if we don't define
"-mcpu=cortex-m0/m1/m0plus.small-multiply". So no influence to other
targets.

Build gcc passed. Is it OK for trunk?

Thanks and Best Regards,
Hale Wang

gcc/testsuite/ChangeLog:

2014-11-13  Hale Wang  

* gcc.target/arm/small-multiply-m0-1.c: Only apply when
" -mcpu=cortex-m0/m1/m0plus.small-multiply ".
* gcc.target/arm/small-multiply-m0-2.c: Likewise.
* gcc.target/arm/small-multiply-m0-3.c: Likewise.
* gcc.target/arm/small-multiply-m0plus-1.c: Likewise.
* gcc.target/arm/small-multiply-m0plus-2.c: Likewise.
* gcc.target/arm/small-multiply-m0plus-3.c: Likewise.
* gcc.target/arm/small-multiply-m1-1.c: Likewise.
* gcc.target/arm/small-multiply-m1-2.c: Likewise.
* gcc.target/arm/small-multiply-m1-3.c: Likewise.

diff --git a/gcc/testsuite/gcc.target/arm/small-multiply-m0-1.c
b/gcc/testsuite/gcc.target/arm/small-multiply-m0-1.c
index 77ec603..49132e3 100644
--- a/gcc/testsuite/gcc.target/arm/small-multiply-m0-1.c
+++ b/gcc/testsuite/gcc.target/arm/small-multiply-m0-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_thumb1_ok } */
-/* { dg-skip-if "Test is specific to cortex-m0.small-multiply" { arm*-*-* }
{ "-mcpu=*" } { "-mcpu=cortex-m0.small-multiply" } } */
+/* { dg-skip-if "Test is specific to cortex-m0.small-multiply" { arm*-*-* }
{ "*" } { "-mcpu=cortex-m0.small-multiply" } } */
 /* { dg-options "-mcpu=cortex-m0.small-multiply -mthumb -O2" } */
 
 int
diff --git a/gcc/testsuite/gcc.target/arm/small-multiply-m0-2.c
b/gcc/testsuite/gcc.target/arm/small-multiply-m0-2.c
index c89b3ba..7f1bf7b 100644
--- a/gcc/testsuite/gcc.target/arm/small-multiply-m0-2.c
+++ b/gcc/testsuite/gcc.target/arm/small-multiply-m0-2.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_thumb1_ok } */
-/* { dg-skip-if "Test is specific to cortex-m0.small-multiply" { arm*-*-* }
{ "-mcpu=*" } { "-mcpu=cortex-m0.small-multiply" } } */
+/* { dg-skip-if "Test is specific to cortex-m0.small-multiply" { arm*-*-* }
{ "*" } { "-mcpu=cortex-m0.small-multiply" } } */
 /* { dg-options "-mcpu=cortex-m0.small-multiply -mthumb -Os" } */
 
 int
diff --git a/gcc/testsuite/gcc.target/arm/small-multiply-m0-3.c
b/gcc/testsuite/gcc.target/arm/small-multiply-m0-3.c
index b2df109..aca39d7 100644
--- a/gcc/testsuite/gcc.target/arm/small-multiply-m0-3.c
+++ b/gcc/testsuite/gcc.target/arm/small-multiply-m0-3.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_thumb1_ok } */
-/* { dg-skip-if "Test is specific to cortex-m0.small-multiply" { arm*-*-* }
{ "-mcpu=*" } { "-mcpu=cortex-m0.small-multiply" } } */
+/* { dg-skip-if "Test is specific to cortex-m0.small-multiply" { arm*-*-* }
{ "*" } { "-mcpu=cortex-m0.small-multiply" } } */
 /* { dg-options "-mcpu=cortex-m0.small-multiply -mthumb -Os" } */
 
 int
diff --git a/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-1.c
b/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-1.c
index 08a450b..12e8839 100644
--- a/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-1.c
+++ b/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_thumb1_ok } */
-/* { dg-skip-if "Test is specific to cortex-m0plus.small-multiply" {
arm*-*-* } { "-mcpu=*" } { "-mcpu=cortex-m0plus.small-multiply" } } */
+/* { dg-skip-if "Test is specific to cortex-m0plus.small-multiply" {
arm*-*-* } { "*" } { "-mcpu=cortex-m0plus.small-multiply" } } */
 /* { dg-options "-mcpu=cortex-m0plus.small-multiply -mthumb -O2" } */
 
 int
diff --git a/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-2.c
b/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-2.c
index 17b52d3..3e3c9b2 100644
--- a/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-2.c
+++ b/gcc/testsuite/gcc.target/arm/small-multiply-m0plus-2.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_thumb1_ok } */
-/* { dg-skip-if "Test is specific to cortex-m0plus.small-multiply" {
arm*-*-* } { "-mcpu=*" } { "-mcpu=cortex-m0plus.small-multiply" } } */
+/* { dg-skip-if "Test is specific to cortex-m0plus.small-multiply" {
arm*-*-* } { "*" } { "-

[PATCH, ARM, libgcc] New aeabi_idiv function for armv6-m

2014-11-25 Thread Hale Wang
Hi,

This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
(https://git.linaro.org/toolchain/cortex-strings.git), which was contributed
by ARM under Free BSD license. 

The new aeabi_idiv routine is used to replace the one in
libgcc/config/arm/lib1funcs.S. This replacement happens within the Thumb1
wrapper. The new routine is under LGPLv3 license.

The main advantage of this version is that it can improve the performance of
the aeabi_idiv function for Thumb1. This solution will also increase the
code size. So it will only be used if __OPTIMIZE_SIZE__ is not defined.

Make check passed for armv6-m.

OK for trunk?

Thanks,
Hale Wang

libgcc/ChangeLog:

2014-11-26  Hale Wang  

* config/arm/lib1funcs.S: Add new wrapper.

===
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
index b617137..de66c81 100644
--- a/libgcc/config/arm/lib1funcs.S
+++ b/libgcc/config/arm/lib1funcs.S
@@ -306,34 +306,12 @@ LSYM(Lend_fde):
 #ifdef __ARM_EABI__
 .macro THUMB_LDIV0 name signed
 #if defined(__ARM_ARCH_6M__)
-   .ifc \signed, unsigned
-   cmp r0, #0
-   beq 1f
-   mov r0, #0
-   mvn r0, r0  @ 0x
-1:
-   .else
-   cmp r0, #0
-   beq 2f
-   blt 3f
+
+   push{r0, lr}
mov r0, #0
-   mvn r0, r0
-   lsr r0, r0, #1  @ 0x7fff
-   b   2f
-3: mov r0, #0x80
-   lsl r0, r0, #24 @ 0x8000
-2:
-   .endif
-   push{r0, r1, r2}
-   ldr r0, 4f
-   adr r1, 4f
-   add r0, r1
-   str r0, [sp, #8]
-   @ We know we are not on armv4t, so pop pc is safe.
-   pop {r0, r1, pc}
-   .align  2
-4:
-   .word   __aeabi_idiv0 - 4b
+   bl  SYM(__aeabi_idiv0)
+   pop {r1, pc}
+
 #elif defined(__thumb2__)
.syntax unified
.ifc \signed, unsigned
@@ -927,7 +905,158 @@ LSYM(Lover7):
add dividend, work
   .endif
 LSYM(Lgot_result):
-.endm  
+.endm
+
+#if defined(__prefer_thumb__) && !defined(__OPTIMIZE_SIZE__)
+.macro BranchToDiv n, label
+   lsr curbit, dividend, \n
+   cmp curbit, divisor
+   blo \label
+.endm
+
+.macro DoDiv n
+   lsr curbit, dividend, \n
+   cmp curbit, divisor
+   bcc 1f
+   lsl curbit, divisor, \n
+   sub dividend, dividend, curbit
+
+1: adc result, result
+.endm
+
+.macro THUMB1_Div_Positive
+   mov result, #0
+   BranchToDiv #1, LSYM(Lthumb1_div1)
+   BranchToDiv #4, LSYM(Lthumb1_div4)
+   BranchToDiv #8, LSYM(Lthumb1_div8)
+   BranchToDiv #12, LSYM(Lthumb1_div12)
+   BranchToDiv #16, LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_large_positive):
+   mov result, #0xff
+   lsl divisor, divisor, #8
+   rev result, result
+   lsr curbit, dividend, #16
+   cmp curbit, divisor
+   blo 1f
+   asr result, #8
+   lsl divisor, divisor, #8
+   beq LSYM(Ldivbyzero_waypoint)
+
+1: lsr curbit, dividend, #12
+   cmp curbit, divisor
+   blo LSYM(Lthumb1_div12)
+   b   LSYM(Lthumb1_div16)
+LSYM(Lthumb1_div_loop):
+   lsr divisor, divisor, #8
+LSYM(Lthumb1_div16):
+   Dodiv   #15
+   Dodiv   #14
+   Dodiv   #13
+   Dodiv   #12
+LSYM(Lthumb1_div12):
+   Dodiv   #11
+   Dodiv   #10
+   Dodiv   #9
+   Dodiv   #8
+   bcs LSYM(Lthumb1_div_loop)
+LSYM(Lthumb1_div8):
+   Dodiv   #7
+   Dodiv   #6
+   Dodiv   #5
+LSYM(Lthumb1_div5):
+   Dodiv   #4
+LSYM(Lthumb1_div4):
+   Dodiv   #3
+LSYM(Lthumb1_div3):
+   Dodiv   #2
+LSYM(Lthumb1_div2):
+   Dodiv   #1
+LSYM(Lthumb1_div1):
+   sub divisor, dividend, divisor
+   bcs 1f
+   cpy divisor, dividend
+
+1: adc result, result
+   cpy dividend, result
+   RET
+
+LSYM(Ldivbyzero_waypoint):
+   b   LSYM(Ldiv0)
+.endm
+
+.macro THUMB1_Div_Negative
+   lsr result, divisor, #31
+   beq 1f
+   neg divisor, divisor
+
+1: asr curbit, dividend, #32
+   bcc 2f
+   neg dividend, dividend
+
+2: eor curbit, result
+   mov result, #0
+   cpy ip, curbit
+   BranchToDiv #4, LSYM(Lthumb1_div_negative4)
+   BranchToDiv #8, LSYM(Lthumb1_div_negative8)
+LSYM(Lthumb1_div_large):
+   mov result, #0xfc
+   lsl divisor, divisor, #6
+   rev result, result
+   lsr curbit, dividend, #8
+   cmp curbit, divisor
+   blo LSYM(Lthumb1_div_negative8)
+
+   lsl divisor, divisor, #6
+   asr result, result, #6
+   cmp curbit, divisor
+   blo LSYM(Lthumb1_div_negative8)
+
+   lsl divisor, divisor, #6
+   asr result, result, #6
+   cmp curbit, divisor
+   blo LSYM(Lthumb1_div_negative8)
+
+  

RE: [PATCH] [1/2] [ARM] [libgcc] Support RTABI half-precision conversion functions.

2015-04-13 Thread Hale Wang
> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Joseph Myers
> Sent: Saturday, April 11, 2015 4:30 AM
> To: Hale Wang
> Cc: 'GCC Patches'
> Subject: Re: [PATCH] [1/2] [ARM] [libgcc] Support RTABI half-precision
> conversion functions.
> 
> On Fri, 10 Apr 2015, Hale Wang wrote:
> 
> > According to the run-time ABI for ARM architecture, this function is
> > allowed to corrupt only the integer core registers permitted to be
> > corrupted by the [AAPCS] (r0-r3, ip, lr, and CPSR). So we can't just
> > simply use the existing GNU conversion functions to implement this
> function.
> 
> I don't see that "only".  Indeed, at the bottom of page 15 in IHI0043D,
"If the
> FP instruction set is available, implementations of these functions may
use it.
> Consequently, FP hardware-using code that calls one of these helper
> functions directly, or indirectly by calling a function with a
base-standard
> interface, must assume that the FP parameter, result, scratch, and status
> registers might be altered by a call to it.".

Yes, you are right. It's my fault to add the "only" here. Thank you to point
out this.
Beside this, is this patch OK for you?

Hale
> 
> (Furthermore, the ideal result on implementations with VFP but not
> hardware half-precision support would be that the software conversions
> interact properly with hardware exceptions and rounding modes (so using
> FPSCR).)
> 
> --
> Joseph S. Myers
> jos...@codesourcery.com





Ping^2 : [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-04-15 Thread Hale Wang
Ping for trunk?

Hale

> -Original Message-
> From: Richard Sandiford [mailto:rdsandif...@googlemail.com]
> Sent: Friday, February 27, 2015 4:04 AM
> To: Terry Guo
> Cc: Segher Boessenkool; Richard Sandiford; GCC Patches; Hale Wang
> Subject: Re: Ping : [PATCH] [gcc, combine] PR46164: Don't combine the
insns
> if a volatile register is contained.
> 
> Terry Guo  writes:
> > On Thu, Feb 26, 2015 at 1:55 PM, Segher Boessenkool
> >  wrote:
> >> On Tue, Feb 17, 2015 at 11:39:34AM +0800, Terry Guo wrote:
> >>> On Sun, Feb 15, 2015 at 7:35 PM, Segher Boessenkool
> >>>  wrote:
> >>> > Hi Terry,
> >>> >
> >>> > I still think this is stage1 material.
> >>> >
> >>> >> + /* Don't combine if dest contains a user specified register and
> >>> >> i3 contains
> >>> >> + ASM_OPERANDS, because the user specified register (same with
> >>> >> dest) in i3
> >>> >> + would be replaced by the src of insn which might be different
> with
> >>> >> + the user's expectation.  */
> >>> >
> >>> > "Do not eliminate a register asm in an asm input" or similar?
> >>> > Text explaining why REG_USERVAR_P && HARD_REGISTER_P works
> here
> >>> > would be good to have, too.
> >>
> >>> diff --git a/gcc/combine.c b/gcc/combine.c index f779117..aeb2854
> >>> 100644
> >>> --- a/gcc/combine.c
> >>> +++ b/gcc/combine.c
> >>> @@ -1779,7 +1779,7 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3,
> >>> rtx_insn *pred ATTRIBUTE_UNUSED,  {
> >>>int i;
> >>>const_rtx set = 0;
> >>> -  rtx src, dest;
> >>> +  rtx src, dest, asm_op;
> >>>rtx_insn *p;
> >>>  #ifdef AUTO_INC_DEC
> >>>rtx link;
> >>> @@ -1914,6 +1914,14 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3,
> rtx_insn *pred ATTRIBUTE_UNUSED,
> >>>set = expand_field_assignment (set);
> >>>src = SET_SRC (set), dest = SET_DEST (set);
> >>>
> >>> +  /* Use REG_USERVAR_P and HARD_REGISTER_P to check whether
> DEST is a user
> >>> + specified register, and do not eliminate such register if it is
in an
> >>> + asm input because we may end up with something different with
> user's
> >>> + expectation.  */
> >>
> >> That doesn't explain why this will hit (almost) only on register asms.
> >> The user's expectation doesn't matter that much either: GCC would
> >> violate its own documentation / promises, that matters more ;-)
> >>
> >>> +  if (REG_P (dest) && REG_USERVAR_P (dest) && HARD_REGISTER_P
> (dest)
> >>> +  && ((asm_op = extract_asm_operands (PATTERN (i3))) != NULL))
> >>
> >> You do not need the temporary variable, nor the != 0 or the extra
> >> parens; just write
> >>
> >>  && extract_asm_operands (PATTERN (i3))
> >>
> >> Cheers,
> >>
> >>
> >> Segher
> >
> > Thanks for comments. Patch is updated now. Please review again.
> 
> Looks good to me FWIW.
> 
> Thanks,
> Richard





[PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-01-26 Thread Hale Wang
Hi,

The GCC combine pass combines the insns even though they contain volatile
registers. This doesn't make sence.

The test case listed in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=46164
shows the expected asm command "mov r1, r1" is not generated."r1" is defined
as a volatile register, and there are three insns related to r1:

(insn 98 97 40 3 (set (reg/v:SI 1 r1 [ b ]) (reg:SI 154 [ b ])) 
(insn 41 40 43 3 (set (reg/f:SI 148)(reg/v:SI 1 r1 [ b ]))
(insn 43 41 45 3 (parallel [
(set (reg/v:SI 0 r0 [ ret ])
(asm_operands/v:SI ("mov %2, %2  mov %3, %3  mov %4, %4")
("=r") 0 [
(reg/v:SI 0 r0 [ a ])
(reg/v:SI 1 r1 [ b ])
(reg/v:SI 2 r2 [ c ])
(mem/c:QI (reg/f:SI 148) [0 MEM[(char *)&temp]+0 S1
A8])

The combine pass combine these insns:

(note 98 97 40 3 NOTE_INSN_DELETED)
(note 41 40 43 3 NOTE_INSN_DELETED)
(insn 43 41 45 3 (parallel [
(set (reg/v:SI 0 r0 [ ret ])
(asm_operands/v:SI ("mov %2, %2  mov %3, %3  mov %4, %4")
("=r") 0 [
(reg/v:SI 0 r0 [ a ])
(reg:SI 154 [ b ])
(reg/v:SI 2 r2 [ c ])
(mem/c:QI (reg:SI 154 [ b ]) [0 MEM[(char *)&temp]+0
S1 A8])


The volatile register "r1" is totally disappeared in the asm_operands, and
the generated asm code is unexpected.

This patch is used to disable the combine operation if the insns contain
volatile registers. A new test case is also added in this patch.

Is it OK for trunk?

BR,
Hale Wang

ChangeLog:

2015-01-22  Hale Wang  

PR middle-end/46164
* combine.c (can_combine_p): Don't combine the insns if
a volatile register is contained.

2015-01-22  Hale Wang  

PR middle-end/46164
* gcc.target/arm/pr46164.c: New test.


diff --git a/gcc/combine.c b/gcc/combine.c
index 5c763b4..cf48666 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -2004,6 +2004,13 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn
*pred ATTRIBUTE_UNUSED,
  return 0;
}
 
+  /* If src contains a volatile register, reject, because the register may
+ possibly be used in a asm operand.  The combined insn may cause the
asm
+ operand to be generated unexpectly.  */
+
+  if (REG_P (src) && REG_USERVAR_P (src))
+return 0;
+
   /* If INSN contains anything volatile, or is an `asm' (whether volatile
  or not), reject, unless nothing volatile comes between it and I3 */
 
diff --git a/gcc/testsuite/gcc.target/arm/pr46164.c
b/gcc/testsuite/gcc.target/arm/pr46164.c
new file mode 100644
index 000..ad3b7cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr46164.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=cortex-m3 -mthumb -O1" } */
+
+char temp[16];
+extern int foo1 (void);
+
+void foo (void)
+{
+  int i;
+  int len;
+
+  while (1)
+  {
+len = foo1 ();
+register char *a asm ("r1") = temp;
+asm volatile ("mov %[r1], %[r1]\n " :: [r1]"r"(a), "m"(*a));
+
+for (i = 0; i < len; i++)
+{
+  if (temp[i] == 10)
+  return;
+}
+  }
+}
+
+/* { dg-final { scan-assembler "\[\\t \]+mov\ r1,\ r1" } } */

pr46164-combine-volatile-register.patch-3
Description: Binary data


RE: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-01-26 Thread Hale Wang


> -Original Message-
> From: Andrew Pinski [mailto:pins...@gmail.com]
> Sent: Monday, January 26, 2015 6:03 PM
> To: Hale Wang
> Cc: GCC Patches
> Subject: Re: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a
> volatile register is contained.
> 
> 
> I think it is allowed to the second combining, just not the first.
> Also it is not about volatile registers here but rather user specified 
> registers
> into inline-asm.
> Also I thought can_combine_p would reject combing into an inline-asm to
> prevent this issue.
> 

Hi Andrew,

Thanks a lot.
As you suggested, I changed the patch to reject combing into an inline-asm. I 
have attached the patch.
Is it OK for you?

BR,
Hale

> Thanks,
> Andrew
> 
> >
> > This patch is used to disable the combine operation if the insns
> > contain volatile registers. A new test case is also added in this patch.
> >
> > Is it OK for trunk?
> >
> > BR,
> > Hale Wang
> >
> > ChangeLog:
> >
> > 2015-01-22  Hale Wang  
> >
> > PR middle-end/46164
> > * combine.c (can_combine_p): Don't combine the insns if
> > a volatile register is contained.
> >
> > 2015-01-22  Hale Wang  
> >
> > PR middle-end/46164
> > * gcc.target/arm/pr46164.c: New test.

diff --git a/gcc/combine.c b/gcc/combine.c
index 5c763b4..2e8290a 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -1769,7 +1769,7 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn 
*pred ATTRIBUTE_UNUSED,
 {
   int i;
   const_rtx set = 0;
-  rtx src, dest;
+  rtx src, dest, asm_op;
   rtx_insn *p;
 #ifdef AUTO_INC_DEC
   rtx link;
@@ -1983,6 +1983,10 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn 
*pred ATTRIBUTE_UNUSED,
   else if (GET_CODE (dest) != CC0)
 return 0;
 
+  /* If i3 contains an inline-asm operand, reject, because the user specified
+ registers in the inline-asm maybe removed by the combining.  */
+  if ((asm_op = extract_asm_operands (PATTERN (i3))) != NULL)
+return 0;
 
   if (GET_CODE (PATTERN (i3)) == PARALLEL)
 for (i = XVECLEN (PATTERN (i3), 0) - 1; i >= 0; i--)
diff --git a/gcc/testsuite/gcc.target/arm/pr46164.c 
b/gcc/testsuite/gcc.target/arm/pr46164.c
new file mode 100644
index 000..ad3b7cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr46164.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=cortex-m3 -mthumb -O1" } */
+
+char temp[16];
+extern int foo1 (void);
+
+void foo (void)
+{
+  int i;
+  int len;
+
+  while (1)
+  {
+len = foo1 ();
+register char *a asm ("r1") = temp;
+asm volatile ("mov %[r1], %[r1]\n " :: [r1]"r"(a), "m"(*a));
+
+for (i = 0; i < len; i++)
+{
+  if (temp[i] == 10)
+  return;
+}
+  }
+}
+
+/* { dg-final { scan-assembler "\[\\t \]+mov\ r1,\ r1" } } */

pr46164-combine-volatile-register.patch-4
Description: Binary data


RE: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-01-26 Thread Hale Wang
> -Original Message-
> From: Segher Boessenkool [mailto:seg...@kernel.crashing.org]
> Sent: Tuesday, January 27, 2015 3:07 AM
> To: Hale Wang
> Cc: GCC Patches
> Subject: Re: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a
> volatile register is contained.
> 
> On Mon, Jan 26, 2015 at 05:55:52PM +0800, Hale Wang wrote:
> > diff --git a/gcc/combine.c b/gcc/combine.c index 5c763b4..cf48666
> > 100644
> > --- a/gcc/combine.c
> > +++ b/gcc/combine.c
> > @@ -2004,6 +2004,13 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3,
> > rtx_insn *pred ATTRIBUTE_UNUSED,
> >   return 0;
> > }
> >
> > +  /* If src contains a volatile register, reject, because the register
may
> > + possibly be used in a asm operand.  The combined insn may cause
> > + the
> > asm
> > + operand to be generated unexpectly.  */
> > +
> > +  if (REG_P (src) && REG_USERVAR_P (src))
> > +return 0;
> > +
> >/* If INSN contains anything volatile, or is an `asm' (whether
volatile
> >   or not), reject, unless nothing volatile comes between it and I3
> > */
> 
> 
> > diff --git a/gcc/testsuite/gcc.target/arm/pr46164.c
> > b/gcc/testsuite/gcc.target/arm/pr46164.c
> > new file mode 100644
> > index 000..ad3b7cb
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/pr46164.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mcpu=cortex-m3 -mthumb -O1" } */
> 
> Just "-O1" reproduces the problem here, FWIW.
> 

You are correct. Just "-O1" reproduces this problem.
However it's a combine bug which is related to the combing user specified
register into inline-asm.

> 
> Could you try this patch please?
>
 
Your patch rejected the combine 98+43, that's correct. However, Jakub
pointed out that preventing that to be combined would be a serious
regression on code quality.

Andrew Pinski suggested: can_combine_p would reject combing into an
inline-asm to prevent this issue. And I have updated the patch. What do you
think about this change?

BR,
Hale


@@ -1983,6 +1983,10 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn
*pred ATTRIBUTE_UNUSED,
   else if (GET_CODE (dest) != CC0)
 return 0;
 
+  /* If i3 contains an inline-asm operand, reject, because the user
specified
+ registers in the inline-asm maybe removed by the combining.  */  
+ if ((asm_op = extract_asm_operands (PATTERN (i3))) != NULL)
+return 0;
 
   if (GET_CODE (PATTERN (i3)) == PARALLEL)
 for (i = XVECLEN (PATTERN (i3), 0) - 1; i >= 0; i--)

> 
> Segher
> 
> 





RE: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-01-26 Thread Hale Wang


> -Original Message-
> From: Segher Boessenkool [mailto:seg...@kernel.crashing.org]
> Sent: Tuesday, January 27, 2015 12:52 PM
> To: Hale Wang
> Cc: GCC Patches
> Subject: Re: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a
> volatile register is contained.
> 
> On Tue, Jan 27, 2015 at 11:49:55AM +0800, Hale Wang wrote:
> 
> Hi Hale,
> 
> > > > diff --git a/gcc/testsuite/gcc.target/arm/pr46164.c
> > > > b/gcc/testsuite/gcc.target/arm/pr46164.c
> > > > new file mode 100644
> > > > index 000..ad3b7cb
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/arm/pr46164.c
> > > > @@ -0,0 +1,26 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-mcpu=cortex-m3 -mthumb -O1" } */
> > >
> > > Just "-O1" reproduces the problem here, FWIW.
> >
> > You are correct. Just "-O1" reproduces this problem.
> > However it's a combine bug which is related to the combing user
> > specified register into inline-asm.
> 
> Yes, it is.  But the registers the testcase uses exist on any ARM version
there
> is as far as I know, so not specifying specific model and ABI should give
wider
> test coverage (if anyone actually builds and/or tests more than the
default,
> of course :-) )
> 
> > > Could you try this patch please?
> >
> > Your patch rejected the combine 98+43, that's correct.
> 
> Excellent, thanks for testing.
> 
> > However, Jakub
> > pointed out that preventing that to be combined would be a serious
> > regression on code quality.
> 
> I know; I needed to think of some good way to detect register variables
(they
> aren't marked specially in RTL).  I think I found one, for combine that
is; if we
> need to detect it in other passes too, we probably need to put another
flag
> on it, or something.
> 
> > Andrew Pinski suggested: can_combine_p would reject combing into an
> > inline-asm to prevent this issue. And I have updated the patch. What
> > do you think about this change?
> 
> That will regress combining anything else into an asm.  It will disallow
> combining asms _at all_, if we really wanted that we should simply not
build
> LOG_LINKS for them.  But it hurts optimisation (for simple "r" constraints
it is
> not a real problem, RA should take care of it, but for anything else it
is).
> 
> Updated patch below.  A user variable that is also a hard register can
only
> happen in a few cases: 1) a register variable, the case we are after; 2)
an
> argument for the current function that was propagated into a user variable
> (something combine should not do at all, it hinders good register
allocation,
> but it does anyway on most targets).
> 
> Do you want to take this or shall I?  This is not a regression, so it
probably
> should wait for stage1 :-(
> 

Your solution is very good. I will test this patch locally and send out the
result ASAP. 
Thanks,

Hale

> 
> Segher
> 
> 
> diff --git a/gcc/combine.c b/gcc/combine.c index 58de157..9cba594 100644
> --- a/gcc/combine.c
> +++ b/gcc/combine.c
> @@ -1928,6 +1928,10 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3,
> rtx_insn *pred ATTRIBUTE_UNUSED,
>set = expand_field_assignment (set);
>src = SET_SRC (set), dest = SET_DEST (set);
> 
> +  /* Don't eliminate a register variable.  */  if (REG_P (dest) &&
> + REG_USERVAR_P (dest) && HARD_REGISTER_P (dest))
> +return 0;
> +
>/* Don't eliminate a store in the stack pointer.  */
>if (dest == stack_pointer_rtx
>/* Don't combine with an insn that sets a register to itself if it
has






RE: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-01-28 Thread Hale Wang
Hi Segher,

I have updated the patch as you suggested. Both the patch and the changelog
are attached.

By the way, the test case provided by Tim Pambor in PR46164 was a different
bug with PR46164. So I resubmitted the bug in
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64818.
And this patch is just used to fix this bug. Is it OK for you?

Thanks,
Hale

gcc/ChangeLog:
2015-01-27  Segher Boessenkool  
Hale Wang  

PR rtl-optimization/64818
* combine.c (can_combine_p): Don't combine the insn if
the dest of insn is a user specified register.

gcc/testsuit/ChangeLog:
2015-01-27  Segher Boessenkool  
    Hale Wang  

PR rtl-optimization/64818
* gcc.target/arm/pr64818.c: New test.


diff --git a/gcc/combine.c b/gcc/combine.c
index 5c763b4..6901ac2 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -1904,6 +1904,12 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3, rtx_insn
*pred ATTRIBUTE_UNUSED,
   set = expand_field_assignment (set);
   src = SET_SRC (set), dest = SET_DEST (set);
 
+  /* Don't combine if dest contains a user specified register, because the
+ user specified register (same with dest) in i3 would be replaced by
the
+ src of insn which might be different with the user's expectation.  */
+  if (REG_P (dest) && REG_USERVAR_P (dest) && HARD_REGISTER_P (dest))
+return 0;
+
   /* Don't eliminate a store in the stack pointer.  */
   if (dest == stack_pointer_rtx
   /* Don't combine with an insn that sets a register to itself if it
has
diff --git a/gcc/testsuite/gcc.target/arm/pr64818.c
b/gcc/testsuite/gcc.target/arm/pr64818.c
new file mode 100644
index 000..bddd846
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr64818.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O1" } */
+
+char temp[16];
+extern int foo1 (void);
+
+void foo (void)
+{
+  int i;
+  int len;
+
+  while (1)
+  {
+len = foo1 ();
+register int a asm ("r0") = 5;
+register char *b asm ("r1") = temp;
+register int c asm ("r2") = len;
+asm volatile ("mov %[r0], %[r0]\n  mov %[r1], %[r1]\n  mov %[r2],
%[r2]\n"
+  : "+m"(*b)
+  : [r0]"r"(a), [r1]"r"(b), [r2]"r"(c));
+
+for (i = 0; i < len; i++)
+{
+  if (temp[i] == 10)
+  return;
+    }
+  }
+}
+
+/* { dg-final { scan-assembler "\[\\t \]+mov\ r1,\ r1" } } */


> > On Tue, Jan 27, 2015 at 11:49:55AM +0800, Hale Wang wrote:
> >
> > Hi Hale,
> > > You are correct. Just "-O1" reproduces this problem.
> > > However it's a combine bug which is related to the combing user
> > > specified register into inline-asm.
> >
> > Yes, it is.  But the registers the testcase uses exist on any ARM
> > version
> there
> > is as far as I know, so not specifying specific model and ABI should
> > give
> wider
> > test coverage (if anyone actually builds and/or tests more than the
> default,
> > of course :-) )
> >
> > > > Could you try this patch please?
> > >
> > > Your patch rejected the combine 98+43, that's correct.
> >
> > Excellent, thanks for testing.
> >
> > > However, Jakub
> > > pointed out that preventing that to be combined would be a serious
> > > regression on code quality.
> >
> > I know; I needed to think of some good way to detect register
> > variables
> (they
> > aren't marked specially in RTL).  I think I found one, for combine
> > that
> is; if we
> > need to detect it in other passes too, we probably need to put another
> flag
> > on it, or something.
> >
> > > Andrew Pinski suggested: can_combine_p would reject combing into an
> > > inline-asm to prevent this issue. And I have updated the patch. What
> > > do you think about this change?
> >
> > That will regress combining anything else into an asm.  It will
> > disallow combining asms _at all_, if we really wanted that we should
> > simply not
> build
> > LOG_LINKS for them.  But it hurts optimisation (for simple "r"
> > constraints
> it is
> > not a real problem, RA should take care of it, but for anything else
> > it
> is).
> >
> > Updated patch below.  A user variable that is also a hard register can
> only
> > happen in a few cases: 1) a register variable, the case we are after;
> > 2)
> an
> > argument for the current function that was propagated into a user
> > variable (something combine should not do at all, it hinders good
> > register
> allocation,
> > but it does anyway on most targets).
> >
> > Do you want to take this or shall I?  This is not a regression, so it
> probably
> > should wait for stage1 :-(
> >
> 
> Your solution is very good. I will test this patch locally and send out
the result
> ASAP.
> Thanks,
> 
> Hale
> 
> >
> > Segher
> >

pr64818-combine-user-specified-register.changelog
Description: Binary data


pr64818-combine-user-specified-register.patch-2
Description: Binary data


Ping : [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-02-08 Thread Hale Wang
Ping?

> -Original Message-
> From: Hale Wang [mailto:hale.w...@arm.com]
> Sent: Thursday, January 29, 2015 9:58 AM
> To: Hale Wang; 'Segher Boessenkool'
> Cc: GCC Patches
> Subject: RE: [PATCH] [gcc, combine] PR46164: Don't combine the insns if a
> volatile register is contained.
> 
> Hi Segher,
> 
> I have updated the patch as you suggested. Both the patch and the
> changelog are attached.
> 
> By the way, the test case provided by Tim Pambor in PR46164 was a
different
> bug with PR46164. So I resubmitted the bug in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64818.
> And this patch is just used to fix this bug. Is it OK for you?
> 
> Thanks,
> Hale
> 
> gcc/ChangeLog:
> 2015-01-27  Segher Boessenkool  
>   Hale Wang  
> 
>   PR rtl-optimization/64818
>   * combine.c (can_combine_p): Don't combine the insn if
>   the dest of insn is a user specified register.
> 
> gcc/testsuit/ChangeLog:
> 2015-01-27  Segher Boessenkool  
>   Hale Wang  
> 
>   PR rtl-optimization/64818
>   * gcc.target/arm/pr64818.c: New test.
> 
> 
> diff --git a/gcc/combine.c b/gcc/combine.c index 5c763b4..6901ac2 100644
> --- a/gcc/combine.c
> +++ b/gcc/combine.c
> @@ -1904,6 +1904,12 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3,
> rtx_insn *pred ATTRIBUTE_UNUSED,
>set = expand_field_assignment (set);
>src = SET_SRC (set), dest = SET_DEST (set);
> 
> +  /* Don't combine if dest contains a user specified register, because
the
> + user specified register (same with dest) in i3 would be replaced by
the
> + src of insn which might be different with the user's expectation.
> + */  if (REG_P (dest) && REG_USERVAR_P (dest) && HARD_REGISTER_P
> (dest))
> +return 0;
> +
>/* Don't eliminate a store in the stack pointer.  */
>if (dest == stack_pointer_rtx
>/* Don't combine with an insn that sets a register to itself if it
has diff --git
> a/gcc/testsuite/gcc.target/arm/pr64818.c
> b/gcc/testsuite/gcc.target/arm/pr64818.c
> new file mode 100644
> index 000..bddd846
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/pr64818.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1" } */
> +
> +char temp[16];
> +extern int foo1 (void);
> +
> +void foo (void)
> +{
> +  int i;
> +  int len;
> +
> +  while (1)
> +  {
> +len = foo1 ();
> +register int a asm ("r0") = 5;
> +register char *b asm ("r1") = temp;
> +register int c asm ("r2") = len;
> +asm volatile ("mov %[r0], %[r0]\n  mov %[r1], %[r1]\n
> mov %[r2], %[r2]\n"
> +: "+m"(*b)
> +: [r0]"r"(a), [r1]"r"(b), [r2]"r"(c));
> +
> +for (i = 0; i < len; i++)
> +{
> +  if (temp[i] == 10)
> +  return;
> +}
> +  }
> +}
> +
> +/* { dg-final { scan-assembler "\[\\t \]+mov\ r1,\ r1" } } */
> 
> 
> > > On Tue, Jan 27, 2015 at 11:49:55AM +0800, Hale Wang wrote:
> > >
> > > Hi Hale,
> > > > You are correct. Just "-O1" reproduces this problem.
> > > > However it's a combine bug which is related to the combing user
> > > > specified register into inline-asm.
> > >
> > > Yes, it is.  But the registers the testcase uses exist on any ARM
> > > version
> > there
> > > is as far as I know, so not specifying specific model and ABI should
> > > give
> > wider
> > > test coverage (if anyone actually builds and/or tests more than the
> > default,
> > > of course :-) )
> > >
> > > > > Could you try this patch please?
> > > >
> > > > Your patch rejected the combine 98+43, that's correct.
> > >
> > > Excellent, thanks for testing.
> > >
> > > > However, Jakub
> > > > pointed out that preventing that to be combined would be a serious
> > > > regression on code quality.
> > >
> > > I know; I needed to think of some good way to detect register
> > > variables
> > (they
> > > aren't marked specially in RTL).  I think I found one, for combine
> > > that
> > is; if we
> > > need to detect it in other passes too, we probably need to put
> > > another
> > flag
> > > on it, or something.
> > >
> > > > Andrew Pinski suggested: can_combine_p would reject combing into
> > > > an inline-asm to prevent this issue. And I h

RE: [Ping^2] [PATCH, ARM, libgcc] New aeabi_idiv function for armv6-m

2015-02-08 Thread Hale Wang
Ping https://gcc.gnu.org/ml/gcc-patches/2014-12/msg01059.html.

> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Hale Wang
> Sent: Friday, December 12, 2014 9:36 AM
> To: gcc-patches
> Subject: RE: [Ping] [PATCH, ARM, libgcc] New aeabi_idiv function for armv6-
> m
> 
> Ping? Already applied to arm/embedded-4_9-branch, is it OK for trunk?
> 
> -Hale
> 
> > -Original Message-
> > From: Joey Ye [mailto:joey.ye...@gmail.com]
> > Sent: Thursday, November 27, 2014 10:01 AM
> > To: Hale Wang
> > Cc: gcc-patches
> > Subject: Re: [PATCH, ARM, libgcc] New aeabi_idiv function for armv6-m
> >
> > OK applying to arm/embedded-4_9-branch, though you still need
> > maintainer approval into trunk.
> >
> > - Joey
> >
> > On Wed, Nov 26, 2014 at 11:43 AM, Hale Wang 
> wrote:
> > > Hi,
> > >
> > > This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
> > > (https://git.linaro.org/toolchain/cortex-strings.git), which was
> > > contributed by ARM under Free BSD license.
> > >
> > > The new aeabi_idiv routine is used to replace the one in
> > > libgcc/config/arm/lib1funcs.S. This replacement happens within the
> > > Thumb1 wrapper. The new routine is under LGPLv3 license.
> > >
> > > The main advantage of this version is that it can improve the
> > > performance of the aeabi_idiv function for Thumb1. This solution
> > > will also increase the code size. So it will only be used if
> > > __OPTIMIZE_SIZE__ is
> > not defined.
> > >
> > > Make check passed for armv6-m.
> > >
> > > OK for trunk?
> > >
> > > Thanks,
> > > Hale Wang
> > >
> > > libgcc/ChangeLog:
> > >
> > > 2014-11-26  Hale Wang  
> > >
> > > * config/arm/lib1funcs.S: Add new wrapper.
> > >
> > > ===
> > > diff --git a/libgcc/config/arm/lib1funcs.S
> > > b/libgcc/config/arm/lib1funcs.S index b617137..de66c81 100644
> > > --- a/libgcc/config/arm/lib1funcs.S
> > > +++ b/libgcc/config/arm/lib1funcs.S
> > > @@ -306,34 +306,12 @@ LSYM(Lend_fde):
> > >  #ifdef __ARM_EABI__
> > >  .macro THUMB_LDIV0 name signed
> > >  #if defined(__ARM_ARCH_6M__)
> > > -   .ifc \signed, unsigned
> > > -   cmp r0, #0
> > > -   beq 1f
> > > -   mov r0, #0
> > > -   mvn r0, r0  @ 0x
> > > -1:
> > > -   .else
> > > -   cmp r0, #0
> > > -   beq 2f
> > > -   blt 3f
> > > +
> > > +   push{r0, lr}
> > > mov r0, #0
> > > -   mvn r0, r0
> > > -   lsr r0, r0, #1  @ 0x7fff
> > > -   b   2f
> > > -3: mov r0, #0x80
> > > -   lsl r0, r0, #24 @ 0x8000
> > > -2:
> > > -   .endif
> > > -   push{r0, r1, r2}
> > > -   ldr r0, 4f
> > > -   adr r1, 4f
> > > -   add r0, r1
> > > -   str r0, [sp, #8]
> > > -   @ We know we are not on armv4t, so pop pc is safe.
> > > -   pop {r0, r1, pc}
> > > -   .align  2
> > > -4:
> > > -   .word   __aeabi_idiv0 - 4b
> > > +   bl  SYM(__aeabi_idiv0)
> > > +   pop {r1, pc}
> > > +
> > >  #elif defined(__thumb2__)
> > > .syntax unified
> > > .ifc \signed, unsigned
> > > @@ -927,7 +905,158 @@ LSYM(Lover7):
> > > add dividend, work
> > >.endif
> > >  LSYM(Lgot_result):
> > > -.endm
> > > +.endm
> > > +
> > > +#if defined(__prefer_thumb__)
> > && !defined(__OPTIMIZE_SIZE__) .macro
> > > +BranchToDiv n, label
> > > +   lsr curbit, dividend, \n
> > > +   cmp curbit, divisor
> > > +   blo \label
> > > +.endm
> > > +
> > > +.macro DoDiv n
> > > +   lsr curbit, dividend, \n
> > > +   cmp curbit, divisor
> > > +   bcc 1f
> > > +   lsl curbit, divisor, \n
> > > +   sub dividend, dividend, curbit
> > > +
> > > +1: adc result, result
> > > +.endm
> > > +
> > > +.macro THUMB1_Div_Positive
> > > +   mov

RE: Ping : [PATCH] [gcc, combine] PR46164: Don't combine the insns if a volatile register is contained.

2015-02-12 Thread Hale Wang
> -Original Message-
> From: Segher Boessenkool [mailto:seg...@kernel.crashing.org]
> Sent: Friday, February 13, 2015 6:16 AM
> To: Hale Wang; 'GCC Patches'; Richard Sandiford
> Subject: Re: Ping : [PATCH] [gcc, combine] PR46164: Don't combine the
insns
> if a volatile register is contained.
> 
> On Thu, Feb 12, 2015 at 03:54:21PM +0000, Richard Sandiford wrote:
> > "Hale Wang"  writes:
> > > Ping?
> 
> It's not a regression (or is it?), so it is not appropriate for stage4.
> 
> 
> > >> diff --git a/gcc/combine.c b/gcc/combine.c index 5c763b4..6901ac2
> > >> 100644
> > >> --- a/gcc/combine.c
> > >> +++ b/gcc/combine.c
> > >> @@ -1904,6 +1904,12 @@ can_combine_p (rtx_insn *insn, rtx_insn *i3,
> > >> rtx_insn *pred ATTRIBUTE_UNUSED,
> > >>set = expand_field_assignment (set);
> > >>src = SET_SRC (set), dest = SET_DEST (set);
> > >>
> > >> +  /* Don't combine if dest contains a user specified register,
> > >> + because
> > > the
> > >> + user specified register (same with dest) in i3 would be
> > >> + replaced by
> > > the
> > >> + src of insn which might be different with the user's
expectation.
> > >> + */  if (REG_P (dest) && REG_USERVAR_P (dest) &&
> HARD_REGISTER_P
> > >> (dest))
> > >> +return 0;
> >
> > I suppose this is similar to Andrew's comment, but I think the rule is
> > that it's invalid to replace a REG_USERVAR_P operand in an inline asm.
> 
> Why not?  You probably mean register asm, not all user variables?
> 
> > Outside of an inline asm we make no guarantee about whether something
> > is stored in a particular register or not.
> >
> > So IMO we should be checking whether either INSN or I3 is an asm as
> > well as the above.
> 
> [ INSN can never be an asm, that is already refused by can_combine_p. ]
> 

Indeed. If INSN is an asm operand, it's already refused by can_combine_p.

Hale.

> We do not guarantee things will end up in the specified reg (except for
asm),
> but will it hurt to leave things in the reg the user said it should be in,
even if
> we do not guarantee this behaviour?
> 
> 
> Segher






[Ping] [PATCH] Add arm_cortex_m7_tune.

2014-10-28 Thread Hale Wang
Ping?
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg02025.html

Thanks
Hale Wang

> -Original Message-
> From: Hale Wang [mailto:hale.w...@arm.com]
> Sent: Tuesday, October 21, 2014 5:57 PM
> To: Hale Wang; gcc-patches@gcc.gnu.org
> Subject: RE: [PATCH] Add arm_cortex_m7_tune.
> 
> Attach the patch.
> 
> > -Original Message-
> > From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> > ow...@gcc.gnu.org] On Behalf Of Hale Wang
> > Sent: Tuesday, October 21, 2014 5:49 PM
> > To: gcc-patches@gcc.gnu.org
> > Subject: [PATCH] Add arm_cortex_m7_tune.
> >
> > Hi,
> >
> > This patch is used to tune the gcc for Cortex-M7.
> >
> > The performance of Dhrystone can be improved by 1%.
> > The performance of Coremark can be improved by 2.3%.
> >
> > Patch also attached for convenience.
> >
> > Is it ok for trunk?
> >
> > Thanks and Best Regards,
> > Hale Wang
> >
> > gcc/ChangeLog
> > 2014-10-11  Hale Wang  
> >
> > * config/arm/arm.c: Add cortex-m7 tune.
> > * config/arm/arm-cores.def: Use cortex-m7 tune.
> >
> > diff --git a/gcc/config/arm/arm-cores.def
> > b/gcc/config/arm/arm-cores.def index 56ec7fd..3b34173 100644
> > --- a/gcc/config/arm/arm-cores.def
> > +++ b/gcc/config/arm/arm-cores.def
> > @@ -149,7 +149,7 @@ ARM_CORE("cortex-r4",
> > cortexr4, cortexr4,  7R,  FL_LDSCHED, cortex)
> > ARM_CORE("cortex-r4f",cortexr4f, cortexr4f,
> > 7R,  FL_LDSCHED, cortex)
> > ARM_CORE("cortex-r5",  cortexr5, cortexr5,
> > 7R,  FL_LDSCHED | FL_ARM_DIV, cortex)
> > ARM_CORE("cortex-r7",  cortexr7, cortexr7,
> > 7R,  FL_LDSCHED | FL_ARM_DIV, cortex)
> > -ARM_CORE("cortex-m7",   cortexm7, cortexm7,
> > 7EM, FL_LDSCHED, v7m)
> > +ARM_CORE("cortex-m7",  cortexm7, cortexm7,
> > 7EM, FL_LDSCHED, cortex_m7)
> > ARM_CORE("cortex-m4",   cortexm4, cortexm4,
> > 7EM, FL_LDSCHED, v7m)
> > ARM_CORE("cortex-m3",   cortexm3, cortexm3,
> > 7M,  FL_LDSCHED, v7m)
> > ARM_CORE("marvell-pj4", marvell_pj4,
marvell_pj4,
> > 7A,  FL_LDSCHED, 9e)
> > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index
> > 93b989d..834b13a 100644
> > --- a/gcc/config/arm/arm.c
> > +++ b/gcc/config/arm/arm.c
> > @@ -2003,6 +2003,27 @@ const struct tune_params arm_v7m_tune =
> >8
> > /* Maximum insns to inline memset.  */ };
> >
> > +/* Cortex-M7 tuning.  */
> > +
> > +const struct tune_params arm_cortex_m7_tune = {
> > +  arm_9e_rtx_costs,
> > +  &v7m_extra_costs,
> > +  NULL,
> > /* Sched adj cost.  */
> > +  0,
> > /* Constant limit.  */
> > +  0,
> > /* Max cond insns.  */
> > +  ARM_PREFETCH_NOT_BENEFICIAL,
> > +  true,
> > /* Prefer constant pool.  */
> > +  arm_cortex_m_branch_cost,
> > +  false,
> > /* Prefer LDRD/STRD.  */
> > +  {true, true},
> > /* Prefer non short circuit.  */
> > +  &arm_default_vec_cost,/* Vectorizer costs.
*/
> > +  false,/* Prefer Neon for
64-bits
> > bitops.  */
> > +  false, false, /* Prefer 32-bit
encodings.
> > */
> > +  false,
> > /* Prefer Neon for stringops.  */
> > +  8
> > /* Maximum insns to inline memset.  */
> > +};
> > +
> > /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
> > arm_v6t2_tune. It is used for cortex-m0, cortex-m1 and
cortex-m0plus.
> > */
> > const struct tune_params arm_v6m_tune =
> >
> >
> >


cortex-m7-tune-2.patch
Description: Binary data


[Ping] [PATCH] Small multiplier support in Cortex-M0/1/+

2014-10-28 Thread Hale Wang
Ping?
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg02027.html

Thanks,
Hale Wang

> -Original Message-
> From: Hale Wang [mailto:hale.w...@arm.com]
> Sent: Tuesday, October 21, 2014 6:02 PM
> To: 'gcc-patches@gcc.gnu.org'
> Subject: Small multiplier support in Cortex-M0/1/+
> 
> Hi,
> 
> Some configurations of the Cortex-M0 and Cortex-M1 come with a high
> latency multiplier. This patch adds support for such configurations.
> 
> Small multiplier means using add/sub/shift instructions to replace the mul
> instruction for the MCU that has no fast multiplier.
> 
> The following strategies are adopted in this patch:
> 1. Define new CPUs as -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-
> multiply,cortex-m1.small-multiply to support small multiplier.
> 2. -Os means size is preferred. A threshold of 5 is set which means it
will
> prevent spliting if ending up with more than 5 instructions. As for
non-OS,
> there will be no such a limit.
> 
> Some test cases are also added in the testsuite to verify this function.
> 
> Is it ok for trunk?
> 
> Thanks and Best Regards,
> Hale Wang
> 
> gcc/ChangeLog:
> 
> 2014-08-29  Hale Wang  
> 
>   * config/arm/arm-cores.def: Add support for
>   -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
>   cortex-m1.small-multiply.
>   * config/arm/arm-tables.opt: Regenerate.
>   * config/arm/arm-tune.md: Regenerate.
>   * config/arm/arm.c: Update the rtx-costs for MUL.
>   * config/arm/bpabi.h: Handle
>   -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
>   cortex-m1.small-multiply.
>   * doc/invoke.texi: Document
>   -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
>   cortex-m1.small-multiply.
>   * testsuite/gcc.target/arm/small-multiply-m0-1.c: New test case.
>   * testsuite/gcc.target/arm/small-multiply-m0-2.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m0-3.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m0plus-1.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m0plus-2.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m0plus-3.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m1-1.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m1-2.c: Likewise.
>   * testsuite/gcc.target/arm/small-multiply-m1-3.c: Likewise.
> 
> ==
> =
> diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
> index a830a83..af4b373 100644
> --- a/gcc/config/arm/arm-cores.def
> +++ b/gcc/config/arm/arm-cores.def
> @@ -137,6 +137,11 @@ ARM_CORE("cortex-m1",cortexm1,
> cortexm1, 6M, FL_LDSCHED, v6m)
>  ARM_CORE("cortex-m0",cortexm0, cortexm0, 6M,
> FL_LDSCHED, v6m)
>  ARM_CORE("cortex-m0plus",cortexm0plus, cortexm0plus, 6M,
> FL_LDSCHED, v6m)
> 
> +/* V6M Architecture Processors for small-multiply implementations.  */
> +ARM_CORE("cortex-m1.small-multiply", cortexm1smallmultiply,
> cortexm1, 6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> +ARM_CORE("cortex-m0.small-multiply", cortexm0smallmultiply,
> cortexm0, 6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> +ARM_CORE("cortex-m0plus.small-multiply",cortexm0plussmallmultiply,
> +cortexm0plus,6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> +
>  /* V7 Architecture Processors */
>  ARM_CORE("generic-armv7-a",  genericv7a, genericv7a,
>   7A,  FL_LDSCHED, cortex)
>  ARM_CORE("cortex-a5",cortexa5, cortexa5, 7A,
> FL_LDSCHED, cortex_a5)
> diff --git a/gcc/config/arm/arm-tables.opt b/gcc/config/arm/arm-tables.opt
> index bc046a0..bd65bd2 100644
> --- a/gcc/config/arm/arm-tables.opt
> +++ b/gcc/config/arm/arm-tables.opt
> @@ -241,6 +241,15 @@ EnumValue
>  Enum(processor_type) String(cortex-m0plus) Value(cortexm0plus)
> 
>  EnumValue
> +Enum(processor_type) String(cortex-m1.small-multiply)
> +Value(cortexm1smallmultiply)
> +
> +EnumValue
> +Enum(processor_type) String(cortex-m0.small-multiply)
> +Value(cortexm0smallmultiply)
> +
> +EnumValue
> +Enum(processor_type) String(cortex-m0plus.small-multiply)
> +Value(cortexm0plussmallmultiply)
> +
> +EnumValue
>  Enum(processor_type) String(generic-armv7-a) Value(genericv7a)
> 
>  EnumValue
> diff --git a/gcc/config/arm/arm-tune.md b/gcc/config/arm/arm-tune.md
> index 954cab8..8b5c778 100644
> --- a/gcc/config/arm/arm-tune.md
> +++ b/gcc/config/arm/arm-tune.md
> @@ -25,6 +25,7 @@
>   arm1176jzs,arm1176jzfs,mpcorenovfp,
>   mpcore,arm1156t2s,arm1156t2fs,

RE: Small multiplier support in Cortex-M0/1/+

2014-11-12 Thread Hale Wang
> -Original Message-
> From: Christophe Lyon [mailto:christophe.l...@linaro.org]
> Sent: Wednesday, November 12, 2014 9:49 PM
> To: Hale Wang
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: Small multiplier support in Cortex-M0/1/+
> 
> On 21 October 2014 12:01, Hale Wang  wrote:
> > Hi,
> >
> > Some configurations of the Cortex-M0 and Cortex-M1 come with a high
> > latency multiplier. This patch adds support for such configurations.
> >
> > Small multiplier means using add/sub/shift instructions to replace the
> > mul instruction for the MCU that has no fast multiplier.
> >
> > The following strategies are adopted in this patch:
> > 1. Define new CPUs as
> > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,cortex-m1.
> > small-
> > multiply to support small multiplier.
> > 2. -Os means size is preferred. A threshold of 5 is set which means it
> > will prevent spliting if ending up with more than 5 instructions. As
> > for non-OS, there will be no such a limit.
> >
> > Some test cases are also added in the testsuite to verify this function.
> >
> > Is it ok for trunk?
> >
> > Thanks and Best Regards,
> > Hale Wang
> >
> > gcc/ChangeLog:
> >
> > 2014-08-29  Hale Wang  
> >
> > * config/arm/arm-cores.def: Add support for
> > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
> > cortex-m1.small-multiply.
> > * config/arm/arm-tables.opt: Regenerate.
> > * config/arm/arm-tune.md: Regenerate.
> > * config/arm/arm.c: Update the rtx-costs for MUL.
> > * config/arm/bpabi.h: Handle
> > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
> > cortex-m1.small-multiply.
> > * doc/invoke.texi: Document
> > -mcpu=cortex-m0.small-multiply,cortex-m0plus.small-multiply,
> > cortex-m1.small-multiply.
> > * testsuite/gcc.target/arm/small-multiply-m0-1.c: New test case.
> > * testsuite/gcc.target/arm/small-multiply-m0-2.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m0-3.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m0plus-1.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m0plus-2.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m0plus-3.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m1-1.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m1-2.c: Likewise.
> > * testsuite/gcc.target/arm/small-multiply-m1-3.c: Likewise.
> >
> >
> ==
> =
> > diff --git a/gcc/config/arm/arm-cores.def
> > b/gcc/config/arm/arm-cores.def index a830a83..af4b373 100644
> > --- a/gcc/config/arm/arm-cores.def
> > +++ b/gcc/config/arm/arm-cores.def
> > @@ -137,6 +137,11 @@ ARM_CORE("cortex-m1",  cortexm1,
> cortexm1,
> > 6M, FL_LDSCHED, v6m)
> >  ARM_CORE("cortex-m0",  cortexm0, cortexm0, 6M,
> > FL_LDSCHED, v6m)
> >  ARM_CORE("cortex-m0plus",  cortexm0plus, cortexm0plus, 6M,
> > FL_LDSCHED, v6m)
> >
> > +/* V6M Architecture Processors for small-multiply implementations.  */
> > +ARM_CORE("cortex-m1.small-multiply",   cortexm1smallmultiply,
> cortexm1,
> > 6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> > +ARM_CORE("cortex-m0.small-multiply",   cortexm0smallmultiply,
> cortexm0,
> > 6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> > +ARM_CORE("cortex-m0plus.small-multiply",cortexm0plussmallmultiply,
> > cortexm0plus,6M, FL_LDSCHED | FL_SMALLMUL, v6m)
> > +
> >  /* V7 Architecture Processors */
> >  ARM_CORE("generic-armv7-a",genericv7a, genericv7a, 7A,
> > FL_LDSCHED, cortex)
> >  ARM_CORE("cortex-a5",  cortexa5, cortexa5, 7A,
> > FL_LDSCHED, cortex_a5)
> > diff --git a/gcc/config/arm/arm-tables.opt
> > b/gcc/config/arm/arm-tables.opt index bc046a0..bd65bd2 100644
> > --- a/gcc/config/arm/arm-tables.opt
> > +++ b/gcc/config/arm/arm-tables.opt
> > @@ -241,6 +241,15 @@ EnumValue
> >  Enum(processor_type) String(cortex-m0plus) Value(cortexm0plus)
> >
> >  EnumValue
> > +Enum(processor_type) String(cortex-m1.small-multiply)
> > Value(cortexm1smallmultiply)
> > +
> > +EnumValue
> > +Enum(processor_type) String(cortex-m0.small-multiply)
> > Value(cortexm0smallmultiply)
> > +
> > +EnumValue
> > +Enum(processor_type) String(cort

RE: [Ping] [PATCH, ARM, libgcc] New aeabi_idiv function for armv6-m

2014-12-11 Thread Hale Wang
Ping? Already applied to arm/embedded-4_9-branch, is it OK for trunk?

-Hale

> -Original Message-
> From: Joey Ye [mailto:joey.ye...@gmail.com]
> Sent: Thursday, November 27, 2014 10:01 AM
> To: Hale Wang
> Cc: gcc-patches
> Subject: Re: [PATCH, ARM, libgcc] New aeabi_idiv function for armv6-m
> 
> OK applying to arm/embedded-4_9-branch, though you still need maintainer
> approval into trunk.
> 
> - Joey
> 
> On Wed, Nov 26, 2014 at 11:43 AM, Hale Wang  wrote:
> > Hi,
> >
> > This patch ports the aeabi_idiv routine from Linaro Cortex-Strings
> > (https://git.linaro.org/toolchain/cortex-strings.git), which was
> > contributed by ARM under Free BSD license.
> >
> > The new aeabi_idiv routine is used to replace the one in
> > libgcc/config/arm/lib1funcs.S. This replacement happens within the
> > Thumb1 wrapper. The new routine is under LGPLv3 license.
> >
> > The main advantage of this version is that it can improve the
> > performance of the aeabi_idiv function for Thumb1. This solution will
> > also increase the code size. So it will only be used if __OPTIMIZE_SIZE__ is
> not defined.
> >
> > Make check passed for armv6-m.
> >
> > OK for trunk?
> >
> > Thanks,
> > Hale Wang
> >
> > libgcc/ChangeLog:
> >
> > 2014-11-26  Hale Wang  
> >
> > * config/arm/lib1funcs.S: Add new wrapper.
> >
> > ===
> > diff --git a/libgcc/config/arm/lib1funcs.S
> > b/libgcc/config/arm/lib1funcs.S index b617137..de66c81 100644
> > --- a/libgcc/config/arm/lib1funcs.S
> > +++ b/libgcc/config/arm/lib1funcs.S
> > @@ -306,34 +306,12 @@ LSYM(Lend_fde):
> >  #ifdef __ARM_EABI__
> >  .macro THUMB_LDIV0 name signed
> >  #if defined(__ARM_ARCH_6M__)
> > -   .ifc \signed, unsigned
> > -   cmp r0, #0
> > -   beq 1f
> > -   mov r0, #0
> > -   mvn r0, r0  @ 0x
> > -1:
> > -   .else
> > -   cmp r0, #0
> > -   beq 2f
> > -   blt 3f
> > +
> > +   push{r0, lr}
> > mov r0, #0
> > -   mvn r0, r0
> > -   lsr r0, r0, #1  @ 0x7fff
> > -   b   2f
> > -3: mov r0, #0x80
> > -   lsl r0, r0, #24 @ 0x8000
> > -2:
> > -   .endif
> > -   push{r0, r1, r2}
> > -   ldr r0, 4f
> > -   adr r1, 4f
> > -   add r0, r1
> > -   str r0, [sp, #8]
> > -   @ We know we are not on armv4t, so pop pc is safe.
> > -   pop {r0, r1, pc}
> > -   .align  2
> > -4:
> > -   .word   __aeabi_idiv0 - 4b
> > +   bl  SYM(__aeabi_idiv0)
> > +   pop {r1, pc}
> > +
> >  #elif defined(__thumb2__)
> > .syntax unified
> > .ifc \signed, unsigned
> > @@ -927,7 +905,158 @@ LSYM(Lover7):
> > add dividend, work
> >.endif
> >  LSYM(Lgot_result):
> > -.endm
> > +.endm
> > +
> > +#if defined(__prefer_thumb__)
> && !defined(__OPTIMIZE_SIZE__) .macro
> > +BranchToDiv n, label
> > +   lsr curbit, dividend, \n
> > +   cmp curbit, divisor
> > +   blo \label
> > +.endm
> > +
> > +.macro DoDiv n
> > +   lsr curbit, dividend, \n
> > +   cmp curbit, divisor
> > +   bcc 1f
> > +   lsl curbit, divisor, \n
> > +   sub dividend, dividend, curbit
> > +
> > +1: adc result, result
> > +.endm
> > +
> > +.macro THUMB1_Div_Positive
> > +   mov result, #0
> > +   BranchToDiv #1, LSYM(Lthumb1_div1)
> > +   BranchToDiv #4, LSYM(Lthumb1_div4)
> > +   BranchToDiv #8, LSYM(Lthumb1_div8)
> > +   BranchToDiv #12, LSYM(Lthumb1_div12)
> > +   BranchToDiv #16, LSYM(Lthumb1_div16)
> > +LSYM(Lthumb1_div_large_positive):
> > +   mov result, #0xff
> > +   lsl divisor, divisor, #8
> > +   rev result, result
> > +   lsr curbit, dividend, #16
> > +   cmp curbit, divisor
> > +   blo 1f
> > +   asr result, #8
> > +   lsl divisor, divisor, #8
> > +   beq LSYM(Ldivbyzero_waypoint)
> > +
> > +1: lsr curbit, dividend, #12
> > +   cmp curbit, divisor
> > +   blo LSYM(Lthumb1_div12)
> > +   b   LSYM(Lthumb1_div16)
> > +LSYM(Lthumb1_div_loop):
> > +   

RE: patch to fix PR64110

2014-12-15 Thread Hale Wang
Hi,

This commit will cause another GCC build fail for ARM targets. The details are 
descripted in the following Bugzilla linker:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64323.

Could you help me to have a look?
Thanks,

Hale.

> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Vladimir Makarov
> Sent: Saturday, December 13, 2014 4:12 AM
> To: GCC Patches
> Subject: patch to fix PR64110
> 
> The following patch fixes
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64110
> 
> The patch was successfully tested and bootstrapped on x86/x86-64.
> 
> Committed as rev. 218688.
> 
> 2014-12-12  Vladimir Makarov  
> 
>  PR target/64110
>  * lra-constraints.c (process_alt_operands): Refuse alternative
>  when reload pseudo of given class can not hold value of given
>  mode.
> 
> 2014-12-12  Vladimir Makarov  
> 
>  PR target/64110
>  * gcc.target/i386/pr64110.c: New.





RE: patch to fix PR64110

2014-12-16 Thread Hale Wang
This issue is already fixed by your commit  r218760. 
Thanks.

Hale.

> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Hale Wang
> Sent: Tuesday, December 16, 2014 10:17 AM
> To: 'Vladimir Makarov'; GCC Patches
> Subject: RE: patch to fix PR64110
> 
> Hi,
> 
> This commit will cause another GCC build fail for ARM targets. The details are
> descripted in the following Bugzilla linker:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64323.
> 
> Could you help me to have a look?
> Thanks,
> 
> Hale.
> 
> > -Original Message-
> > From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> > ow...@gcc.gnu.org] On Behalf Of Vladimir Makarov
> > Sent: Saturday, December 13, 2014 4:12 AM
> > To: GCC Patches
> > Subject: patch to fix PR64110
> >
> > The following patch fixes
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64110
> >
> > The patch was successfully tested and bootstrapped on x86/x86-64.
> >
> > Committed as rev. 218688.
> >
> > 2014-12-12  Vladimir Makarov  
> >
> >  PR target/64110
> >  * lra-constraints.c (process_alt_operands): Refuse alternative
> >  when reload pseudo of given class can not hold value of given
> >  mode.
> >
> > 2014-12-12  Vladimir Makarov  
> >
> >  PR target/64110
> >  * gcc.target/i386/pr64110.c: New.
> 
> 
> 






[PATCH] [ARM] Tune the max_cond_insns/branch_cost for Cortex-M7

2015-01-14 Thread Hale Wang
Hi,

This patch is tuned particularly for benchmark performance on cortex-m7.
Tested with GCC regression test, no regressions. Is it ok for trunk?

BR,
Hale Wang

gcc/ChangeLog
2014-12-24  Hale Wang  

* config/arm/arm.c: Tune the max_cond_insns/branch_cost for
Cortex-M7.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 8193bf1..d52fcbd 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -287,6 +287,7 @@ static unsigned int arm_autovectorize_vector_sizes
(void);
 static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
 static int arm_cortex_m_branch_cost (bool, bool);
+static int arm_cortex_m7_branch_cost (bool, bool);
 
 static bool arm_vectorize_vec_perm_const_ok (machine_mode vmode,
 const unsigned char *sel);
@@ -1967,10 +1968,10 @@ const struct tune_params arm_cortex_m7_tune =
   &v7m_extra_costs,
   NULL,/* Sched adj cost.
*/
   0,   /* Constant limit.  */
-  0,   /* Max cond insns.  */
+  1,   /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,/* Prefer constant
pool.  */
-  arm_cortex_m_branch_cost,
+  arm_cortex_m7_branch_cost,
   false,   /* Prefer LDRD/STRD.  */
   {true, true},/* Prefer non short
circuit.  */
   &arm_default_vec_cost,/* Vectorizer costs.  */
@@ -12015,6 +12016,12 @@ arm_cortex_m_branch_cost (bool speed_p, bool
predictable_p)
  : arm_default_branch_cost (speed_p, predictable_p);
 }
 
+static int
+arm_cortex_m7_branch_cost (bool speed_p, bool predictable_p)
+{
+  return speed_p ? 0 : arm_default_branch_cost (speed_p, predictable_p);
+}
+
 static bool fp_consts_inited = false;
 
 static REAL_VALUE_TYPE value_fp0;

cortex-m7-branch-cost.patch-3
Description: Binary data


Back porting the LTO fix to upstream gcc 4.9 branch

2014-08-05 Thread Hale Wang
Hi,

I have submitted the patch to fix the ABI mis-matching error caused by LTO
on 18th June 2014.

Refer to : https:// gcc.gnu.org/ml/gcc-patches/2014-06/msg01429.html  for
details.

This fix was done for trunk. We need this fix included for gcc 4.9 branch.
So could we back porting this fix to upstream gcc 4.9 branch? 

Thanks and Best Regards,
Hale Wang





RE: Back porting the LTO fix to upstream gcc 4.9 branch

2014-08-05 Thread Hale Wang
Refer to: https://gcc.gnu.org/ml/gcc-patches/2014-06/msg01429.html.

Sorry for an extra whitespace.

> -Original Message-
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-
> ow...@gcc.gnu.org] On Behalf Of Hale Wang
> Sent: 2014年8月6日 13:50
> To: GCC Patches
> Cc: Mike Stump; Richard Biener
> Subject: Back porting the LTO fix to upstream gcc 4.9 branch
> 
> Hi,
> 
> I have submitted the patch to fix the ABI mis-matching error caused by LTO
on
> 18th June 2014.
> 
> Refer to : https://gcc.gnu.org/ml/gcc-patches/2014-06/msg01429.html  for
> details.
> 
> This fix was done for trunk. We need this fix included for gcc 4.9 branch.
> So could we back porting this fix to upstream gcc 4.9 branch?
> 
> Thanks and Best Regards,
> Hale Wang
> 
> 
>