[PATCH, TSAN] Fix missing __tsan_func_exit instrumentation
Hi, this patch fixes two cases, where the __tsan_func_entry is present but __tsan_func_exit is missing. This results in bogus call stacks and memory leaks. See PR 65400 for stripped down code samples where this was first discovered. Boot-strapped and regression-tested on x86_64-linux-gnu. OK for trunk? Thanks Bernd. 2015-03-14 Bernd Edlinger PR sanitizer/65400 * ipa-split.c (plit_function): Insert a call to TSAN_FUNC_EXIT again. * tsan.c (instrument_gimple): Reset the tail call flag on each call statement. patch-tsan.diff Description: Binary data
[committed] Fix make_field_assignment on big endian (PR rtl-optimization/65401)
Hi! The following testcase is miscompiled on s390x-linux, because make_field_assignment considers the actual byte swap as a field assignment. The problem is in the widening of the MEM mode, in the testcase from original QI to HI, that only works for little-endian, for big endian we need to adjust the offset. Bootstrapped/regtested on {x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, preapproved by Jeff on IRC, committed to trunk. 2015-03-14 Jakub Jelinek PR rtl-optimization/65401 * combine.c (rtx_equal_for_field_assignment_p): Add widen_x argument. If true, adjust_address_nv of x with big-endian correction for the mode widening to GET_MODE (y). (make_field_assignment): Don't do MEM mode widening here. Use MEM_P instead of GET_CODE == MEM. * gcc.c-torture/execute/pr65401.c: New test. --- gcc/combine.c.jj2015-02-03 10:38:46.0 +0100 +++ gcc/combine.c 2015-03-13 18:46:45.710940306 +0100 @@ -475,7 +475,7 @@ static rtx force_to_mode (rtx, machine_m unsigned HOST_WIDE_INT, int); static rtx if_then_else_cond (rtx, rtx *, rtx *); static rtx known_cond (rtx, enum rtx_code, rtx, rtx); -static int rtx_equal_for_field_assignment_p (rtx, rtx); +static int rtx_equal_for_field_assignment_p (rtx, rtx, bool = false); static rtx make_field_assignment (rtx); static rtx apply_distributive_law (rtx); static rtx distribute_and_simplify_rtx (rtx, int); @@ -9184,8 +9184,23 @@ known_cond (rtx x, enum rtx_code cond, r assignment as a field assignment. */ static int -rtx_equal_for_field_assignment_p (rtx x, rtx y) +rtx_equal_for_field_assignment_p (rtx x, rtx y, bool widen_x) { + if (widen_x && GET_MODE (x) != GET_MODE (y)) +{ + if (GET_MODE_SIZE (GET_MODE (x)) > GET_MODE_SIZE (GET_MODE (y))) + return 0; + if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN) + return 0; + /* For big endian, adjust the memory offset. */ + if (BYTES_BIG_ENDIAN) + x = adjust_address_nv (x, GET_MODE (y), + -subreg_lowpart_offset (GET_MODE (x), + GET_MODE (y))); + else + x = adjust_address_nv (x, GET_MODE (y), 0); +} + if (x == y || rtx_equal_p (x, y)) return 1; @@ -9339,16 +9354,15 @@ make_field_assignment (rtx x) /* The second SUBREG that might get in the way is a paradoxical SUBREG around the first operand of the AND. We want to pretend the operand is as wide as the destination here. We - do this by creating a new MEM in the wider mode for the sole + do this by adjusting the MEM to wider mode for the sole purpose of the call to rtx_equal_for_field_assignment_p. Also note this trick only works for MEMs. */ else if (GET_CODE (rhs) == AND && paradoxical_subreg_p (XEXP (rhs, 0)) - && GET_CODE (SUBREG_REG (XEXP (rhs, 0))) == MEM + && MEM_P (SUBREG_REG (XEXP (rhs, 0))) && CONST_INT_P (XEXP (rhs, 1)) - && rtx_equal_for_field_assignment_p (gen_rtx_MEM (GET_MODE (dest), -XEXP (SUBREG_REG (XEXP (rhs, 0)), 0)), - dest)) + && rtx_equal_for_field_assignment_p (SUBREG_REG (XEXP (rhs, 0)), + dest, true)) c1 = INTVAL (XEXP (rhs, 1)), other = lhs; else if (GET_CODE (lhs) == AND && CONST_INT_P (XEXP (lhs, 1)) @@ -9357,16 +9371,15 @@ make_field_assignment (rtx x) /* The second SUBREG that might get in the way is a paradoxical SUBREG around the first operand of the AND. We want to pretend the operand is as wide as the destination here. We - do this by creating a new MEM in the wider mode for the sole + do this by adjusting the MEM to wider mode for the sole purpose of the call to rtx_equal_for_field_assignment_p. Also note this trick only works for MEMs. */ else if (GET_CODE (lhs) == AND && paradoxical_subreg_p (XEXP (lhs, 0)) - && GET_CODE (SUBREG_REG (XEXP (lhs, 0))) == MEM + && MEM_P (SUBREG_REG (XEXP (lhs, 0))) && CONST_INT_P (XEXP (lhs, 1)) - && rtx_equal_for_field_assignment_p (gen_rtx_MEM (GET_MODE (dest), -XEXP (SUBREG_REG (XEXP (lhs, 0)), 0)), - dest)) + && rtx_equal_for_field_assignment_p (SUBREG_REG (XEXP (lhs, 0)), + dest, true)) c1 = INTVAL (XEXP (lhs, 1)), other = rhs; else return x; --- gcc/testsuite/gcc.c-torture/execute/pr65401.c.jj2015-03-13 18:36:30.639817393 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr65401.c 2015-03-13 18:42:02.693485127 +0100 @@ -0,0 +1,59 @@ +/* PR rtl-optimization/65401 */ + +struct S { unsigned short s[64]; }; + +__attr
[PATCH] Fix dr_explicit_realign vectorization (PR tree-optimization/65369)
Hi! This issue is practically the same as PR63341, except in this case it is for dr_explicit_realign rather than dr_explicit_realign_optimized, and the bump isn't passed through multiple functions and thus is easier to fix. Without the patch we use (dataptr & -16) for the first load and ((dataptr + 12) & -16) for the second load, which works just fine if the elements are properly aligned (4 byte at least), but in this case we have underaligned accesses (coming from folding of memcpy in this testcase, and from 4 byte loads combined together recognized by bswap pass in the original source), and so we really want to use ((dataptr + 15) & -16), otherwise if we are unlucky we might read the same memory twice even when dataptr is not 16 byte aligned. Bootstrapped/regtested on {x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk? 2015-03-14 Jakub Jelinek PR tree-optimization/65369 * tree-vect-stmts.c (vectorizable_load) : Set bump to vs * TYPE_SIZE_UNIT (elem_type) - 1 instead of (vs - 1) * TYPE_SIZE_UNIT (elem_type). * gcc.c-torture/execute/pr65369.c: New test. --- gcc/tree-vect-stmts.c.jj2015-03-09 08:05:13.0 +0100 +++ gcc/tree-vect-stmts.c 2015-03-13 17:27:30.613529768 +0100 @@ -6468,9 +6468,8 @@ vectorizable_load (gimple stmt, gimple_s case dr_explicit_realign: { tree ptr, bump; - tree vs_minus_1; - vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); + tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); if (compute_in_loop) msq = vect_setup_realignment (first_stmt, gsi, @@ -6499,8 +6498,9 @@ vectorizable_load (gimple stmt, gimple_s vect_finish_stmt_generation (stmt, new_stmt, gsi); msq = new_temp; - bump = size_binop (MULT_EXPR, vs_minus_1, + bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type)); + bump = size_binop (MINUS_EXPR, bump, size_one_node); ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump); new_stmt = gimple_build_assign (NULL_TREE, BIT_AND_EXPR, ptr, --- gcc/testsuite/gcc.c-torture/execute/pr65369.c.jj2015-03-13 17:37:10.926175685 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr65369.c 2015-03-13 17:35:40.0 +0100 @@ -0,0 +1,45 @@ +/* PR tree-optimization/65369 */ + +static const char data[] = + "12345678901234567890123456789012345678901234567890" + "123456789012345678901234567890"; + +__attribute__ ((noinline)) +static void foo (const unsigned int *buf) +{ + if (__builtin_memcmp (buf, data, 64)) +__builtin_abort (); +} + +__attribute__ ((noinline)) +static void bar (const unsigned char *block) +{ + unsigned int buf[16]; + __builtin_memcpy (buf + 0, block + 0, 4); + __builtin_memcpy (buf + 1, block + 4, 4); + __builtin_memcpy (buf + 2, block + 8, 4); + __builtin_memcpy (buf + 3, block + 12, 4); + __builtin_memcpy (buf + 4, block + 16, 4); + __builtin_memcpy (buf + 5, block + 20, 4); + __builtin_memcpy (buf + 6, block + 24, 4); + __builtin_memcpy (buf + 7, block + 28, 4); + __builtin_memcpy (buf + 8, block + 32, 4); + __builtin_memcpy (buf + 9, block + 36, 4); + __builtin_memcpy (buf + 10, block + 40, 4); + __builtin_memcpy (buf + 11, block + 44, 4); + __builtin_memcpy (buf + 12, block + 48, 4); + __builtin_memcpy (buf + 13, block + 52, 4); + __builtin_memcpy (buf + 14, block + 56, 4); + __builtin_memcpy (buf + 15, block + 60, 4); + foo (buf); +} + +int +main () +{ + unsigned char input[sizeof data + 16] __attribute__((aligned (16))); + __builtin_memset (input, 0, sizeof input); + __builtin_memcpy (input + 1, data, sizeof data); + bar (input + 1); + return 0; +} Jakub
[PATCH] Fix reassoc bit test optimization (PR tree-optimization/65418)
Hi! The first testcase shows a bug in my bit test reassoc optimization, extract_bit_test_mask is (intentionally) stripping nops, but was setting *totallowp and operating with tbias in the type of unstripped expression, which then results in different signedness of types used and confusing the optimization. In particular, -218 and -216 are already folded into (x is signed int) (((unsigned) x + 218U) & -2U) == 0 and thus without the patch we set lowi in the parent to -218U. Then -146 and -132 are just x == -146 and x == -132 thus we were comparing -218U to -146 or -132. But we really want to use -218 instead, as that is the type of x. Fixed thusly, bootstrapped/regtested on {x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk? 2015-03-14 Jakub Jelinek PR tree-optimization/65418 * tree-ssa-reassoc.c (extract_bit_test_mask): If there are casts in the first PLUS_EXPR operand, ensure tbias and *totallowp are in the inner type. * gcc.c-torture/execute/pr65418-1.c: New test. * gcc.c-torture/execute/pr65418-2.c: New test. --- gcc/tree-ssa-reassoc.c.jj 2015-02-26 22:02:39.0 +0100 +++ gcc/tree-ssa-reassoc.c 2015-03-13 16:22:50.506295252 +0100 @@ -2439,26 +2439,25 @@ extract_bit_test_mask (tree exp, int pre && TREE_CODE (exp) == PLUS_EXPR && TREE_CODE (TREE_OPERAND (exp, 1)) == INTEGER_CST) { + tree ret = TREE_OPERAND (exp, 0); + STRIP_NOPS (ret); widest_int bias = wi::neg (wi::sext (wi::to_widest (TREE_OPERAND (exp, 1)), TYPE_PRECISION (TREE_TYPE (low; - tree tbias = wide_int_to_tree (TREE_TYPE (low), bias); + tree tbias = wide_int_to_tree (TREE_TYPE (ret), bias); if (totallowp) { *totallowp = tbias; - exp = TREE_OPERAND (exp, 0); - STRIP_NOPS (exp); - return exp; + return ret; } else if (!tree_int_cst_lt (totallow, tbias)) return NULL_TREE; + bias = wi::to_widest (tbias); bias -= wi::to_widest (totallow); if (wi::ges_p (bias, 0) && wi::lts_p (bias, prec - max)) { *mask = wi::lshift (*mask, bias); - exp = TREE_OPERAND (exp, 0); - STRIP_NOPS (exp); - return exp; + return ret; } } } --- gcc/testsuite/gcc.c-torture/execute/pr65418-1.c.jj 2015-03-13 16:49:07.973604649 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr65418-1.c 2015-03-13 16:48:28.0 +0100 @@ -0,0 +1,19 @@ +/* PR tree-optimization/65418 */ + +__attribute__((noinline, noclone)) int +foo (int x) +{ + if (x == -216 || x == -132 || x == -218 || x == -146) + return 1; + return 0; +} + +int +main () +{ + volatile int i; + for (i = -230; i < -120; i++) +if (foo (i) != (i == -216 || i == -132 || i == -218 || i == -146)) + __builtin_abort (); + return 0; +} --- gcc/testsuite/gcc.c-torture/execute/pr65418-2.c.jj 2015-03-13 16:49:10.992556110 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr65418-2.c 2015-03-13 16:48:44.0 +0100 @@ -0,0 +1,19 @@ +/* PR tree-optimization/65418 */ + +__attribute__((noinline, noclone)) int +foo (int x) +{ + if (x == -216 || x == -211 || x == -218 || x == -205 || x == -223) + return 1; + return 0; +} + +int +main () +{ + volatile int i; + for (i = -230; i < -200; i++) +if (foo (i) != (i == -216 || i == -211 || i == -218 || i == -205 || i == -223)) + __builtin_abort (); + return 0; +} Jakub
Re: [PATCH] Fix reassoc bit test optimization (PR tree-optimization/65418)
On March 14, 2015 10:10:34 AM GMT+01:00, Jakub Jelinek wrote: >Hi! > >The first testcase shows a bug in my bit test reassoc optimization, >extract_bit_test_mask is (intentionally) stripping nops, but was >setting >*totallowp and operating with tbias in the type of unstripped >expression, >which then results in different signedness of types used and confusing >the >optimization. In particular, -218 and -216 are already folded into (x >is >signed int) >(((unsigned) x + 218U) & -2U) == 0 >and thus without the patch we set lowi in the parent to -218U. >Then -146 and -132 are just >x == -146 >and >x == -132 >thus we were comparing -218U to -146 or -132. But we really want >to use -218 instead, as that is the type of x. > >Fixed thusly, bootstrapped/regtested on >{x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk? OK. Thanks, Richard. >2015-03-14 Jakub Jelinek > > PR tree-optimization/65418 > * tree-ssa-reassoc.c (extract_bit_test_mask): If there > are casts in the first PLUS_EXPR operand, ensure tbias and > *totallowp are in the inner type. > > * gcc.c-torture/execute/pr65418-1.c: New test. > * gcc.c-torture/execute/pr65418-2.c: New test. > >--- gcc/tree-ssa-reassoc.c.jj 2015-02-26 22:02:39.0 +0100 >+++ gcc/tree-ssa-reassoc.c 2015-03-13 16:22:50.506295252 +0100 >@@ -2439,26 +2439,25 @@ extract_bit_test_mask (tree exp, int pre > && TREE_CODE (exp) == PLUS_EXPR > && TREE_CODE (TREE_OPERAND (exp, 1)) == INTEGER_CST) > { >+tree ret = TREE_OPERAND (exp, 0); >+STRIP_NOPS (ret); > widest_int bias > = wi::neg (wi::sext (wi::to_widest (TREE_OPERAND (exp, 1)), >TYPE_PRECISION (TREE_TYPE (low; >-tree tbias = wide_int_to_tree (TREE_TYPE (low), bias); >+tree tbias = wide_int_to_tree (TREE_TYPE (ret), bias); > if (totallowp) > { > *totallowp = tbias; >-exp = TREE_OPERAND (exp, 0); >-STRIP_NOPS (exp); >-return exp; >+return ret; > } > else if (!tree_int_cst_lt (totallow, tbias)) > return NULL_TREE; >+bias = wi::to_widest (tbias); > bias -= wi::to_widest (totallow); > if (wi::ges_p (bias, 0) && wi::lts_p (bias, prec - max)) > { > *mask = wi::lshift (*mask, bias); >-exp = TREE_OPERAND (exp, 0); >-STRIP_NOPS (exp); >-return exp; >+return ret; > } > } > } >--- gcc/testsuite/gcc.c-torture/execute/pr65418-1.c.jj 2015-03-13 >16:49:07.973604649 +0100 >+++ gcc/testsuite/gcc.c-torture/execute/pr65418-1.c2015-03-13 >16:48:28.0 +0100 >@@ -0,0 +1,19 @@ >+/* PR tree-optimization/65418 */ >+ >+__attribute__((noinline, noclone)) int >+foo (int x) >+{ >+ if (x == -216 || x == -132 || x == -218 || x == -146) >+ return 1; >+ return 0; >+} >+ >+int >+main () >+{ >+ volatile int i; >+ for (i = -230; i < -120; i++) >+if (foo (i) != (i == -216 || i == -132 || i == -218 || i == -146)) >+ __builtin_abort (); >+ return 0; >+} >--- gcc/testsuite/gcc.c-torture/execute/pr65418-2.c.jj 2015-03-13 >16:49:10.992556110 +0100 >+++ gcc/testsuite/gcc.c-torture/execute/pr65418-2.c2015-03-13 >16:48:44.0 +0100 >@@ -0,0 +1,19 @@ >+/* PR tree-optimization/65418 */ >+ >+__attribute__((noinline, noclone)) int >+foo (int x) >+{ >+ if (x == -216 || x == -211 || x == -218 || x == -205 || x == -223) >+ return 1; >+ return 0; >+} >+ >+int >+main () >+{ >+ volatile int i; >+ for (i = -230; i < -200; i++) >+if (foo (i) != (i == -216 || i == -211 || i == -218 || i == -205 >|| i == -223)) >+ __builtin_abort (); >+ return 0; >+} > > Jakub
Re: [PATCH] Fix dr_explicit_realign vectorization (PR tree-optimization/65369)
On March 14, 2015 10:04:53 AM GMT+01:00, Jakub Jelinek wrote: >Hi! > >This issue is practically the same as PR63341, except in this case it >is for >dr_explicit_realign rather than dr_explicit_realign_optimized, and the >bump >isn't passed through multiple functions and thus is easier to fix. > >Without the patch we use (dataptr & -16) for the first load and >((dataptr + 12) & -16) for the second load, which works just fine if >the >elements are properly aligned (4 byte at least), but in this case we >have >underaligned accesses (coming from folding of memcpy in this testcase, >and >from 4 byte loads combined together recognized by bswap pass in the >original >source), and so we really want to use ((dataptr + 15) & -16), otherwise >if we are unlucky we might read the same memory twice even when dataptr >is not 16 byte aligned. > >Bootstrapped/regtested on >{x86_64,i686,aarch64,powerpc64{,le},s390{,x}}-linux, ok for trunk? OK. Thanks, Richard. >2015-03-14 Jakub Jelinek > > PR tree-optimization/65369 > * tree-vect-stmts.c (vectorizable_load) : > Set bump to vs * TYPE_SIZE_UNIT (elem_type) - 1 instead of > (vs - 1) * TYPE_SIZE_UNIT (elem_type). > > * gcc.c-torture/execute/pr65369.c: New test. > >--- gcc/tree-vect-stmts.c.jj 2015-03-09 08:05:13.0 +0100 >+++ gcc/tree-vect-stmts.c 2015-03-13 17:27:30.613529768 +0100 >@@ -6468,9 +6468,8 @@ vectorizable_load (gimple stmt, gimple_s > case dr_explicit_realign: > { > tree ptr, bump; >- tree vs_minus_1; > >- vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); >+ tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > > if (compute_in_loop) > msq = vect_setup_realignment (first_stmt, gsi, >@@ -6499,8 +6498,9 @@ vectorizable_load (gimple stmt, gimple_s > vect_finish_stmt_generation (stmt, new_stmt, gsi); > msq = new_temp; > >- bump = size_binop (MULT_EXPR, vs_minus_1, >+ bump = size_binop (MULT_EXPR, vs, > TYPE_SIZE_UNIT (elem_type)); >+ bump = size_binop (MINUS_EXPR, bump, size_one_node); > ptr = bump_vector_ptr (dataref_ptr, NULL, gsi, stmt, bump); > new_stmt = gimple_build_assign >(NULL_TREE, BIT_AND_EXPR, ptr, >--- gcc/testsuite/gcc.c-torture/execute/pr65369.c.jj 2015-03-13 >17:37:10.926175685 +0100 >+++ gcc/testsuite/gcc.c-torture/execute/pr65369.c 2015-03-13 >17:35:40.0 +0100 >@@ -0,0 +1,45 @@ >+/* PR tree-optimization/65369 */ >+ >+static const char data[] = >+ "12345678901234567890123456789012345678901234567890" >+ "123456789012345678901234567890"; >+ >+__attribute__ ((noinline)) >+static void foo (const unsigned int *buf) >+{ >+ if (__builtin_memcmp (buf, data, 64)) >+__builtin_abort (); >+} >+ >+__attribute__ ((noinline)) >+static void bar (const unsigned char *block) >+{ >+ unsigned int buf[16]; >+ __builtin_memcpy (buf + 0, block + 0, 4); >+ __builtin_memcpy (buf + 1, block + 4, 4); >+ __builtin_memcpy (buf + 2, block + 8, 4); >+ __builtin_memcpy (buf + 3, block + 12, 4); >+ __builtin_memcpy (buf + 4, block + 16, 4); >+ __builtin_memcpy (buf + 5, block + 20, 4); >+ __builtin_memcpy (buf + 6, block + 24, 4); >+ __builtin_memcpy (buf + 7, block + 28, 4); >+ __builtin_memcpy (buf + 8, block + 32, 4); >+ __builtin_memcpy (buf + 9, block + 36, 4); >+ __builtin_memcpy (buf + 10, block + 40, 4); >+ __builtin_memcpy (buf + 11, block + 44, 4); >+ __builtin_memcpy (buf + 12, block + 48, 4); >+ __builtin_memcpy (buf + 13, block + 52, 4); >+ __builtin_memcpy (buf + 14, block + 56, 4); >+ __builtin_memcpy (buf + 15, block + 60, 4); >+ foo (buf); >+} >+ >+int >+main () >+{ >+ unsigned char input[sizeof data + 16] __attribute__((aligned (16))); >+ __builtin_memset (input, 0, sizeof input); >+ __builtin_memcpy (input + 1, data, sizeof data); >+ bar (input + 1); >+ return 0; >+} > > Jakub
Re: [PING^2] [PATCH] [AArch64, NEON] Improve vmulX intrinsics
Hi, Kyrill Thank you for your suggestion. I fixed it and regtested with aarch64-linux-gnu on QEMU. This patch has no regressions for aarch64_be-linux-gnu big-endian target too. OK for the trunk? Thanks. Jiang jiji Index: gcc/ChangeLog === --- gcc/ChangeLog (revision 221393) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,38 @@ +2015-03-14 Felix Yang + Jiji Jiang + + * config/aarch64/aarch64-simd.md (aarch64_mul_n, + aarch64_mull_n, aarch64_mull, + aarch64_simd_mull2_n, aarch64_mull2_n, + aarch64_mull_lane, aarch64_mull2_lane_internal, + aarch64_mull_laneq, aarch64_mull2_laneq_internal, + aarch64_smull2_lane, aarch64_umull2_lane, + aarch64_smull2_laneq, aarch64_umull2_laneq, + aarch64_fmulx, aarch64_fmulx, aarch64_fmulx_lane, + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, smull2_n, + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, + smull2_lane): New builtins. + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite + using builtin functions. + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, + VDQF_Q): New unspec and int iterator. + 2015-03-12 Kyrylo Tkachov PR rtl-optimization/65235 Index: gcc/config/aarch64/arm_neon.h === --- gcc/config/aarch64/arm_neon.h (revision 221393) +++ gcc/config/aarch64/arm_neon.h (working copy) @@ -7580,671 +7580,6 @@ vmovn_u64 (uint64x2_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmul_n_f32 (float32x2_t a, float32_t b) -{ - float32x2_t result; - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmul_n_s16 (int16x4_t a, int16_t b) -{ - int16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmul_n_s32 (int32x2_t a, int32_t b) -{ - int32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmul_n_u16 (uint16x4_t a, uint16_t b) -{ - uint16x4_t result; - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmul_n_u32 (uint32x2_t a, uint32_t b) -{ - uint32x2_t result; - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -#define vmull_high_lane_s16(a, b, c)\ - __extension__ \ -({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result;\ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ -: "=w"(result) \ -: "w"(a_), "x"(b_), "i"(c) \ -: /* No clobbers */);
RE: [PATCH] Fix another wrong-code bug with -fstrict-volatile-bitfields
Bernd Edlinger writes: > Hi, > > are there any more comments on this? > > I would like to apply the patch as is, unless we find a > a way to get to a test case, maybe with a cross-compiler, > where the MODE_ALIGNMENT is different from MODE_BITSIZE. > > Currently, I think that does not happen. On m68k-linux GET_MODE_ALIGNMENT (SImode) == 16 while GET_MODE_BITSIZE (SImode) == 32. I don't know what that means for your patch, just wanted to inform you that such targets do exist. /Mikael > > Thanks > Bernd. > > > Date: Tue, 10 Mar 2015 14:40:52 +0100 > > > > Hi Richard and Eric, > > > > On Mon, 9 Mar 2015 15:30:31, Richard Biener wrote: > >>> Reg-tested on x86_64 successfully and ARM is still running. > > > > ARM completed without regressions meanwhile. > > > >>> > >>> Is it OK for trunk? > >> > >> Looks ok to me apart from > >> > >> /* Check for cases of unaligned fields that must be split. */ > >> - if (bitnum % BITS_PER_UNIT + bitsize> modesize > >> - || (STRICT_ALIGNMENT > >> - && bitnum % GET_MODE_ALIGNMENT (fieldmode) + bitsize> modesize)) > >> + if (bitnum % (STRICT_ALIGNMENT ? modesize : BITS_PER_UNIT) > >> + + bitsize> modesize > >> + || (STRICT_ALIGNMENT && MEM_ALIGN (op0) < modesize)) > >> return false; > >> > >> where I'd use GET_MODE_ALIGNMENT (fieldmode) rather than modesize > >> (in both places). > >> > >> Please leave Eric the chance to comment. > >> > > > > Just to clarify a few things here: > > > > I try to make the checks in strict_volatile_bitfield_p > > to be consistent with the strict volatile bit-field code path > > that follows if we return true here. > > > > I would summarize the current implementation of the > > strict volatile bit-field code as follows: > > > > For strict alignment, we access the structure as > > if it were an array of fieldmode. A multiple of modesize > > is added to the base address, and one single read and/or > > write access must be sufficient to do the job. The access > > must be Ok regarding the target's alignment restrictions. > > That does not change, what changed with the previous patch > > is a missed optimization with the EP_insv code pattern. > > > > For non strict alignment, a multiple of modesize is added > > to the base address, but if the range [bitnum:bitnum+bitsize-1] > > spans two fieldmode words, which should only happen if we > > use packed structures, a byte offset is added to the base address. > > The byte offset is chosen as small as possible, to not reach beyond > > the bit field region. That is new. This change is irrelevant for the use > > case > > of accessing a device register, but the generated code is more compact. > > > > Usually we have GET_MODE_ALIGNMENT(fieldmode)==modesize > > for all SCALAR_INT_MODE_P(fieldmode). > > > > The only exceptions are complex numbers, and targets where > > ADJUST_ALIGNMENT is used in the modes.def, right? > > > > The targets that do that are few, and the modes are mostly vector modes. > > So I did not find any target where the MODE_ALIGNMENT would make > > a difference here. Therefore I think it is more or less a matter of taste. > > But please correct me if I am wrong. > > > > If there are cases, where MODE_ALIGNMENT > changing the second condition MEM_ALIGN(op0) > MEM_ALIGN(op0) > still be consistent, and probably be more correct. > > > > But I think changing the first condition would allow cases where this > > assertion in the patch does no longer hold: > > gcc_assert (bitnum + bitsize <= GET_MODE_BITSIZE (fieldmode)); > > > > > > > > Thanks > > Bernd. > > > --
Fix for PRs 36043, 58744 and 65408
This is Richi's prototype patch in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36043#c23 with fixes for blocks larger than one reg, big-endian, and BLOCK_REG_PADDING. I also removed the operand_subword_force since we may as well let narrow_bit_field_mem in extract_bit_field do that for us. It is necessary to do the BLOCK_REG_PADDING shift after we've loaded the block or else repeat the bit-field extraction in that case. Bootstrapped and regression tested (-m32 and -m64) x86_64-linux and powerpc64-linux. OK to apply? I'll also throw together a testcase or three. For execute tests I'm thinking of using sbrk to locate an odd sized struct such that access past the end segfaults, rather than mmap/munmap as was done in the pr36043 testcase. Does that sound reasonable? PR target/65408 PR target/58744 PR middle-end/36043 * calls.c (load_register_parameters): Don't load past end of mem unless suitably aligned. Index: gcc/calls.c === --- gcc/calls.c (revision 221435) +++ gcc/calls.c (working copy) @@ -2090,6 +2090,26 @@ load_register_parameters (struct arg_data *args, i (XEXP (args[i].value, 0), size))) *sibcall_failure = 1; + if (size % UNITS_PER_WORD == 0 + || MEM_ALIGN (mem) % BITS_PER_WORD == 0) + move_block_to_reg (REGNO (reg), mem, nregs, args[i].mode); + else + { + if (nregs > 1) + move_block_to_reg (REGNO (reg), mem, nregs - 1, + args[i].mode); + rtx dest = gen_rtx_REG (word_mode, REGNO (reg) + nregs - 1); + unsigned int bitoff = (nregs - 1) * BITS_PER_WORD; + unsigned int bitsize = size * BITS_PER_UNIT - bitoff; + rtx x = extract_bit_field (mem, bitsize, bitoff, 1, +dest, word_mode, word_mode); + if (BYTES_BIG_ENDIAN) + x = expand_shift (LSHIFT_EXPR, word_mode, x, + BITS_PER_WORD - bitsize, dest, 1); + if (x != dest) + emit_move_insn (dest, x); + } + /* Handle a BLKmode that needs shifting. */ if (nregs == 1 && size < UNITS_PER_WORD #ifdef BLOCK_REG_PADDING @@ -2097,22 +2117,18 @@ load_register_parameters (struct arg_data *args, i #else && BYTES_BIG_ENDIAN #endif -) + ) { - rtx tem = operand_subword_force (mem, 0, args[i].mode); - rtx ri = gen_rtx_REG (word_mode, REGNO (reg)); - rtx x = gen_reg_rtx (word_mode); + rtx dest = gen_rtx_REG (word_mode, REGNO (reg)); int shift = (UNITS_PER_WORD - size) * BITS_PER_UNIT; - enum tree_code dir = BYTES_BIG_ENDIAN ? RSHIFT_EXPR - : LSHIFT_EXPR; + enum tree_code dir = (BYTES_BIG_ENDIAN + ? RSHIFT_EXPR : LSHIFT_EXPR); + rtx x; - emit_move_insn (x, tem); - x = expand_shift (dir, word_mode, x, shift, ri, 1); - if (x != ri) - emit_move_insn (ri, x); + x = expand_shift (dir, word_mode, dest, shift, dest, 1); + if (x != dest) + emit_move_insn (dest, x); } - else - move_block_to_reg (REGNO (reg), mem, nregs, args[i].mode); } /* When a parameter is a block, and perhaps in other cases, it is -- Alan Modra Australia Development Lab, IBM
Re: Fix for PRs 36043, 58744 and 65408
On Sat, Mar 14, 2015 at 6:02 AM, Alan Modra wrote: > This is Richi's prototype patch in > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36043#c23 with fixes for > blocks larger than one reg, big-endian, and BLOCK_REG_PADDING. > I also removed the operand_subword_force since we may as well let > narrow_bit_field_mem in extract_bit_field do that for us. It is > necessary to do the BLOCK_REG_PADDING shift after we've loaded the > block or else repeat the bit-field extraction in that case. > > Bootstrapped and regression tested (-m32 and -m64) x86_64-linux and > powerpc64-linux. OK to apply? > > I'll also throw together a testcase or three. For execute tests I'm > thinking of using sbrk to locate an odd sized struct such that access > past the end segfaults, rather than mmap/munmap as was done in the > pr36043 testcase. Does that sound reasonable? > > PR target/65408 > PR target/58744 > PR middle-end/36043 > * calls.c (load_register_parameters): Don't load past end of > mem unless suitably aligned. > Can you add a testcase in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36043 -- H.J.
Re: Fix for PRs 36043, 58744 and 65408
On Sat, Mar 14, 2015 at 06:14:40AM -0700, H.J. Lu wrote: > On Sat, Mar 14, 2015 at 6:02 AM, Alan Modra wrote: > > I'll also throw together a testcase or three. For execute tests I'm > > thinking of using sbrk to locate an odd sized struct such that access > > past the end segfaults, rather than mmap/munmap as was done in the > > pr36043 testcase. Does that sound reasonable? > > Can you add a testcase in > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36043 I was thinking that mmap/munmap is less portable than sbrk. Hmm, a grep over the testsuite says mmap is already used and do-do run { target mmap } is available. OK, I'm happy to jump that way too. -- Alan Modra Australia Development Lab, IBM
Re: Fix for PRs 36043, 58744 and 65408
On March 14, 2015 2:02:38 PM GMT+01:00, Alan Modra wrote: >I'll also throw together a testcase or three. For execute tests I'm >thinking of using sbrk to locate an odd sized struct such that access >past the end segfaults, rather than mmap/munmap as was done in the >pr36043 testcase. Does that sound reasonable? Well since sbrk was marked LEGACY in SUSv2 and was removed in SUSv3 (and still is in 1003.1-2008) I'm not sure it is wise to use it in new code.. Using it will bite testing on legacy-free setups, fwiw. Cheers,
Re: [patch, fortran] Bug 64432 - [5 Regression] SYSTEM_CLOCK(COUNT_RATE=rate) wrong result for integer(4)::rate
On 03/08/2015 04:58 PM, Steve Kargl wrote: On Mon, Mar 09, 2015 at 01:07:25AM +0200, Janne Blomqvist wrote: So I would prefer if we just hardcode the error values in the frontend (-HUGE, 0, 0), in case somebody tries to use the kind=1,2 versions, thus also removing the need for the new library functions, keeping the existing simpler ones instead. AFAICT this would be standards conforming. Any other opinions on this? Revised patch attached as requested. Regression tested on x86_64 linux. Typical results are shown below. I will provide a test case for the test-suite. $ ./a.out KIND=1: -127 0 0 KIND=1: -127 0 0 KIND=1: -127 . 0 --- KIND=2: -32767 0 0 KIND=2: -32767 . 0 --- KIND=4: 57496123 1000 2147483647 KIND=4: 57496123 1000.0 2147483647 --- KIND=8: 57496123484138 10 9223372036854775807 KIND=8: 57496123522116 10.000 9223372036854775807 --- KIND=10: 57496123575504 10 9223372036854775807 KIND=10: 57496123612377 10.000 9223372036854775807 --- KIND=16: 57496123669210 10 9223372036854775807 KIND=16: 57496123698413 10.00 9223372036854775807 OK for trunk? Regards, Jerry 2015-03-14 Jerry DeLisle PR fortran/64432 *trans-intrinisic.c (conv_intrinsic_system_clock): Check the smallest kind passed in user arguments and hard-code results for KIND=1 or KIND=2 to indicate no clock available. 2015-03-14 Jerry DeLisle PR libgfortran/64432 * intrinsics/system_clock.c (system_clock4, system_clock8): Cleanup some whitespace.
Re: [patch, fortran] Bug 64432 - [5 Regression] SYSTEM_CLOCK(COUNT_RATE=rate) wrong result for integer(4)::rate
Attachment on this one. On 03/14/2015 07:22 AM, Jerry DeLisle wrote: On 03/08/2015 04:58 PM, Steve Kargl wrote: On Mon, Mar 09, 2015 at 01:07:25AM +0200, Janne Blomqvist wrote: So I would prefer if we just hardcode the error values in the frontend (-HUGE, 0, 0), in case somebody tries to use the kind=1,2 versions, thus also removing the need for the new library functions, keeping the existing simpler ones instead. AFAICT this would be standards conforming. Any other opinions on this? Revised patch attached as requested. Regression tested on x86_64 linux. Typical results are shown below. I will provide a test case for the test-suite. $ ./a.out KIND=1: -127 0 0 KIND=1: -127 0 0 KIND=1: -127 . 0 --- KIND=2: -32767 0 0 KIND=2: -32767 . 0 --- KIND=4: 57496123 1000 2147483647 KIND=4: 57496123 1000.0 2147483647 --- KIND=8: 57496123484138 10 9223372036854775807 KIND=8: 57496123522116 10.000 9223372036854775807 --- KIND=10: 57496123575504 10 9223372036854775807 KIND=10: 57496123612377 10.000 9223372036854775807 --- KIND=16: 57496123669210 10 9223372036854775807 KIND=16: 57496123698413 10.00 9223372036854775807 OK for trunk? Regards, Jerry 2015-03-14 Jerry DeLisle PR fortran/64432 *trans-intrinisic.c (conv_intrinsic_system_clock): Check the smallest kind passed in user arguments and hard-code results for KIND=1 or KIND=2 to indicate no clock available. 2015-03-14 Jerry DeLisle PR libgfortran/64432 * intrinsics/system_clock.c (system_clock4, system_clock8): Cleanup some whitespace. Index: gcc/fortran/trans-intrinsic.c === --- gcc/fortran/trans-intrinsic.c (revision 221405) +++ gcc/fortran/trans-intrinsic.c (working copy) @@ -2671,22 +2671,13 @@ conv_intrinsic_system_clock (gfc_code *code) stmtblock_t block; gfc_se count_se, count_rate_se, count_max_se; tree arg1 = NULL_TREE, arg2 = NULL_TREE, arg3 = NULL_TREE; - tree type, tmp; - int kind; + tree tmp; + int least; gfc_expr *count = code->ext.actual->expr; gfc_expr *count_rate = code->ext.actual->next->expr; gfc_expr *count_max = code->ext.actual->next->next->expr; - /* The INTEGER(8) version has higher precision, it is used if both COUNT - and COUNT_MAX can hold 64-bit values, or are absent. */ - if ((!count || count->ts.kind >= 8) - && (!count_max || count_max->ts.kind >= 8)) -kind = 8; - else -kind = gfc_default_integer_kind; - type = gfc_get_int_type (kind); - /* Evaluate our arguments. */ if (count) { @@ -2706,37 +2697,104 @@ conv_intrinsic_system_clock (gfc_code *code) gfc_conv_expr (&count_max_se, count_max); } - /* Prepare temporary variables if we need them. */ - if (count && count->ts.kind != kind) -arg1 = gfc_create_var (type, "count"); - else if (count) -arg1 = count_se.expr; + /* Find the smallest kind found of the arguments. */ + least = 16; + least = (count && count->ts.kind < least) ? count->ts.kind : least; + least = (count_rate && count_rate->ts.kind < least) ? count_rate->ts.kind + : least; + least = (count_max && count_max->ts.kind < least) ? count_max->ts.kind + : least; - if (count_rate && (count_rate->ts.kind != kind - || count_rate->ts.type != BT_INTEGER)) -arg2 = gfc_create_var (type, "count_rate"); - else if (count_rate) -arg2 = count_rate_se.expr; + /* Prepare temporary variables. */ - if (count_max && count_max->ts.kind != kind) -arg3 = gfc_create_var (type, "count_max"); - else if (count_max) -arg3 = count_max_se.expr; + if (count) +{ + if (least >= 8) + arg1 = gfc_create_var (gfc_get_int_type (8), "count"); + else if (least == 4) + arg1 = gfc_create_var (gfc_get_int_type (4), "count"); + else if (count->ts.kind == 1) +arg1 = gfc_conv_mpz_to_tree (gfc_integer_kinds[0].pedantic_min_int, + count->ts.kind); + else +arg1 = gfc_conv_mpz_to_tree (gfc_integer_kinds[1].pedantic_min_int, + count->ts.kind); +} + if (count_rate) +{ + if (least >= 8) + arg2 = gfc_create_var (gfc_get_int_type (8), "count_rate"); + else if (least == 4) + arg2 = gfc_create_var (gfc_get_int_type (4), "count_rate"); + else +arg2 = integer_zero_node; +} + + if (count_max) +{ + if (least >= 8) + arg3 = gfc_create_var (gfc_get_int_type (8), "count_max"); + else if (least == 4) + arg3 = gfc_create_var (gfc_get_int_type (4), "count_max"); + else +arg3 = integer_zero_node; +} +
Re: [PATCH] pr 63354 - gcc -pg -mprofile-kernel creates unused stack frames on leaf functions on ppc64le
On Fri, Mar 13, 2015 at 03:54:57PM -0600, Martin Sebor wrote: > Attached is a patch that eliminates the unused stack frame > allocated by gcc 5 with -pg -mprofile-kernel on powepc64le > and brings the code into parity with previous gcc versions. > > The patch doesn't do anything to change the emitted code > when -mprofile-kernel is used without -pg. Since the former > option isn't fully documented (as noted in pr 65372) it's > unclear what effect it should be expected to have without > -pg. -mprofile-kernel does nothing without profiling enabled. Maybe it should just have been called -pk or something horrid like that. The effect it should have is to do what the only user of the option (the 64-bit PowerPC Linux kernel) wants. The effect it does have is to make the 64-bit ABI more like the 32-bit ABI for mcount. > 2015-03-13 Anton Blanchard > > PR target/63354 > * gcc/config/rs6000/linux64.h (ARGET_KEEP_LEAF_WHEN_PROFILED): Define. ^ typo > * cc/config/rs6000/rs6000.c (rs6000_keep_leaf_when_profiled). New ^ typo^ typo It shouldn't have "gcc/" in the path names at all, actually. > +/* -mprofile-kernel code calls mcount before the function prolog, "prologue". > + so a profiled leaf function should stay a leaf function. */ > + > +static bool > +rs6000_keep_leaf_when_profiled (void) > +{ > + return TARGET_PROFILE_KERNEL; > +} Something like switch (DEFAULT_ABI) { case ABI_AIX: case ABI_ELFv2: return TARGET_PROFILE_KERNEL; default: return true; } although I'm not sure about Darwin here. More conservative is to return false for anything untested, of course. > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr63354.c > @@ -0,0 +1,10 @@ > +/* { dg-do compile { target { powerpc*-*-* } } } */ > +/* { dg-options "-O2 -pg -mprofile-kernel" } */ > + > +int foo (void) > +{ > + return 1; > +} > + > +/* { dg-final { scan-assembler "bl _mcount" } } */ > +/* { dg-final { scan-assembler-not "\(addi|stdu\) 1," } } */ Either you should run this only on AIX/ELFv2 ABIs, or you want to test for "stwu" as well. Bare "1" does not work for all assemblers (only Darwin again?) Segher
Re: Fix for PRs 36043, 58744 and 65408
On Mar 14, 2015, at 6:58 AM, Bernhard Reutner-Fischer wrote: > On March 14, 2015 2:02:38 PM GMT+01:00, Alan Modra wrote: > >> I'll also throw together a testcase or three. For execute tests I'm >> thinking of using sbrk to locate an odd sized struct such that access >> past the end segfaults, rather than mmap/munmap as was done in the >> pr36043 testcase. Does that sound reasonable? > > Well since sbrk was marked LEGACY in SUSv2 and was removed in SUSv3 (and > still is in 1003.1-2008) I'm not sure it is wise to use it in new code.. > Using it will bite testing on legacy-free setups, fwiw. newlib doesn’t have mmap. Indeed, some machines will never have mmap. newlib has sbrk.
Re: Fix for PRs 36043, 58744 and 65408
On Sat, Mar 14, 2015 at 10:51:28AM -0700, Mike Stump wrote: > On Mar 14, 2015, at 6:58 AM, Bernhard Reutner-Fischer > wrote: > > On March 14, 2015 2:02:38 PM GMT+01:00, Alan Modra wrote: > > > >> I'll also throw together a testcase or three. For execute tests I'm > >> thinking of using sbrk to locate an odd sized struct such that access > >> past the end segfaults, rather than mmap/munmap as was done in the > >> pr36043 testcase. Does that sound reasonable? > > > > Well since sbrk was marked LEGACY in SUSv2 and was removed in SUSv3 (and > > still is in 1003.1-2008) I'm not sure it is wise to use it in new code.. > > Using it will bite testing on legacy-free setups, fwiw. > > newlib doesn’t have mmap. Indeed, some machines will never have mmap. > newlib has sbrk. Still, I think it is preferrable to test with mmap... Jakub
Re: Fix for PRs 36043, 58744 and 65408
On Mar 14, 2015, at 10:56 AM, Jakub Jelinek wrote: >> newlib doesn’t have mmap. Indeed, some machines will never have mmap. >> newlib has sbrk. > > Still, I think it is preferrable to test with mmap… I don’t see anything wrong with going the target mmap direction… my post was just to provide information… not decide which is better. I’d rather leave the issue to those with an opinion.
RE: [PATCH] Fix another wrong-code bug with -fstrict-volatile-bitfields
Hi, On Sat, 14 Mar 2015 13:24:33, Mikael Pettersson wrote: > > Bernd Edlinger writes: >> Hi, >> >> are there any more comments on this? >> >> I would like to apply the patch as is, unless we find a >> a way to get to a test case, maybe with a cross-compiler, >> where the MODE_ALIGNMENT is different from MODE_BITSIZE. >> >> Currently, I think that does not happen. > > On m68k-linux GET_MODE_ALIGNMENT (SImode) == 16 while > GET_MODE_BITSIZE (SImode) == 32. > > I don't know what that means for your patch, just wanted > to inform you that such targets do exist. > Oh I see, thanks. This is due to BIGGEST_ALIGNMENT=16, STRICT_ALIGNMENT=1 by default on that architecture. If I change this check as suggested: if (bitnum % (STRICT_ALIGNMENT ? GET_MODE_ALIGNMENT (fieldmode) : BITS_PER_UNIT) + bitsize> modesize || (STRICT_ALIGNMENT && MEM_ALIGN (op0) < GET_MODE_ALIGNMENT (fieldmode))) return false; Then I can get the assertion failed in store_bit_field: gcc_assert (bitnum + bitsize <= GET_MODE_BITSIZE (fieldmode)); With this example: cat test.c struct s { short y:16; int x:31; }; void f(volatile struct s* z, int x) { z->x=x; } m68k-elf-gcc -fstrict-volatile-bitfields -mstrict-align -mno-align-int -O2 -S test.c test.c: In function 'f': test.c:10:7: internal compiler error: in store_bit_field, at expmed.c:1005 z->x=x; ^ what I said before.., without the patch the test case generates just invalid code which assigns only 16 bits. There is also a problem in this check. I had to make short y:16 a bit filed to bypass that, initially I wrote short y; /* Check for cases where the C++ memory model applies. */ if (bitregion_end != 0 && (bitnum - bitnum % modesize < bitregion_start || bitnum - bitnum % modesize + modesize - 1> bitregion_end)) return false; This assumes also that the access is at a modesize boundary. If we have BIGGEST_ALIGNMENT=16 that means we have likely a 16 bit architecture. I doubt that the strict alignment code makes any sense for modesize> BIGGEST_ALIGNMENT. I think I should change this check /* The bit size must not be larger than the field mode, and the field mode must not be larger than a word. */ if (bitsize> modesize || modesize> BITS_PER_WORD) return false; to this: if (bitsize> modesize || modesize> BITS_PER_WORD || modesize> BIGGEST_ALIGNMENT) return false; This should avoid these oddities. Bernd.
Re: Fix for PRs 36043, 58744 and 65408
On Sat, Mar 14, 2015 at 11:32:38PM +1030, Alan Modra wrote: > I'll also throw together a testcase or three. * gcc.dg/pr65408.c: New. Index: gcc/testsuite/gcc.dg/pr65408.c === --- gcc/testsuite/gcc.dg/pr65408.c (revision 0) +++ gcc/testsuite/gcc.dg/pr65408.c (working copy) @@ -0,0 +1,112 @@ +/* PR middle-end/36043 target/58744 target/65408 */ +/* { dg-do run { target mmap } } */ +/* { dg-options "-O2" } */ + +#include +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#ifndef MAP_ANON +#define MAP_ANON 0 +#endif +#ifndef MAP_FAILED +#define MAP_FAILED ((void *)-1) +#endif + +typedef struct +{ + unsigned char r; + unsigned char g; + unsigned char b; +} __attribute__((packed)) pr58744; + +typedef struct +{ + unsigned short r; + unsigned short g; + unsigned short b; +} pr36043; + +typedef struct +{ + int r; + int g; + int b; +} pr65408; + +__attribute__ ((noinline, noclone)) +void +f1a (pr58744 x) +{ + if (x.r != 1 || x.g != 2 || x.b != 3) +__builtin_abort(); +} + +__attribute__ ((noinline, noclone)) +void +f1 (pr58744 *x) +{ + f1a (*x); +} + +__attribute__ ((noinline, noclone)) +void +f2a (pr36043 x) +{ + if (x.r != 1 || x.g != 2 || x.b != 3) +__builtin_abort(); +} + +__attribute__ ((noinline, noclone)) +void +f2 (pr36043 *x) +{ + f2a (*x); +} + +__attribute__ ((noinline, noclone)) +void +f3a (pr65408 x) +{ + if (x.r != 1 || x.g != 2 || x.b != 3) +__builtin_abort(); +} + +__attribute__ ((noinline, noclone)) +void +f3 (pr65408 *x) +{ + f3a (*x); +} + +int +main () +{ + char *p = mmap ((void *) 0, 131072, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) +return 0; + char *endp = p + 65536; + if (munmap (endp, 65536) < 0) +return 0; + + pr58744 *s1 = (pr58744 *) endp - 1; + s1->r = 1; + s1->g = 2; + s1->b = 3; + f1 (s1); + + pr36043 *s2 = (pr36043 *) endp - 1; + s2->r = 1; + s2->g = 2; + s2->b = 3; + f2 (s2); + + pr65408 *s3 = (pr65408 *) endp - 1; + s3->r = 1; + s3->g = 2; + s3->b = 3; + f3 (s3); + + return 0; +} -- Alan Modra Australia Development Lab, IBM
[PATCH, nds32] Committed: Rename some variables to explicitly represent general purpose register
Hi, all, This patch is just to rename some variables so that one can easily tell that those variables are used to describe general purpose registers. No functionality changes. Committed as Rev.221306: https://gcc.gnu.org/r221306 Best regards, jasonwucj diff --git a/gcc/config/nds32/nds32-md-auxiliary.c b/gcc/config/nds32/nds32-md-auxiliary.c index 2f49277..0a3e773 100644 --- a/gcc/config/nds32/nds32-md-auxiliary.c +++ b/gcc/config/nds32/nds32-md-auxiliary.c @@ -604,8 +604,8 @@ nds32_output_stack_push (rtx par_rtx) + NDS32_MAX_GPR_REGS_FOR_ARGS - 1; /* Pick up callee-saved first regno and last regno for further use. */ - int rb_callee_saved = cfun->machine->callee_saved_regs_first_regno; - int re_callee_saved = cfun->machine->callee_saved_regs_last_regno; + int rb_callee_saved = cfun->machine->callee_saved_first_gpr_regno; + int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno; /* First we need to check if we are pushing argument registers not used for the named arguments. If so, we have to create 'smw.adm' (push.s) @@ -644,7 +644,7 @@ nds32_output_stack_push (rtx par_rtx) otherwise, generate 'push25 Re,0'. */ sp_adjust = cfun->machine->local_size + cfun->machine->out_args_size - + cfun->machine->callee_saved_area_padding_bytes; + + cfun->machine->callee_saved_area_gpr_padding_bytes; if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust)) && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust)) operands[1] = GEN_INT (sp_adjust); @@ -712,8 +712,8 @@ nds32_output_stack_pop (rtx par_rtx ATTRIBUTE_UNUSED) /* The operands array which will be used in output_asm_insn(). */ rtx operands[3]; /* Pick up callee-saved first regno and last regno for further use. */ - int rb_callee_saved = cfun->machine->callee_saved_regs_first_regno; - int re_callee_saved = cfun->machine->callee_saved_regs_last_regno; + int rb_callee_saved = cfun->machine->callee_saved_first_gpr_regno; + int re_callee_saved = cfun->machine->callee_saved_last_gpr_regno; /* If we step here, we are going to do v3pop or multiple pop operation. */ @@ -742,7 +742,7 @@ nds32_output_stack_pop (rtx par_rtx ATTRIBUTE_UNUSED) and then use 'pop25 Re,0'. */ sp_adjust = cfun->machine->local_size + cfun->machine->out_args_size - + cfun->machine->callee_saved_area_padding_bytes; + + cfun->machine->callee_saved_area_gpr_padding_bytes; if (satisfies_constraint_Iu08 (GEN_INT (sp_adjust)) && NDS32_DOUBLE_WORD_ALIGN_P (sp_adjust) && !cfun->calls_alloca) diff --git a/gcc/config/nds32/nds32.c b/gcc/config/nds32/nds32.c index 1845bc2..b79e33e 100644 --- a/gcc/config/nds32/nds32.c +++ b/gcc/config/nds32/nds32.c @@ -224,12 +224,12 @@ nds32_compute_stack_frame (void) cfun->machine->lp_size = (df_regs_ever_live_p (LP_REGNUM)) ? 4 : 0; /* Initially there is no padding bytes. */ - cfun->machine->callee_saved_area_padding_bytes = 0; + cfun->machine->callee_saved_area_gpr_padding_bytes = 0; /* Calculate the bytes of saving callee-saved registers on stack. */ - cfun->machine->callee_saved_regs_size = 0; - cfun->machine->callee_saved_regs_first_regno = SP_REGNUM; - cfun->machine->callee_saved_regs_last_regno = SP_REGNUM; + cfun->machine->callee_saved_gpr_regs_size = 0; + cfun->machine->callee_saved_first_gpr_regno = SP_REGNUM; + cfun->machine->callee_saved_last_gpr_regno = SP_REGNUM; /* Currently, there is no need to check $r28~$r31 because we will save them in another way. */ for (r = 0; r < 28; r++) @@ -240,10 +240,10 @@ nds32_compute_stack_frame (void) (only need to set it once). If first regno == SP_REGNUM, we can tell that it is the first time to be here. */ - if (cfun->machine->callee_saved_regs_first_regno == SP_REGNUM) - cfun->machine->callee_saved_regs_first_regno = r; + if (cfun->machine->callee_saved_first_gpr_regno == SP_REGNUM) + cfun->machine->callee_saved_first_gpr_regno = r; /* Mark the last required callee-saved register. */ - cfun->machine->callee_saved_regs_last_regno = r; + cfun->machine->callee_saved_last_gpr_regno = r; } } @@ -262,8 +262,8 @@ nds32_compute_stack_frame (void) condition 3: There is no local_size, which means we do not need to adjust $sp. */ if (lookup_attribute ("naked", DECL_ATTRIBUTES (current_function_decl)) - || (cfun->machine->callee_saved_regs_first_regno == SP_REGNUM - && cfun->machine->callee_saved_regs_last_regno == SP_REGNUM + || (cfun->machine->callee_saved_first_gpr_regno == SP_REGNUM + && cfun->machine->callee_saved_last_gpr_regno == SP_REGNUM && !df_regs_ever_live_p (FP_REGNUM) && !df_regs_ever_live_p (LP_REGNUM)