Hi, the attached patch fixes two issues with the TX backend optimization trying to get rid of FPR save/restore operations in some cases.
The first is that the stack pointer might get clobbered on 64 bit if the backend was able to get rid of all the FPR saves/restores and these were the only things requiring stack space. For more details please see the testcase comment. The second is that the optimization did not work as expected on 31 bit since the loop setting the fpr save bits only for the 64 bit call-saved FPRs made use of the call_clobbered information. Both issues are covered by the new testcase. Bootstrapped and regtested on s390 and s390x with --with-arch=zEC12. No regressions. I'm going to apply the patch to 4.8 and mainline after waiting a few days for comments. Bye, -Andreas- 2013-10-04 Andreas Krebbel <andreas.kreb...@de.ibm.com> * config/s390/s390.c (s390_register_info): Make the call-saved FPR loop to work also for 31bit ABI. Save the stack pointer for frame_size > 0. 2013-10-04 Andreas Krebbel <andreas.kreb...@de.ibm.com> * gcc.target/s390/htm-nofloat-2.c: New testcase. --- gcc/config/s390/s390.c | 21 ----!!!! gcc/testsuite/gcc.target/s390/htm-nofloat-2.c | 55 ++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 8 deletions(-), 11 modifications(!) Index: gcc/config/s390/s390.c =================================================================== *** gcc/config/s390/s390.c.orig --- gcc/config/s390/s390.c *************** s390_register_info (int clobbered_regs[] *** 7509,7516 **** { cfun_frame_layout.fpr_bitmap = 0; cfun_frame_layout.high_fprs = 0; ! if (TARGET_64BIT) ! for (i = FPR8_REGNUM; i <= FPR15_REGNUM; i++) /* During reload we have to use the df_regs_ever_live infos since reload is marking FPRs used as spill slots there as live before actually making the code changes. Without --- 7509,7519 ---- { cfun_frame_layout.fpr_bitmap = 0; cfun_frame_layout.high_fprs = 0; ! ! for (i = FPR0_REGNUM; i <= FPR15_REGNUM; i++) ! { ! if (call_really_used_regs[i]) ! continue; /* During reload we have to use the df_regs_ever_live infos since reload is marking FPRs used as spill slots there as live before actually making the code changes. Without *************** s390_register_info (int clobbered_regs[] *** 7523,7530 **** && !global_regs[i]) { cfun_set_fpr_save (i); ! cfun_frame_layout.high_fprs++; } } for (i = 0; i < 16; i++) --- 7526,7536 ---- && !global_regs[i]) { cfun_set_fpr_save (i); ! ! if (i >= FPR8_REGNUM) ! cfun_frame_layout.high_fprs++; } + } } for (i = 0; i < 16; i++) *************** s390_register_info (int clobbered_regs[] *** 7554,7559 **** --- 7560,7566 ---- || TARGET_TPF_PROFILING || cfun_save_high_fprs_p || get_frame_size () > 0 + || (reload_completed && cfun_frame_layout.frame_size > 0) || cfun->calls_alloca || cfun->stdarg); *************** s390_register_info (int clobbered_regs[] *** 7652,7665 **** cfun_set_fpr_save (i + FPR0_REGNUM); } } - - if (!TARGET_64BIT) - { - if (df_regs_ever_live_p (FPR4_REGNUM) && !global_regs[FPR4_REGNUM]) - cfun_set_fpr_save (FPR4_REGNUM); - if (df_regs_ever_live_p (FPR6_REGNUM) && !global_regs[FPR6_REGNUM]) - cfun_set_fpr_save (FPR6_REGNUM); - } } /* Fill cfun->machine with info about frame of current function. */ --- 7659,7664 ---- Index: gcc/testsuite/gcc.target/s390/htm-nofloat-2.c =================================================================== *** /dev/null --- gcc/testsuite/gcc.target/s390/htm-nofloat-2.c *************** *** 0 **** --- 1,55 ---- + /* { dg-do run } */ + /* { dg-options "-O3 -mhtm -Wa,-march=zEC12 --save-temps" } */ + + /* __builtin_tbegin has to emit clobbers for all FPRs since the tbegin + instruction does not automatically preserves them. If the + transaction body is fully contained in a function the backend tries + after reload to get rid of the FPR save/restore operations + triggered by the clobbers. This testcase failed since the backend + was able to get rid of all FPR saves/restores and since these were + the only stack operations also of the entire stack space. So even + the save/restore of the stack pointer was omitted in the end. + However, since the frame layout has been fixed before, the prologue + still generated the stack pointer decrement making foo return with + a modified stack pointer. */ + + void abort(void); + + void __attribute__((noinline)) + foo (int a) + { + /* This is just to prevent the tbegin code from actually being + executed. That way the test may even run on machines prior to + zEC12. */ + if (a == 42) + return; + + if (__builtin_tbegin (0) == 0) + __builtin_tend (); + } + + #ifdef __s390x__ + #define GET_STACK_POINTER(SP) \ + asm volatile ("stg %%r15, %0" : "=QRST" (SP)); + #else + #define GET_STACK_POINTER(SP) \ + asm volatile ("st %%r15, %0" : "=QR" (SP)); + #endif + + int main(void) + { + unsigned long new_sp, old_sp; + + GET_STACK_POINTER (old_sp); + foo(42); + GET_STACK_POINTER (new_sp); + + if (old_sp != new_sp) + abort (); + + return 0; + } + + /* Make sure no FPR saves/restores are emitted. */ + /* { dg-final { scan-assembler-not "\tstd\t" } } */ + /* { dg-final { scan-assembler-not "\tld\t" } } */