https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118460
Christophe Lyon <clyon at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |clyon at gcc dot gnu.org --- Comment #5 from Christophe Lyon <clyon at gcc dot gnu.org> --- Adding a thumb2_movhfcc_vfp pattern similar to thumb2_movsfcc_vfp does trick: ============================================ diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md index 379f5f7b3dc..23a1e38bb92 100644 --- a/gcc/config/arm/vfp.md +++ b/gcc/config/arm/vfp.md @@ -816,6 +816,29 @@ (define_insn "*thumb2_movsfcc_vfp" (set_attr "type" "fmov,fmov,fmov,f_mcr,f_mcr,f_mcr,f_mrc,f_mrc,f_mrc")] ) +(define_insn "*thumb2_movhfcc_vfp" + [(set (match_operand:HF 0 "s_register_operand" "=t,t,t,t,t,t,?r,?r,?r") + (if_then_else:HF + (match_operator 3 "arm_comparison_operator" + [(match_operand 4 "cc_register" "") (const_int 0)]) + (match_operand:HF 1 "s_register_operand" "0,t,t,0,?r,?r,0,t,t") + (match_operand:HF 2 "s_register_operand" "t,0,t,?r,0,?r,t,0,t")))] + "TARGET_VFP_FP16INST && !arm_restrict_it" + "@ + it\\t%D3\;vmov%D3.f16\\t%0, %2 + it\\t%d3\;vmov%d3.f16\\t%0, %1 + ite\\t%D3\;vmov%D3.f16\\t%0, %2\;vmov%d3.f16\\t%0, %1 + it\\t%D3\;vmov%D3\\t%0, %2 + it\\t%d3\;vmov%d3\\t%0, %1 + ite\\t%D3\;vmov%D3\\t%0, %2\;vmov%d3\\t%0, %1 + it\\t%D3\;vmov%D3\\t%0, %2 + it\\t%d3\;vmov%d3\\t%0, %1 + ite\\t%D3\;vmov%D3\\t%0, %2\;vmov%d3\\t%0, %1" + [(set_attr "conds" "use") + (set_attr "length" "6,6,10,6,6,10,6,6,10") + (set_attr "type" "fmov,fmov,fmov,f_mcr,f_mcr,f_mcr,f_mrc,f_mrc,f_mrc")] +) + (define_insn "*movdfcc_vfp" [(set (match_operand:DF 0 "s_register_operand" "=w,w,w,w,w,w,?r,?r,?r") (if_then_else:DF ================================================ With that patch, I generate: bar: vcvtb.f32.f16 s14, s0 @ 7 [c=4 l=4] extendhfsf2 vcvtb.f32.f16 s15, s1 @ 8 [c=4 l=4] extendhfsf2 vcmpe.f32 s14, s15 @ 20 [c=4 l=4] *cmpsf_trap_vfp/0 vmrs APSR_nzcv, FPSCR @ 21 [c=4 l=4] *movcc_vfp it pl @ 15 [c=4 l=6] *thumb2_movhfcc_vfp/0 vmovpl.f16 s0, s1 bx lr for the reduced test case, which is similar to what we generate for float.