[RFC ARM] Use vcvt.f32.s32 with immediate bits to do fixed to floating point conversions.

Ramana Radhakrishnan Tue, 04 Oct 2011 09:26:40 -0700

Hi,

So one of the things Michael pointed out in today's call was that the
ARM backend doesn't generate vcvt.f32.s<type> where you have an idiom
conversion from fixed to floating point as in the example below. I've
chosen to implement this in the following manner in the backend using
these interfaces from real.c . The reason I've chosen to not allow
this transformation in case flag_rounding_math is true is because this
instruction always ends up rounding using round-to-nearest rather than
obeying whats in the FPSCR and thus is not safe for programs that want
to dynamically set their rounding modes.


The benefits are quite obvious in that we eliminate a load from the
constant pool and a floating point multiply and thus essentially
shaving off a floating point multiply + Load latency off these
sequences. This instruction can only write the output into the same
register as the input register which is why I've modelled it as below
by tying op1 into op0.

If there's a simpler way of using the interfaces into real.c then I'm all ears ?

Thoughts ? I believe such idioms are used in libav from where the
original report appears to have come and thus it's a worthwhile gain
where we can have it. Any other places where folks might have noticed
this.

I will post upstream as well once I finish testing this patch. I'm
posting this here to get some feedback as well to let anyone who is
really really keen about trying this out have a go given I'm out
tomorrow.

( I took a quick look at the short -> f32 case as well but the fact
remains that loads either zero or sign extend anyway so there's
probably not much gain in modelling that right away and the win really
is in getting rid of that fp mul and the constant pool load. There's
probably some gain in going from i64-> f64 as well so those patterns
need to be written up at some point for completeness )

cheers
Ramana



2011-10-04  Ramana Radhakrishnan  <ramana.radhakrish...@linaro.org>

        * config/arm/arm.c (vfp3_const_double_for_fract_bits): Define.
        * config/arm/arm-protos.h (vfp3_const_double_for_fract_bits): Declare.
        * config/arm/constraints.md ("Dt"): New constraint.
        * config/arm/predicates.md (const_double_vcvt_power_of_two_reciprocal):
        New.
        * config/arm/vfp.md (*arm_combine_vcvt_f32_s32): New.
        (*arm_combine_vcvt_f32_u32): New.

For the following testcases I see the code as follows with
-mfloat-abi=hard -mfpu=vfpv3 and -mcpu=cortex-a9

float foo (int i)
{
 float v = (float)i / (1 << 11);
 return v;
}
float foa_unsigned (unsigned int i)
{
 float v = (float)i / (1 << 5);
 return v;
}


After patch .

foo:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        fmsr    s0, r0  @ int
        vcvt.f32.s32    s0, s0, #11
        bx      lr
        .size   foo, .-foo
        .align  2
        .global foa_unsigned
        .type   foa_unsigned, %function
foa_unsigned:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        fmsr    s0, r0  @ int
        vcvt.f32.u32    s0, s0, #5
        bx      lr
        .size   foa_unsigned, .-foa_unsigned
        .align  2
        .global foo1
        .type   foo1, %function

rather than
        .type   foo, %function
foo:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        fmsr    s15, r0 @ int
        fsitos  s0, s15
        flds    s15, .L2
        fmuls   s0, s0, s15
        bx      lr
.L3:
        .align  2
.L2:
        .word   973078528
        .size   foo, .-foo
        .align  2
        .global foa_unsigned
        .type   foa_unsigned, %function
foa_unsigned:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        fmsr    s15, r0 @ int
        fuitos   s0, s15
        flds    s15, .L5
        fmuls   s0, s0, s15
        bx      lr
.L6:
        .align  2
.L5:
        .word   1023410176

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 23a29c6..c933704 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,7 @@ struct tune_params
 };
 
 extern const struct tune_params *current_tune;
+extern int vfp3_const_double_for_fract_bits (rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 5161439..e4459d7 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -17260,6 +17260,11 @@ arm_print_operand (FILE *stream, rtx x, int code)
       }
       return;
 
+    case 'v':
+       gcc_assert (GET_CODE (x) == CONST_DOUBLE);
+       fprintf (stream, "#%d", vfp3_const_double_for_fract_bits (x));
+       return;
+
     /* Register specifier for vld1.16/vst1.16.  Translate the S register
        number into a D register number and element index.  */
     case 'z':
@@ -24525,4 +24530,27 @@ arm_count_output_move_double_insns (rtx *operands)
   return count;
 }
 
+int
+vfp3_const_double_for_fract_bits (rtx operand)
+{
+  REAL_VALUE_TYPE r0;
+  
+  if (GET_CODE (operand) != CONST_DOUBLE)
+    return 0;
+  
+  REAL_VALUE_FROM_CONST_DOUBLE (r0, operand);
+  if (exact_real_inverse (DFmode, &r0))
+    {
+      if (exact_real_truncate (DFmode, &r0))
+       {
+         HOST_WIDE_INT value = real_to_integer (&r0);
+         value = value & 0xffffffff;
+         if ((value != 0) && ( (value & (value - 1)) == 0))
+           return int_log2 (value);
+       }
+    }
+  return 0;
+}
+
 #include "gt-arm.h"
+
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index d8ce982..7d0269a 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -29,7 +29,7 @@
 ;; in Thumb-1 state: I, J, K, L, M, N, O
 
 ;; The following multi-letter normal constraints have been used:
-;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy, Di, Dz
+;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy, Di, Dt, Dz
 ;; in Thumb-1 state: Pa, Pb, Pc, Pd
 ;; in Thumb-2 state: Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py
 
@@ -291,6 +291,12 @@
  (and (match_code "const_double")
       (match_test "TARGET_32BIT && TARGET_VFP_DOUBLE && vfp3_const_double_rtx 
(op)")))
 
+(define_constraint "Dt" 
+ "@internal
+  In ARM/ Thumb2 a const_double which can be used with a vcvt.f32.s32 with 
fract bits operation"
+  (and (match_code "const_double")
+       (match_test "TARGET_32BIT && TARGET_VFP && 
vfp3_const_double_for_fract_bits (op)")))
+
 (define_memory_constraint "Ut"
  "@internal
   In ARM/Thumb-2 state an address valid for loading/storing opaque structure
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 27ba603..cf00f88 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -741,6 +741,11 @@
   return true; 
 })
 
+(define_predicate "const_double_vcvt_power_of_two_reciprocal"
+  (and (match_code "const_double")
+       (match_test "TARGET_32BIT && TARGET_VFP 
+                          && vfp3_const_double_for_fract_bits (op)")))
+
 (define_predicate "neon_struct_operand"
   (and (match_code "mem")
        (match_test "TARGET_32BIT && neon_vector_mem_operand (op, 2)")))
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 0c85c46..84dfcc6 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -1144,6 +1144,28 @@
    (set_attr "type" "fcmpd")]
 )
 
+;; Fixed point to floating point conversions. 
+(define_insn "*arm_combine_vcvt_f32_s32"
+  [(set (match_operand:SF 0 "s_register_operand" "=w")
+       (mult:SF (float:SF (match_operand:SI 1 "s_register_operand" "0"))
+                (match_operand 2 "const_double_vcvt_power_of_two_reciprocal" 
"Dt")))]
+  "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP3 && !flag_rounding_math"
+  "vcvt.f32.s32\\t%0, %1, %v2"
+ [(set_attr "predicable" "no")
+  (set_attr "type" "f_cvt")]
+)
+
+(define_insn "*arm_combine_vcvt_f32_u32"
+  [(set (match_operand:SF 0 "s_register_operand" "=w")
+       (mult:SF 
+        (unsigned_float:SF 
+         (match_operand:SI 1 "s_register_operand" "0"))
+        (match_operand 2 "const_double_vcvt_power_of_two_reciprocal" "Dt")))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP3 && !flag_rounding_math"
+ "vcvt.f32.u32\\t%0, %1, %v2"
+ [(set_attr "predicable" "no")
+  (set_attr "type" "f_cvt")]
+)
 
 ;; Store multiple insn used in function prologue.

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

[RFC ARM] Use vcvt.f32.s32 with immediate bits to do fixed to floating point conversions.

Reply via email to