RE: [2/2] RISC-V: Constant FP Optimization with 'Zfa'

Jin Ma via Gcc-patches Mon, 14 Aug 2023 05:51:35 -0700

Hi Tsukasa,
  What a coincidence, I also implemented zfa extension, which also includes fli 
related instructions :)


links: https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627294.html

> +  if (!TARGET_HARD_FLOAT || !TARGET_ZFA)
> +    return result;
> +  switch (GET_MODE (x))
> +    {
> +    case HFmode:
> +      /* Not only 'Zfhmin', either 'Zfh' or 'Zvfh' is required.  */
> +      if (!TARGET_ZFH && !TARGET_ZVFH)

When Zvfh means that zfh is also on, so there may be no need to judge
the TARGET_ZVFH here. By the way,the format here seems wrong, maybe 'tab'
is needed for alignment?

> +     return result;
> +      break;
> +    case SFmode: break;
> +    case DFmode: break;

Maybe we still have to judge TARGET_DOUBLE_FLOAT?

> +    default: return result;
> +    }
> +
> +  if (!CONST_DOUBLE_P (x))
> +    return result;

I think it might be better to judge whether x satisfies the CONST_DOUBLE_P
before switch (GET_MODE (x)) above.

> +
> +  r = *CONST_DOUBLE_REAL_VALUE (x);
> +
> +  if (REAL_VALUE_ISNAN (r))
> +    {
> +      long reprs[2] = { 0 };
> +      /* Compare with canonical NaN.  */
> +      switch (GET_MODE (x))
> +     {
> +     case HFmode:
> +       reprs[0] = real_to_target (NULL, &r,
> +                                  float_mode_for_size (16).require ());
> +       /* 0x7e00: Canonical NaN for binary16.  */
> +       if (reprs[0] != 0x7e00)
> +         return result;
> +       break;
> +     case SFmode:
> +       reprs[0] = real_to_target (NULL, &r,
> +                                  float_mode_for_size (32).require ());
> +       /* 0x7fc00000: Canonical NaN for binary32.  */
> +       if (reprs[0] != 0x7fc00000)
> +         return result;
> +       break;
> +     case DFmode:
> +       real_to_target (reprs, &r, float_mode_for_size (64).require ());
> +       if (FLOAT_WORDS_BIG_ENDIAN)
> +         std::swap (reprs[0], reprs[1]);
> +       /* 0x7ff80000_00000000: Canonical NaN for binary64.  */
> +       if (reprs[0] != 0 || reprs[1] != 0x7ff80000)
> +         return result;
> +       break;
> +     default:
> +       gcc_unreachable ();
> +     }
> +      result.type = RISCV_FLOAT_CONST_NAN;
> +      result.valid = true;
> +      return result;
> +    }
> +  else if (REAL_VALUE_ISINF (r))
> +    {
> +      if (REAL_VALUE_NEGATIVE (r))
> +     return result;
> +      result.type = RISCV_FLOAT_CONST_INF;
> +      result.valid = true;
> +      return result;
> +    }
> +
> +  bool sign = REAL_VALUE_NEGATIVE (r);
> +  result.sign = sign;
> +
> +  r = real_value_abs (&r);
> +  /* GCC internally does not use IEEE754-like encoding (where normalized
> +     significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
> +     So, this exponent_p1 variable equals IEEE754 unbiased exponent + 1.  */
> +  int exponent_p1 = REAL_EXP (&r);
> +
> +  /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
> +     highest (sign) bit, with a fixed binary point at bit point_pos.
> +     m1 holds the low part of the mantissa, m2 the high part.
> +     WARNING: If we ever have a representation using more than 2 * H_W_I - 1
> +     bits for the mantissa, this can fail (low bits will be lost).  */
> +  bool fail = false;
> +  real_ldexp (&m, &r, (2 * HOST_BITS_PER_WIDE_INT - 1) - exponent_p1);
> +  wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
> +  if (fail)
> +    return result;
> +
> +  /* If the low part of the mantissa has bits set we cannot represent
> +     the value.  */
> +  if (w.ulow () != 0)
> +    return result;
> +  /* We have rejected the lower HOST_WIDE_INT, so update our
> +     understanding of how many bits lie in the mantissa and
> +     look only at the high HOST_WIDE_INT.  */
> +  unsigned HOST_WIDE_INT mantissa = w.elt (1);
> +
> +  /* We cannot represent the value 0.0.  */
> +  if (mantissa == 0)
> +    return result;
> +
> +  /* We can only represent values with a mantissa of the form 1.xx.  */
> +  unsigned HOST_WIDE_INT mask
> +      = ((unsigned HOST_WIDE_INT) 1 << (HOST_BITS_PER_WIDE_INT - 4)) - 1;
> +  if ((mantissa & mask) != 0)
> +    return result;
> +  mantissa >>= HOST_BITS_PER_WIDE_INT - 4;
> +  /* Now the lowest 3-bits of mantissa should form (1.xx)b.  */
> +  gcc_assert (mantissa & (1u << 2));
> +  /* Mask out the highest bit.  */
> +  mantissa &= ~(1u << 2);
> +
> +  if (mantissa == 0)
> +    {
> +      /* We cannot represent any values but -1.0.  */
> +      if (exponent_p1 != 1 && sign)
> +     return result;
> +      switch (exponent_p1)
> +     {
> +     case -15: /* 1.0 * 2^(-16)  */
> +     case -14: /* 1.0 * 2^(-15)  */
> +     case -7:  /* 1.0 * 2^(- 8)  */
> +     case -6:  /* 1.0 * 2^(- 7)  */
> +     case 8:   /* 1.0 * 2^(+ 7)  */
> +     case 9:   /* 1.0 * 2^(+ 8)  */
> +     case 16:  /* 1.0 * 2^(+15)  */
> +     case 17:  /* 1.0 * 2^(+16)  */
> +       break;
> +     default:
> +       if (exponent_p1 >= -3 && exponent_p1 <= 5)
> +         /* 1.0 * 2^[-4,4]  */
> +         break;
> +       switch (GET_MODE (x))
> +         {
> +         case HFmode: /* IEEE 754 binary16.  */
> +           /* Minimum positive normal == 1.0 * 2^(-14)  */
> +           if (exponent_p1 != -13) return result;
> +           break;
> +         case SFmode: /* IEEE 754 binary32.  */
> +           /* Minimum positive normal == 1.0 * 2^(-126)  */
> +           if (exponent_p1 != -125) return result;
> +           break;
> +         case DFmode: /* IEEE 754 binary64.  */
> +           /* Minimum positive normal == 1.0 * 2^(-1022)  */
> +           if (exponent_p1 != -1021) return result;
> +           break;
> +         default:
> +           gcc_unreachable ();
> +         }
> +       result.type = RISCV_FLOAT_CONST_MIN;
> +       result.valid = true;
> +       return result;
> +     }
> +    }
> +  else
> +    {
> +      if (sign)
> +     return result;
> +      if (exponent_p1 < -1 || exponent_p1 > 2)
> +     return result;
> +      /* The value is (+1.xx)b * 2^[-2,1].
> +      But we cannot represent (+1.11)b * 2^1 (that is 3.5). */
> +      if (exponent_p1 == 2 && mantissa == 3)
> +     return result;
> +    }
> +
> +  result.valid = true;
> +  result.mantissa_below_point = mantissa;
> +  result.biased_exponent = exponent_p1 + 15;
> +
> +  return result;
> +}
> +

This code is great and completely different from the way I implemented it.
I'm not sure which one is better, but my idea is that the fli instruction
corresponds to three tables (HF, SF and DF), all of which represent
specific values. the library in gcc's real.h can very well convert
the corresponding values into the values in the table, so it is only
necessary to perform a simple binary search to look up the tables.

@@ -1362,17 +1545,14 @@  riscv_const_insns (rtx x)
                   constant incurs a literal-pool access.  Allow this in
                   order to increase vectorization possibilities.  */
                int n = riscv_const_insns (elt);
-               if (CONST_DOUBLE_P (elt))
-                   return 1 + 4; /* vfmv.v.f + memory access.  */
> +             /* We need as many insns as it takes to load the constant
> +                into a GPR and one vmv.v.x.  */
> +             if (n != 0)
> +               return 1 + n;
> +             else if (CONST_DOUBLE_P (elt))
> +               return 1 + 4; /* vfmv.v.f + memory access.  */
                else
-                 {
-                   /* We need as many insns as it takes to load the constant
-                      into a GPR and one vmv.v.x.  */
-                   if (n != 0)
-                     return 1 + n;
-                   else
-                     return 1 + 4; /*vmv.v.x + memory access.  */
-                 }
> +               return 1 + 4; /* vmv.v.x + memory access.  */
              }
          }

I don't seem to understand here, if n = = 0, always return 1 + 4?
If so, it could be
if (n != 0)
   return 1 + n;
else
  return 1 + 4;

@@ -5117,6 +5313,36 @@  riscv_print_operand (FILE *file, rtx op, int letter)
            output_address (mode, XEXP (op, 0));
          break;
 
> +     case CONST_DOUBLE:
> +       {
> +         struct riscv_float_fli_const flt = riscv_get_float_fli_const (op);
> +         if (flt.valid)
> +           {
> +             switch (flt.type)
> +               {
> +               case RISCV_FLOAT_CONST_MIN:
> +                 fputs ("min", file);
> +                 break;
> +               case RISCV_FLOAT_CONST_INF:
> +                 fputs ("inf", file);
> +                 break;
> +               case RISCV_FLOAT_CONST_NAN:
> +                 fputs ("nan", file);
> +                 break;
> +               default:
> +                 /* Use simpler (and bit-perfect) printer.  */
> +                 if (flt.sign)
> +                   fputc ('-', file);
> +                 fprintf (file, "0x1.%cp%+d",
> +                          "048c"[flt.mantissa_below_point],
> +                          (int) flt.biased_exponent - 16);
> +                 break;
> +               }
> +             break;
> +           }
> +       }
> +       /* Fall through.  */

Display floating-point values at the assembly level can refer llvm
https://reviews.llvm.org/D145645. 

It may also be necessary to deal with riscv_split_64bit_move_p
and riscv_legitimize_const_move for rv32, otherwise the mov of
DFmode on rv32 will be split into high 32-bit mov and low 32-bit
mov, thus unable to generate fli instructions.

Thanks,
Jin

RE: [2/2] RISC-V: Constant FP Optimization with 'Zfa'

Reply via email to