https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119130

            Bug ID: 119130
           Summary: Results of vec_pack_to_short_fp32 intrinsic are not in
                    the expected order on big-endian POWER9
           Product: gcc
           Version: 14.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: john_platts at hotmail dot com
  Target Milestone: ---

The POWER9 vec_pack_to_short_fp32(a, b) intrinsic produces unexpected results
on big-endian POWER9.

vec_pack_to_short_fp32({1, 2, 3, 4}, {5, 6, 7, 8}) results in {1, 2, 3, 4, 5,
6, 7, 8} on little-endian POWER9.

vec_pack_to_short_fp32({1, 2, 3, 4}, {5, 6, 7, 8}) results in {5, 6, 7, 8, 1,
2, 3 , 4} instead of the expected {1, 2, 3, 4, 5, 6, 7, 8} on big-endian
POWER9.

Here is a C++20 program (need to compile with -std=c++20 -mcpu=power9) that can
be used to test the results of vec_pack_to_short_fp32:
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#include <altivec.h>

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>

#include <cctype>
#include <charconv>
#include <concepts>
#include <string_view>
#include <type_traits>

template <std::floating_point TF>
static inline TF ConvCharsToFloat(std::string_view sv) {
  const char* const input_data = sv.data();
  const size_t input_len = sv.length();

  size_t start_idx = 0;
  for (; start_idx < input_len &&
         std::isspace(static_cast<unsigned char>(input_data[start_idx]));
       ++start_idx) {
  }

  if (start_idx >= input_len) {
    return std::remove_cv_t<TF>{};
  }

  std::remove_cv_t<TF> parsed_val;
  const auto parse_result = std::from_chars(input_data + start_idx,
                                            input_data + input_len,
parsed_val);
  return (parse_result.ec == std::errc{}) ? parsed_val :
std::remove_cv_t<TF>{};
}

static inline double VsxConvertF16BitsToF64(uint16_t f16_bits) {
  const __vector unsigned short f16_bits_vec =
      vec_splats(static_cast<unsigned short>(f16_bits));
  double f64_result;
  __asm__("xscvhpdp %x0,%x1" : "=wa"(f64_result) : "wa"(f16_bits_vec) :);
  return f64_result;
}

typedef float GccF32x4AlignedVec
    __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
typedef uint16_t GccU16x8AlignedVec
    __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));

int main(int argc, char** argv) {
  alignas(16) float f32_in_lanes[8];
  alignas(16) uint16_t f16_result_lanes[8];

  for (int i = 1, input_len; i < argc; i += input_len) {
    input_len = argc - i;
    if (input_len > 8) input_len = 8;

    *reinterpret_cast<GccF32x4AlignedVec*>(f32_in_lanes) =
GccF32x4AlignedVec{};
    *reinterpret_cast<GccF32x4AlignedVec*>(f32_in_lanes + 4) =
        GccF32x4AlignedVec{};

    for (int j = 0; j < input_len; j++) {
      f32_in_lanes[j] = ConvCharsToFloat<float>(std::string_view(argv[i + j]));
    }

    __vector float vf32_a = reinterpret_cast<__vector float>(
        *reinterpret_cast<const GccF32x4AlignedVec*>(f32_in_lanes));
    __vector float vf32_b = reinterpret_cast<__vector float>(
        *reinterpret_cast<const GccF32x4AlignedVec*>(f32_in_lanes + 4));
    __vector unsigned short result_vec = vec_pack_to_short_fp32(vf32_a,
vf32_b);

    *reinterpret_cast<GccU16x8AlignedVec*>(f16_result_lanes) = result_vec;

    for (int j = 0; j < 8; j++) {
      printf("xvcvsphp(%g) = %g\n", f32_in_lanes[j],
             VsxConvertF16BitsToF64(f16_result_lanes[j]));
    }
  }

  return 0;
}

Reply via email to