https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119130
Bug ID: 119130
Summary: Results of vec_pack_to_short_fp32 intrinsic are not in
the expected order on big-endian POWER9
Product: gcc
Version: 14.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: john_platts at hotmail dot com
Target Milestone: ---
The POWER9 vec_pack_to_short_fp32(a, b) intrinsic produces unexpected results
on big-endian POWER9.
vec_pack_to_short_fp32({1, 2, 3, 4}, {5, 6, 7, 8}) results in {1, 2, 3, 4, 5,
6, 7, 8} on little-endian POWER9.
vec_pack_to_short_fp32({1, 2, 3, 4}, {5, 6, 7, 8}) results in {5, 6, 7, 8, 1,
2, 3 , 4} instead of the expected {1, 2, 3, 4, 5, 6, 7, 8} on big-endian
POWER9.
Here is a C++20 program (need to compile with -std=c++20 -mcpu=power9) that can
be used to test the results of vec_pack_to_short_fp32:
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")
#undef vector
#undef pixel
#undef bool
#include <altivec.h>
#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <cctype>
#include <charconv>
#include <concepts>
#include <string_view>
#include <type_traits>
template <std::floating_point TF>
static inline TF ConvCharsToFloat(std::string_view sv) {
const char* const input_data = sv.data();
const size_t input_len = sv.length();
size_t start_idx = 0;
for (; start_idx < input_len &&
std::isspace(static_cast<unsigned char>(input_data[start_idx]));
++start_idx) {
}
if (start_idx >= input_len) {
return std::remove_cv_t<TF>{};
}
std::remove_cv_t<TF> parsed_val;
const auto parse_result = std::from_chars(input_data + start_idx,
input_data + input_len,
parsed_val);
return (parse_result.ec == std::errc{}) ? parsed_val :
std::remove_cv_t<TF>{};
}
static inline double VsxConvertF16BitsToF64(uint16_t f16_bits) {
const __vector unsigned short f16_bits_vec =
vec_splats(static_cast<unsigned short>(f16_bits));
double f64_result;
__asm__("xscvhpdp %x0,%x1" : "=wa"(f64_result) : "wa"(f16_bits_vec) :);
return f64_result;
}
typedef float GccF32x4AlignedVec
__attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
typedef uint16_t GccU16x8AlignedVec
__attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
int main(int argc, char** argv) {
alignas(16) float f32_in_lanes[8];
alignas(16) uint16_t f16_result_lanes[8];
for (int i = 1, input_len; i < argc; i += input_len) {
input_len = argc - i;
if (input_len > 8) input_len = 8;
*reinterpret_cast<GccF32x4AlignedVec*>(f32_in_lanes) =
GccF32x4AlignedVec{};
*reinterpret_cast<GccF32x4AlignedVec*>(f32_in_lanes + 4) =
GccF32x4AlignedVec{};
for (int j = 0; j < input_len; j++) {
f32_in_lanes[j] = ConvCharsToFloat<float>(std::string_view(argv[i + j]));
}
__vector float vf32_a = reinterpret_cast<__vector float>(
*reinterpret_cast<const GccF32x4AlignedVec*>(f32_in_lanes));
__vector float vf32_b = reinterpret_cast<__vector float>(
*reinterpret_cast<const GccF32x4AlignedVec*>(f32_in_lanes + 4));
__vector unsigned short result_vec = vec_pack_to_short_fp32(vf32_a,
vf32_b);
*reinterpret_cast<GccU16x8AlignedVec*>(f16_result_lanes) = result_vec;
for (int j = 0; j < 8; j++) {
printf("xvcvsphp(%g) = %g\n", f32_in_lanes[j],
VsxConvertF16BitsToF64(f16_result_lanes[j]));
}
}
return 0;
}