https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119130
Bug ID: 119130 Summary: Results of vec_pack_to_short_fp32 intrinsic are not in the expected order on big-endian POWER9 Product: gcc Version: 14.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: john_platts at hotmail dot com Target Milestone: --- The POWER9 vec_pack_to_short_fp32(a, b) intrinsic produces unexpected results on big-endian POWER9. vec_pack_to_short_fp32({1, 2, 3, 4}, {5, 6, 7, 8}) results in {1, 2, 3, 4, 5, 6, 7, 8} on little-endian POWER9. vec_pack_to_short_fp32({1, 2, 3, 4}, {5, 6, 7, 8}) results in {5, 6, 7, 8, 1, 2, 3 , 4} instead of the expected {1, 2, 3, 4, 5, 6, 7, 8} on big-endian POWER9. Here is a C++20 program (need to compile with -std=c++20 -mcpu=power9) that can be used to test the results of vec_pack_to_short_fp32: #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include <altivec.h> #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include <stddef.h> #include <stdint.h> #include <stdio.h> #include <cctype> #include <charconv> #include <concepts> #include <string_view> #include <type_traits> template <std::floating_point TF> static inline TF ConvCharsToFloat(std::string_view sv) { const char* const input_data = sv.data(); const size_t input_len = sv.length(); size_t start_idx = 0; for (; start_idx < input_len && std::isspace(static_cast<unsigned char>(input_data[start_idx])); ++start_idx) { } if (start_idx >= input_len) { return std::remove_cv_t<TF>{}; } std::remove_cv_t<TF> parsed_val; const auto parse_result = std::from_chars(input_data + start_idx, input_data + input_len, parsed_val); return (parse_result.ec == std::errc{}) ? parsed_val : std::remove_cv_t<TF>{}; } static inline double VsxConvertF16BitsToF64(uint16_t f16_bits) { const __vector unsigned short f16_bits_vec = vec_splats(static_cast<unsigned short>(f16_bits)); double f64_result; __asm__("xscvhpdp %x0,%x1" : "=wa"(f64_result) : "wa"(f16_bits_vec) :); return f64_result; } typedef float GccF32x4AlignedVec __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); typedef uint16_t GccU16x8AlignedVec __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); int main(int argc, char** argv) { alignas(16) float f32_in_lanes[8]; alignas(16) uint16_t f16_result_lanes[8]; for (int i = 1, input_len; i < argc; i += input_len) { input_len = argc - i; if (input_len > 8) input_len = 8; *reinterpret_cast<GccF32x4AlignedVec*>(f32_in_lanes) = GccF32x4AlignedVec{}; *reinterpret_cast<GccF32x4AlignedVec*>(f32_in_lanes + 4) = GccF32x4AlignedVec{}; for (int j = 0; j < input_len; j++) { f32_in_lanes[j] = ConvCharsToFloat<float>(std::string_view(argv[i + j])); } __vector float vf32_a = reinterpret_cast<__vector float>( *reinterpret_cast<const GccF32x4AlignedVec*>(f32_in_lanes)); __vector float vf32_b = reinterpret_cast<__vector float>( *reinterpret_cast<const GccF32x4AlignedVec*>(f32_in_lanes + 4)); __vector unsigned short result_vec = vec_pack_to_short_fp32(vf32_a, vf32_b); *reinterpret_cast<GccU16x8AlignedVec*>(f16_result_lanes) = result_vec; for (int j = 0; j < 8; j++) { printf("xvcvsphp(%g) = %g\n", f32_in_lanes[j], VsxConvertF16BitsToF64(f16_result_lanes[j])); } } return 0; }