Package: g++-8 Version: 8.4.0-7 Severity: important Tags: patch upstream g++-8 generates incorrect code on arm64 for -O3 if targetting aarch64 unless tree vectorisation is disabled. This is quite bad and affects real packages like vlc and apache arrow.
The upstream bug is https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98949 and the fix is merged upstream here: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=de0ede7625f6c4d4bbd2caaf363032b0da80cf69 This has been applied in gcc-9 9.3.0-22 and gcc-10 10.2.1-6 already (cheers for that). 'PR tree-optimization/97236' Tested with: $ g++-8 -O3 test.cc && ./a.out a.out: test.cc:27: int main(): Assertion `bitmap[7] != 0' failed. Aborted (core dumped) $ g++-8 -O3 -fno-tree-vectorize test.cc && ./a.out (no error, code works) for test case: ==test.cc== #include <cassert> #include <cstdint> int main(void) { uint64_t a[64], b[64]; for (int i = 0; i < 64; ++i) { a[i] = 1; b[i] = 2; } a[63] = b[63]; // only last element is the same uint8_t bitmap[8]; // holds 64 bits, bit_i = 1 if (a[i] == b[i]) else 0, i = 0 ~ 63 int index = 0; // index to a[], b[] for (int byte = 0; byte < 8; ++byte) { uint8_t out_results[8]; // holds 8 comparison results temporarily for (int bit = 0; bit < 8; ++bit) { out_results[bit] = a[index] == b[index]; ++index; } bitmap[byte] = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 | out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 | out_results[6] << 6 | out_results[7] << 7); } // last bitmap should be non-zero, fired on gcc-9.3 aarch64 -O3 assert(bitmap[7] != 0); return 0; } Attached is a tested patch. -- Wookey
diff -u gcc-8-8.4.0/debian/changelog gcc-8-8.4.0/debian/changelog --- gcc-8-8.4.0/debian/changelog +++ gcc-8-8.4.0/debian/changelog @@ -1,3 +1,10 @@ +gcc-8 (8.4.0-7.1) UNRELEASED; urgency=medium + + * Non-maintainer upload. + * Fix PR tree-optimization/97236 (AArch64) + + -- Wookey <woo...@softiron-wookey.cambridge.arm.com> Tue, 09 Feb 2021 13:27:54 +0000 + gcc-8 (8.4.0-7) unstable; urgency=medium * Update to git 20210202 from the gcc-8 branch. diff -u gcc-8-8.4.0/debian/rules.patch gcc-8-8.4.0/debian/rules.patch --- gcc-8-8.4.0/debian/rules.patch +++ gcc-8-8.4.0/debian/rules.patch @@ -83,6 +83,7 @@ verbose-lto-linker \ libstdc++-futex \ pr97528 \ + pr97236 \ ifeq (,$(filter $(distrelease),precise trusty stretch jessie wheezy)) debian_patches += pr90050 only in patch2: unchanged: --- gcc-8-8.4.0.orig/debian/patches/pr97236.diff +++ gcc-8-8.4.0/debian/patches/pr97236.diff @@ -0,0 +1,110 @@ +From: Matthias Klose <d...@ubuntu.com> +Date: Tue, 6 Oct 2020 11:41:37 +0000 (+0200) +Subject: Backport fix for PR/tree-optimization/97236 - fix bad use of VMAT_CONTIGUOUS +X-Git-Url: https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff_plain;h=de0ede7625f6c4d4bbd2caaf363032b0da80cf69;hp=6f2f4412b9542c0f5fb950e8986ed97b6671806d + +Backport fix for PR/tree-optimization/97236 - fix bad use of VMAT_CONTIGUOUS + +This avoids using VMAT_CONTIGUOUS with single-element interleaving +when using V1mode vectors. Instead keep VMAT_ELEMENTWISE but +continue to avoid load-lanes and gathers. + +2020-10-01 Richard Biener <rguent...@suse.de> + + PR tree-optimization/97236 + * tree-vect-stmts.c (get_group_load_store_type): Keep + VMAT_ELEMENTWISE for single-element vectors. + + * gcc.dg/vect/pr97236.c: New testcase. + +(cherry picked from commit 1ab88985631dd2c5a5e3b5c0dce47cf8b6ed2f82) +--- + +diff --git a/src/gcc/testsuite/gcc.dg/vect/pr97236.c b/gcc/testsuite/gcc.dg/vect/pr97236.c +new file mode 100644 +index 00000000000..9d3dc20d953 +--- /dev/null ++++ b/src/gcc/testsuite/gcc.dg/vect/pr97236.c +@@ -0,0 +1,43 @@ ++typedef unsigned char __uint8_t; ++typedef __uint8_t uint8_t; ++typedef struct plane_t { ++ uint8_t *p_pixels; ++ int i_lines; ++ int i_pitch; ++} plane_t; ++ ++typedef struct { ++ plane_t p[5]; ++} picture_t; ++ ++#define N 4 ++ ++void __attribute__((noipa)) ++picture_Clone(picture_t *picture, picture_t *res) ++{ ++ for (int i = 0; i < N; i++) { ++ res->p[i].p_pixels = picture->p[i].p_pixels; ++ res->p[i].i_lines = picture->p[i].i_lines; ++ res->p[i].i_pitch = picture->p[i].i_pitch; ++ } ++} ++ ++int ++main() ++{ ++ picture_t aaa, bbb; ++ uint8_t pixels[10] = {1, 1, 1, 1, 1, 1, 1, 1}; ++ ++ for (unsigned i = 0; i < N; i++) ++ aaa.p[i].p_pixels = pixels; ++ ++ picture_Clone (&aaa, &bbb); ++ ++ uint8_t c = 0; ++ for (unsigned i = 0; i < N; i++) ++ c += bbb.p[i].p_pixels[0]; ++ ++ if (c != N) ++ __builtin_abort (); ++ return 0; ++} +diff --git a/src/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c +index 7472558048c..ea75c7249aa 100644 +--- a/src/gcc/tree-vect-stmts.c ++++ b/src/gcc/tree-vect-stmts.c +@@ -2209,25 +2209,23 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp, + /* First cope with the degenerate case of a single-element + vector. */ + if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)) +- *memory_access_type = VMAT_CONTIGUOUS; ++ ; + + /* Otherwise try using LOAD/STORE_LANES. */ +- if (*memory_access_type == VMAT_ELEMENTWISE +- && (vls_type == VLS_LOAD +- ? vect_load_lanes_supported (vectype, group_size, masked_p) +- : vect_store_lanes_supported (vectype, group_size, +- masked_p))) ++ else if (vls_type == VLS_LOAD ++ ? vect_load_lanes_supported (vectype, group_size, masked_p) ++ : vect_store_lanes_supported (vectype, group_size, ++ masked_p)) + { + *memory_access_type = VMAT_LOAD_STORE_LANES; + overrun_p = would_overrun_p; + } + + /* If that fails, try using permuting loads. */ +- if (*memory_access_type == VMAT_ELEMENTWISE +- && (vls_type == VLS_LOAD +- ? vect_grouped_load_supported (vectype, single_element_p, +- group_size) +- : vect_grouped_store_supported (vectype, group_size))) ++ else if (vls_type == VLS_LOAD ++ ? vect_grouped_load_supported (vectype, single_element_p, ++ group_size) ++ : vect_grouped_store_supported (vectype, group_size)) + { + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; + overrun_p = would_overrun_p;