https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115777

            Bug ID: 115777
           Summary: Severe performance regression on insertion sort at -O2
                    or above
           Product: gcc
           Version: 14.1.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: nrk at disroot dot org
  Target Milestone: ---

When compiled with -O2 or above, the following benchmark results in some severe
performance regression.

        $ gcc -o test testcase.c -march=x86-64-v3 -O0 && ./test
        insertion_sort  =>      946
        $ gcc -o test testcase.c -march=x86-64-v3 -O1 && ./test
        insertion_sort  =>      552
        $ gcc -o test testcase.c -march=x86-64-v3 -O2 && ./test
        insertion_sort  =>     1536
        $ gcc -o test testcase.c -march=x86-64-v3 -O3 && ./test
        insertion_sort  =>     1525


Test & benchmark code below:


        #include <stddef.h>
        #include <stdint.h>

        #define T uint32_t
        #define SWAP(A, B) do { T tmp = A; A = B; B = tmp; } while (0)

        __attribute((noinline)) static void
        insertion_sort(T *v, ptrdiff_t n)
        {
                for (ptrdiff_t i = 1; i < n; ++i) {
                        for (ptrdiff_t k = i; k > 0 && v[k-1] > v[k]; --k) {
                                SWAP(v[k-1], v[k]);
                        }
                }
        }

        /// benchmark
        #include <stdio.h>
        #include <time.h>

        int main(void)
        {
                enum {L=1<<10}; T v[L];
                uint64_t rng = 55555;
                for (int i = 0; i < L; ++i) {
                        v[i] = (rng = rng * 1111111111111111111 + 0x1337);
                }
                clock_t beg = clock();
                insertion_sort(v, L);
                clock_t end = clock();
                printf("insertion_sort  => %8ld\n", (long)(end - beg));
        }

Reply via email to