https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70546

            Bug ID: 70546
           Summary: ifconvert if(cond) ++count; to count += cond;
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: glisse at gcc dot gnu.org
  Target Milestone: ---
            Target: x86_64-*-*

In the example from http://stackoverflow.com/q/36414959/1918193 we fail to
vectorize f0 (using g++ -O3) because of the condition in the loop. However, if
I manually change the condition as in the #else case, the vectorizer is happy
and we get a very nice performance boost. Could ifconvert (or some other phi
optimization pass) handle this transformation? n is a local variable, so we
shouldn't even need -ftree-loop-if-convert-stores. At -O1, it is also a clear
win with the random distribution, but it becomes a slight loss if the vector v
is sorted in increasing order and a big loss for the decreasing order, so I
don't know for sure under which condition this should be done.

#include <algorithm>
#include <chrono>
#include <random>
#include <iomanip>
#include <iostream>
#include <vector>

using namespace std;
using namespace std::chrono;

vector<int> v(1'000'000);

int f0()
{
  int n = 0;

  for (int i = 1; i < v.size()-1; ++i) {
    int a = v[i-1];
    int b = v[i];
    int c = v[i+1];

#ifndef IMPROVED
    if (a < b  &&  b < c) ++n;
#else
    n += (a < b  &&  b < c);
#endif
  }

  return n;
}


int f1()
{
  int n = 0;

  for (int i = 1; i < v.size()-1; ++i)
    if (v[i-1] < v[i]  &&  v[i] < v[i+1])
      ++n;

  return n;
}


int main()
{
  auto benchmark = [](int (*f)()) {
    const int N = 100;

    volatile long long result = 0;
    vector<long long>  timings(N);

    for (int i = 0; i < N; ++i) {
      auto t0 = high_resolution_clock::now();
      result += f();
      auto t1 = high_resolution_clock::now();

      timings[i] = duration_cast<nanoseconds>(t1-t0).count();
    }

    sort(timings.begin(), timings.end());
    cout << fixed << setprecision(6) << timings.front()/1'000'000.0 << "ms
min\n";
    cout << timings[timings.size()/2]/1'000'000.0 << "ms median\n" << "Result:
" << result/N << "\n\n";
  };

  mt19937                    generator   (31415);   // deterministic seed
  uniform_int_distribution<> distribution(0, 1023);

  for (auto& e: v)
    e = distribution(generator);

  benchmark(f0);
  benchmark(f1);

  cout << "\ndone\n";

  return 0;
}

Reply via email to