https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109811
JuzheZhong <juzhe.zhong at rivai dot ai> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |juzhe.zhong at rivai dot ai --- Comment #7 from JuzheZhong <juzhe.zhong at rivai dot ai> --- (In reply to Jan Hubicka from comment #6) > hottest loop in clang's profile is: > for (size_t y = 0; y < opsin.ysize(); y++) { > for (size_t x = 0; x < opsin.xsize(); x++) { > if (is_background_row[y * is_background_stride + x]) continue; > cc.clear(); > stack.clear(); > stack.emplace_back(x, y); > size_t min_x = x; > size_t max_x = x; > size_t min_y = y; > size_t max_y = y; > std::pair<uint32_t, uint32_t> reference; > bool found_border = false; > bool all_similar = true; > while (!stack.empty()) { > std::pair<uint32_t, uint32_t> cur = stack.back(); > stack.pop_back(); > if (visited_row[cur.second * visited_stride + cur.first]) continue; > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > closed by this continue. > visited_row[cur.second * visited_stride + cur.first] = 1; > if (cur.first < min_x) min_x = cur.first; > if (cur.first > max_x) max_x = cur.first; > if (cur.second < min_y) min_y = cur.second; > if (cur.second > max_y) max_y = cur.second; > if (paint_ccs) { > cc.push_back(cur); > } > for (int dx = -kSearchRadius; dx <= kSearchRadius; dx++) { > for (int dy = -kSearchRadius; dy <= kSearchRadius; dy++) { > if (dx == 0 && dy == 0) continue; > int next_first = static_cast<int32_t>(cur.first) + dx; > int next_second = static_cast<int32_t>(cur.second) + dy; > if (next_first < 0 || next_second < 0 || > static_cast<uint32_t>(next_first) >= opsin.xsize() || > static_cast<uint32_t>(next_second) >= opsin.ysize()) { > continue; > } > std::pair<uint32_t, uint32_t> next{next_first, next_second}; > if (!is_background_row[next.second * is_background_stride + > next.first]) { > stack.push_back(next); > } else { > if (!found_border) { > reference = next; > found_border = true; > } else { > if (!is_similar_b(next, reference)) all_similar = false; > } > } > } > } > } > if (!found_border || !all_similar || max_x - min_x >= kMaxPatchSize || > max_y - min_y >= kMaxPatchSize) { > continue; > } > size_t bpos = background_stride * reference.second + reference.first; > float ref[3] = {background_rows[0][bpos], background_rows[1][bpos], > background_rows[2][bpos]}; > bool has_similar = false; > for (size_t iy = std::max<int>( > static_cast<int32_t>(min_y) - kHasSimilarRadius, 0); > iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); > iy++) { > for (size_t ix = std::max<int>( > static_cast<int32_t>(min_x) - kHasSimilarRadius, 0); > ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize()); > ix++) { > size_t opos = opsin_stride * iy + ix; > float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos], > opsin_rows[2][opos]}; > if (pci.is_similar_v(ref, px, kHasSimilarThreshold)) { > has_similar = true; > } > } > } > if (!has_similar) continue; > info.emplace_back(); > info.back().second.emplace_back(min_x, min_y); > QuantizedPatch& patch = info.back().first; > patch.xsize = max_x - min_x + 1; > patch.ysize = max_y - min_y + 1; > int max_value = 0; > for (size_t c : {1, 0, 2}) { > for (size_t iy = min_y; iy <= max_y; iy++) { > for (size_t ix = min_x; ix <= max_x; ix++) { > size_t offset = (iy - min_y) * patch.xsize + ix - min_x; > patch.fpixels[c][offset] = > opsin_rows[c][iy * opsin_stride + ix] - ref[c]; > int val = pci.Quantize(patch.fpixels[c][offset], c); > patch.pixels[c][offset] = val; > if (std::abs(val) > max_value) max_value = std::abs(val); > } > } > } > if (max_value < kMinPeak) { > info.pop_back(); > continue; > } > if (paint_ccs) { > float cc_color = rng.UniformF(0.5, 1.0); > for (std::pair<uint32_t, uint32_t> p : cc) { > ccs.Row(p.second)[p.first] = cc_color; > } > } > } > } > > I guess such a large loop nest with hottest loop not being the innermost is > bad for register pressure. > Clangs code is : > 0.02 │1196:┌─→cmp %r10,-0xb8(%rbp) > ▒ > │ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&, > jxl::PassesEncoderState*, JxlCmsInterface co▒ > │ │while (!stack.empty()) { > ◆ > 1.39 │ │↓ je 1690 > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > │11a3:│ mov -0x8(%r10),%rbx > ▒ > 9.80 │ │ mov -0x230(%rbp),%r14 > ▒ > │ │__gnu_cxx::__normal_iterator<std::pair<unsigned int, unsigned > int>*, std::vector<std::pair<unsigned ▒ > │ │{ return __normal_iterator(_M_current - __n); } > ▒ > 0.86 │ │ add $0xfffffffffffffff8,%r10 > ▒ > │ │jxl::FindBestPatchDictionary(jxl::Image3<float> const&, > jxl::PassesEncoderState*, JxlCmsInterface co▒ > 0.36 │ │ mov %rbx,%r13 > ▒ > 0.01 │ │ shr $0x20,%r13 > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 2.69 │ │ mov %ebx,%eax > ▒ > 0.00 │ │ mov %r13,%rcx > ▒ > 1.12 │ │ imul -0x180(%rbp),%rcx > ▒ > 0.78 │ │ add %rax,%rcx > ▒ > 2.73 │ ├──cmpb $0x0,(%r14,%rcx,1) > ▒ > 19.91 │ └──jne 1196 > ▒ > > While GCC is longer: > Percent│ Percent│ while (!stack.empty()) { > ▒ > │1241:┌─→cmp %rax,-0x370(%rbp) > ▒ > 0.70 │ │↓ je 1481 > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > 0.02 │124e:│ mov -0x8(%rax),%rdx > ▒ > │ │std::vector<std::pair<unsigned int, unsigned int>, > std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒ > │ │--this->_M_impl._M_finish; > ▒ > 0.63 │ │ sub $0x8,%rax > ▒ > 0.74 │ │ mov %rax,-0x368(%rbp) > ▒ > │ │FindTextLikePatches(): > ▒ > 0.01 │ │ mov %edx,%esi > ◆ > 0.35 │ │ mov %rdx,%rdi > ▒ > 0.82 │ │ mov %rdx,-0x490(%rbp) > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 0.00 │ │ mov -0x5f8(%rbp),%rdx > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > 1.32 │ │ shr $0x20,%rdi > ▒ > 0.11 │ │ mov %rsi,%r15 > ▒ > 0.09 │ │ mov %edi,%ecx > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 0.72 │ │ mov -0x5f0(%rbp),%rdi > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > 0.03 │ │ mov %rcx,%r12 > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 0.39 │ │ imul %rcx,%rdx > ▒ > 1.00 │ │ add %rsi,%rdx > ▒ > 1.35 │ │ add %rdi,%rdx > ▒ > 1.37 │ ├──cmpb $0x0,(%rdx) > ▒ > 9.56 │ └──jne 1241 > ▒ while (!stack.empty()) { > ▒ > │1241:┌─→cmp %rax,-0x370(%rbp) > ▒ > 0.70 │ │↓ je 1481 > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > 0.02 │124e:│ mov -0x8(%rax),%rdx > ▒ > │ │std::vector<std::pair<unsigned int, unsigned int>, > std::allocator<std::pair<unsigned int, unsigned int> > >::pop_back▒ > │ │--this->_M_impl._M_finish; > ▒ > 0.63 │ │ sub $0x8,%rax > ▒ > 0.74 │ │ mov %rax,-0x368(%rbp) > ▒ > │ │FindTextLikePatches(): > ▒ > 0.01 │ │ mov %edx,%esi > ◆ > 0.35 │ │ mov %rdx,%rdi > ▒ > 0.82 │ │ mov %rdx,-0x490(%rbp) > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 0.00 │ │ mov -0x5f8(%rbp),%rdx > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > 1.32 │ │ shr $0x20,%rdi > ▒ > 0.11 │ │ mov %rsi,%r15 > ▒ > 0.09 │ │ mov %edi,%ecx > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 0.72 │ │ mov -0x5f0(%rbp),%rdi > ▒ > │ │std::pair<uint32_t, uint32_t> cur = stack.back(); > ▒ > 0.03 │ │ mov %rcx,%r12 > ▒ > │ │if (visited_row[cur.second * visited_stride + cur.first]) > continue; ▒ > 0.39 │ │ imul %rcx,%rdx > ▒ > 1.00 │ │ add %rsi,%rdx > ▒ > 1.35 │ │ add %rdi,%rdx > ▒ > 1.37 │ ├──cmpb $0x0,(%rdx) > ▒ > 9.56 │ └──jne 1241 > ▒ > > > Moreover we have heuristics saying that continue usually closes loops with > low trip count. > -fno-ivopts improves performance with --num_threads=0 however makes number > of instructions worse. Curiously enought with --num_threads=1 and more > ivopts is benefical. Hi, thanks for the analysis. It seems that Clang has better performance than GCC in case of no vectorizer? What about with vectorizer ? Well, we have a lot testing benchmark between our downstream RISCV Clang vs GCC. >From my observation, Clang do much better than GCC in case of scalar code. For example, we found in mlperf, Clang 40% better than GCC when specifying no-vectorizer. However, when enabling vectorization, gcc does better than clang overall from my observation. Thanks