Re: Performance degradation on g++ 4.6

Oleg Smolsky Mon, 22 Aug 2011 18:09:33 -0700

Hey David, these two --param options made no difference to the test.

I've cut the suite down to a single test (attached), which yields thefollowing results:


./simple_types_constant_folding_os (gcc 41)
    test         description   time   operations/s
     0 "int8_t constant add"   1.34 sec   1194.03 M

./simple_types_constant_folding_os (gcc 46)
    test         description   time   operations/s
     0 "int8_t constant add"   2.84 sec   563.38 M

Both compilers fully inline the templated function and the emitted codelooks very similar. I am puzzled as to why one of these loops issignificantly slower than the other. I've attached disassembled listings- perhaps someone could have a look please? (the body of the loop startsat 0000000000400FD for gcc41 and at 0000000000400D90 for gcc46)


Thanks,
Oleg.


On 2011/8/1 22:48, Xinliang David Li wrote:

Try isolate the int8_t constant folding testing from the rest to see
if the slow down can be reproduced with the isolated case. If the
problem disappear, it is likely due to the following inline
parameters:

large-function-insns, large-function-growth, large-unit-insns,
inline-unit-growth. For instance set

--param large-function-insns=10000
--param large-unit-insns=20000

David

On Mon, Aug 1, 2011 at 11:43 AM, Oleg Smolsky<oleg.smol...@riverbed.com>  wrote:

On 2011/7/29 14:07, Xinliang David Li wrote:

Profiling tools are your best friend here. If you don't have access to
any, the least you can do is to build the program with -pg option and
use gprof tool to find out differences.

The test suite has a bunch of very basic C++ tests that are executed an
enormous number of times. I've built one with the obvious performance
degradation and attached the source, output and reports.

Here are some highlights:
    v4.1:    Total absolute time for int8_t constant folding: 30.42 sec
    v4.6:    Total absolute time for int8_t constant folding: 43.32 sec

Every one of the tests in this section had degraded... the first half more
than the second. I am not sure how much further I can take this - the
benchmarked code is very short and plain. I can post disassembly for one
(some?) of them if anyone is willing to take a look...

Thanks,
Oleg.

/*
    Copyright 2007-2008 Adobe Systems Incorporated
    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
    or a copy at http://stlab.adobe.com/licenses.html )
    
    
    Source file for tests shared among several benchmarks
*/

/******************************************************************************/

template<typename T>
inline bool tolerance_equal(T &a, T &b) {
        T diff = a - b;
        return (abs(diff) < 1.0e-6);
}


template<>
inline bool tolerance_equal(int32_t &a, int32_t &b) {
        return (a == b);
}
template<>
inline bool tolerance_equal(uint32_t &a, uint32_t &b) {
        return (a == b);
}
template<>
inline bool tolerance_equal(uint64_t &a, uint64_t &b) {
        return (a == b);
}
template<>
inline bool tolerance_equal(int64_t &a, int64_t &b) {
        return (a == b);
}

template<>
inline bool tolerance_equal(double &a, double &b) {
        double diff = a - b;
        double reldiff = diff;
        if (fabs(a) > 1.0e-8)
                reldiff = diff / a;
        return (fabs(reldiff) < 1.0e-6);
}

template<>
inline bool tolerance_equal(float &a, float &b) {
        float diff = a - b;
        double reldiff = diff;
        if (fabs(a) > 1.0e-4)
                reldiff = diff / a;
        return (fabs(reldiff) < 1.0e-3);                // single precision 
divide test is really imprecise
}

/******************************************************************************/

template <typename T, typename Shifter>
inline void check_shifted_sum(T result) {
        T temp = (T)SIZE * Shifter::do_shift((T)init_value);
        if (!tolerance_equal<T>(result,temp))
                printf("test %i failed\n", current_test);
}

template <typename T, typename Shifter>
inline void check_shifted_sum_CSE(T result) {
        T temp = (T)0.0;
        if (!tolerance_equal<T>(result,temp))
                printf("test %i failed\n", current_test);
}

template <typename T, typename Shifter>
inline void check_shifted_variable_sum(T result, T var) {
        T temp = (T)SIZE * Shifter::do_shift((T)init_value, var);
        if (!tolerance_equal<T>(result,temp))
                printf("test %i failed\n", current_test);
}

template <typename T, typename Shifter>
inline void check_shifted_variable_sum(T result, T var1, T var2, T var3, T 
var4) {
        T temp = (T)SIZE * Shifter::do_shift((T)init_value, var1, var2, var3, 
var4);
        if (!tolerance_equal<T>(result,temp))
                printf("test %i failed\n", current_test);
}

template <typename T, typename Shifter>
inline void check_shifted_variable_sum_CSE(T result, T var) {
        T temp = (T)0.0;
        if (!tolerance_equal<T>(result,temp))
                printf("test %i failed\n", current_test);
}

template <typename T, typename Shifter>
inline void check_shifted_variable_sum_CSE(T result, T var1, T var2, T var3, T 
var4) {
        T temp = (T)0.0;
        if (!tolerance_equal<T>(result,temp))
                printf("test %i failed\n", current_test);
}


/******************************************************************************/

template <typename Iterator, typename T>
void fill(Iterator first, Iterator last, T value) {
        while (first != last) *first++ = value;
}

/******************************************************************************/

template <typename T>
        struct custom_constant_add {
          static T do_shift(T input) { return (input + T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_add {
          static T do_shift(T input) { return (input + T(1) + T(2) + T(3) + 
T(4)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_constant_sub {
          static T do_shift(T input) { return (input - T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_sub {
          static T do_shift(T input) { return (input - T(1) - T(2) - T(3) - 
T(4)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_constant_multiply {
          static T do_shift(T input) { return (input * T(120)); }
        };

/******************************************************************************/

// this should result in a single multiply
template <typename T>
        struct custom_multiple_constant_multiply {
          static T do_shift(T input) { return (input * T(2) * T(3) * T(4) * 
T(5)); }
        };

/******************************************************************************/

// this should result in a single add
template <typename T>
        struct custom_multiple_constant_multiply2 {
          static T do_shift(T input) { return (input + T(2) * T(3) * T(4) * 
T(5)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_constant_divide {
          static T do_shift(T input) { return (input / T(5)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_divide {
          static T do_shift(T input) { return ((((input / T(2) ) / T(3) ) / 
T(4)) / T(5)); }
        };

/******************************************************************************/

// this more likely to have constants fused than the version above
template <typename T>
        struct custom_multiple_constant_divide2 {
          static T do_shift(T input) { return (input + (((T(120) / T(3) ) / 
T(4)) / T(5))); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_mixed {
          static T do_shift(T input) { return (input + T(2) - T(3) * T(4) / 
T(5)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_constant_and {
          static T do_shift(T input) { return (input & T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_and {
          static T do_shift(T input) { return (input & T(15) & T(30) & T(31) & 
T(63)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_constant_or {
          static T do_shift(T input) { return (input | T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_or {
          static T do_shift(T input) { return (input | T(15) | T(30) | T(31) | 
T(63)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_constant_xor {
          static T do_shift(T input) { return (input ^ T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_constant_xor {
          static T do_shift(T input) { return (input ^ T(15) ^ T(30) ^ T(31) ^ 
T(63)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_two {
          static T do_shift(T input) { return (T(2)); }
        };

/******************************************************************************/
        
template <typename T>
        struct custom_add_constants {
          static T do_shift(T input) { return (T(1) + T(2)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_sub_constants {
          static T do_shift(T input) { return (T(2) - T(1)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiply_constants {
          static T do_shift(T input) { return (T(2) * T(3)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_divide_constants {
          static T do_shift(T input) { return (T(20) / T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_mod_constants {
          static T do_shift(T input) { return (T(23) % T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_and_constants {
          static T do_shift(T input) { return (T(23) & T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_or_constants {
          static T do_shift(T input) { return (T(23) | T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_xor_constants {
          static T do_shift(T input) { return (T(23) ^ T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_equal_constants {
          static T do_shift(T input) { return (T(23) == T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_notequal_constants {
          static T do_shift(T input) { return (T(23) != T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_greaterthan_constants {
          static T do_shift(T input) { return (T(23) > T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_lessthan_constants {
          static T do_shift(T input) { return (T(23) < T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_greaterthanequal_constants {
          static T do_shift(T input) { return (T(23) >= T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_lessthanequal_constants {
          static T do_shift(T input) { return (T(23) <= T(10)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_add_variable {
          static T do_shift(T input, T v1) { return (input + v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_sub_variable {
          static T do_shift(T input, T v1) { return (input - v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiply_variable {
          static T do_shift(T input, T v1) { return (input * v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_divide_variable {
          static T do_shift(T input, T v1) { return (input / v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_add_multiple_variable {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input + 
v1 + v2 + v3 + v4); }
        };

/******************************************************************************/

template <typename T>
        struct custom_sub_multiple_variable {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input - 
v1 - v2 - v3 - v4); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiply_multiple_variable {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input * 
v1 * v2 * v3 * v4); }
        };

/******************************************************************************/

// something more likely to be moved out of loops, and a sanity check
template <typename T>
        struct custom_multiply_multiple_variable2 {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input + 
v1 * v2 * v3 * v4); }
        };

/******************************************************************************/

// this can NOT have CSE and loop invariant motion applied in integer math
// and can only be optimized in float if inexact math is allowed
template <typename T>
        struct custom_divide_multiple_variable {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return ((((input 
/ v1 ) / v2 ) / v3) / v4); }
        };

/******************************************************************************/

// this can have CSE and loop invariant motion applied in integer math
// this should be optimizeable without inexact math
template <typename T>
        struct custom_divide_multiple_variable2 {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input + 
(((v1 / v2 ) / v3) / v4)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_mixed_multiple_variable {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input + 
v1 - v2 * v3 / v4); }
        };

/******************************************************************************/

template <typename T>
        struct custom_variable_and {
          static T do_shift(T input, T v1) { return (input & v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_variable_and {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input & 
v1 & v2 & v3 & v4); }
        };

/******************************************************************************/

template <typename T>
        struct custom_variable_or {
          static T do_shift(T input, T v1) { return (input | v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_variable_or {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input | 
v1 | v2 | v3 | v4); }
        };

/******************************************************************************/

template <typename T>
        struct custom_variable_xor {
          static T do_shift(T input, T v1) { return (input ^ v1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_multiple_variable_xor {
          static T do_shift(T input, T v1, T v2, T v3, T v4) { return (input ^ 
v1 ^ v2 ^ v3 ^ v4); }
        };


/******************************************************************************/

template <typename T>
        struct custom_identity {
          static T do_shift(T input) { return (input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_add_zero {
          static T do_shift(T input) { return (input + T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_sub_zero {
          static T do_shift(T input) { return (input - T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_negate {
          static T do_shift(T input) { return (-input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_negate_twice {
          static T do_shift(T input) { return (-(-input)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_zero_minus {
          static T do_shift(T input) { return (T(0) - input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_times_one {
          static T do_shift(T input) { return (input * T(1)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_divideby_one {
          static T do_shift(T input) { return (input / T(1)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_algebra_mixed {
          static T do_shift(T input) { return (-(T(0) - (((input + T(0)) - 
T(0)) / T(1)))) * T(1); }
        };

/******************************************************************************/

template <typename T>
        struct custom_zero {
          static T do_shift(T input) { return T(0); }
        };

/******************************************************************************/

template <typename T>
        struct custom_times_zero {
          static T do_shift(T input) { return (input * T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_subtract_self {
          static T do_shift(T input) { return (input - input); }
        };
/******************************************************************************/

template <typename T>
        struct custom_algebra_mixed_constant {
          static T do_shift(T input) { return (input - (-(T(0) - (((input + 
T(0)) / T(1)) - T(0)))) * T(1)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_cse1 {
          static T do_shift(T v1, T v2, T v3) { return (v1 * (v2 - v3) ); }
        };

/******************************************************************************/

template <typename T>
        struct custom_and_self {
          static T do_shift(T input) { return (input & input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_or_self {
          static T do_shift(T input) { return (input | input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_xor_self {
          static T do_shift(T input) { return (input ^ input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_or_zero {
          static T do_shift(T input) { return (input | T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_xor_zero {
          static T do_shift(T input) { return (input ^ T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_andnot_zero {
          static T do_shift(T input) { return (input & ~ T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_and_zero {
          static T do_shift(T input) { return (input & T(0)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_mod_one {
          static T do_shift(T input) { return (input % T(1)); }
        };

/******************************************************************************/

template <typename T>
        struct custom_equal_self {
          static T do_shift(T input) { return (input == input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_notequal_self {
          static T do_shift(T input) { return (input != input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_greaterthan_self {
          static T do_shift(T input) { return (input > input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_lessthan_self {
          static T do_shift(T input) { return (input < input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_greaterthanequal_self {
          static T do_shift(T input) { return (input >= input); }
        };

/******************************************************************************/

template <typename T>
        struct custom_lessthanequal_self {
          static T do_shift(T input) { return (input <= input); }
        };

/******************************************************************************/

template <typename T, typename Shifter>
void test_constant(T* first, int count, const char *label) {
  int i;
  
  start_timer();
  
  for(i = 0; i < iterations; ++i) {
    T result = 0;
    for (int n = 0; n < count; ++n) {
                result += Shifter::do_shift( first[n] );
        }
    check_shifted_sum<T, Shifter>(result);
  }
  
  record_result( timer(), label );
}

/******************************************************************************/

template <typename T, typename Shifter>
void test_variable1(T* first, int count, T v1, const char *label) {
  int i;
  
  start_timer();
  
  for(i = 0; i < iterations; ++i) {
    T result = 0;
    for (int n = 0; n < count; ++n) {
                result += Shifter::do_shift( first[n], v1 );
        }
    check_shifted_variable_sum<T, Shifter>(result, v1);
  }
  
  record_result( timer(), label );
}

/******************************************************************************/

template <typename T, typename Shifter>
void test_variable4(T* first, int count, T v1, T v2, T v3, T v4, const char 
*label) {
  int i;
  
  start_timer();
  
  for(i = 0; i < iterations; ++i) {
    T result = 0;
    for (int n = 0; n < count; ++n) {
                result += Shifter::do_shift( first[n], v1, v2, v3, v4 );
        }
    check_shifted_variable_sum<T, Shifter>(result, v1, v2, v3, v4);
  }
  
  record_result( timer(), label );
}

/******************************************************************************/

template <typename T, typename Shifter>
void test_CSE_opt(T* first, int count, T v1, const char *label) {
  int i;
  
  start_timer();
  
  for(i = 0; i < iterations; ++i) {
    T result = 0;
        T temp = Shifter::do_shift( v1, first[0], first[1] );
        temp += temp;
        result += first[0] + temp;
        result -= first[1] + temp;
    for (int n = 1; n < count; ++n) {
                temp = Shifter::do_shift( v1, first[n-1], first[n] );
                temp += temp;
                result += first[n-1] + temp;
                result -= first[n] + temp;
        }
    check_shifted_variable_sum_CSE<T, Shifter>(result, v1);
  }
  
  record_result( timer(), label );
}

/******************************************************************************/

template <typename T, typename Shifter>
void test_CSE(T* first, int count, T v1, const char *label) {
  int i;
  
  start_timer();
  
  for(i = 0; i < iterations; ++i) {
    T result = 0;
        result += first[0] + Shifter::do_shift( v1, first[0], first[1] ) + 
Shifter::do_shift( v1, first[0], first[1] );
        result -= first[1] + Shifter::do_shift( v1, first[0], first[1] ) + 
Shifter::do_shift( v1, first[0], first[1] );
    for (int n = 1; n < count; ++n) {
                result += first[n-1] + Shifter::do_shift( v1, first[n-1], 
first[n] ) + Shifter::do_shift( v1, first[n-1], first[n] );
                result -= first[n] + Shifter::do_shift( v1, first[n-1], 
first[n] ) + Shifter::do_shift( v1, first[n-1], first[n] );
        }
    check_shifted_variable_sum_CSE<T, Shifter>(result, v1);
  }
  
  record_result( timer(), label );
}

/******************************************************************************/

/*
    Copyright 2007-2008 Adobe Systems Incorporated
    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
    or a copy at http://stlab.adobe.com/licenses.html )


Goal:  Test compiler optimizations related to constant folding of simple 
language defined types

Assumptions:

        1) the compiler will combine constant calculations into a single 
constant for simple types
                aka constant folding
                result = A + B                  ==>             result = 
constant
                result = A - B                  ==>             result = 
constant
                result = A * B                  ==>             result = 
constant
                result = A / B                  ==>             result = 
constant
                result = A % B                  ==>             result = 
constant       for integer types
                result = (A == B)               ==>             result = 
constant       for integer types
                result = (A != B)               ==>             result = 
constant       for integer types
                result = (A > B)                ==>             result = 
constant       for integer types
                result = (A < B)                ==>             result = 
constant       for integer types
                result = (A >= B)               ==>             result = 
constant       for integer types
                result = (A <= B)               ==>             result = 
constant       for integer types
                result = (A & B)                ==>             result = 
constant       for integer types
                result = (A | B)                ==>             result = 
constant       for integer types
                result = (A ^ B)                ==>             result = 
constant       for integer types
                
                result = input + A + B + C + D  ==>             result = input 
+ (A+B+C+D)
                result = input - A - B - C - D  ==>             result = input 
- (A+B+C+D)
                result = input * A * B * C * D  ==>             result = input 
* (A*B*C*D)
                result = input + A * B * C * D  ==>             result = input 
+ (A*B*C*D)
                result = ((((input/A) /B) /C) /D)       ==>     result = input 
/ (A*B*C*D)
                result = input + (((A /B) /C) /D)       ==>     result = input 
+ (A/B/C/D)
                result = input & A & B & C & D  ==>             result = input 
& (A&B&C&D)                      for integer types
                result = input | A | B | C | D  ==>             result = input 
| (A|B|C|D)                      for integer types
                result = input ^ A ^ B ^ C ^ D  ==>             result = input 
^ (A^B^C^D)                      for integer types


NOTE - in some cases, loop invariant code motion might move the constant 
calculation out of the inner loop
        making it appear that the constants were folded
                But in the constant result cases, we want the compiler to 
recognize the constant and move it out of the loop

*/

/******************************************************************************/

#include "benchmark_stdint.hpp"
#include <cstddef>
#include <cstdio>
#include <ctime>
#include <cstdlib>
#include <cmath>
#include "benchmark_results.h"
#include "benchmark_timer.h"

/******************************************************************************/

// this constant may need to be adjusted to give reasonable minimum times
// For best results, times should be about 1.0 seconds for the minimum test run
int base_iterations = 2000000;
int iterations = base_iterations;


// 8000 items, or between 8k and 64k of data
// this is intended to remain within the L2 cache of most common CPUs
const int SIZE  = 8000;


// initial value for filling our arrays, may be changed from the command line
double init_value = 1.0;

/******************************************************************************/

// our global arrays of numbers to be operated upon

double dataDouble[SIZE];
float dataFloat[SIZE];

uint64_t data64unsigned[SIZE];
int64_t data64[SIZE];

uint32_t data32unsigned[SIZE];
int32_t data32[SIZE];

uint16_t data16unsigned[SIZE];
int16_t data16[SIZE];

uint8_t data8unsigned[SIZE];
int8_t data8[SIZE];

/******************************************************************************/

#include "benchmark_shared_tests.h"

/******************************************************************************/


int main(int argc, char** argv) {
        
        // output command for documentation:
        int i;
        for (i = 0; i < argc; ++i)
                printf("%s ", argv[i] );
        printf("\n");

        if (argc > 1) base_iterations = atoi(argv[1]);
        if (argc > 2) init_value = (double) atof(argv[2]);



// int8_t
        ::fill(data8, data8+SIZE, int8_t(init_value));
        
        iterations = base_iterations / 10;
        test_constant<int8_t, custom_constant_add<int8_t> >(data8,SIZE,"int8_t 
constant add");

        summarize("int8_t constant folding", SIZE, iterations, kDontShowGMeans, 
kDontShowPenalty );

        return 0;
}

// the end
/******************************************************************************/
/******************************************************************************/

.text:0000000000400EFF ; 
---------------------------------------------------------------------------
.text:0000000000400F04                 align 10h
.text:0000000000400F10
.text:0000000000400F10 ; =============== S U B R O U T I N E 
=======================================
.text:0000000000400F10
.text:0000000000400F10 ; Attributes: bp-based frame
.text:0000000000400F10
.text:0000000000400F10                 public main
.text:0000000000400F10 main            proc near
.text:0000000000400F10                 push    rbp
.text:0000000000400F11                 mov     rbp, rsp
.text:0000000000400F14                 push    r14
.text:0000000000400F16                 push    r13
.text:0000000000400F18                 push    r12
.text:0000000000400F1A                 push    rbx
.text:0000000000400F1B                 call    _mcount
.text:0000000000400F20                 test    edi, edi
.text:0000000000400F22                 mov     r13d, edi
.text:0000000000400F25                 mov     r14, rsi
.text:0000000000400F28                 jle     short loc_400F4C
.text:0000000000400F2A                 mov     rbx, rsi
.text:0000000000400F2D                 xor     r12d, r12d
.text:0000000000400F30
.text:0000000000400F30 loc_400F30:                             ; CODE XREF: 
main+3Aj
.text:0000000000400F30                 mov     rsi, [rbx]
.text:0000000000400F33                 xor     eax, eax
.text:0000000000400F35                 mov     edi, offset aS  ; "%s "
.text:0000000000400F3A                 add     r12d, 1
.text:0000000000400F3E                 add     rbx, 8
.text:0000000000400F42                 call    _printf
.text:0000000000400F47                 cmp     r12d, r13d
.text:0000000000400F4A                 jnz     short loc_400F30
.text:0000000000400F4C
.text:0000000000400F4C loc_400F4C:                             ; CODE XREF: 
main+18j
.text:0000000000400F4C                 mov     edi, 0Ah        ; c
.text:0000000000400F51                 call    _putchar
.text:0000000000400F56                 cmp     r13d, 1
.text:0000000000400F5A                 jle     short loc_400F74
.text:0000000000400F5C                 mov     rdi, [r14+8]    ; nptr
.text:0000000000400F60                 xor     ecx, ecx        ; group
.text:0000000000400F62                 mov     edx, 0Ah        ; base
.text:0000000000400F67                 xor     esi, esi        ; endptr
.text:0000000000400F69                 call    ___strtol_internal
.text:0000000000400F6E                 mov     cs:base_iterations, eax
.text:0000000000400F74
.text:0000000000400F74 loc_400F74:                             ; CODE XREF: 
main+4Aj
.text:0000000000400F74                 cmp     r13d, 2
.text:0000000000400F78                 jg      loc_40110B
.text:0000000000400F7E
.text:0000000000400F7E loc_400F7E:                             ; CODE XREF: 
main+210j
.text:0000000000400F7E                 movsd   xmm0, cs:init_value
.text:0000000000400F86                 xor     eax, eax
.text:0000000000400F88                 cvttsd2si edx, xmm0
.text:0000000000400F8C                 db      66h, 66h, 66h
.text:0000000000400F8C                 nop
.text:0000000000400F90
.text:0000000000400F90 loc_400F90:                             ; CODE XREF: 
main+90j
.text:0000000000400F90                 mov     ds:data8[rax], dl
.text:0000000000400F96                 add     rax, 1
.text:0000000000400F9A                 cmp     rax, 1F40h
.text:0000000000400FA0                 jnz     short loc_400F90
.text:0000000000400FA2                 mov     ecx, cs:base_iterations
.text:0000000000400FA8                 mov     edx, 66666667h
.text:0000000000400FAD                 mov     eax, ecx
.text:0000000000400FAF                 sar     ecx, 1Fh
.text:0000000000400FB2                 imul    edx
.text:0000000000400FB4                 sar     edx, 2
.text:0000000000400FB7                 sub     edx, ecx
.text:0000000000400FB9                 mov     cs:iterations, edx
.text:0000000000400FBF                 call    _clock
.text:0000000000400FC4                 mov     cs:start_time, rax
.text:0000000000400FCB                 mov     eax, cs:iterations
.text:0000000000400FD1                 test    eax, eax
.text:0000000000400FD3                 jle     short loc_40103B
.text:0000000000400FD5                 xor     ebx, ebx
.text:0000000000400FD7
.text:0000000000400FD7 loc_400FD7:                             ; CODE XREF: 
main+129j
.text:0000000000400FD7                 xor     ecx, ecx
.text:0000000000400FD9                 xor     edx, edx
.text:0000000000400FDB                 db      66h, 66h
.text:0000000000400FDB                 nop
.text:0000000000400FDE                 db      66h
.text:0000000000400FDE                 nop
.text:0000000000400FE0
.text:0000000000400FE0 loc_400FE0:                             ; CODE XREF: 
main+E8j
.text:0000000000400FE0                 movzx   eax, ds:data8[rdx]
.text:0000000000400FE7                 add     rdx, 1
.text:0000000000400FEB                 add     eax, 0Ah
.text:0000000000400FEE                 cmp     rdx, 1F40h
.text:0000000000400FF5                 lea     ecx, [rax+rcx]
.text:0000000000400FF8                 jnz     short loc_400FE0
.text:0000000000400FFA                 movsd   xmm0, cs:init_value
.text:0000000000401002                 movsd   xmm1, cs:qword_401260
.text:000000000040100A                 cvttsd2si eax, xmm0
.text:000000000040100E                 add     eax, 0Ah
.text:0000000000401011                 shl     eax, 6
.text:0000000000401014                 sub     cl, al
.text:0000000000401016                 movsx   eax, cl
.text:0000000000401019                 mov     edx, eax
.text:000000000040101B                 sar     edx, 1Fh
.text:000000000040101E                 xor     eax, edx
.text:0000000000401020                 sub     eax, edx
.text:0000000000401022                 cvtsi2sd xmm0, eax
.text:0000000000401026                 ucomisd xmm1, xmm0
.text:000000000040102A                 jbe     loc_4010F4
.text:0000000000401030
.text:0000000000401030 loc_401030:                             ; CODE XREF: 
main+1F6j
.text:0000000000401030                 add     ebx, 1
.text:0000000000401033                 cmp     cs:iterations, ebx
.text:0000000000401039                 jg      short loc_400FD7
.text:000000000040103B
.text:000000000040103B loc_40103B:                             ; CODE XREF: 
main+C3j
.text:000000000040103B                 call    _clock
.text:0000000000401040                 mov     rdi, cs:results ; ptr
.text:0000000000401047                 mov     rbx, rax
.text:000000000040104A                 mov     cs:end_time, rax
.text:0000000000401051                 mov     r12, cs:start_time
.text:0000000000401058                 test    rdi, rdi
.text:000000000040105B                 jz      short loc_40106B
.text:000000000040105D                 mov     edx, cs:current_test
.text:0000000000401063                 cmp     edx, cs:allocated_results
.text:0000000000401069                 jl      short loc_40109C
.text:000000000040106B
.text:000000000040106B loc_40106B:                             ; CODE XREF: 
main+14Bj
.text:000000000040106B                 mov     esi, cs:allocated_results
.text:0000000000401071                 add     esi, 0Ah
.text:0000000000401074                 mov     cs:allocated_results, esi
.text:000000000040107A                 movsxd  rsi, esi
.text:000000000040107D                 shl     rsi, 4          ; size
.text:0000000000401081                 call    _realloc
.text:0000000000401086                 test    rax, rax
.text:0000000000401089                 mov     cs:results, rax
.text:0000000000401090                 jz      loc_401125
.text:0000000000401096                 mov     edx, cs:current_test
.text:000000000040109C
.text:000000000040109C loc_40109C:                             ; CODE XREF: 
main+159j
.text:000000000040109C                 sub     rbx, r12
.text:000000000040109F                 movsxd  rax, edx
.text:00000000004010A2                 xor     r8d, r8d
.text:00000000004010A5                 cvtsi2sd xmm0, rbx
.text:00000000004010AA                 shl     rax, 4
.text:00000000004010AE                 add     rax, cs:results
.text:00000000004010B5                 xor     ecx, ecx
.text:00000000004010B7                 mov     esi, 1F40h
.text:00000000004010BC                 mov     edi, offset aInt8_tConstant ; 
"int8_t constant folding"
.text:00000000004010C1                 mov     qword ptr [rax+8], 4012BAh
.text:00000000004010C9                 divsd   xmm0, cs:qword_401258
.text:00000000004010D1                 movsd   qword ptr [rax], xmm0
.text:00000000004010D5                 lea     eax, [rdx+1]
.text:00000000004010D8                 mov     edx, cs:iterations
.text:00000000004010DE                 mov     cs:current_test, eax
.text:00000000004010E4                 call    _Z9summarizePKciiii ; 
summarize(char  const*,int,int,int,int)
.text:00000000004010E9                 pop     rbx
.text:00000000004010EA                 pop     r12
.text:00000000004010EC                 pop     r13
.text:00000000004010EE                 pop     r14
.text:00000000004010F0                 leave
.text:00000000004010F1                 xor     eax, eax
.text:00000000004010F3                 retn
.text:00000000004010F4 ; 
---------------------------------------------------------------------------
.text:00000000004010F4
.text:00000000004010F4 loc_4010F4:                             ; CODE XREF: 
main+11Aj
.text:00000000004010F4                 mov     esi, cs:current_test
.text:00000000004010FA                 mov     edi, offset aTestIFailed ; "test 
%i failed\n"
.text:00000000004010FF                 xor     eax, eax
.text:0000000000401101                 call    _printf
.text:0000000000401106                 jmp     loc_401030
.text:000000000040110B ; 
---------------------------------------------------------------------------
.text:000000000040110B
.text:000000000040110B loc_40110B:                             ; CODE XREF: 
main+68j
.text:000000000040110B                 mov     rdi, [r14+10h]  ; nptr
.text:000000000040110F                 xor     edx, edx        ; group
.text:0000000000401111                 xor     esi, esi        ; endptr
.text:0000000000401113                 call    ___strtod_internal
.text:0000000000401118                 movsd   cs:init_value, xmm0
.text:0000000000401120                 jmp     loc_400F7E
.text:0000000000401125 ; 
---------------------------------------------------------------------------
.text:0000000000401125
.text:0000000000401125 loc_401125:                             ; CODE XREF: 
main+180j
.text:0000000000401125                 mov     esi, cs:allocated_results
.text:000000000040112B                 mov     edi, offset aCouldNotAlloca ; 
"Could not allocate %d results\n"
.text:0000000000401130                 call    _printf
.text:0000000000401135                 mov     edi, 0FFFFFFFFh ; status
.text:000000000040113A                 call    _exit
.text:000000000040113A main            endp

.text:0000000000400C98 ; 
---------------------------------------------------------------------------
.text:0000000000400C99                 align 20h
.text:0000000000400CA0
.text:0000000000400CA0 ; =============== S U B R O U T I N E 
=======================================
.text:0000000000400CA0
.text:0000000000400CA0
.text:0000000000400CA0                 public main
.text:0000000000400CA0 main            proc near
.text:0000000000400CA0
.text:0000000000400CA0 var_28          = qword ptr -28h
.text:0000000000400CA0
.text:0000000000400CA0                 push    r12
.text:0000000000400CA2                 push    rbp
.text:0000000000400CA3                 mov     rbp, rsi
.text:0000000000400CA6                 push    rbx
.text:0000000000400CA7                 mov     ebx, edi
.text:0000000000400CA9                 sub     rsp, 10h
.text:0000000000400CAD                 test    edi, edi
.text:0000000000400CAF                 jle     loc_400E3B
.text:0000000000400CB5                 xor     r12d, r12d
.text:0000000000400CB8                 db      66h, 66h, 66h
.text:0000000000400CB8                 nop
.text:0000000000400CBC                 db      66h, 66h, 66h
.text:0000000000400CBC                 nop
.text:0000000000400CC0
.text:0000000000400CC0 loc_400CC0:                             ; CODE XREF: 
main+38j
.text:0000000000400CC0                 mov     rsi, [rbp+r12*8+0]
.text:0000000000400CC5                 xor     eax, eax
.text:0000000000400CC7                 mov     edi, offset aS  ; "%s "
.text:0000000000400CCC                 add     r12, 1
.text:0000000000400CD0                 call    _printf
.text:0000000000400CD5                 cmp     ebx, r12d
.text:0000000000400CD8                 jg      short loc_400CC0
.text:0000000000400CDA                 mov     edi, 0Ah        ; c
.text:0000000000400CDF                 call    _putchar
.text:0000000000400CE4                 cmp     ebx, 1
.text:0000000000400CE7                 jle     short loc_400D1B
.text:0000000000400CE9                 mov     rdi, [rbp+8]    ; nptr
.text:0000000000400CED                 xor     ecx, ecx        ; group
.text:0000000000400CEF                 xor     esi, esi        ; endptr
.text:0000000000400CF1                 mov     edx, 0Ah        ; base
.text:0000000000400CF6                 call    ___strtol_internal
.text:0000000000400CFB                 cmp     ebx, 2
.text:0000000000400CFE                 mov     cs:base_iterations, eax
.text:0000000000400D04                 jz      short loc_400D1B
.text:0000000000400D06                 mov     rdi, [rbp+10h]  ; nptr
.text:0000000000400D0A                 xor     edx, edx        ; group
.text:0000000000400D0C                 xor     esi, esi        ; endptr
.text:0000000000400D0E                 call    ___strtod_internal
.text:0000000000400D13                 movsd   cs:init_value, xmm0
.text:0000000000400D1B
.text:0000000000400D1B loc_400D1B:                             ; CODE XREF: 
main+47j
.text:0000000000400D1B                                         ; main+64j ...
.text:0000000000400D1B                 movsd   xmm1, cs:init_value
.text:0000000000400D23                 mov     ecx, 1F4h
.text:0000000000400D28                 cvttsd2si eax, xmm1
.text:0000000000400D2C                 pxor    xmm1, xmm1
.text:0000000000400D30                 movd    xmm0, eax
.text:0000000000400D34                 xor     eax, eax
.text:0000000000400D36                 pshufb  xmm0, xmm1
.text:0000000000400D3B                 db      66h, 66h
.text:0000000000400D3B                 nop
.text:0000000000400D3E                 db      66h
.text:0000000000400D3E                 nop
.text:0000000000400D40
.text:0000000000400D40 loc_400D40:                             ; CODE XREF: 
main+B6j
.text:0000000000400D40                 mov     rdx, rax
.text:0000000000400D43                 add     rax, 1
.text:0000000000400D47                 shl     rdx, 4
.text:0000000000400D4B                 cmp     rcx, rax
.text:0000000000400D4E                 movdqa  xmmword ptr [rdx+5015A0h], xmm0
.text:0000000000400D56                 ja      short loc_400D40
.text:0000000000400D58                 mov     ecx, cs:base_iterations
.text:0000000000400D5E                 mov     edx, 66666667h
.text:0000000000400D63                 xor     ebx, ebx
.text:0000000000400D65                 mov     eax, ecx
.text:0000000000400D67                 sar     ecx, 1Fh
.text:0000000000400D6A                 imul    edx
.text:0000000000400D6C                 sar     edx, 2
.text:0000000000400D6F                 sub     edx, ecx
.text:0000000000400D71                 mov     cs:iterations, edx
.text:0000000000400D77                 call    _Z11start_timerv ; 
start_timer(void)
.text:0000000000400D7C                 mov     ecx, cs:iterations
.text:0000000000400D82                 movsd   xmm1, cs:qword_4010E0
.text:0000000000400D8A                 test    ecx, ecx
.text:0000000000400D8C                 jle     short loc_400DE3
.text:0000000000400D8E                 db      66h
.text:0000000000400D8E                 nop
.text:0000000000400D90
.text:0000000000400D90 loc_400D90:                             ; CODE XREF: 
main+141j
.text:0000000000400D90                 mov     edx, offset data8
.text:0000000000400D95                 xor     eax, eax
.text:0000000000400D97                 db      66h, 66h
.text:0000000000400D97                 nop
.text:0000000000400D9A                 db      66h, 66h
.text:0000000000400D9A                 nop
.text:0000000000400D9D                 db      66h, 66h
.text:0000000000400D9D                 nop
.text:0000000000400DA0
.text:0000000000400DA0 loc_400DA0:                             ; CODE XREF: 
main+110j
.text:0000000000400DA0                 add     eax, 0Ah
.text:0000000000400DA3                 add     al, [rdx]
.text:0000000000400DA5                 add     rdx, 1
.text:0000000000400DA9                 cmp     rdx, 5034E0h
.text:0000000000400DB0                 jnz     short loc_400DA0
.text:0000000000400DB2                 movsd   xmm0, cs:init_value
.text:0000000000400DBA                 cvttsd2si edx, xmm0
.text:0000000000400DBE                 add     edx, 0Ah
.text:0000000000400DC1                 shl     edx, 6
.text:0000000000400DC4                 sub     al, dl
.text:0000000000400DC6                 movsx   eax, al
.text:0000000000400DC9                 mov     edx, eax
.text:0000000000400DCB                 sar     edx, 1Fh
.text:0000000000400DCE                 xor     eax, edx
.text:0000000000400DD0                 sub     eax, edx
.text:0000000000400DD2                 cvtsi2sd xmm0, eax
.text:0000000000400DD6                 ucomisd xmm1, xmm0
.text:0000000000400DDA                 jbe     short loc_400E17
.text:0000000000400DDC
.text:0000000000400DDC loc_400DDC:                             ; CODE XREF: 
main+199j
.text:0000000000400DDC                 add     ebx, 1
.text:0000000000400DDF                 cmp     ebx, ecx
.text:0000000000400DE1                 jl      short loc_400D90
.text:0000000000400DE3
.text:0000000000400DE3 loc_400DE3:                             ; CODE XREF: 
main+ECj
.text:0000000000400DE3                 call    _Z5timerv       ; timer(void)
.text:0000000000400DE8                 mov     edi, offset aInt8_tConstant ; 
"int8_t constant add"
.text:0000000000400DED                 call    _Z13record_resultdPKc ; 
record_result(double,char  const*)
.text:0000000000400DF2                 mov     edx, cs:iterations
.text:0000000000400DF8                 xor     r8d, r8d
.text:0000000000400DFB                 xor     ecx, ecx
.text:0000000000400DFD                 mov     esi, 1F40h
.text:0000000000400E02                 mov     edi, offset aInt8_tConsta_0 ; 
"int8_t constant folding"
.text:0000000000400E07                 call    _Z9summarizePKciiii ; 
summarize(char  const*,int,int,int,int)
.text:0000000000400E0C                 add     rsp, 10h
.text:0000000000400E10                 xor     eax, eax
.text:0000000000400E12                 pop     rbx
.text:0000000000400E13                 pop     rbp
.text:0000000000400E14                 pop     r12
.text:0000000000400E16                 retn
.text:0000000000400E17 ; 
---------------------------------------------------------------------------
.text:0000000000400E17
.text:0000000000400E17 loc_400E17:                             ; CODE XREF: 
main+13Aj
.text:0000000000400E17                 mov     esi, cs:current_test
.text:0000000000400E1D                 mov     edi, offset aTestIFailed ; "test 
%i failed\n"
.text:0000000000400E22                 xor     eax, eax
.text:0000000000400E24                 movsd   [rsp+28h+var_28], xmm1
.text:0000000000400E29                 call    _printf
.text:0000000000400E2E                 mov     ecx, cs:iterations
.text:0000000000400E34                 movsd   xmm1, [rsp+28h+var_28]
.text:0000000000400E39                 jmp     short loc_400DDC
.text:0000000000400E3B ; 
---------------------------------------------------------------------------
.text:0000000000400E3B
.text:0000000000400E3B loc_400E3B:                             ; CODE XREF: 
main+Fj
.text:0000000000400E3B                 mov     edi, 0Ah        ; c
.text:0000000000400E40                 call    _putchar
.text:0000000000400E45                 jmp     loc_400D1B
.text:0000000000400E45 main            endp

Re: Performance degradation on g++ 4.6

Reply via email to