Memory buffer handling - additional optimization proposal

2017-01-21 Thread daniel

Hi
I have checked how gcc treats temporary buffers allocated in different 
ways (local buffer on stack, malloced locally in function, malloced 
outside and passed via argument) and found that gcc could do better work 
there. I have few proposals how to make things better:


1. Introduce __builtin_assume_temporary(ptr) and/or 
__attribute__((temporary)) which will tell gcc that given function 
argument or variable is a temporary buffer  and its contents after 
function execution will not be used. By doing so gcc could treat it as 
if it was allocated locally in function and optimize out unnecessary 
memory writes.


2. One of function versions used malloc() to allocate buffer of constant 
size at beginning of function, and free() to delete it at the end. When 
this function was inlined and called twice, its body was inserted twice 
in my code. Other function versions with buffer on stack or malloced 
outside and passed via argument were inserted once. I am not sure if 
this is some bug or feature, someone familiar with this should check it.


3. If there is valid reason to actually duplicate this code (e.g. loop 
unrolling), gcc could detect that code calls malloc and free in loop and 
repeatedly allocate/free buffer of the same size. In such case gcc could 
optimize this to call malloc and free once. If someone will use calloc, 
gcc could simply clear buffer before each loop iteration.


I tested this with gcc 5.4.0 shipped by default in Cygwin 64 bit. Test 
code used by me is below.


Regards,
Daniel

#include 
#include 

#define CNT 4

#define ATTR_INLINE __attribute__((hot, always_inline)) static inline
//#define ATTR_INLINE

ATTR_INLINE int func_1(int* data)
{
int buff[CNT * CNT];

for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}

for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT 
+ j+1];

}
}
}

return buff[0];
}

ATTR_INLINE int func_2(int* data)
{
int* buff = (int*)malloc(CNT * CNT * sizeof(int));

for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}

for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT 
+ j+1];

}
}
}

int result = buff[0];
free(buff);
return result;
}

ATTR_INLINE int func_3(int* __restrict__ data, int* __restrict__ buff)
{
for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}

for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT 
+ j+1];

}
}
}

return buff[0];
}

ATTR_INLINE int func_4(int* data, int* buff)
{
for (int i = 0; i < CNT; ++i)
{
for (int j = 0; j < CNT; ++j)
{
buff[i * CNT + j] = data[i];
}
}

for (int n = 0; n < CNT; ++n)
{
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < n; ++j)
{
buff[i * CNT + j] = buff[(i+1) * CNT + j] + buff[i * CNT 
+ j+1];

}
}
}

return buff[0];
}

int main()
{
int* data = (int*)malloc(CNT * sizeof(int));
int* buff = (int*)malloc(CNT * sizeof(int));
for (int n = 0; n < CNT; ++n)
data[n] = rand();

int sum = 0;
printf("1");
sum += func_1(data);
sum += func_1(data);
printf("2");
sum += func_2(data);
sum += func_2(data);
printf("3");
sum += func_3(data, buff);
sum += func_3(data, buff);
printf("4");
sum += func_4(data, buff);
sum += func_4(data, buff);
printf("4");

return sum;
}


Why does 2nd loop only print values 10-19 and not 0-19?

2017-01-21 Thread L A Walsh




It may be this should go to "gcc-help", due to my
not seeing my error, but



 gcc -v

Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib64/gcc/x86_64-suse-linux/4.9/lto-wrapper
Target: x86_64-suse-linux
Configured with: ../configure --prefix=/usr --infodir=/usr/share/info 
--mandir=/usr/share/man --libdir=/usr/lib64 --libexecdir=/usr/lib64 
--enable-languages=c,c++,objc,fortran,obj-c++,java,ada,go 
--enable-checking=release --with-gxx-include-dir=/usr/include/c++/4.9 
--enable-ssp --disable-libssp --disable-libvtv --disable-plugin 
--with-bugurl=http://bugs.opensuse.org/ --with-pkgversion='SUSE Linux' 
--disable-libgcj --with-slibdir=/lib64 --with-system-zlib 
--enable-__cxa_atexit --enable-libstdcxx-allocator=new 
--disable-libstdcxx-pch --enable-version-specific-runtime-libs 
--enable-linker-build-id --enable-linux-futex --program-suffix=-4.9 
--without-system-libunwind --enable-multilib --with-arch-32=i586 
--with-tune=generic --build=x86_64-suse-linux --host=x86_64-suse-linux

Thread model: posix
gcc version 4.9.0 (SUSE Linux)


Program:
-

/* why does data printing loop start @ 10? */
/* linda w. (gcc(at)tlinx(dot)org */

#include 
#include 
#include 


static int source[] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 
10, 11, 12, 13, 14, 15, 
16, 17, 18, 19 };


int main(int argc, char **argv) {
   int i;
   int * dest= calloc(sizeof(source), sizeof(source[0]));
   int err = errno;

   if (!dest) {
   fprintf(stderr, "Error, dest could not be allocated 
(errno=%d)\n", err);

   exit(1);
   }

   printf("for(i=0; i < %d)...\n", sizeof(source)/sizeof(source[0]));

   /* header */
   for (i==0; i < sizeof(source)/(2*sizeof(source[0])); ++i) {
   printf("  %2d--|",  i,0);
   }

   printf("\ndata:\n"); /* shouldn't nxt loop start @ '0' ? */

   for (i==0; i < sizeof(source)/sizeof(source[0]); i++) {
   printf(" %d:%2d;",  i, *(dest+i));
   if (i%10 == 9) printf("\n");
   }
   printf("\n");
}
---
output:

for(i=0; i < 20)...
  0--|   1--|   2--|   3--|   4--|   5--|   6--|   7--|   8--|   9--|
data:
10: 0; 11: 0; 12: 0; 13: 0; 14: 0; 15: 0; 16: 0; 17: 0; 18: 0; 19: 0;







Re: Why does 2nd loop only print values 10-19 and not 0-19?

2017-01-21 Thread Paul Smith
On Sat, 2017-01-21 at 13:25 -0800, L A Walsh wrote:
> It may be this should go to "gcc-help", due to my
> not seeing my error, but

Probably better to start there... if it's really a GCC bug you'll be
quickly redirected here.

>     for (i==0; i < sizeof(source)/(2*sizeof(source[0])); ++i) {
^^

This should be "for (i=0; ...".


Re: Why does 2nd loop only print values 10-19 and not 0-19?

2017-01-21 Thread L A Walsh

Paul Smith wrote:

On Sat, 2017-01-21 at 13:25 -0800, L A Walsh wrote:

It may be this should go to "gcc-help", due to my
not seeing my error, but


Probably better to start there... if it's really a GCC bug you'll be
quickly redirected here.


for (i==0; i < sizeof(source)/(2*sizeof(source[0])); ++i) {

^^

This should be "for (i=0; ...".

---
Sorry for the bother...


Re: Why does 2nd loop only print values 10-19 and not 0-19?

2017-01-21 Thread Andreas Schwab
On Jan 21 2017, L A Walsh  wrote:

> It may be this should go to "gcc-help",

You should be using -Wall.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."