Package: clblas
Version: 2.12-1
Control: tags -1 patch upstream
(upstream tag based on checking the source, not testing)
Some clblas operations fail on beignet-opencl-icd with
stringInput.cl:179:24: error: variables in the local address space can
only be declared in the outermost scope of a kernel function
Checking the source confirms that they are declaring __local variables
in an inner scope, which is not allowed by the OpenCL standard:
https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/local.html
This affects example_chpmv, example_snrm2, example_sspmv, example_stpmv,
example_stpsv, example_strmv, example_strsv.
The attached patch makes the examples run, but has *not* been tested
beyond them (which I suspect don't check correctness): the test suite
fails to build with what looks like
https://github.com/clMathLibraries/clBLAS/issues/338.
Description: Move __local declarations to kernel function scope
The OpenCL spec does not allow declaring local variables in scopes
below kernel function scope, and such declarations fail to build
on at least beignet-opencl-icd.
https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/local.html
Author: Rebecca N. Palmer <rebecca_pal...@zoho.com>
Bug-Debian: https://bugs.debian.org/<bugnumber>
Forwarded: no
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/trmv.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/trmv.cl
@@ -75,6 +75,8 @@ __kernel void %PREFIXtrmv_CU_kernel( __g
__local %TYPE sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
+ volatile __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
+ volatile __local %TYPE* sData = sDataTemp;
const int gIdx = get_global_id(0);
const int bIdx = get_group_id(0);
@@ -197,8 +199,6 @@ __kernel void %PREFIXtrmv_CU_kernel( __g
}
- volatile __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
- volatile __local %TYPE* sData = sDataTemp;
//sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -325,6 +325,8 @@ __kernel void %PREFIXtrmv_CL_kernel( __g
#endif
__local %TYPE sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
+ __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
+ __local %TYPE* sData = sDataTemp;
size_t gIdx = get_global_id(0);
size_t bIdx = get_group_id(0);
@@ -448,8 +450,6 @@ __kernel void %PREFIXtrmv_CL_kernel( __g
}
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
- __local %TYPE* sData = sDataTemp;
//sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -584,6 +584,7 @@ __kernel void %PREFIXtrmv_CLT_kernel( __
int threadIdx = get_local_id(0);
__local %TYPE xShared[TARGET_WIDTH];
+ __local %TYPE%V* xSharedTemp;
int startCol = blockIdx * %TARGET_ROWS;
@@ -608,7 +609,7 @@ __kernel void %PREFIXtrmv_CLT_kernel( __
//float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
%TYPE%V xData;
- __local %TYPE%V* xSharedTemp = (xShared + rowShift);
+ xSharedTemp = (xShared + rowShift);
xData = *(xSharedTemp);
int row = startRow + rowShift;
@@ -761,6 +762,9 @@ __kernel void %PREFIXtrmv_CUT_kernel( __
int threadIdx = get_local_id(0);
__local %TYPE xShared[TARGET_WIDTH];
+ __local %TYPE%V* xSharedTemp;
+ __local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
+ __local %TYPE* sData = sDataTemp;
int startRow = 0;
int startCol = N - (blockIdx + 1)* %TARGET_ROWS;
@@ -842,7 +846,7 @@ __kernel void %PREFIXtrmv_CUT_kernel( __
//float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
%TYPE%V xData;
- __local %TYPE%V* xSharedTemp = (xShared + rowShift);
+ xSharedTemp = (xShared + rowShift);
xData = *(xSharedTemp);
int row = startRow + rowShift;
@@ -861,8 +865,6 @@ __kernel void %PREFIXtrmv_CUT_kernel( __
//__local float4 sData[16][4];
//sData[(threadIdx & 15)][colShift] = acc;
//barrier(CLK_LOCAL_MEM_FENCE);
- __local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
- __local %TYPE* sData = sDataTemp;
//sDataTemp[ ( threadIdx & ( TARGET_WIDTH_BY_VEC -1 ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
sDataTemp[ ( threadIdx % ( TARGET_WIDTH_BY_VEC ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/trsv_gemv.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/trsv_gemv.cl
@@ -35,6 +35,11 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
{
__global %TYPE* xnew;
__global %TYPE* A = _A + offa;
+ __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+ //__local %TYPE* sData = sDataTemp;
+ __local %TYPE xShared_scalar; // To share solved x value with other threads..
+ __local %TYPE xShared_array[%V];
+
if ( incx < 0 ) // Goto end of vector
{
@@ -100,7 +105,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
targetCol = startCol - %TARGET_ROWS;
- __local %TYPE xShared; // To share solved x value with other threads..
for( int i=0; i < (lastRow + 1); i++)
{
@@ -108,8 +112,8 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
{
%TYPE xVal = xnew[ targetRow * incx];
%SUB(sum, xVal, sum);
- xShared = sum;
- xnew[ targetRow * incx ] = xShared;
+ xShared_scalar = sum;
+ xnew[ targetRow * incx ] = xShared_scalar;
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -118,7 +122,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
{
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
+ %MAD(sum, loadedA, xShared_scalar);
}
// Avoid Race
@@ -164,8 +168,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
barrier(CLK_LOCAL_MEM_FENCE);
}
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -193,7 +195,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
}
}
- __local %TYPE xShared[%V];
int targetRowTemp = rowStart + threadIdx * %V;
int VECTOR_SIZE = %V;
@@ -231,7 +232,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
// Solve for first x - Do the rest in loop
%TYPE x[%V];
%SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
- xShared[%V - 1] = x[%V - 1];
+ xShared_array[%V - 1] = x[%V - 1];
xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
//#pragma unroll
@@ -256,13 +257,13 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
//#pragma unroll
for(int m = 0; m < %V; m++)
{
- xShared[m] = x[m];
+ xShared_array[m] = x[m];
xnew[ (targetRowTemp + m)* incx ] = x[m];
}
}
- // Sync so that xShared it available to all threads
+ // Sync so that xShared_array is available to all threads
barrier(CLK_LOCAL_MEM_FENCE);
if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
@@ -270,10 +271,10 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
//#pragma unroll
for( int j=0; j < %V; j++)
{
- //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
+ //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared_array[%V - 1 -j];
%TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
%CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
+ %VMAD(sumVec, loadedAVec, xShared_array[VECTOR_SIZE - 1 -j]);
}
}
@@ -335,6 +336,11 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
{
__global %TYPE* xnew;
__global %TYPE* A = _A + offa;
+ __local %TYPE xData[ %TARGET_WIDTH];
+ __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+ //__local %TYPE* sData = sDataTemp;
+ __local %TYPE xShared_scalar; // To share solved x value with other threads..
+ __local %TYPE xShared_array[%V];
if ( incx < 0 ) // Goto end of vector
{
@@ -400,7 +406,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
targetCol = startCol - %TARGET_ROWS;
- __local %TYPE xShared; // To share solved x value with other threads..
for( int i=0; i < (lastRow + 1); i++)
{
@@ -412,9 +417,9 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
// Handle diagonal element
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %DIV(xShared, sum, loadedA);
+ %DIV(xShared_scalar, sum, loadedA);
- xnew[ targetRow * incx ] = xShared;
+ xnew[ targetRow * incx ] = xShared_scalar;
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -423,7 +428,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
{
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
+ %MAD(sum, loadedA, xShared_scalar);
}
// Avoid Race
@@ -443,7 +448,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
%TYPE sumTemp = %MAKEVEC(0.0);
%TYPE%V sum = %VMAKEVEC(sumTemp);
- __local %TYPE xData[ %TARGET_WIDTH];
//#pragma unroll
for( int i=1; i <= %NLOOPS; i++)
@@ -469,8 +473,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
barrier(CLK_LOCAL_MEM_FENCE);
}
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -498,7 +500,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
}
}
- __local %TYPE xShared[%V];
int targetRowTemp = rowStart + threadIdx * %V;
int VECTOR_SIZE = %V;
@@ -538,7 +539,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
%SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
%DIV(sumVecReg[VECTOR_SIZE - 1], x[VECTOR_SIZE -1], reg[VECTOR_SIZE - 1][VECTOR_SIZE - 1]);
x[VECTOR_SIZE -1] = sumVecReg[VECTOR_SIZE - 1];
- xShared[%V - 1] = x[%V - 1];
+ xShared_array[%V - 1] = x[%V - 1];
xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
//#pragma unroll
@@ -568,13 +569,13 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
//#pragma unroll
for(int m = 0; m < %V; m++)
{
- xShared[m] = x[m];
+ xShared_array[m] = x[m];
xnew[ (targetRowTemp + m)* incx ] = x[m];
}
}
- // Sync so that xShared it available to all threads
+ // Sync so that xShared_array is available to all threads
barrier(CLK_LOCAL_MEM_FENCE);
if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
@@ -582,10 +583,10 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
//#pragma unroll
for( int j=0; j < %V; j++)
{
- //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
+ //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared_array[%V - 1 -j];
%TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
%CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
+ %VMAD(sumVec, loadedAVec, xShared_array[VECTOR_SIZE - 1 -j]);
}
}
@@ -649,6 +650,11 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
{
__global %TYPE* xnew;
__global %TYPE* A = _A + offa;
+ __local %TYPE xData[ %TARGET_WIDTH];
+ __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+ //__local %TYPE* sData = sDataTemp;
+ __local %TYPE xShared_scalar; // To share solved x value with other threads..
+ __local %TYPE xShared_array[%V];
if ( incx < 0 ) // Goto end of vector
{
@@ -715,7 +721,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
targetCol = startCol + %TARGET_ROWS;
- __local %TYPE xShared; // To share solved x value with other threads..
for( int i=0; i < ((lastRow -startRow) + 1); i++)
{
@@ -726,16 +731,16 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
if( isUnity)
{
- xShared = sum;
+ xShared_scalar = sum;
}
else // Handle diagonal element
{
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %DIV(xShared, sum, loadedA);
+ %DIV(xShared_scalar, sum, loadedA);
}
- xnew[ targetRow * incx ] = xShared;
+ xnew[ targetRow * incx ] = xShared_scalar;
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -744,7 +749,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
{
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
+ %MAD(sum, loadedA, xShared_scalar);
}
// Avoid Race
@@ -764,7 +769,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
%TYPE sumTemp = %MAKEVEC(0.0);
%TYPE%V sum = %VMAKEVEC(sumTemp);
- __local %TYPE xData[ %TARGET_WIDTH];
//#pragma unroll
for( int i=1; i <= %NLOOPS; i++)
@@ -790,8 +794,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
barrier(CLK_LOCAL_MEM_FENCE);
}
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -819,7 +821,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
}
}
- __local %TYPE xShared[%V];
int targetRowTemp = rowStart + threadIdx * %V;
int VECTOR_SIZE = %V;
@@ -857,7 +858,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
// Solve for first x - Do the rest in loop
%TYPE x[%V];
%SUB(x[0], xVal[0], sumVecReg[0]);
- xShared[0] = x[0];
+ xShared_array[0] = x[0];
xnew[ (targetRowTemp)* incx ] = x[0];
//#pragma unroll
@@ -882,13 +883,13 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
//#pragma unroll
for(int m = 0; m < %V; m++)
{
- xShared[m] = x[m];
+ xShared_array[m] = x[m];
xnew[ (targetRowTemp + m)* incx ] = x[m];
}
}
- // Sync so that xShared it available to all threads
+ // Sync so that xShared_array is available to all threads
barrier(CLK_LOCAL_MEM_FENCE);
if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
{
@@ -897,7 +898,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
{
%TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
%CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[j]);
+ %VMAD(sumVec, loadedAVec, xShared_array[j]);
}
}
@@ -958,6 +959,12 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
{
__global %TYPE* xnew;
__global %TYPE* A = _A + offa;
+ __local %TYPE xData[ %TARGET_WIDTH];
+ __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+ //__local %TYPE* sData = sDataTemp;
+ __local %TYPE xShared_scalar; // To share solved x value with other threads..
+ __local %TYPE xShared_array[%V];
+
if ( incx < 0 ) // Goto end of vector
{
@@ -1024,7 +1031,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
targetCol = startCol + %TARGET_ROWS;
- __local %TYPE xShared; // To share solved x value with other threads..
for( int i=0; i < ((lastRow -startRow) + 1); i++)
{
@@ -1036,8 +1042,8 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
// Handle diagonal element
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %DIV(xShared, sum, loadedA);
- xnew[ targetRow * incx ] = xShared;
+ %DIV(xShared_scalar, sum, loadedA);
+ xnew[ targetRow * incx ] = xShared_scalar;
}
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1046,7 +1052,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
{
loadedA = A((targetRow), (targetCol));
%CONJUGATE(doConj, loadedA);
- %MAD(sum, loadedA, xShared);
+ %MAD(sum, loadedA, xShared_scalar);
}
// Avoid Race
@@ -1066,7 +1072,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
%TYPE sumTemp = %MAKEVEC(0.0);
%TYPE%V sum = %VMAKEVEC(sumTemp);
- __local %TYPE xData[ %TARGET_WIDTH];
//#pragma unroll
for( int i=1; i <= %NLOOPS; i++)
@@ -1092,8 +1097,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
barrier(CLK_LOCAL_MEM_FENCE);
}
- __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
- //__local %TYPE* sData = sDataTemp;
sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
@@ -1121,7 +1124,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
}
}
- __local %TYPE xShared[%V];
int targetRowTemp = rowStart + threadIdx * %V;
int VECTOR_SIZE = %V;
@@ -1161,7 +1163,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
%SUB(x[0], xVal[0], sumVecReg[0]);
%DIV(sumVecReg[0], x[0], reg[0][0]);
x[0] = sumVecReg[0];
- xShared[0] = sumVecReg[0];
+ xShared_array[0] = sumVecReg[0];
xnew[ (targetRowTemp)* incx ] = sumVecReg[0];
//#pragma unroll
@@ -1191,13 +1193,13 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
//#pragma unroll
for(int m = 1; m < %V; m++)
{
- xShared[m] = x[m];
+ xShared_array[m] = x[m];
xnew[ (targetRowTemp + m)* incx ] = x[m];
}
}
- // Sync so that xShared it available to all threads
+ // Sync so that xShared_array is available to all threads
barrier(CLK_LOCAL_MEM_FENCE);
if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
{
@@ -1206,7 +1208,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
{
%TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
%CONJUGATE(doConj, loadedAVec);
- %VMAD(sumVec, loadedAVec, xShared[j]);
+ %VMAD(sumVec, loadedAVec, xShared_array[j]);
}
}
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/nrm2.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/nrm2.cl
@@ -172,8 +172,14 @@ __kernel void %PREFIXnrm2_ssq_kernel( __
// If scaleOfWG was zero, that means the whole array encountered before was filled with zeroes
// Note: _scale is a local variable, either all enter or none
- if(isnotequal(scaleOfWG, PZERO))
+ if(isequal(scaleOfWG, PZERO))
{
+ if( (get_local_id(0)) == 0 ) {
+ scratchBuff[ get_group_id(0) ] = scaleOfWG;
+ scratchBuff[ numWGs + get_group_id(0) ] = 0.0f;
+ }
+ return;
+ }
for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
{
%TYPE%V vReg1;
@@ -201,7 +207,7 @@ __kernel void %PREFIXnrm2_ssq_kernel( __
}
%REDUCTION_BY_SUM( ssq );
- }
+
if( (get_local_id(0)) == 0 ) {
scratchBuff[ get_group_id(0) ] = scaleOfWG;
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/reduction.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/reduction.cl
@@ -319,8 +319,13 @@ __kernel void %PREFIXred_ssq_kernel( __g
// If scale was zero, that means the whole array encountered before was filled with zeroes
// Note: scale is a local variable, either all enter or none
- if(isnotequal(scaleOfWG, ZERO))
+ if(isequal(scaleOfWG, ZERO))
{
+ if( (get_local_id(0)) == 0 ) {
+ res[0] = 0.0f;
+ }
+ return;
+ }
for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
{
%TYPE%V scale1, ssq1;
@@ -342,7 +347,7 @@ __kernel void %PREFIXred_ssq_kernel( __g
}
%REDUCTION_BY_SUM( ssq );
- }
+
if( (get_local_id(0)) == 0 ) {
res[0] = scaleOfWG * sqrt(ssq);