Bug#921890: clblas: error: variables in the local address space can only be declared in the outermost scope of a kernel function

Rebecca N. Palmer Sat, 09 Feb 2019 13:50:41 -0800

Package: clblas
Version: 2.12-1
Control: tags -1 patch upstream
(upstream tag based on checking the source, not testing)


Some clblas operations fail on beignet-opencl-icd with

stringInput.cl:179:24: error: variables in the local address space canonly be declared in the outermost scope of a kernel function

Checking the source confirms that they are declaring __local variablesin an inner scope, which is not allowed by the OpenCL standard:https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/local.html

This affects example_chpmv, example_snrm2, example_sspmv, example_stpmv,example_stpsv, example_strmv, example_strsv.

The attached patch makes the examples run, but has *not* been testedbeyond them (which I suspect don't check correctness): the test suitefails to build with what looks likehttps://github.com/clMathLibraries/clBLAS/issues/338.

Description: Move __local declarations to kernel function scope

The OpenCL spec does not allow declaring local variables in scopes
below kernel function scope, and such declarations fail to build
on at least beignet-opencl-icd.

https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/local.html

Author: Rebecca N. Palmer <rebecca_pal...@zoho.com>
Bug-Debian: https://bugs.debian.org/<bugnumber>
Forwarded: no

--- clblas-2.12.orig/src/library/blas/gens/clTemplates/trmv.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/trmv.cl
@@ -75,6 +75,8 @@ __kernel void %PREFIXtrmv_CU_kernel( __g
 
 
 	__local %TYPE  sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
+    volatile __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
+    volatile __local %TYPE* sData = sDataTemp;
 
 	const int gIdx = get_global_id(0);
 	const int bIdx = get_group_id(0);
@@ -197,8 +199,6 @@ __kernel void %PREFIXtrmv_CU_kernel( __g
 		}
 
 
-		volatile __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
-		volatile __local %TYPE* sData = sDataTemp;
 		//sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		barrier(CLK_LOCAL_MEM_FENCE);
@@ -325,6 +325,8 @@ __kernel void %PREFIXtrmv_CL_kernel( __g
     #endif
 
 	__local %TYPE sXData[ TARGET_WIDTH ]; // Each column is multiplied with a common x_vector element
+    __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
+    __local %TYPE* sData = sDataTemp;
 
 	size_t gIdx = get_global_id(0);
 	size_t bIdx = get_group_id(0);
@@ -448,8 +450,6 @@ __kernel void %PREFIXtrmv_CL_kernel( __g
 		}
 
 
-		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * TARGET_WIDTH];
-		__local %TYPE* sData = sDataTemp;
 		//sDataTemp[(threadIdx & ( TARGET_ROWS_BY_VEC -1 )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC  )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		barrier(CLK_LOCAL_MEM_FENCE);
@@ -584,6 +584,7 @@ __kernel void %PREFIXtrmv_CLT_kernel( __
 	int threadIdx 	= get_local_id(0);
 
 	__local %TYPE xShared[TARGET_WIDTH];
+    __local %TYPE%V* xSharedTemp;
 
 	int startCol  	= blockIdx * %TARGET_ROWS;
 
@@ -608,7 +609,7 @@ __kernel void %PREFIXtrmv_CLT_kernel( __
 
 		//float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
 		%TYPE%V xData;
-		__local %TYPE%V* xSharedTemp = (xShared + rowShift);
+		xSharedTemp = (xShared + rowShift);
 		xData = *(xSharedTemp);
 
 		int row	= startRow + rowShift;
@@ -761,6 +762,9 @@ __kernel void %PREFIXtrmv_CUT_kernel( __
 	int threadIdx 	= get_local_id(0);
 
 	__local %TYPE xShared[TARGET_WIDTH];
+    __local %TYPE%V* xSharedTemp;
+    __local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
+    __local %TYPE* sData = sDataTemp;
 
 	int startRow  	= 0;
 	int startCol  	= N - (blockIdx + 1)* %TARGET_ROWS;
@@ -842,7 +846,7 @@ __kernel void %PREFIXtrmv_CUT_kernel( __
 
 			//float4 xData = (float4)(xShared[ rowShift ], xShared[ rowShift + 1], xShared[ rowShift + 2], xShared[ rowShift + 3]);
 			%TYPE%V xData;
-			__local %TYPE%V* xSharedTemp = (xShared + rowShift);
+			xSharedTemp = (xShared + rowShift);
 			xData = *(xSharedTemp);
 
 			int row	= startRow + rowShift;
@@ -861,8 +865,6 @@ __kernel void %PREFIXtrmv_CUT_kernel( __
 		//__local float4 sData[16][4];
 		//sData[(threadIdx & 15)][colShift] = acc;
 		//barrier(CLK_LOCAL_MEM_FENCE);
-		__local %TYPE%V sDataTemp[TARGET_WIDTH_BY_VEC * %TARGET_ROWS];
-		__local %TYPE* sData = sDataTemp;
 
 		//sDataTemp[ ( threadIdx & ( TARGET_WIDTH_BY_VEC -1 ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
 		sDataTemp[ ( threadIdx % ( TARGET_WIDTH_BY_VEC  ) ) + (colShift * TARGET_WIDTH_BY_VEC) ] = acc;
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/trsv_gemv.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/trsv_gemv.cl
@@ -35,6 +35,11 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 {
 	__global %TYPE* xnew;
 	__global %TYPE* A = _A + offa;
+	__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+	//__local %TYPE* sData = sDataTemp;
+	__local %TYPE  xShared_scalar; // To share solved x value with other threads..
+	__local %TYPE xShared_array[%V];
+
 
 	if ( incx < 0 ) // Goto end of vector
 	{
@@ -100,7 +105,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
 			targetCol 	= startCol - %TARGET_ROWS;
 
-			__local %TYPE  xShared; // To share solved x value with other threads..
 
 			for( int i=0; i < (lastRow + 1); i++)
 			{
@@ -108,8 +112,8 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 				{
 					%TYPE xVal = xnew[ targetRow * incx];
 					%SUB(sum, xVal, sum);
-					xShared = sum;
-					xnew[ targetRow * incx ] = xShared;
+					xShared_scalar = sum;
+					xnew[ targetRow * incx ] = xShared_scalar;
 				}
 
 				barrier(CLK_LOCAL_MEM_FENCE);
@@ -118,7 +122,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 				{
 					loadedA = A((targetRow), (targetCol));
 					%CONJUGATE(doConj, loadedA);
-					%MAD(sum, loadedA, xShared);
+					%MAD(sum, loadedA, xShared_scalar);
 				}
 
 				// Avoid Race
@@ -164,8 +168,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 			barrier(CLK_LOCAL_MEM_FENCE);
 		}
 
-		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
-		//__local %TYPE* sData = sDataTemp;
 		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -193,7 +195,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 				}
 			}
 
-			__local %TYPE xShared[%V];
 
 			int targetRowTemp = rowStart + threadIdx * %V;
 			int VECTOR_SIZE   = %V;
@@ -231,7 +232,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 					// Solve for first x - Do the rest in loop
 					%TYPE x[%V];
 					%SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
-					xShared[%V - 1] = x[%V - 1];
+					xShared_array[%V - 1] = x[%V - 1];
 					xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
 
 					//#pragma unroll
@@ -256,13 +257,13 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 					//#pragma unroll
 					for(int m = 0; m < %V; m++)
 					{
-						xShared[m] = x[m];
+						xShared_array[m] = x[m];
 						xnew[ (targetRowTemp + m)* incx ] = x[m];
 					}
 			    }
 
 
-			    // Sync so that xShared it available to all threads
+			    // Sync so that xShared_array is available to all threads
 			    barrier(CLK_LOCAL_MEM_FENCE);
 
 			    if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
@@ -270,10 +271,10 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 						//#pragma unroll
 						for( int j=0; j < %V; j++)
 						{
-							//sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
+							//sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared_array[%V - 1 -j];
 							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
 							%CONJUGATE(doConj, loadedAVec);
-							%VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
+							%VMAD(sumVec, loadedAVec, xShared_array[VECTOR_SIZE - 1 -j]);
 						}
 				}
 
@@ -335,6 +336,11 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 {
 	__global %TYPE* xnew;
 	__global %TYPE* A = _A + offa;
+	__local %TYPE xData[ %TARGET_WIDTH];
+	__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+	//__local %TYPE* sData = sDataTemp;
+	__local %TYPE  xShared_scalar; // To share solved x value with other threads..
+	__local %TYPE xShared_array[%V];
 
 	if ( incx < 0 ) // Goto end of vector
 	{
@@ -400,7 +406,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
 			targetCol 	= startCol - %TARGET_ROWS;
 
-			__local %TYPE  xShared; // To share solved x value with other threads..
 
 			for( int i=0; i < (lastRow + 1); i++)
 			{
@@ -412,9 +417,9 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 					// Handle diagonal element
 					loadedA = A((targetRow), (targetCol));
 					%CONJUGATE(doConj, loadedA);
-					%DIV(xShared, sum, loadedA);
+					%DIV(xShared_scalar, sum, loadedA);
 
-					xnew[ targetRow * incx ] = xShared;
+					xnew[ targetRow * incx ] = xShared_scalar;
 				}
 
 				barrier(CLK_LOCAL_MEM_FENCE);
@@ -423,7 +428,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 				{
 					loadedA = A((targetRow), (targetCol));
 					%CONJUGATE(doConj, loadedA);
-					%MAD(sum, loadedA, xShared);
+					%MAD(sum, loadedA, xShared_scalar);
 				}
 
 				// Avoid Race
@@ -443,7 +448,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 		%TYPE   sumTemp = %MAKEVEC(0.0);
 		%TYPE%V sum	= %VMAKEVEC(sumTemp);
 
-		__local %TYPE xData[ %TARGET_WIDTH];
 
 		//#pragma unroll
 		for( int i=1; i <= %NLOOPS; i++)
@@ -469,8 +473,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 			barrier(CLK_LOCAL_MEM_FENCE);
 		}
 
-		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
-		//__local %TYPE* sData = sDataTemp;
 		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -498,7 +500,6 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 				}
 			}
 
-			__local %TYPE xShared[%V];
 
 			int targetRowTemp = rowStart + threadIdx * %V;
 			int VECTOR_SIZE   = %V;
@@ -538,7 +539,7 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 					%SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]);
 					%DIV(sumVecReg[VECTOR_SIZE - 1], x[VECTOR_SIZE -1], reg[VECTOR_SIZE - 1][VECTOR_SIZE - 1]);
 					x[VECTOR_SIZE -1] = sumVecReg[VECTOR_SIZE - 1];
-					xShared[%V - 1] = x[%V - 1];
+					xShared_array[%V - 1] = x[%V - 1];
 					xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1];
 
 					//#pragma unroll
@@ -568,13 +569,13 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 					//#pragma unroll
 					for(int m = 0; m < %V; m++)
 					{
-						xShared[m] = x[m];
+						xShared_array[m] = x[m];
 						xnew[ (targetRowTemp + m)* incx ] = x[m];
 					}
 			        }
 
 
-			        // Sync so that xShared it available to all threads
+			        // Sync so that xShared_array is available to all threads
 			        barrier(CLK_LOCAL_MEM_FENCE);
 
 			      	if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i))
@@ -582,10 +583,10 @@ __kernel void %PREFIXtrsv_CU_ComputeRect
 						//#pragma unroll
 						for( int j=0; j < %V; j++)
 						{
-							//sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j];
+							//sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared_array[%V - 1 -j];
 							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol -j)));
 							%CONJUGATE(doConj, loadedAVec);
-							%VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]);
+							%VMAD(sumVec, loadedAVec, xShared_array[VECTOR_SIZE - 1 -j]);
 						}
 				}
 
@@ -649,6 +650,11 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 {
 	__global %TYPE* xnew;
 	__global %TYPE* A = _A + offa;
+	__local %TYPE xData[ %TARGET_WIDTH];
+	__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+	//__local %TYPE* sData = sDataTemp;
+	__local %TYPE  xShared_scalar; // To share solved x value with other threads..
+	__local %TYPE xShared_array[%V];
 
 	if ( incx < 0 ) // Goto end of vector
 	{
@@ -715,7 +721,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
 			targetCol 	= startCol + %TARGET_ROWS;
 
-			__local %TYPE  xShared; // To share solved x value with other threads..
 
 			for( int i=0; i < ((lastRow -startRow) + 1); i++)
 			{
@@ -726,16 +731,16 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 
 					if( isUnity)
 					{
-						xShared = sum;
+						xShared_scalar = sum;
 					}
 					else // Handle diagonal element
 					{
 						loadedA = A((targetRow), (targetCol));
 						%CONJUGATE(doConj, loadedA);
-						%DIV(xShared, sum, loadedA);
+						%DIV(xShared_scalar, sum, loadedA);
 					}
 
-					xnew[ targetRow * incx ] = xShared;
+					xnew[ targetRow * incx ] = xShared_scalar;
 				}
 
 				barrier(CLK_LOCAL_MEM_FENCE);
@@ -744,7 +749,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 				{
 					loadedA = A((targetRow), (targetCol));
 					%CONJUGATE(doConj, loadedA);
-					%MAD(sum, loadedA, xShared);
+					%MAD(sum, loadedA, xShared_scalar);
 				}
 
 				// Avoid Race
@@ -764,7 +769,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 		%TYPE   sumTemp = %MAKEVEC(0.0);
 		%TYPE%V sum	= %VMAKEVEC(sumTemp);
 
-		__local %TYPE xData[ %TARGET_WIDTH];
 
 		//#pragma unroll
 		for( int i=1; i <= %NLOOPS; i++)
@@ -790,8 +794,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 			barrier(CLK_LOCAL_MEM_FENCE);
 		}
 
-		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
-		//__local %TYPE* sData = sDataTemp;
 		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -819,7 +821,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 				}
 			}
 
-			__local %TYPE xShared[%V];
 
 			int targetRowTemp = rowStart + threadIdx * %V;
 			int VECTOR_SIZE   = %V;
@@ -857,7 +858,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 					// Solve for first x - Do the rest in loop
 					%TYPE x[%V];
 					%SUB(x[0], xVal[0], sumVecReg[0]);
-					xShared[0] = x[0];
+					xShared_array[0] = x[0];
 					xnew[ (targetRowTemp)* incx ] = x[0];
 
 					//#pragma unroll
@@ -882,13 +883,13 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 					//#pragma unroll
 					for(int m = 0; m < %V; m++)
 					{
-						xShared[m] = x[m];
+						xShared_array[m] = x[m];
 						xnew[ (targetRowTemp + m)* incx ] = x[m];
 					}
 			        }
 
 
-			        // Sync so that xShared it available to all threads
+			        // Sync so that xShared_array is available to all threads
 			        barrier(CLK_LOCAL_MEM_FENCE);
 			      	if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
 				{
@@ -897,7 +898,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 						{
 							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
 							%CONJUGATE(doConj, loadedAVec);
-							%VMAD(sumVec, loadedAVec, xShared[j]);
+							%VMAD(sumVec, loadedAVec, xShared_array[j]);
 						}
 				}
 
@@ -958,6 +959,12 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 {
 	__global %TYPE* xnew;
 	__global %TYPE* A = _A + offa;
+	__local %TYPE xData[ %TARGET_WIDTH];
+	__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
+	//__local %TYPE* sData = sDataTemp;
+	__local %TYPE  xShared_scalar; // To share solved x value with other threads..
+	__local %TYPE xShared_array[%V];
+
 
 	if ( incx < 0 ) // Goto end of vector
 	{
@@ -1024,7 +1031,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 			// As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true
 			targetCol 	= startCol + %TARGET_ROWS;
 
-			__local %TYPE  xShared; // To share solved x value with other threads..
 
 			for( int i=0; i < ((lastRow -startRow) + 1); i++)
 			{
@@ -1036,8 +1042,8 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 					// Handle diagonal element
 					loadedA = A((targetRow), (targetCol));
 					%CONJUGATE(doConj, loadedA);
-					%DIV(xShared, sum, loadedA);
-					xnew[ targetRow * incx ] = xShared;
+					%DIV(xShared_scalar, sum, loadedA);
+					xnew[ targetRow * incx ] = xShared_scalar;
 				}
 
 				barrier(CLK_LOCAL_MEM_FENCE);
@@ -1046,7 +1052,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 				{
 					loadedA = A((targetRow), (targetCol));
 					%CONJUGATE(doConj, loadedA);
-					%MAD(sum, loadedA, xShared);
+					%MAD(sum, loadedA, xShared_scalar);
 				}
 
 				// Avoid Race
@@ -1066,7 +1072,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 		%TYPE   sumTemp = %MAKEVEC(0.0);
 		%TYPE%V sum	= %VMAKEVEC(sumTemp);
 
-		__local %TYPE xData[ %TARGET_WIDTH];
 
 		//#pragma unroll
 		for( int i=1; i <= %NLOOPS; i++)
@@ -1092,8 +1097,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 			barrier(CLK_LOCAL_MEM_FENCE);
 		}
 
-		__local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH];
-		//__local %TYPE* sData = sDataTemp;
 		sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum;
 		barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -1121,7 +1124,6 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 				}
 			}
 
-			__local %TYPE xShared[%V];
 
 			int targetRowTemp = rowStart + threadIdx * %V;
 			int VECTOR_SIZE   = %V;
@@ -1161,7 +1163,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 					%SUB(x[0], xVal[0], sumVecReg[0]);
 					%DIV(sumVecReg[0], x[0], reg[0][0]);
 					x[0] = sumVecReg[0];
-					xShared[0] = sumVecReg[0];
+					xShared_array[0] = sumVecReg[0];
 					xnew[ (targetRowTemp)* incx ] = sumVecReg[0];
 
 					//#pragma unroll
@@ -1191,13 +1193,13 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 					//#pragma unroll
 					for(int m = 1; m < %V; m++)
 					{
-						xShared[m] = x[m];
+						xShared_array[m] = x[m];
 						xnew[ (targetRowTemp + m)* incx ] = x[m];
 					}
 			        }
 
 
-			        // Sync so that xShared it available to all threads
+			        // Sync so that xShared_array is available to all threads
 			        barrier(CLK_LOCAL_MEM_FENCE);
 			      	if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) )
 				{
@@ -1206,7 +1208,7 @@ __kernel void %PREFIXtrsv_CL_ComputeRect
 						{
 							%TYPE%V loadedAVec  = %VLOAD( 0, &A((targetRowTemp), (targetCol +j)));
 							%CONJUGATE(doConj, loadedAVec);
-							%VMAD(sumVec, loadedAVec, xShared[j]);
+							%VMAD(sumVec, loadedAVec, xShared_array[j]);
 						}
 				}
 
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/nrm2.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/nrm2.cl
@@ -172,8 +172,14 @@ __kernel void %PREFIXnrm2_ssq_kernel( __
 
     // If scaleOfWG was zero, that means the whole array encountered before was filled with zeroes
     // Note: _scale is a local variable, either all enter or none
-    if(isnotequal(scaleOfWG, PZERO))
+    if(isequal(scaleOfWG, PZERO))
     {
+        if( (get_local_id(0)) == 0 ) {
+            scratchBuff[ get_group_id(0) ] = scaleOfWG;
+            scratchBuff[ numWGs + get_group_id(0) ] = 0.0f;
+        }
+    return;
+    }
         for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
         {
             %TYPE%V vReg1;
@@ -201,7 +207,7 @@ __kernel void %PREFIXnrm2_ssq_kernel( __
         }
 
         %REDUCTION_BY_SUM( ssq );
-    }
+
 
     if( (get_local_id(0)) == 0 ) {
         scratchBuff[ get_group_id(0) ] = scaleOfWG;
--- clblas-2.12.orig/src/library/blas/gens/clTemplates/reduction.cl
+++ clblas-2.12/src/library/blas/gens/clTemplates/reduction.cl
@@ -319,8 +319,13 @@ __kernel void %PREFIXred_ssq_kernel( __g
 
     // If scale was zero, that means the whole array encountered before was filled with zeroes
     // Note: scale is a local variable, either all enter or none
-    if(isnotequal(scaleOfWG, ZERO))
+    if(isequal(scaleOfWG, ZERO))
     {
+        if( (get_local_id(0)) == 0 ) {
+            res[0] = 0.0f;
+        }
+    return;
+    }
         for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)<N; gOffset+=( get_global_size(0) * %V ) )
         {
             %TYPE%V scale1, ssq1;
@@ -342,7 +347,7 @@ __kernel void %PREFIXred_ssq_kernel( __g
         }
 
         %REDUCTION_BY_SUM( ssq );
-    }
+
 
     if( (get_local_id(0)) == 0 ) {
         res[0] = scaleOfWG * sqrt(ssq);

Bug#921890: clblas: error: variables in the local address space can only be declared in the outermost scope of a kernel function

Reply via email to