sc/source/core/opencl/formulagroupcl.cxx | 429 +++++++++++++++++++++++++------ sc/source/core/opencl/op_math.cxx | 225 +++++++++++++--- sc/source/core/opencl/op_math.hxx | 4 sc/source/core/opencl/opbase.cxx | 43 +++ sc/source/core/opencl/opbase.hxx | 8 5 files changed, 590 insertions(+), 119 deletions(-)
New commits: commit 483da7cdb5082821541b1897ad81b8ddf55ff1a7 Author: Wei Wei <[email protected]> Date: Fri Nov 15 17:33:19 2013 -0600 GPU Calc: implemented parallel reduction for SUMIFS For now only works for fixed and sliding fixed-sized windows. Change-Id: I25e3f893a86d0e1723ae1e1633ffeeee93926b8d Signed-off-by: I-Jui (Ray) Sung <[email protected]> diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx index 3b19886..427dd9e 100644 --- a/sc/source/core/opencl/formulagroupcl.cxx +++ b/sc/source/core/opencl/formulagroupcl.cxx @@ -419,42 +419,51 @@ public: bIsStartFixed = mpDVR->IsStartFixed(); bIsEndFixed = mpDVR->IsEndFixed(); } + virtual bool NeedParallelReduction(void) const + { + return GetWindowSize()> 100 && + ( (GetStartFixed() && GetEndFixed()) || + (!GetStartFixed() && !GetEndFixed()) ) ; + } virtual void GenSlidingWindowFunction(std::stringstream &ss) { - std::string name = Base::GetName(); - ss << "__kernel void "<<name; - ss << "_reduction(__global double* A, " - "__global double *result,int arrayLength,int windowSize){\n"; - ss << " double tmp, current_result = 0.0;\n"; - ss << " int writePos = get_group_id(1);\n"; - ss << " int offset = get_group_id(1);\n"; - ss << " int lidx = get_local_id(0);\n"; - ss << " __local double shm_buf[256];\n"; - ss << " if (arrayLength == windowSize)\n"; - ss << " offset = 0;\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " int loop = arrayLength/512 + 1;\n"; - ss << " for (int l=0; l<loop; l++){\n"; - ss << " tmp = 0.0;\n"; - ss << " int loopOffset = l*512;\n"; - ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n"; - ss << " tmp = A[loopOffset + lidx + offset] + " - "A[loopOffset + lidx + offset + 256];\n"; - ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n"; - ss << " tmp = A[loopOffset + lidx + offset];\n"; - ss << " shm_buf[lidx] = tmp;\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " for (int i = 128; i >0; i/=2) {\n"; - ss << " if (lidx < i)\n"; - ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " current_result += shm_buf[0];\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " result[writePos] = current_result;\n"; - ss << "}\n"; + if (dynamic_cast<OpSum*>(mpCodeGen.get())) + { + std::string name = Base::GetName(); + ss << "__kernel void "<<name; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result = 0.0;\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int offset = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + ss << " if (arrayLength == windowSize)\n"; + ss << " offset = 0;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = 0.0;\n"; + ss << " int loopOffset = l*512;\n"; + ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n"; + ss << " tmp = A[loopOffset + lidx + offset] + " + "A[loopOffset + lidx + offset + 256];\n"; + ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n"; + ss << " tmp = A[loopOffset + lidx + offset];\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result += shm_buf[0];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } } @@ -573,11 +582,16 @@ public: if (CL_SUCCESS != err) throw OpenCLError(err); // reproduce the reduction function name - std::string kernelName = Base::GetName() + "_reduction"; + std::string kernelName; + if (dynamic_cast<OpSum*>(mpCodeGen.get())) + kernelName = Base::GetName() + "_reduction"; + else throw Unhandled(); + cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); if (err != CL_SUCCESS) throw OpenCLError(err); // set kernel arg of reduction kernel + // TODO(Wei Wei): use unique name for kernel err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), (void *)&(Base::mpClmem)); if (CL_SUCCESS != err) @@ -621,6 +635,14 @@ public: } } + size_t GetArrayLength(void) const {return mpDVR->GetArrayLength(); } + + size_t GetWindowSize(void) const {return mpDVR->GetRefRowSize(); } + + size_t GetStartFixed(void) const {return bIsStartFixed; } + + size_t GetEndFixed(void) const {return bIsEndFixed; } + protected: bool bIsStartFixed, bIsEndFixed; const formula::DoubleVectorRefToken *mpDVR; @@ -1001,6 +1023,75 @@ public: { i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram); } + if (OpSumIfs *OpSumCodeGen = dynamic_cast<OpSumIfs*>(mpCodeGen.get())) + { + assert(mpClmem == NULL); + // Obtain cl context + KernelEnv kEnv; + OpenclDevice::setKernelEnv(&kEnv); + cl_int err; + DynamicKernelSlidingArgument<DynamicKernelArgument> *slidingArgPtr = + dynamic_cast< DynamicKernelSlidingArgument<DynamicKernelArgument> *> + (mvSubArguments[0].get()); + cl_mem mpClmem2; + + if (OpSumCodeGen->NeedReductionKernel()) + { + assert(slidingArgPtr); + size_t nInput = slidingArgPtr -> GetArrayLength(); + size_t nCurWindowSize = slidingArgPtr -> GetWindowSize(); + std::vector<cl_mem> vclmem; + + for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e; + ++it) + { + vclmem.push_back((*it)->GetCLBuffer()); + } + mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE, + sizeof(double)*nVectorWidth, NULL, &err); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + std::string kernelName = "SumIfs_reduction"; + cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError(err); + + // set kernel arg of reduction kernel + for (size_t j=0; j< vclmem.size(); j++){ + err = clSetKernelArg(redKernel, j, sizeof(cl_mem), + (void *)&vclmem[j]); + if (CL_SUCCESS != err) + throw OpenCLError(err); + } + err = clSetKernelArg(redKernel, vclmem.size(), sizeof(cl_mem), (void *)&mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, vclmem.size()+1, sizeof(cl_int), (void*)&nInput); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, vclmem.size()+2, sizeof(cl_int), (void*)&nCurWindowSize); + if (CL_SUCCESS != err) + throw OpenCLError(err); + // set work group size and execute + size_t global_work_size[] = {256, (size_t)nVectorWidth }; + size_t local_work_size[] = {256, 1}; + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL, + global_work_size, local_work_size, 0, NULL, NULL); + if (CL_SUCCESS != err) + throw OpenCLError(err); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + // Pass mpClmem2 to the "real" kernel + err = clSetKernelArg(k, argno, sizeof(cl_mem), (void *)&mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError(err); + } + } return i; } diff --git a/sc/source/core/opencl/op_math.cxx b/sc/source/core/opencl/op_math.cxx index aac8661..fb38601 100644 --- a/sc/source/core/opencl/op_math.cxx +++ b/sc/source/core/opencl/op_math.cxx @@ -448,6 +448,142 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss, size_t nCurWindowSize = pCurDVR->GetArrayLength() < pCurDVR->GetRefRowSize() ? pCurDVR->GetArrayLength(): pCurDVR->GetRefRowSize() ; + + mNeedReductionKernel = vSubArguments[0]->NeedParallelReduction(); + if (mNeedReductionKernel) + { + // generate reduction functions + ss << "__kernel void "; + ss << "SumIfs_reduction( "; + for (unsigned i = 0; i < vSubArguments.size(); i++) + { + if (i) + ss << ","; + vSubArguments[i]->GenSlidingWindowDecl(ss); + } + ss << ", __global double *result,int arrayLength,int windowSize"; + + ss << ")\n{\n"; + ss << " double tmp =0;\n"; + ss << " int i ;\n"; + + GenTmpVariables(ss,vSubArguments); + ss << " double current_result = 0.0;\n"; + ss << " int writePos = get_group_id(1);\n"; + if (pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) + ss << " int offset = 0;\n"; + else if (!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) + ss << " int offset = get_group_id(1);\n"; + else + throw Unhandled(); + // actually unreachable + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = 0.0;\n"; + ss << " int loopOffset = l*512;\n"; + + ss << " int p1 = loopOffset + lidx + offset, p2 = p1 + 256;\n"; + ss << " if (p2 < min(offset + windowSize, arrayLength)) {\n"; + ss << " tmp0 = 0.0;\n"; + int mm=0; + std::string p1 = "p1"; + std::string p2 = "p2"; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm++) + { + CheckSubArgumentIsNan2(ss,vSubArguments,j,p1); + CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1); + ss << ""; + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << "))"; + ss << "{\n"; + } + CheckSubArgumentIsNan2(ss,vSubArguments,0,p1); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm--) + { + for(int n = 0;n<mm+1;n++) + { + ss << " "; + } + ss<< "}\n\n"; + } + mm=0; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm++) + { + CheckSubArgumentIsNan2(ss,vSubArguments,j,p2); + CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p2); + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << ")){\n"; + } + CheckSubArgumentIsNan2(ss,vSubArguments,0,p2); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j< vSubArguments.size();j+=2,mm--) + { + for(int n = 0;n<mm+1;n++) + { + ss << " "; + } + ss<< "}\n"; + } + ss << " }\n"; + + ss << " else if (p1 < min(arrayLength, offset + windowSize)) {\n"; + mm=0; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm++) + { + CheckSubArgumentIsNan2(ss,vSubArguments,j,p1); + CheckSubArgumentIsNan2(ss,vSubArguments,j+1,p1); + + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << ")){\n"; + } + CheckSubArgumentIsNan2(ss,vSubArguments,0,p1); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j<vSubArguments.size();j+=2,mm--) + { + for(int n = 0;n<mm+1;n++) + { + ss << " "; + } + ss<< "}\n\n"; + } + + ss << " }\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result += shm_buf[0];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + }// finish generate reduction code + // generate functions as usual ss << "\ndouble " << sSymName; ss << "_"<< BinFuncName() <<"("; for (unsigned i = 0; i < vSubArguments.size(); i++) @@ -459,48 +595,57 @@ void OpSumIfs::GenSlidingWindowFunction(std::stringstream &ss, ss << ")\n {\n"; ss <<" int gid0=get_global_id(0);\n"; ss << " double tmp =0;\n"; - ss << " int i ;\n"; - GenTmpVariables(ss,vSubArguments); - ss << " for (i = "; - if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) { - ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n"; - } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { - ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n"; - } else { - ss << "0; i < "<< nCurWindowSize <<"; i++)\n"; - } - ss << " {\n"; - if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) - { - ss<< " int doubleIndex =i+gid0;\n"; - }else - { - ss<< " int doubleIndex =i;\n"; - } - ss<< " int singleIndex =gid0;\n"; - int m=0; - for(unsigned j=1;j<vSubArguments.size();j+=2,m++) - { - CheckSubArgumentIsNan(ss,vSubArguments,j); - CheckSubArgumentIsNan(ss,vSubArguments,j+1); - ss <<" if(isequal("; - ss <<"tmp"; - ss <<j; - ss <<" , "; - ss << "tmp"; - ss << j+1; - ss << ")){\n"; - } - CheckSubArgumentIsNan(ss,vSubArguments,0); - ss << " tmp += tmp0;\n"; - for(unsigned j=1;j<=vSubArguments.size();j+=2,m--) - { - for(int n = 0;n<m+1;n++) + if (!mNeedReductionKernel) + { + ss << " int i ;\n"; + GenTmpVariables(ss,vSubArguments); + ss << " for (i = "; + if (!pCurDVR->IsStartFixed() && pCurDVR->IsEndFixed()) { + ss << "gid0; i < "<< nCurWindowSize <<"; i++)\n"; + } else if (pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { + ss << "0; i < gid0+"<< nCurWindowSize <<"; i++)\n"; + } else { + ss << "0; i < "<< nCurWindowSize <<"; i++)\n"; + } + ss << " {\n"; + if(!pCurDVR->IsStartFixed() && !pCurDVR->IsEndFixed()) { - ss << " "; + ss<< " int doubleIndex =i+gid0;\n"; + }else + { + ss<< " int doubleIndex =i;\n"; } - ss<< "}\n"; - } + ss<< " int singleIndex =gid0;\n"; + int m=0; + for(unsigned j=1;j<vSubArguments.size();j+=2,m++) + { + CheckSubArgumentIsNan(ss,vSubArguments,j); + CheckSubArgumentIsNan(ss,vSubArguments,j+1); + ss <<" if(isequal("; + ss <<"tmp"; + ss <<j; + ss <<" , "; + ss << "tmp"; + ss << j+1; + ss << ")){\n"; + } + CheckSubArgumentIsNan(ss,vSubArguments,0); + ss << " tmp += tmp0;\n"; + for(unsigned j=1;j<=vSubArguments.size();j+=2,m--) + { + for(int n = 0;n<m+1;n++) + { + ss << " "; + } + ss<< "}\n"; + } + } + if (mNeedReductionKernel) + { + ss << "tmp ="; + vSubArguments[0]->GenDeclRef(ss); + ss << "[gid0];\n"; + } ss << "return tmp;\n"; ss << "}"; } diff --git a/sc/source/core/opencl/op_math.hxx b/sc/source/core/opencl/op_math.hxx index 01cbc82..7081b00 100644 --- a/sc/source/core/opencl/op_math.hxx +++ b/sc/source/core/opencl/op_math.hxx @@ -33,9 +33,13 @@ public: class OpSumIfs: public CheckVariables { public: + OpSumIfs(void): CheckVariables(), mNeedReductionKernel(false) {} virtual void GenSlidingWindowFunction(std::stringstream &ss, const std::string sSymName, SubArguments &vSubArguments); virtual std::string BinFuncName(void) const { return "SumIfs"; } + bool NeedReductionKernel(void) const { return mNeedReductionKernel; } +protected: + bool mNeedReductionKernel; }; class OpCosh: public Normal { diff --git a/sc/source/core/opencl/opbase.cxx b/sc/source/core/opencl/opbase.cxx index 6f3c339..fe7fc24 100644 --- a/sc/source/core/opencl/opbase.cxx +++ b/sc/source/core/opencl/opbase.cxx @@ -157,6 +157,49 @@ void CheckVariables::CheckSubArgumentIsNan( std::stringstream & ss, ss << vSubArguments[i]->GenSlidingWindowDeclRef(); ss<<";\n"; } + +void CheckVariables::CheckSubArgumentIsNan2( std::stringstream & ss, + SubArguments &vSubArguments, int argumentNum, std::string p) +{ + int i = argumentNum; + if(vSubArguments[i]->GetFormulaToken()->GetType() == formula::svDouble) + { + ss <<" tmp"; + ss <<i; + ss << "="; + vSubArguments[i]->GenDeclRef(ss); + ss<<";\n"; + return; + } + +#ifdef ISNAN + ss<< " tmp"; + ss<< i; + ss<< "= fsum("; + vSubArguments[i]->GenDeclRef(ss); + if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svDoubleVectorRef) + ss<<"["<< p.c_str()<< "]"; + else if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svSingleVectorRef) + ss<<"[get_group_id(1)]"; + ss<<", 0);\n"; + return; +#endif + ss <<" tmp"; + ss <<i; + ss << "="; + vSubArguments[i]->GenDeclRef(ss); + if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svDoubleVectorRef) + ss<<"["<< p.c_str()<< "]"; + else if(vSubArguments[i]->GetFormulaToken()->GetType() == + formula::svSingleVectorRef) + ss<<"[get_group_id(1)]"; + + ss<<";\n"; +} + void CheckVariables::CheckAllSubArgumentIsNan( std::stringstream & ss, SubArguments & vSubArguments) { diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx index 41e4587..6b475df 100644 --- a/sc/source/core/opencl/opbase.hxx +++ b/sc/source/core/opencl/opbase.hxx @@ -104,6 +104,9 @@ public: virtual void DumpInlineFun(std::set<std::string>& , std::set<std::string>& ) const {} const std::string& GetName(void) const { return mSymName; } + cl_mem GetCLBuffer(void) const {return mpClmem; } + virtual bool NeedParallelReduction(void) const { return false; } + protected: const std::string mSymName; FormulaTreeNodeRef mFormulaTree; @@ -157,6 +160,9 @@ public: SubArguments &vSubArguments, int argumentNum); void CheckAllSubArgumentIsNan(std::stringstream &ss, SubArguments &vSubArguments); + // only check isNan + void CheckSubArgumentIsNan2(std::stringstream &ss, + SubArguments &vSubArguments, int argumentNum, std::string p); }; }} commit 4a9c141f5e6250bc4a9cde870b1649a1c3faccae Author: Wei Wei <[email protected]> Date: Fri Nov 15 16:37:10 2013 -0600 GPU Calc: use parallel reduction to implement sum Use reduction kernel when given a large DoubleVectorRef Change-Id: Ifd4977b81be64274733909e43f0e5ef161bb455e Signed-off-by: I-Jui (Ray) Sung <[email protected]> diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx index 6d442fc..3b19886 100644 --- a/sc/source/core/opencl/formulagroupcl.cxx +++ b/sc/source/core/opencl/formulagroupcl.cxx @@ -44,7 +44,7 @@ namespace sc { namespace opencl { /// Map the buffer used by an argument and do necessary argument setting -size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int) +size_t DynamicKernelArgument::Marshal(cl_kernel k, int argno, int, cl_program) { FormulaToken *ref = mFormulaTree->GetFormulaToken(); assert(mpClmem == NULL); @@ -125,7 +125,7 @@ public: return 1; } /// Pass the 32-bit hash of the string to the kernel - virtual size_t Marshal(cl_kernel k, int argno, int) + virtual size_t Marshal(cl_kernel k, int argno, int, cl_program) { FormulaToken *ref = mFormulaTree->GetFormulaToken(); assert(mpClmem == NULL); @@ -183,7 +183,7 @@ public: return 1; } /// Create buffer and pass the buffer to a given kernel - virtual size_t Marshal(cl_kernel k, int argno, int) + virtual size_t Marshal(cl_kernel k, int argno, int, cl_program) { double tmp = 0.0; // Pass the scalar result back to the rest of the formula kernel @@ -222,7 +222,7 @@ public: return 1; } /// Create buffer and pass the buffer to a given kernel - virtual size_t Marshal(cl_kernel k, int argno, int) + virtual size_t Marshal(cl_kernel k, int argno, int, cl_program) { double tmp = 0.0; // Pass the scalar result back to the rest of the formula kernel @@ -264,7 +264,7 @@ public: return 1; } /// Create buffer and pass the buffer to a given kernel - virtual size_t Marshal(cl_kernel k, int argno, int) + virtual size_t Marshal(cl_kernel k, int argno, int, cl_program) { double tmp = 0.0; // Pass the scalar result back to the rest of the formula kernel @@ -292,11 +292,11 @@ public: { DynamicKernelStringArgument::GenDecl(ss); } - virtual size_t Marshal(cl_kernel, int, int); + virtual size_t Marshal(cl_kernel, int, int, cl_program); }; /// Marshal a string vector reference -size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int) +size_t DynamicKernelStringArgument::Marshal(cl_kernel k, int argno, int, cl_program) { FormulaToken *ref = mFormulaTree->GetFormulaToken(); assert(mpClmem == NULL); @@ -390,10 +390,10 @@ public: ss << ")"; return ss.str(); } - virtual size_t Marshal(cl_kernel k, int argno, int vw) + virtual size_t Marshal(cl_kernel k, int argno, int vw, cl_program p) { - int i = DynamicKernelArgument::Marshal(k, argno, vw); - i += mStringArgument.Marshal(k, argno+i, vw); + int i = DynamicKernelArgument::Marshal(k, argno, vw, p); + i += mStringArgument.Marshal(k, argno+i, vw, p); return i; } protected: @@ -402,24 +402,61 @@ protected: /// Handling a Double Vector that is used as a sliding window input /// to either a sliding window average or sum-of-products +class OpSum; // Forward Declaration template<class Base> class DynamicKernelSlidingArgument: public Base { public: DynamicKernelSlidingArgument(const std::string &s, - FormulaTreeNodeRef ft): - Base(s, ft) + FormulaTreeNodeRef ft, boost::shared_ptr<SlidingFunctionBase> &CodeGen): + Base(s, ft), mpCodeGen(CodeGen), needReductionKernel(true), mpClmem2(NULL) { FormulaToken *t = ft->GetFormulaToken(); if (t->GetType() != formula::svDoubleVectorRef) throw Unhandled(); - const formula::DoubleVectorRefToken* pDVR = - dynamic_cast<const formula::DoubleVectorRefToken *>(t); - assert(pDVR); - bIsStartFixed = pDVR->IsStartFixed(); - bIsEndFixed = pDVR->IsEndFixed(); + mpDVR = dynamic_cast<const formula::DoubleVectorRefToken *>(t); + assert(mpDVR); + bIsStartFixed = mpDVR->IsStartFixed(); + bIsEndFixed = mpDVR->IsEndFixed(); } - virtual void GenSlidingWindowFunction(std::stringstream &) {} + virtual void GenSlidingWindowFunction(std::stringstream &ss) { + std::string name = Base::GetName(); + ss << "__kernel void "<<name; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result = 0.0;\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int offset = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + ss << " if (arrayLength == windowSize)\n"; + ss << " offset = 0;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l<loop; l++){\n"; + ss << " tmp = 0.0;\n"; + ss << " int loopOffset = l*512;\n"; + ss << " if((loopOffset + lidx + offset + 256) < ( offset + windowSize))\n"; + ss << " tmp = A[loopOffset + lidx + offset] + " + "A[loopOffset + lidx + offset + 256];\n"; + ss << " else if ((loopOffset + lidx + offset) < ( offset + windowSize))\n"; + ss << " tmp = A[loopOffset + lidx + offset];\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] += shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result += shm_buf[0];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } + virtual std::string GenSlidingWindowDeclRef(bool=false) const { @@ -430,8 +467,168 @@ public: ss << Base::GetName() << "[i]"; return ss.str(); } + /// Controls how the elements in the DoubleVectorRef are traversed + virtual size_t GenLoop(std::stringstream &ss, bool &needBody) + { + assert(mpDVR); + size_t nCurWindowSize = mpDVR->GetRefRowSize(); + if (dynamic_cast<OpSum*>(mpCodeGen.get())) + { + if (!bIsStartFixed && !bIsEndFixed) + { + // set 100 as a threshold for invoking reduction kernel + if (nCurWindowSize > 100 ) + { + std::string temp = Base::GetName() + "[gid0]"; + ss << "tmp = "; + ss << mpCodeGen->Gen2(temp, "tmp"); + ss << ";\n\t"; + needBody = false; + needReductionKernel = false; + return nCurWindowSize; + } + } + + if (bIsStartFixed && bIsEndFixed) + { + // set 100 as a threshold for invoking reduction kernel + if (nCurWindowSize > 100 ) + { + std::string temp = Base::GetName() + "[0]"; + ss << "tmp = "; + ss << mpCodeGen->Gen2(temp, "tmp"); + ss << ";\n\t"; + needBody = false; + needReductionKernel = false; + return nCurWindowSize; + } + } + } + needBody = true; + needReductionKernel = true; + ss << "for (int i = "; + if (!bIsStartFixed && bIsEndFixed) + { +#ifdef ISNAN + ss << "gid0; i < " << mpDVR->GetArrayLength(); + ss << " && i < " << nCurWindowSize << "; i++){\n\t\t"; +#else + ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t"; +#endif + } + else if (bIsStartFixed && !bIsEndFixed) + { +#ifdef ISNAN + ss << "0; i < " << mpDVR->GetArrayLength(); + ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t"; +#else + ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t"; +#endif + } + else if (!bIsStartFixed && !bIsEndFixed) + { +#ifdef ISNAN + ss << "0; i + gid0 < " << mpDVR->GetArrayLength(); + ss << " && i < "<< nCurWindowSize << "; i++){\n\t\t"; +#else + ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t"; +#endif + } + else + { +#ifdef ISNAN + ss << "0; i < "<< nCurWindowSize << "; i++){\n\t\t"; +#else + ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t"; +#endif + } + + return nCurWindowSize; + } + + virtual size_t Marshal(cl_kernel k, int argno, int w, cl_program mpProgram) + { + if (needReductionKernel) + return Base::Marshal(k, argno, w, mpProgram); + + assert(Base::mpClmem == NULL); + // Obtain cl context + KernelEnv kEnv; + OpenclDevice::setKernelEnv(&kEnv); + cl_int err; + size_t nInput = mpDVR->GetArrayLength(); + size_t nCurWindowSize = mpDVR->GetRefRowSize(); + // create clmem buffer + if (mpDVR->GetArrays()[0].mpNumericArray == NULL) + throw Unhandled(); + double *pHostBuffer = const_cast<double*>( + mpDVR->GetArrays()[0].mpNumericArray); + size_t szHostBuffer = nInput * sizeof(double); + Base::mpClmem = clCreateBuffer(kEnv.mpkContext, + (cl_mem_flags) CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR, + szHostBuffer, + pHostBuffer, &err); + mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_WRITE_ONLY, + sizeof(double)*w, NULL, NULL); + if (CL_SUCCESS != err) + throw OpenCLError(err); + // reproduce the reduction function name + std::string kernelName = Base::GetName() + "_reduction"; + cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError(err); + // set kernel arg of reduction kernel + err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), + (void *)&(Base::mpClmem)); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), (void *)&mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, 2, sizeof(cl_int), (void*)&nInput); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + err = clSetKernelArg(redKernel, 3, sizeof(cl_int), (void*)&nCurWindowSize); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + // set work group size and execute + size_t global_work_size[] = {256, (size_t)w }; + size_t local_work_size[] = {256, 1}; + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, NULL, + global_work_size, local_work_size, 0, NULL, NULL); + if (CL_SUCCESS != err) + throw OpenCLError(err); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError(err); + + // set kernel arg + err = clSetKernelArg(k, argno, sizeof(cl_mem), (void*)&(mpClmem2)); + if (CL_SUCCESS != err) + throw OpenCLError(err); + return 1; + } + ~DynamicKernelSlidingArgument() + { + if (mpClmem2) + { + clReleaseMemObject(mpClmem2); + mpClmem2 = NULL; + } + } + protected: bool bIsStartFixed, bIsEndFixed; + const formula::DoubleVectorRefToken *mpDVR; + // from parent nodes + boost::shared_ptr<SlidingFunctionBase> mpCodeGen; + // controls whether to invoke the reduction kernel during marshaling or not + bool needReductionKernel; + cl_mem mpClmem2; }; /// Abstract class for code generation @@ -439,6 +636,9 @@ protected: class Reduction: public SlidingFunctionBase { public: + typedef DynamicKernelSlidingArgument<DynamicKernelArgument> NumericRange; + typedef DynamicKernelSlidingArgument<DynamicKernelStringArgument> StringRange; + virtual void GenSlidingWindowFunction(std::stringstream &ss, const std::string sSymName, SubArguments &vSubArguments) { @@ -459,65 +659,48 @@ public: size_t nItems = 0; while (i--) { - FormulaToken *pCur = vSubArguments[i]->GetFormulaToken(); - assert(pCur); - if (pCur->GetType() == formula::svDoubleVectorRef) + if (NumericRange *NR = dynamic_cast<NumericRange *> (vSubArguments[i].get())) { - const formula::DoubleVectorRefToken* pDVR = - dynamic_cast<const formula::DoubleVectorRefToken *>(pCur); - size_t nCurWindowSize = pDVR->GetRefRowSize(); - ss << "for (int i = "; - if (!pDVR->IsStartFixed() && pDVR->IsEndFixed()) { -#ifdef ISNAN - ss << "gid0; i < " << pDVR->GetArrayLength(); - ss << " && i < " << nCurWindowSize << "; i++){\n\t\t"; -#else - ss << "gid0; i < "<< nCurWindowSize << "; i++)\n\t\t"; -#endif - } else if (pDVR->IsStartFixed() && !pDVR->IsEndFixed()) { -#ifdef ISNAN - ss << "0; i < " << pDVR->GetArrayLength(); - ss << " && i < gid0+"<< nCurWindowSize << "; i++){\n\t\t"; -#else - ss << "0; i < gid0+"<< nCurWindowSize << "; i++)\n\t\t"; -#endif - } else if (!pDVR->IsStartFixed() && !pDVR->IsEndFixed()){ -#ifdef ISNAN - ss << "0; i + gid0 < " << pDVR->GetArrayLength(); - ss << " && i < "<< nCurWindowSize << "; i++){\n\t\t"; -#else - ss << "0; i < "<< nCurWindowSize << "; i++)\n\t\t"; -#endif - } - else - { - ss << "0; i < "<< pDVR->GetArrayLength() << "; i++){\n\t\t"; - } - nItems += nCurWindowSize; + bool needBody; + nItems += NR->GenLoop(ss, needBody); + if (needBody == false) continue; } - else if (pCur->GetType() == formula::svSingleVectorRef) + else if (StringRange *SR = dynamic_cast<StringRange *> (vSubArguments[i].get())) { + bool needBody; + nItems += SR->GenLoop(ss, needBody); //did not handle yet + if (needBody == false) continue; + } + else + { + FormulaToken *pCur = vSubArguments[i]->GetFormulaToken(); + assert(pCur); + assert(pCur->GetType() != formula::svDoubleVectorRef); + + if (pCur->GetType() == formula::svSingleVectorRef) + { #ifdef ISNAN const formula::SingleVectorRefToken* pSVR = - dynamic_cast< const formula::SingleVectorRefToken* >(pCur); - ss << "if (gid0 < " << pSVR->GetArrayLength() << "){\n\t\t"; + dynamic_cast< const formula::SingleVectorRefToken* >(pCur); + ss << "if (gid0 < " << pSVR->GetArrayLength() << "){\n\t\t"; #else - nItems += 1; + nItems += 1; #endif - } - else if (pCur->GetType() == formula::svDouble) - { + } + else if (pCur->GetType() == formula::svDouble) + { #ifdef ISNAN - ss << "{\n\t\t"; + ss << "{\n\t\t"; #endif - nItems += 1; - } - else - { + nItems += 1; + } + else + { #ifdef ISNAN - ss << "nCount += 1;\n\t\t"; + ss << "nCount += 1;\n\t\t"; #endif - nItems += 1; + nItems += 1; + } } #ifdef ISNAN if(ocPush==vSubArguments[i]->GetFormulaToken()->GetOpCode()) @@ -810,13 +993,13 @@ public: const std::string &s, const FormulaTreeNodeRef& ft, SlidingFunctionBase* pCodeGen); /// Create buffer and pass the buffer to a given kernel - virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth) + virtual size_t Marshal(cl_kernel k, int argno, int nVectorWidth, cl_program pProgram) { unsigned i = 0; for (SubArgumentsType::iterator it = mvSubArguments.begin(), e= mvSubArguments.end(); it!=e; ++it) { - i += (*it)->Marshal(k, argno + i, nVectorWidth); + i += (*it)->Marshal(k, argno + i, nVectorWidth, pProgram); } return i; } @@ -910,7 +1093,7 @@ public: } private: SubArgumentsType mvSubArguments; - boost::scoped_ptr<SlidingFunctionBase> mpCodeGen; + boost::shared_ptr<SlidingFunctionBase> mpCodeGen; }; boost::shared_ptr<DynamicKernelArgument> SoPHelper( @@ -944,12 +1127,12 @@ DynamicKernelSoPArguments::DynamicKernelSoPArguments( if (pDVR->GetArrays()[0].mpNumericArray) mvSubArguments.push_back( SubArgument(new DynamicKernelSlidingArgument - <DynamicKernelArgument>(ts, ft->Children[i]))); + <DynamicKernelArgument>(ts, ft->Children[i], mpCodeGen))); else mvSubArguments.push_back( SubArgument(new DynamicKernelSlidingArgument <DynamicKernelStringArgument>( - ts, ft->Children[i]))); + ts, ft->Children[i], mpCodeGen))); } else if (pChild->GetType() == formula::svSingleVectorRef) { const formula::SingleVectorRefToken* pSVR = dynamic_cast< const formula::SingleVectorRefToken* >(pChild); @@ -1713,19 +1896,19 @@ public: } /// Memory mapping from host to device and pass buffers to the given kernel as /// arguments - void Marshal(cl_kernel, int); + void Marshal(cl_kernel, int, cl_program); private: unsigned int mCurId; ArgumentMap mSymbols; ArgumentList mParams; }; -void SymbolTable::Marshal(cl_kernel k, int nVectorWidth) +void SymbolTable::Marshal(cl_kernel k, int nVectorWidth, cl_program pProgram) { int i = 1; //The first argument is reserved for results for(ArgumentList::iterator it = mParams.begin(), e= mParams.end(); it!=e; ++it) { - i+=(*it)->Marshal(k, i, nVectorWidth); + i+=(*it)->Marshal(k, i, nVectorWidth, pProgram); } } @@ -1735,7 +1918,7 @@ public: DynamicKernel(FormulaTreeNodeRef r):mpRoot(r), mpProgram(NULL), mpKernel(NULL), mpResClmem(NULL) {} /// Code generation in OpenCL - std::string CodeGen() { + void CodeGen() { // Travese the tree of expression and declare symbols used const DynamicKernelArgument *DK= mSyms.DeclRefArg< DynamicKernelSoPArguments>(mpRoot, new OpNop); @@ -1771,7 +1954,6 @@ public: #if 1 std::cerr<< "Program to be compiled = \n" << mFullProgramSrc << "\n"; #endif - return decl.str(); } /// Produce kernel hash std::string GetMD5(void) @@ -1817,7 +1999,7 @@ public: if (CL_SUCCESS != err) throw OpenCLError(err); // The rest of buffers - mSyms.Marshal(mpKernel, nr); + mSyms.Marshal(mpKernel, nr, mpProgram); size_t global_work_size[] = {nr}; err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, mpKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL); @@ -1993,7 +2175,7 @@ bool FormulaGroupInterpreterOpenCL::interpret( ScDocument& rDoc, mpKernel = new DynamicKernel(Root); try { - std::string kSrc = mpKernel->CodeGen(); + mpKernel->CodeGen(); // Obtain cl context KernelEnv kEnv; OpenclDevice::setKernelEnv(&kEnv); diff --git a/sc/source/core/opencl/opbase.hxx b/sc/source/core/opencl/opbase.hxx index 4898962..41e4587 100644 --- a/sc/source/core/opencl/opbase.hxx +++ b/sc/source/core/opencl/opbase.hxx @@ -92,7 +92,7 @@ public: virtual void GenDeclRef(std::stringstream &ss) const; /// Create buffer and pass the buffer to a given kernel - virtual size_t Marshal(cl_kernel, int, int); + virtual size_t Marshal(cl_kernel, int, int, cl_program); virtual ~DynamicKernelArgument(); _______________________________________________ Libreoffice-commits mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
