LGTM, pushed with the other two patches, thanks.
On Thu, Jan 16, 2014 at 03:38:30PM +0800, Yang Rong wrote: > SPIR 1.2 require llvm.memcpy support. And llvm will emit llvm.memset > sometimes. > So adding a pass to lower these two intrinsic function, and then inline them. > > In intrinsic lowering pass, find all llvm.memset and llvm.memcpy and then > replace > them with a function call __gen_memset_x and __gen_memcpy_xx, x and xx is for > address space. > > Because this pass is after clang, but after clang, the unused function seems > be stripped, so > implement the __gen_memset_x and __gen_memcpy_xx functions in pre compiled > module, then link > them. > > Signed-off-by: Yang Rong <[email protected]> > --- > backend/src/CMakeLists.txt | 3 +- > backend/src/llvm/llvm_gen_backend.hpp | 4 + > backend/src/llvm/llvm_intrinsic_lowering.cpp | 172 ++++++++++++++ > backend/src/llvm/llvm_passes.cpp | 2 +- > backend/src/llvm/llvm_to_gen.cpp | 2 + > backend/src/ocl_memcpy.ll | 336 > +++++++++++++++++++++++++++ > backend/src/ocl_memset.ll | 127 ++++++++++ > 7 files changed, 644 insertions(+), 2 deletions(-) > create mode 100644 backend/src/llvm/llvm_intrinsic_lowering.cpp > create mode 100644 backend/src/ocl_memcpy.ll > create mode 100644 backend/src/ocl_memset.ll > > diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt > index b93133f..10bf67b 100644 > --- a/backend/src/CMakeLists.txt > +++ b/backend/src/CMakeLists.txt > @@ -136,6 +136,7 @@ else (GBE_USE_BLOB) > llvm/llvm_gen_backend.cpp > llvm/llvm_passes.cpp > llvm/llvm_scalarize.cpp > + llvm/llvm_intrinsic_lowering.cpp > llvm/llvm_to_gen.cpp > llvm/llvm_gen_backend.hpp > llvm/llvm_gen_ocl_function.hxx > @@ -165,7 +166,7 @@ add_library (gbe STATIC ${GBE_SRC}) > > # for pre compiled module library. > set (pcm_lib "beignet.bc") > -set (pcm_sources ocl_barrier.ll) > +set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll) > ll_add_library (${pcm_lib} pcm_sources) > > ADD_DEPENDENCIES (gbe pch_object ${pcm_lib}) > diff --git a/backend/src/llvm/llvm_gen_backend.hpp > b/backend/src/llvm/llvm_gen_backend.hpp > index 55079f5..389d5f3 100644 > --- a/backend/src/llvm/llvm_gen_backend.hpp > +++ b/backend/src/llvm/llvm_gen_backend.hpp > @@ -84,8 +84,12 @@ namespace gbe > /*! Remove the GEP instructions */ > llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit); > > + /*! Scalarize all vector op instructions */ > llvm::FunctionPass* createScalarizePass(); > > + /*! Convert the Intrinsic call to gen function */ > + llvm::BasicBlockPass *createIntrinsicLoweringPass(); > + > } /* namespace gbe */ > > #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */ > diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp > b/backend/src/llvm/llvm_intrinsic_lowering.cpp > new file mode 100644 > index 0000000..1942860 > --- /dev/null > +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp > @@ -0,0 +1,172 @@ > +/* > + * Copyright © 2012 Intel Corporation > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library. If not, see > <http://www.gnu.org/licenses/>. > + */ > + > +/** > + * \file llvm_intrinisc_lowering.cpp > + * \author Yang Rong <[email protected]> > + */ > + > +#include "llvm/Config/config.h" > +#if LLVM_VERSION_MINOR <= 2 > +#include "llvm/Function.h" > +#include "llvm/InstrTypes.h" > +#include "llvm/Instructions.h" > +#include "llvm/IntrinsicInst.h" > +#include "llvm/Module.h" > +#else > +#include "llvm/IR/Function.h" > +#include "llvm/IR/InstrTypes.h" > +#include "llvm/IR/Instructions.h" > +#include "llvm/IR/IntrinsicInst.h" > +#include "llvm/IR/Module.h" > +#endif /* LLVM_VERSION_MINOR <= 2 */ > +#include "llvm/Pass.h" > +#if LLVM_VERSION_MINOR <= 1 > +#include "llvm/Support/IRBuilder.h" > +#elif LLVM_VERSION_MINOR == 2 > +#include "llvm/IRBuilder.h" > +#else > +#include "llvm/IR/IRBuilder.h" > +#endif /* LLVM_VERSION_MINOR <= 1 */ > +#include "llvm/Support/CallSite.h" > +#include "llvm/Support/CFG.h" > +#include "llvm/Support/raw_ostream.h" > + > +#include "llvm/llvm_gen_backend.hpp" > +#include "sys/map.hpp" > + > + > +using namespace llvm; > + > +namespace gbe { > + class InstrinsicLowering : public BasicBlockPass > + { > + public: > + static char ID; > + InstrinsicLowering() : > + BasicBlockPass(ID) {} > + > + void getAnalysisUsage(AnalysisUsage &AU) const { > + > + } > + > + virtual const char *getPassName() const { > + return "SPIR backend: lowering instrinsics"; > + } > + static char convertSpaceToName(Value *val) { > + const uint32_t space = val->getType()->getPointerAddressSpace(); > + switch(space) { > + case 0: > + return 'p'; > + case 1: > + return 'g'; > + case 3: > + return 'l'; > + default: > + assert("Non support address space"); > + return '\0'; > + } > + } > + static CallInst *replaceCallWith(const char *NewFn, CallInst *CI, > + Value **ArgBegin, Value **ArgEnd, > + Type *RetTy) > + { > + // If we haven't already looked up this function, check to see if the > + // program already contains a function with this name. > + Module *M = CI->getParent()->getParent()->getParent(); > + // Get or insert the definition now. > + std::vector<Type *> ParamTys; > + for (Value** I = ArgBegin; I != ArgEnd; ++I) > + ParamTys.push_back((*I)->getType()); > + Constant* FCache = M->getOrInsertFunction(NewFn, > + FunctionType::get(RetTy, ParamTys, > false)); > + > + IRBuilder<> Builder(CI->getParent(), CI); > + SmallVector<Value *, 8> Args(ArgBegin, ArgEnd); > + CallInst *NewCI = Builder.CreateCall(FCache, Args); > + NewCI->setName(CI->getName()); > + if (!CI->use_empty()) > + CI->replaceAllUsesWith(NewCI); > + CI->eraseFromParent(); > + return NewCI; > + } > + virtual bool runOnBasicBlock(BasicBlock &BB) > + { > + bool changedBlock = false; > + Module *M = BB.getParent()->getParent(); > + > + DataLayout TD(M); > + LLVMContext &Context = BB.getContext(); > + for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { > + Instruction *Inst = DI++; > + CallInst* CI = dyn_cast<CallInst>(Inst); > + if(CI == NULL) > + continue; > + > + IRBuilder<> Builder(&BB, CI); > + // only support memcpy and memset > + if (Function *F = CI->getCalledFunction()) { > + const Intrinsic::ID intrinsicID = (Intrinsic::ID) > F->getIntrinsicID(); > + if (intrinsicID == 0) > + continue; > + switch (intrinsicID) { > + case Intrinsic::memcpy: { > + Type *IntPtr = TD.getIntPtrType(Context); > + Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), > IntPtr, > + /* isSigned */ false); > + Value *Ops[3]; > + Ops[0] = CI->getArgOperand(0); > + Ops[1] = CI->getArgOperand(1); > + Ops[2] = Size; > + char name[16] = "__gen_memcpy_xx"; > + name[13] = convertSpaceToName(Ops[0]); > + name[14] = convertSpaceToName(Ops[1]); > + replaceCallWith(name, CI, Ops, Ops+3, > Type::getVoidTy(Context)); > + break; > + } > + case Intrinsic::memset: { > + Value *Op0 = CI->getArgOperand(0); > + Value *val = Builder.CreateIntCast(CI->getArgOperand(1), > IntegerType::getInt8Ty(Context), > + /* isSigned */ false); > + Type *IntPtr = TD.getIntPtrType(Op0->getType()); > + Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), > IntPtr, > + /* isSigned */ false); > + Value *Ops[3]; > + Ops[0] = Op0; > + // Extend the amount to i32. > + Ops[1] = val; > + Ops[2] = Size; > + char name[16] = "__gen_memset_x"; > + name[13] = convertSpaceToName(Ops[0]); > + replaceCallWith(name, CI, Ops, Ops+3, > Type::getVoidTy(Context)); > + break; > + } > + default: > + continue; > + } > + } > + } > + return changedBlock; > + } > + }; > + > + char InstrinsicLowering::ID = 0; > + > + BasicBlockPass *createIntrinsicLoweringPass() { > + return new InstrinsicLowering(); > + } > +} // end namespace > diff --git a/backend/src/llvm/llvm_passes.cpp > b/backend/src/llvm/llvm_passes.cpp > index 3bb6f71..1091dae 100644 > --- a/backend/src/llvm/llvm_passes.cpp > +++ b/backend/src/llvm/llvm_passes.cpp > @@ -232,7 +232,7 @@ namespace gbe > } > > virtual const char *getPassName() const { > - return "PTX backend: insert special ptx instructions"; > + return "SPIR backend: insert special spir instructions"; > } > > bool simplifyGEPInstructions(GetElementPtrInst* GEPInst); > diff --git a/backend/src/llvm/llvm_to_gen.cpp > b/backend/src/llvm/llvm_to_gen.cpp > index a9f70d9..b227912 100644 > --- a/backend/src/llvm/llvm_to_gen.cpp > +++ b/backend/src/llvm/llvm_to_gen.cpp > @@ -175,6 +175,8 @@ namespace gbe > // Print the code before further optimizations > if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS) > passes.add(createPrintModulePass(&*o)); > + passes.add(createIntrinsicLoweringPass()); > + passes.add(createFunctionInliningPass(200000)); > passes.add(createScalarReplAggregatesPass()); // Break up allocas > passes.add(createRemoveGEPPass(unit)); > passes.add(createConstantPropagationPass()); > diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll > new file mode 100644 > index 0000000..476033e > --- /dev/null > +++ b/backend/src/ocl_memcpy.ll > @@ -0,0 +1,336 @@ > +;The memcpy's source code. > +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) > { > +; size_t index = 0; > +; while((index + 4) >= size) { > +; *((uint *)(dst + index)) = *((uint *)(src + index)); > +; index += 4; > +; } > +; while(index < size) { > +; dst[index] = src[index]; > +; index++; > +; } > +; } > + > +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > + %1 = load i32 addrspace(1)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* > + store i32 %1, i32 addrspace(1)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 > + %3 = load i8 addrspace(1)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* > + %1 = load i32 addrspace(0)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* > + store i32 %1, i32 addrspace(1)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 > + %3 = load i8 addrspace(0)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > + %1 = load i32 addrspace(3)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* > + store i32 %1, i32 addrspace(1)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 > + %3 = load i8 addrspace(3)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > + %1 = load i32 addrspace(1)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* > + store i32 %1, i32 addrspace(0)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 > + %3 = load i8 addrspace(1)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* > + %1 = load i32 addrspace(0)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* > + store i32 %1, i32 addrspace(0)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 > + %3 = load i8 addrspace(0)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > + %1 = load i32 addrspace(3)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* > + store i32 %1, i32 addrspace(0)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 > + %3 = load i8 addrspace(3)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > + %1 = load i32 addrspace(1)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* > + store i32 %1, i32 addrspace(3)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 > + %3 = load i8 addrspace(1)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* > + %1 = load i32 addrspace(0)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* > + store i32 %1, i32 addrspace(3)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 > + %3 = load i8 addrspace(0)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > + > +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, > i32 %size) nounwind alwaysinline { > +entry: > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond3, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 > + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > + %1 = load i32 addrspace(3)* %0, align 4 > + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* > + store i32 %1, i32 addrspace(3)* %2, align 4 > + br label %while.cond > + > +while.cond3: ; preds = %while.cond, > %while.body5 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] > + %cmp4 = icmp ult i32 %index.1, %size > + br i1 %cmp4, label %while.body5, label %while.end7 > + > +while.body5: ; preds = %while.cond3 > + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 > + %3 = load i8 addrspace(3)* %arrayidx, align 1 > + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond3 > + > +while.end7: ; preds = %while.cond3 > + ret void > +} > diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll > new file mode 100644 > index 0000000..addf9f5 > --- /dev/null > +++ b/backend/src/ocl_memset.ll > @@ -0,0 +1,127 @@ > +;The memset's source code. > +; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) { > +; size_t index = 0; > +; uint v = (val << 24) | (val << 16) | (val << 8) | val; > +; while((index + 4) >= size) { > +; *((uint *)(dst + index)) = v; > +; index += 4; > +; } > +; while(index < size) { > +; dst[index] = val; > +; index++; > +; } > +; } > + > +define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind > alwaysinline { > +entry: > + %conv = zext i8 %val to i32 > + %shl = shl nuw i32 %conv, 24 > + %shl2 = shl nuw nsw i32 %conv, 16 > + %or = or i32 %shl, %shl2 > + %shl4 = shl nuw nsw i32 %conv, 8 > + %or5 = or i32 %or, %shl4 > + %or7 = or i32 %or5, %conv > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond10, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0 > + %0 = bitcast i8* %add.ptr to i32* > + store i32 %or7, i32* %0, align 4 > + br label %while.cond > + > +while.cond10: ; preds = %while.cond, > %while.body13 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] > + %cmp11 = icmp ult i32 %index.1, %size > + br i1 %cmp11, label %while.body13, label %while.end14 > + > +while.body13: ; preds = %while.cond10 > + %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1 > + store i8 %val, i8* %arrayidx, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond10 > + > +while.end14: ; preds = %while.cond10 > + ret void > +} > + > +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 > %size) nounwind alwaysinline { > +entry: > + %conv = zext i8 %val to i32 > + %shl = shl nuw i32 %conv, 24 > + %shl2 = shl nuw nsw i32 %conv, 16 > + %or = or i32 %shl, %shl2 > + %shl4 = shl nuw nsw i32 %conv, 8 > + %or5 = or i32 %or, %shl4 > + %or7 = or i32 %or5, %conv > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond10, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 > + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* > + store i32 %or7, i32 addrspace(1)* %0, align 4 > + br label %while.cond > + > +while.cond10: ; preds = %while.cond, > %while.body13 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] > + %cmp11 = icmp ult i32 %index.1, %size > + br i1 %cmp11, label %while.body13, label %while.end14 > + > +while.body13: ; preds = %while.cond10 > + %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 > + store i8 %val, i8 addrspace(1)* %arrayidx, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond10 > + > +while.end14: ; preds = %while.cond10 > + ret void > +} > + > +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 > %size) nounwind alwaysinline { > +entry: > + %conv = zext i8 %val to i32 > + %shl = shl nuw i32 %conv, 24 > + %shl2 = shl nuw nsw i32 %conv, 16 > + %or = or i32 %shl, %shl2 > + %shl4 = shl nuw nsw i32 %conv, 8 > + %or5 = or i32 %or, %shl4 > + %or7 = or i32 %or5, %conv > + br label %while.cond > + > +while.cond: ; preds = %while.body, > %entry > + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] > + %add = add i32 %index.0, 4 > + %cmp = icmp ult i32 %add, %size > + br i1 %cmp, label %while.cond10, label %while.body > + > +while.body: ; preds = %while.cond > + %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 > + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* > + store i32 %or7, i32 addrspace(3)* %0, align 4 > + br label %while.cond > + > +while.cond10: ; preds = %while.cond, > %while.body13 > + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] > + %cmp11 = icmp ult i32 %index.1, %size > + br i1 %cmp11, label %while.body13, label %while.end14 > + > +while.body13: ; preds = %while.cond10 > + %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 > + store i8 %val, i8 addrspace(3)* %arrayidx, align 1 > + %inc = add i32 %index.1, 1 > + br label %while.cond10 > + > +while.end14: ; preds = %while.cond10 > + ret void > +} > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
