SPIR 1.2 require llvm.memcpy support. And llvm will emit llvm.memset sometimes. So adding a pass to lower these two intrinsic function, and then inline them.
In intrinsic lowering pass, find all llvm.memset and llvm.memcpy and then replace them with a function call __gen_memset_x and __gen_memcpy_xx, x and xx is for address space. Because this pass is after clang, but after clang, the unused function seems be stripped, so implement the __gen_memset_x and __gen_memcpy_xx functions in pre compiled module, then link them. Signed-off-by: Yang Rong <[email protected]> --- backend/src/CMakeLists.txt | 3 +- backend/src/llvm/llvm_gen_backend.hpp | 4 + backend/src/llvm/llvm_intrinsic_lowering.cpp | 172 ++++++++++++++ backend/src/llvm/llvm_passes.cpp | 2 +- backend/src/llvm/llvm_to_gen.cpp | 2 + backend/src/ocl_memcpy.ll | 336 +++++++++++++++++++++++++++ backend/src/ocl_memset.ll | 127 ++++++++++ 7 files changed, 644 insertions(+), 2 deletions(-) create mode 100644 backend/src/llvm/llvm_intrinsic_lowering.cpp create mode 100644 backend/src/ocl_memcpy.ll create mode 100644 backend/src/ocl_memset.ll diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt index b93133f..10bf67b 100644 --- a/backend/src/CMakeLists.txt +++ b/backend/src/CMakeLists.txt @@ -136,6 +136,7 @@ else (GBE_USE_BLOB) llvm/llvm_gen_backend.cpp llvm/llvm_passes.cpp llvm/llvm_scalarize.cpp + llvm/llvm_intrinsic_lowering.cpp llvm/llvm_to_gen.cpp llvm/llvm_gen_backend.hpp llvm/llvm_gen_ocl_function.hxx @@ -165,7 +166,7 @@ add_library (gbe STATIC ${GBE_SRC}) # for pre compiled module library. set (pcm_lib "beignet.bc") -set (pcm_sources ocl_barrier.ll) +set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll) ll_add_library (${pcm_lib} pcm_sources) ADD_DEPENDENCIES (gbe pch_object ${pcm_lib}) diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp index 55079f5..389d5f3 100644 --- a/backend/src/llvm/llvm_gen_backend.hpp +++ b/backend/src/llvm/llvm_gen_backend.hpp @@ -84,8 +84,12 @@ namespace gbe /*! Remove the GEP instructions */ llvm::BasicBlockPass *createRemoveGEPPass(const ir::Unit &unit); + /*! Scalarize all vector op instructions */ llvm::FunctionPass* createScalarizePass(); + /*! Convert the Intrinsic call to gen function */ + llvm::BasicBlockPass *createIntrinsicLoweringPass(); + } /* namespace gbe */ #endif /* __GBE_LLVM_GEN_BACKEND_HPP__ */ diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp new file mode 100644 index 0000000..1942860 --- /dev/null +++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp @@ -0,0 +1,172 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + */ + +/** + * \file llvm_intrinisc_lowering.cpp + * \author Yang Rong <[email protected]> + */ + +#include "llvm/Config/config.h" +#if LLVM_VERSION_MINOR <= 2 +#include "llvm/Function.h" +#include "llvm/InstrTypes.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#else +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#endif /* LLVM_VERSION_MINOR <= 2 */ +#include "llvm/Pass.h" +#if LLVM_VERSION_MINOR <= 1 +#include "llvm/Support/IRBuilder.h" +#elif LLVM_VERSION_MINOR == 2 +#include "llvm/IRBuilder.h" +#else +#include "llvm/IR/IRBuilder.h" +#endif /* LLVM_VERSION_MINOR <= 1 */ +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/raw_ostream.h" + +#include "llvm/llvm_gen_backend.hpp" +#include "sys/map.hpp" + + +using namespace llvm; + +namespace gbe { + class InstrinsicLowering : public BasicBlockPass + { + public: + static char ID; + InstrinsicLowering() : + BasicBlockPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + + } + + virtual const char *getPassName() const { + return "SPIR backend: lowering instrinsics"; + } + static char convertSpaceToName(Value *val) { + const uint32_t space = val->getType()->getPointerAddressSpace(); + switch(space) { + case 0: + return 'p'; + case 1: + return 'g'; + case 3: + return 'l'; + default: + assert("Non support address space"); + return '\0'; + } + } + static CallInst *replaceCallWith(const char *NewFn, CallInst *CI, + Value **ArgBegin, Value **ArgEnd, + Type *RetTy) + { + // If we haven't already looked up this function, check to see if the + // program already contains a function with this name. + Module *M = CI->getParent()->getParent()->getParent(); + // Get or insert the definition now. + std::vector<Type *> ParamTys; + for (Value** I = ArgBegin; I != ArgEnd; ++I) + ParamTys.push_back((*I)->getType()); + Constant* FCache = M->getOrInsertFunction(NewFn, + FunctionType::get(RetTy, ParamTys, false)); + + IRBuilder<> Builder(CI->getParent(), CI); + SmallVector<Value *, 8> Args(ArgBegin, ArgEnd); + CallInst *NewCI = Builder.CreateCall(FCache, Args); + NewCI->setName(CI->getName()); + if (!CI->use_empty()) + CI->replaceAllUsesWith(NewCI); + CI->eraseFromParent(); + return NewCI; + } + virtual bool runOnBasicBlock(BasicBlock &BB) + { + bool changedBlock = false; + Module *M = BB.getParent()->getParent(); + + DataLayout TD(M); + LLVMContext &Context = BB.getContext(); + for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { + Instruction *Inst = DI++; + CallInst* CI = dyn_cast<CallInst>(Inst); + if(CI == NULL) + continue; + + IRBuilder<> Builder(&BB, CI); + // only support memcpy and memset + if (Function *F = CI->getCalledFunction()) { + const Intrinsic::ID intrinsicID = (Intrinsic::ID) F->getIntrinsicID(); + if (intrinsicID == 0) + continue; + switch (intrinsicID) { + case Intrinsic::memcpy: { + Type *IntPtr = TD.getIntPtrType(Context); + Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr, + /* isSigned */ false); + Value *Ops[3]; + Ops[0] = CI->getArgOperand(0); + Ops[1] = CI->getArgOperand(1); + Ops[2] = Size; + char name[16] = "__gen_memcpy_xx"; + name[13] = convertSpaceToName(Ops[0]); + name[14] = convertSpaceToName(Ops[1]); + replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context)); + break; + } + case Intrinsic::memset: { + Value *Op0 = CI->getArgOperand(0); + Value *val = Builder.CreateIntCast(CI->getArgOperand(1), IntegerType::getInt8Ty(Context), + /* isSigned */ false); + Type *IntPtr = TD.getIntPtrType(Op0->getType()); + Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr, + /* isSigned */ false); + Value *Ops[3]; + Ops[0] = Op0; + // Extend the amount to i32. + Ops[1] = val; + Ops[2] = Size; + char name[16] = "__gen_memset_x"; + name[13] = convertSpaceToName(Ops[0]); + replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context)); + break; + } + default: + continue; + } + } + } + return changedBlock; + } + }; + + char InstrinsicLowering::ID = 0; + + BasicBlockPass *createIntrinsicLoweringPass() { + return new InstrinsicLowering(); + } +} // end namespace diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp index 3bb6f71..1091dae 100644 --- a/backend/src/llvm/llvm_passes.cpp +++ b/backend/src/llvm/llvm_passes.cpp @@ -232,7 +232,7 @@ namespace gbe } virtual const char *getPassName() const { - return "PTX backend: insert special ptx instructions"; + return "SPIR backend: insert special spir instructions"; } bool simplifyGEPInstructions(GetElementPtrInst* GEPInst); diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp index a9f70d9..b227912 100644 --- a/backend/src/llvm/llvm_to_gen.cpp +++ b/backend/src/llvm/llvm_to_gen.cpp @@ -175,6 +175,8 @@ namespace gbe // Print the code before further optimizations if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS) passes.add(createPrintModulePass(&*o)); + passes.add(createIntrinsicLoweringPass()); + passes.add(createFunctionInliningPass(200000)); passes.add(createScalarReplAggregatesPass()); // Break up allocas passes.add(createRemoveGEPPass(unit)); passes.add(createConstantPropagationPass()); diff --git a/backend/src/ocl_memcpy.ll b/backend/src/ocl_memcpy.ll new file mode 100644 index 0000000..476033e --- /dev/null +++ b/backend/src/ocl_memcpy.ll @@ -0,0 +1,336 @@ +;The memcpy's source code. +; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) { +; size_t index = 0; +; while((index + 4) >= size) { +; *((uint *)(dst + index)) = *((uint *)(src + index)); +; index += 4; +; } +; while(index < size) { +; dst[index] = src[index]; +; index++; +; } +; } + +define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %1 = load i32 addrspace(1)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + store i32 %1, i32 addrspace(1)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 + %3 = load i8 addrspace(1)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* + %1 = load i32 addrspace(0)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + store i32 %1, i32 addrspace(1)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 + %3 = load i8 addrspace(0)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* + %1 = load i32 addrspace(3)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(1)* %add.ptr1 to i32 addrspace(1)* + store i32 %1, i32 addrspace(1)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 + %3 = load i8 addrspace(3)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(1)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %1 = load i32 addrspace(1)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* + store i32 %1, i32 addrspace(0)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 + %3 = load i8 addrspace(1)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* + %1 = load i32 addrspace(0)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* + store i32 %1, i32 addrspace(0)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 + %3 = load i8 addrspace(0)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* + %1 = load i32 addrspace(3)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(0)* %add.ptr1 to i32 addrspace(0)* + store i32 %1, i32 addrspace(0)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 + %3 = load i8 addrspace(3)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(0)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(0)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + %1 = load i32 addrspace(1)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* + store i32 %1, i32 addrspace(3)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %index.1 + %3 = load i8 addrspace(1)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(0)* %add.ptr to i32 addrspace(0)* + %1 = load i32 addrspace(0)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* + store i32 %1, i32 addrspace(3)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(0)* %src, i32 %index.1 + %3 = load i8 addrspace(0)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} + +define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline { +entry: + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond3, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.0 + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* + %1 = load i32 addrspace(3)* %0, align 4 + %add.ptr1 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 + %2 = bitcast i8 addrspace(3)* %add.ptr1 to i32 addrspace(3)* + store i32 %1, i32 addrspace(3)* %2, align 4 + br label %while.cond + +while.cond3: ; preds = %while.cond, %while.body5 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body5 ] + %cmp4 = icmp ult i32 %index.1, %size + br i1 %cmp4, label %while.body5, label %while.end7 + +while.body5: ; preds = %while.cond3 + %arrayidx = getelementptr inbounds i8 addrspace(3)* %src, i32 %index.1 + %3 = load i8 addrspace(3)* %arrayidx, align 1 + %arrayidx6 = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 + store i8 %3, i8 addrspace(3)* %arrayidx6, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond3 + +while.end7: ; preds = %while.cond3 + ret void +} diff --git a/backend/src/ocl_memset.ll b/backend/src/ocl_memset.ll new file mode 100644 index 0000000..addf9f5 --- /dev/null +++ b/backend/src/ocl_memset.ll @@ -0,0 +1,127 @@ +;The memset's source code. +; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) { +; size_t index = 0; +; uint v = (val << 24) | (val << 16) | (val << 8) | val; +; while((index + 4) >= size) { +; *((uint *)(dst + index)) = v; +; index += 4; +; } +; while(index < size) { +; dst[index] = val; +; index++; +; } +; } + +define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +entry: + %conv = zext i8 %val to i32 + %shl = shl nuw i32 %conv, 24 + %shl2 = shl nuw nsw i32 %conv, 16 + %or = or i32 %shl, %shl2 + %shl4 = shl nuw nsw i32 %conv, 8 + %or5 = or i32 %or, %shl4 + %or7 = or i32 %or5, %conv + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond10, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8* %dst, i32 %index.0 + %0 = bitcast i8* %add.ptr to i32* + store i32 %or7, i32* %0, align 4 + br label %while.cond + +while.cond10: ; preds = %while.cond, %while.body13 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] + %cmp11 = icmp ult i32 %index.1, %size + br i1 %cmp11, label %while.body13, label %while.end14 + +while.body13: ; preds = %while.cond10 + %arrayidx = getelementptr inbounds i8* %dst, i32 %index.1 + store i8 %val, i8* %arrayidx, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond10 + +while.end14: ; preds = %while.cond10 + ret void +} + +define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +entry: + %conv = zext i8 %val to i32 + %shl = shl nuw i32 %conv, 24 + %shl2 = shl nuw nsw i32 %conv, 16 + %or = or i32 %shl, %shl2 + %shl4 = shl nuw nsw i32 %conv, 8 + %or5 = or i32 %or, %shl4 + %or7 = or i32 %or5, %conv + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond10, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.0 + %0 = bitcast i8 addrspace(1)* %add.ptr to i32 addrspace(1)* + store i32 %or7, i32 addrspace(1)* %0, align 4 + br label %while.cond + +while.cond10: ; preds = %while.cond, %while.body13 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] + %cmp11 = icmp ult i32 %index.1, %size + br i1 %cmp11, label %while.body13, label %while.end14 + +while.body13: ; preds = %while.cond10 + %arrayidx = getelementptr inbounds i8 addrspace(1)* %dst, i32 %index.1 + store i8 %val, i8 addrspace(1)* %arrayidx, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond10 + +while.end14: ; preds = %while.cond10 + ret void +} + +define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline { +entry: + %conv = zext i8 %val to i32 + %shl = shl nuw i32 %conv, 24 + %shl2 = shl nuw nsw i32 %conv, 16 + %or = or i32 %shl, %shl2 + %shl4 = shl nuw nsw i32 %conv, 8 + %or5 = or i32 %or, %shl4 + %or7 = or i32 %or5, %conv + br label %while.cond + +while.cond: ; preds = %while.body, %entry + %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ] + %add = add i32 %index.0, 4 + %cmp = icmp ult i32 %add, %size + br i1 %cmp, label %while.cond10, label %while.body + +while.body: ; preds = %while.cond + %add.ptr = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.0 + %0 = bitcast i8 addrspace(3)* %add.ptr to i32 addrspace(3)* + store i32 %or7, i32 addrspace(3)* %0, align 4 + br label %while.cond + +while.cond10: ; preds = %while.cond, %while.body13 + %index.1 = phi i32 [ %index.0, %while.cond ], [ %inc, %while.body13 ] + %cmp11 = icmp ult i32 %index.1, %size + br i1 %cmp11, label %while.body13, label %while.end14 + +while.body13: ; preds = %while.cond10 + %arrayidx = getelementptr inbounds i8 addrspace(3)* %dst, i32 %index.1 + store i8 %val, i8 addrspace(3)* %arrayidx, align 1 + %inc = add i32 %index.1, 1 + br label %while.cond10 + +while.end14: ; preds = %while.cond10 + ret void +} -- 1.8.3.2 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
