diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 84b71ad038422..fa2930655a30c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2871,8 +2871,9 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanSum( // TODO handle more types llvm::Type *SumType = Val->getType(); assert( - (SumType->isIntegerTy() && (SumType->getPrimitiveSizeInBits() == 32 || - SumType->getPrimitiveSizeInBits() == 64)) && + (SumType->isFloatTy() || SumType->isDoubleTy() || + (SumType->isIntegerTy() && (SumType->getPrimitiveSizeInBits() == 32 || + SumType->getPrimitiveSizeInBits() == 64))) && "Unhandled type"); llvm::Type *Int32Ty = llvm::Type::getInt32Ty(CGM.getLLVMContext()); @@ -2880,7 +2881,9 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanSum( std::pair RfunPair = getXteamRedFunctionPtrs(CGF, SumType); - llvm::Value *ZeroVal = SumType->getPrimitiveSizeInBits() == 32 + llvm::Value *ZeroVal = (SumType->isFloatTy() || SumType->isDoubleTy()) + ? llvm::ConstantFP::getZero(SumType) + : SumType->getPrimitiveSizeInBits() == 32 ? llvm::ConstantInt::get(Int32Ty, 0) : llvm::ConstantInt::get(Int64Ty, 0); @@ -2906,37 +2909,166 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanSum( "XTeam Reduction blocksize must be a power of two"); if (SumType->isIntegerTy()) { + if (SumType->getPrimitiveSizeInBits() == 64) { + if (WarpSize == 64) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_l_16x64), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_l_8x64), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_l_4x64), + Args); + else + llvm_unreachable("Block size unsupported."); + } else if (WarpSize == 32) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_l_32x32), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_l_16x32), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_l_8x32), + Args); + else + llvm_unreachable("Block size unsupported."); + } else + llvm_unreachable("Warp size should be 32 or 64."); + } else if (SumType->getPrimitiveSizeInBits() == 32) { + if (WarpSize == 64) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_i_16x64), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_i_8x64), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_i_4x64), + Args); + else + llvm_unreachable("Block size unsupported."); + } else if (WarpSize == 32) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_i_32x32), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_i_16x32), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_i_8x32), + Args); + else + llvm_unreachable("Block size unsupported."); + } else + llvm_unreachable("Warp size should be 32 or 64."); + } + } + if (SumType->isDoubleTy()) { + if (WarpSize == 64) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_d_16x64), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_d_8x64), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_d_4x64), + Args); + else + llvm_unreachable("Block size unsupported."); + } else if (WarpSize == 32) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_d_32x32), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_d_16x32), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_d_8x32), + Args); + else + llvm_unreachable("Block size unsupported."); + } else + llvm_unreachable("Warp size should be 32 or 64."); + } + if (SumType->isFloatTy()) { + // FIXME: The Xteam Scan Implementation exhibits unpredictable behavior for + // 'float' datatype when number of elements to be scanned goes beyond 1 + // million. This issue requires further debugging. if (WarpSize == 64) { if (BlockSize == 1024) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_xteams_i_16x64), + OMPRTL___kmpc_xteams_f_16x64), Args); else if (BlockSize == 512) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_xteams_i_8x64), + OMPRTL___kmpc_xteams_f_8x64), Args); else if (BlockSize == 256) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_xteams_i_4x64), + OMPRTL___kmpc_xteams_f_4x64), Args); else - llvm_unreachable("Block size should be 256, 512 or 1024."); + llvm_unreachable("BBlock size unsupported."); } else if (WarpSize == 32) { - if (BlockSize == 512) + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), + OMPRTL___kmpc_xteams_f_32x32), + Args); + else if (BlockSize == 512) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_xteams_i_16x32), + OMPRTL___kmpc_xteams_f_16x32), Args); else if (BlockSize == 256) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_xteams_i_8x32), + OMPRTL___kmpc_xteams_f_8x32), Args); else - llvm_unreachable("Block size should be 256 or 512."); + llvm_unreachable("Block size unsupported."); } else llvm_unreachable("Warp size should be 32 or 64."); } @@ -2951,8 +3083,9 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanPhaseTwo( // TODO handle more types llvm::Type *SumType = Val->getType(); assert( - (SumType->isIntegerTy() && (SumType->getPrimitiveSizeInBits() == 32 || - SumType->getPrimitiveSizeInBits() == 64)) && + (SumType->isFloatTy() || SumType->isDoubleTy() || + (SumType->isIntegerTy() && (SumType->getPrimitiveSizeInBits() == 32 || + SumType->getPrimitiveSizeInBits() == 64))) && "Unhandled type"); llvm::Type *Int32Ty = llvm::Type::getInt32Ty(CGM.getLLVMContext()); @@ -2960,7 +3093,9 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanPhaseTwo( std::pair RfunPair = getXteamRedFunctionPtrs(CGF, SumType); - llvm::Value *ZeroVal = SumType->getPrimitiveSizeInBits() == 32 + llvm::Value *ZeroVal = (SumType->isFloatTy() || SumType->isDoubleTy()) + ? llvm::ConstantFP::getZero(SumType) + : SumType->getPrimitiveSizeInBits() == 32 ? llvm::ConstantInt::get(Int32Ty, 0) : llvm::ConstantInt::get(Int64Ty, 0); @@ -2979,37 +3114,163 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamScanPhaseTwo( "XTeam Reduction blocksize must be a power of two"); if (SumType->isIntegerTy()) { + if (SumType->getPrimitiveSizeInBits() == 64) { + if (WarpSize == 64) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_l_16x64), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_l_8x64), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_l_4x64), + Args); + else + llvm_unreachable("Block size unsupported."); + } else if (WarpSize == 32) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_l_32x32), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_l_16x32), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_l_8x32), + Args); + else + llvm_unreachable("Block size unsupported."); + } else + llvm_unreachable("Warp size should be 32 or 64."); + } else if (SumType->getPrimitiveSizeInBits() == 32) { + if (WarpSize == 64) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_16x64), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_8x64), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_4x64), + Args); + else + llvm_unreachable("Block size unsupported."); + } else if (WarpSize == 32) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_32x32), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_16x32), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_8x32), + Args); + else + llvm_unreachable("Block size unsupported."); + } else + llvm_unreachable("Warp size should be 32 or 64."); + } + } + if (SumType->isDoubleTy()) { + if (WarpSize == 64) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_d_16x64), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_d_8x64), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_d_4x64), + Args); + else + llvm_unreachable("Block size unsupported."); + } else if (WarpSize == 32) { + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_d_32x32), + Args); + else if (BlockSize == 512) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_d_16x32), + Args); + else if (BlockSize == 256) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_d_8x32), + Args); + else + llvm_unreachable("Block size unsupported."); + } else + llvm_unreachable("Warp size should be 32 or 64."); + } + if (SumType->isFloatTy()) { if (WarpSize == 64) { if (BlockSize == 1024) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_16x64), + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_f_16x64), Args); else if (BlockSize == 512) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_8x64), + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_f_8x64), Args); else if (BlockSize == 256) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_4x64), + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_f_4x64), Args); else - llvm_unreachable("Block size should be 256, 512 or 1024."); + llvm_unreachable("BBlock size unsupported."); } else if (WarpSize == 32) { - if (BlockSize == 512) + if (BlockSize == 1024) + return CGF.EmitRuntimeCall( + OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_f_32x32), + Args); + else if (BlockSize == 512) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_16x32), + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_f_16x32), Args); else if (BlockSize == 256) return CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_xteams_phase2_i_8x32), + CGM.getModule(), OMPRTL___kmpc_xteams_phase2_f_8x32), Args); else - llvm_unreachable("Block size should be 256 or 512."); + llvm_unreachable("Block size unsupported."); } else llvm_unreachable("Warp size should be 32 or 64."); } diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index bff9aed782328..2de5ed385d043 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -549,6 +549,7 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( assert(Itr != RedVarMap.end() && "Metadata not found"); const CodeGenModule::XteamRedVarInfo &RVI = Itr->second; + llvm::Type *RedVarType = ConvertTypeForMem(XteamVD->getType()); assert(RVI.ArgPos + 1 < Args->size() && "Arg position beyond bounds"); @@ -568,9 +569,9 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( // { // RedVar += TeamVals[TeamID - 1] // } - Address ScanStorageValGEP = Address( - Builder.CreateGEP(Int32Ty, DScanStorage, GlobalGpuThreadId), Int32Ty, + Builder.CreateGEP(RedVarType, DScanStorage, GlobalGpuThreadId), + RedVarType, getContext().getTypeAlignInChars( XteamVD->getType())); // Storage[GlobalTID] Builder.CreateStore(Builder.CreateLoad(ScanStorageValGEP), @@ -586,10 +587,10 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( EmitBlock(IsAfterFirstTeamThenBlock); Address PrevTeamValGEP = Address(Builder.CreateGEP( - Int32Ty, DTeamVals, + RedVarType, DTeamVals, Builder.CreateSub(WorkGroupId, llvm::ConstantInt::get(Int32Ty, 1))), - Int32Ty, + RedVarType, getContext().getTypeAlignInChars( XteamVD->getType())); // TeamVals[TeamID - 1] Builder.CreateStore(Builder.CreateAdd(Builder.CreateLoad(RVI.RedVarAddr), @@ -614,7 +615,7 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( // } // } - Builder.CreateStore(llvm::ConstantInt::get(Int32Ty, 0), + Builder.CreateStore(llvm::ConstantInt::get(RedVarType, 0), RVI.RedVarAddr); // RedVar = 0 llvm::Value *IsNotFirstThread = Builder.CreateICmpUGE( GlobalGpuThreadId, @@ -630,8 +631,8 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( GlobalGpuThreadId, llvm::ConstantInt::get(Int32Ty, 1)); // GlobalTID - 1 Address ScanStoragePrevValGEP = Address( - Builder.CreateGEP(Int32Ty, DScanStorage, PrevGlobalGpuThreadId), - Int32Ty, + Builder.CreateGEP(RedVarType, DScanStorage, PrevGlobalGpuThreadId), + RedVarType, getContext().getTypeAlignInChars( XteamVD->getType())); // Storage[GlobalTID - 1] Builder.CreateStore(Builder.CreateLoad(ScanStoragePrevValGEP), @@ -656,10 +657,10 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( EmitBlock(IsNotFirstThreadInTeamThenBlock); Address PrevTeamValGEP = Address(Builder.CreateGEP( - Int32Ty, DTeamVals, + RedVarType, DTeamVals, Builder.CreateSub(WorkGroupId, llvm::ConstantInt::get(Int32Ty, 1))), - Int32Ty, + RedVarType, getContext().getTypeAlignInChars( XteamVD->getType())); // TeamVals[TeamID - 1] Builder.CreateStore(Builder.CreateAdd(Builder.CreateLoad(RVI.RedVarAddr), @@ -676,10 +677,10 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( EmitBlock(IsAfterSecondTeamThenBlock); Address PrevPrevTeamValGEP = Address(Builder.CreateGEP( - Int32Ty, DTeamVals, + RedVarType, DTeamVals, Builder.CreateSub(WorkGroupId, llvm::ConstantInt::get(Int32Ty, 2))), - Int32Ty, + RedVarType, getContext().getTypeAlignInChars( XteamVD->getType())); // TeamVals[TeamID - 2] Builder.CreateStore( @@ -2307,12 +2308,12 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S, llvm::Value *SegmentLoopUB = nullptr; llvm::Value *DSegmentVals = nullptr; - llvm::Value *ThreadLevelRes = nullptr; llvm::Value *GlobalUpperBound = nullptr; const Address *RedVarAddr = nullptr; llvm::BasicBlock *ExecBB = nullptr; llvm::BasicBlock *DoneBB = nullptr; - clang::QualType RedVarType; + const clang::VarDecl *XteamVD; + llvm::Type *RedVarType; if (getLangOpts().OpenMPIsTargetDevice && CGM.isXteamSegmentedScanKernel()) { // Compute Loop trip-count (N) = GlobalUB - GlobalLB + 1 const auto UBLValue = EmitLValue( @@ -2368,19 +2369,19 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S, Builder.CreateMul(SegmentSizeForScan, GlobalGpuThreadId), BigJumpLoopIvAddr); // *iv = GlobalTID * Seg_Size - // Every thread loops till just before the SegmentLoopUB = (GlobaTID + 1) * - // Seg_Size + // Every thread loops till just before the SegmentLoopUB: + // SegmentLoopUB = (GlobaTID + 1) * Seg_Size SegmentLoopUB = Builder.CreateMul( SegmentSizeForScan, Builder.CreateAdd(GlobalGpuThreadId, llvm::ConstantInt::get(Int32Ty, 1))); - auto XteamVD = *(CGM.getXteamOrderedRedVar(&S).begin()); + XteamVD = *(CGM.getXteamOrderedRedVar(&S).begin()); + RedVarType = ConvertTypeForMem(XteamVD->getType()); const CodeGenModule::XteamRedVarMap &RedVarMap = CGM.getXteamRedVarMap(&S); const CodeGenModule::XteamRedVarInfo &RVI = (RedVarMap.find(XteamVD))->second; RedVarAddr = &(RVI.RedVarAddr); - RedVarType = XteamVD->getType(); // SegmentValsAddr points to the SegmentVals array which will store the // intermediate scan results computed per segment by a single thread @@ -2520,11 +2521,12 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S, if (!CGM.isXteamScanPhaseOne) { // SegmentVals contains the final scanned results computed for every // element in a segment. - Address SegmentValsGEP = Address( - Builder.CreateGEP(Int32Ty, DSegmentVals, - Builder.CreateLoad(BigJumpLoopIvAddr)), - Int32Ty, - getContext().getTypeAlignInChars(RedVarType)); // SegmentVals[*iv] + Address SegmentValsGEP = + Address(Builder.CreateGEP(RedVarType, DSegmentVals, + Builder.CreateLoad(BigJumpLoopIvAddr)), + RedVarType, + getContext().getTypeAlignInChars( + XteamVD->getType())); // SegmentVals[*iv] // emit redvar = SegmentVals[omp.iv] Builder.CreateStore(Builder.CreateLoad(SegmentValsGEP), *RedVarAddr); } @@ -2548,11 +2550,12 @@ void CodeGenFunction::EmitForStmtWithArgs(const ForStmt &S, (CGM.isXteamRedKernel(&S) || CGM.isBigJumpLoopKernel(&S))) { if (CGM.isXteamSegmentedScanKernel()) { EmitBlock(Continue.getBlock()); - Address SegmentValsGEP = Address( - Builder.CreateGEP(Int32Ty, DSegmentVals, - Builder.CreateLoad(BigJumpLoopIvAddr)), - Int32Ty, - getContext().getTypeAlignInChars(RedVarType)); // Segment_Vals[*iv] + Address SegmentValsGEP = + Address(Builder.CreateGEP(RedVarType, DSegmentVals, + Builder.CreateLoad(BigJumpLoopIvAddr)), + RedVarType, + getContext().getTypeAlignInChars( + XteamVD->getType())); // Segment_Vals[*iv] Builder.CreateStore(Builder.CreateLoad(*RedVarAddr), SegmentValsGEP); // Segment_Vals[*iv] = red_var llvm::Value *SegmentScanLoopInc = diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index c8aca7967b26c..b379c5e268f5c 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -415,10 +415,10 @@ void CodeGenFunction::InitializeXteamRedCapturedVars( llvm::Value *DScanStorageInst = Builder.CreateAlloca(RedVarType, nullptr, "d_scan_storage"); Address DScanStorageAddr( - DScanStorageInst, Int32Ty, + DScanStorageInst, RedVarType, Context.getTypeAlignInChars(Context.UnsignedIntTy)); llvm::Value *NullPtrDScanStorage = - llvm::ConstantPointerNull::get(Int32Ty->getPointerTo()); + llvm::ConstantPointerNull::get(RedVarType->getPointerTo()); Builder.CreateStore(NullPtrDScanStorage, DScanStorageAddr); assert(DScanStorageInst && "Device scan storage pointer cannot be null"); @@ -428,10 +428,10 @@ void CodeGenFunction::InitializeXteamRedCapturedVars( llvm::Value *DSegmentValsInst = Builder.CreateAlloca(RedVarType, nullptr, "d_segment_vals"); Address DSegmentValsAddr( - DSegmentValsInst, Int32Ty, + DSegmentValsInst, RedVarType, Context.getTypeAlignInChars(Context.UnsignedIntTy)); llvm::Value *NullPtrDSegmentVals = - llvm::ConstantPointerNull::get(Int32Ty->getPointerTo()); + llvm::ConstantPointerNull::get(RedVarType->getPointerTo()); Builder.CreateStore(NullPtrDSegmentVals, DSegmentValsAddr); assert(DSegmentValsInst && "Segment Vals Array pointer cannot be null"); diff --git a/clang/test/OpenMP/xteam_scan_datatypes.cpp b/clang/test/OpenMP/xteam_scan_datatypes.cpp new file mode 100644 index 0000000000000..bd02bab189db8 --- /dev/null +++ b/clang/test/OpenMP/xteam_scan_datatypes.cpp @@ -0,0 +1,3106 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ + +/*---------------------------------------- Test Xteam Segmented Scan ---------------------------------------*/ + +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-ignore-env-vars -fopenmp-assume-no-nested-parallelism -fopenmp-assume-no-thread-state -fopenmp-target-xteam-scan -emit-llvm-bc %s -o %t-ppc-host1.bc +// RUN: %clang_cc1 -target-cpu gfx90a -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-ignore-env-vars -fopenmp-assume-no-nested-parallelism -fopenmp-assume-no-thread-state -fopenmp-target-xteam-scan -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host1.bc -o - | FileCheck %s + +// expected-no-diagnostics + +#include +#define N 1000000 + +template +void run_test() +{ + T sum = T(0); + T *a = new T[N]; + T *b = new T[N]; + + #pragma omp target teams distribute parallel for reduction(inscan, +:sum) map(to: a[0:N]) map(tofrom: b[0:N]) + for(int i = 0; i < N; i++) { + sum += a[i]; + #pragma omp scan inclusive(sum) + b[i] = sum; + } + + sum = T(0); + #pragma omp target teams distribute parallel for reduction(inscan, +:sum) map(to: a[0:N]) map(tofrom: b[0:N]) + for(int i = 0; i < N; i++) { + b[i] = sum; + #pragma omp scan exclusive(sum) + sum += a[i]; + } + + delete[] a; + delete[] b; +} + +int main() { + run_test(); + run_test(); + run_test(); + run_test(); + run_test(); + run_test(); + return 0; +} +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIiEvv_l20 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP38]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP40]], [[TMP39]] +// CHECK-NEXT: store i32 [[TMP41]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = zext i32 [[TMP42]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP46]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i64 [[IDXPROM9]] +// CHECK-NEXT: store i32 [[TMP44]], ptr [[ARRAYIDX10]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP49:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_i_16x64(i32 [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIiEvv_l20_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_i_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_i, i32 0, i64 [[TMP16]], i32 1) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4 +// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP47]], [[TMP46]] +// CHECK-NEXT: store i32 [[TMP48]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = zext i32 [[TMP49]] to i64 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP51]], ptr [[TMP4]], align 4 +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP54]], ptr [[ARRAYIDX11]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP55]] +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP57]], ptr [[TMP56]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP59:%.*]] = add i32 1, [[TMP58]] +// CHECK-NEXT: store i32 [[TMP59]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP51:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIiEvv_l28 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP39]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i64 [[IDXPROM]] +// CHECK-NEXT: store i32 [[TMP37]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP40]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP43]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i64 [[IDXPROM9]] +// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP45]], [[TMP44]] +// CHECK-NEXT: store i32 [[TMP46]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP52:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_i_16x64(i32 [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIiEvv_l28_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_i_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_i, i32 0, i64 [[TMP16]], i32 0) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4 +// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP46]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP47]] to i64 +// CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[TMP48]], 0 +// CHECK-NEXT: br i1 [[TMP49]], label [[OMP_EXCLUSIVE_COPY_EXIT:%.*]], label [[OMP_EXCLUSIVE_DEC:%.*]] +// CHECK: omp.exclusive.dec: +// CHECK-NEXT: [[TMP50:%.*]] = sub nuw i64 [[TMP48]], 1 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP51]], ptr [[TMP4]], align 4 +// CHECK-NEXT: br label [[OMP_EXCLUSIVE_COPY_EXIT]] +// CHECK: omp.exclusive.copy.exit: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = add i32 [[TMP55]], [[TMP54]] +// CHECK-NEXT: store i32 [[TMP56]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP57]] +// CHECK-NEXT: [[TMP59:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP59]], ptr [[TMP58]], align 4 +// CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP61:%.*]] = add i32 1, [[TMP60]] +// CHECK-NEXT: store i32 [[TMP61]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP53:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIjEvv_l20 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP38]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP37]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP40]], [[TMP39]] +// CHECK-NEXT: store i32 [[TMP41]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = zext i32 [[TMP42]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP46]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i64 [[IDXPROM9]] +// CHECK-NEXT: store i32 [[TMP44]], ptr [[ARRAYIDX10]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP54:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_i_16x64(i32 [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIjEvv_l20_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_i_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_i, i32 0, i64 [[TMP16]], i32 1) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4 +// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP47]], [[TMP46]] +// CHECK-NEXT: store i32 [[TMP48]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = zext i32 [[TMP49]] to i64 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP51]], ptr [[TMP4]], align 4 +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP54]], ptr [[ARRAYIDX11]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP55]] +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP57]], ptr [[TMP56]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP59:%.*]] = add i32 1, [[TMP58]] +// CHECK-NEXT: store i32 [[TMP59]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP55:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIjEvv_l28 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP39]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP38]], i64 [[IDXPROM]] +// CHECK-NEXT: store i32 [[TMP37]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP40]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP43]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i64 [[IDXPROM9]] +// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP45]], [[TMP44]] +// CHECK-NEXT: store i32 [[TMP46]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP49]], ptr [[TMP48]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP56:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_i_16x64(i32 [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_i, ptr @__kmpc_rfun_sum_lds_i, i32 0, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIjEvv_l28_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_i_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_i, i32 0, i64 [[TMP16]], i32 0) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4 +// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP46]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP47]] to i64 +// CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[TMP48]], 0 +// CHECK-NEXT: br i1 [[TMP49]], label [[OMP_EXCLUSIVE_COPY_EXIT:%.*]], label [[OMP_EXCLUSIVE_DEC:%.*]] +// CHECK: omp.exclusive.dec: +// CHECK-NEXT: [[TMP50:%.*]] = sub nuw i64 [[TMP48]], 1 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP51]], ptr [[TMP4]], align 4 +// CHECK-NEXT: br label [[OMP_EXCLUSIVE_COPY_EXIT]] +// CHECK: omp.exclusive.copy.exit: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = add i32 [[TMP55]], [[TMP54]] +// CHECK-NEXT: store i32 [[TMP56]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = getelementptr i32, ptr [[TMP35]], i32 [[TMP57]] +// CHECK-NEXT: [[TMP59:%.*]] = load i32, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 [[TMP59]], ptr [[TMP58]], align 4 +// CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP61:%.*]] = add i32 1, [[TMP60]] +// CHECK-NEXT: store i32 [[TMP61]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP57:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIlEvv_l20 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: store i64 0, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i64 0, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP38]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP37]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP39:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: [[TMP40:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], [[TMP39]] +// CHECK-NEXT: store i64 [[TMP41]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = zext i32 [[TMP42]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load i64, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP46]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, ptr [[TMP45]], i64 [[IDXPROM9]] +// CHECK-NEXT: store i64 [[TMP44]], ptr [[ARRAYIDX10]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr i64, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP49]], ptr [[TMP48]], align 8 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP58:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_l_16x64(i64 [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIlEvv_l20_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: store i64 0, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_l_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_l, i64 0, i64 [[TMP16]], i32 1) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr i64, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load i64, ptr [[TMP42]], align 8 +// CHECK-NEXT: store i64 [[TMP43]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 0, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: [[TMP47:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP48:%.*]] = add i64 [[TMP47]], [[TMP46]] +// CHECK-NEXT: store i64 [[TMP48]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = zext i32 [[TMP49]] to i64 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP51]], ptr [[TMP4]], align 8 +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP54]], ptr [[ARRAYIDX11]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = getelementptr i64, ptr [[TMP35]], i32 [[TMP55]] +// CHECK-NEXT: [[TMP57:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP57]], ptr [[TMP56]], align 8 +// CHECK-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP59:%.*]] = add i32 1, [[TMP58]] +// CHECK-NEXT: store i32 [[TMP59]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP59:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIlEvv_l28 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: store i64 0, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i64 0, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load i64, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP39]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP38]], i64 [[IDXPROM]] +// CHECK-NEXT: store i64 [[TMP37]], ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP40]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP43]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, ptr [[TMP42]], i64 [[IDXPROM9]] +// CHECK-NEXT: [[TMP44:%.*]] = load i64, ptr [[ARRAYIDX10]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]] +// CHECK-NEXT: store i64 [[TMP46]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr i64, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP49]], ptr [[TMP48]], align 8 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP60:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_l_16x64(i64 [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_l, ptr @__kmpc_rfun_sum_lds_l, i64 0, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIlEvv_l28_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: store i64 0, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_l_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_l, i64 0, i64 [[TMP16]], i32 0) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr i64, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load i64, ptr [[TMP42]], align 8 +// CHECK-NEXT: store i64 [[TMP43]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 0, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP46]], ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP47]] to i64 +// CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[TMP48]], 0 +// CHECK-NEXT: br i1 [[TMP49]], label [[OMP_EXCLUSIVE_COPY_EXIT:%.*]], label [[OMP_EXCLUSIVE_DEC:%.*]] +// CHECK: omp.exclusive.dec: +// CHECK-NEXT: [[TMP50:%.*]] = sub nuw i64 [[TMP48]], 1 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw i64, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP51]], ptr [[TMP4]], align 8 +// CHECK-NEXT: br label [[OMP_EXCLUSIVE_COPY_EXIT]] +// CHECK: omp.exclusive.copy.exit: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load i64, ptr [[ARRAYIDX11]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP54]] +// CHECK-NEXT: store i64 [[TMP56]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = getelementptr i64, ptr [[TMP35]], i32 [[TMP57]] +// CHECK-NEXT: [[TMP59:%.*]] = load i64, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i64 [[TMP59]], ptr [[TMP58]], align 8 +// CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP61:%.*]] = add i32 1, [[TMP60]] +// CHECK-NEXT: store i32 [[TMP61]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP61:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIdEvv_l20 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: store double 0.000000e+00, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store double 0.000000e+00, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP38]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP37]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP39:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: [[TMP40:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP41:%.*]] = fadd double [[TMP40]], [[TMP39]] +// CHECK-NEXT: store double [[TMP41]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = zext i32 [[TMP42]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load double, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP46]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[TMP45]], i64 [[IDXPROM9]] +// CHECK-NEXT: store double [[TMP44]], ptr [[ARRAYIDX10]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP49]], ptr [[TMP48]], align 8 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP62:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_d_16x64(double [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIdEvv_l20_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: store double 0.000000e+00, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_d_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, double 0.000000e+00, i64 [[TMP16]], i32 1) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load double, ptr [[TMP42]], align 8 +// CHECK-NEXT: store double [[TMP43]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double 0.000000e+00, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: [[TMP47:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP48:%.*]] = fadd double [[TMP47]], [[TMP46]] +// CHECK-NEXT: store double [[TMP48]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = zext i32 [[TMP49]] to i64 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP51]], ptr [[TMP4]], align 8 +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds double, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP54]], ptr [[ARRAYIDX11]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = getelementptr double, ptr [[TMP35]], i32 [[TMP55]] +// CHECK-NEXT: [[TMP57:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP57]], ptr [[TMP56]], align 8 +// CHECK-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP59:%.*]] = add i32 1, [[TMP58]] +// CHECK-NEXT: store i32 [[TMP59]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP63:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIdEvv_l28 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: store double 0.000000e+00, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store double 0.000000e+00, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load double, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP39]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP38]], i64 [[IDXPROM]] +// CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP40]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP43]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[TMP42]], i64 [[IDXPROM9]] +// CHECK-NEXT: [[TMP44:%.*]] = load double, ptr [[ARRAYIDX10]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = fadd double [[TMP45]], [[TMP44]] +// CHECK-NEXT: store double [[TMP46]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr double, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP49]], ptr [[TMP48]], align 8 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP64:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_d_16x64(double [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_d, ptr @__kmpc_rfun_sum_lds_d, double 0.000000e+00, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIdEvv_l28_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca double, align 8, addrspace(5) +// CHECK-NEXT: store double 0.000000e+00, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_d_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_d, double 0.000000e+00, i64 [[TMP16]], i32 0) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr double, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load double, ptr [[TMP42]], align 8 +// CHECK-NEXT: store double [[TMP43]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double 0.000000e+00, ptr [[SUM8_ASCAST]], align 8 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP46]], ptr [[ARRAYIDX]], align 8 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP47]] to i64 +// CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[TMP48]], 0 +// CHECK-NEXT: br i1 [[TMP49]], label [[OMP_EXCLUSIVE_COPY_EXIT:%.*]], label [[OMP_EXCLUSIVE_DEC:%.*]] +// CHECK: omp.exclusive.dec: +// CHECK-NEXT: [[TMP50:%.*]] = sub nuw i64 [[TMP48]], 1 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw double, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP51]], ptr [[TMP4]], align 8 +// CHECK-NEXT: br label [[OMP_EXCLUSIVE_COPY_EXIT]] +// CHECK: omp.exclusive.copy.exit: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds double, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load double, ptr [[ARRAYIDX11]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP56:%.*]] = fadd double [[TMP55]], [[TMP54]] +// CHECK-NEXT: store double [[TMP56]], ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = getelementptr double, ptr [[TMP35]], i32 [[TMP57]] +// CHECK-NEXT: [[TMP59:%.*]] = load double, ptr addrspace(5) [[TMP7]], align 8 +// CHECK-NEXT: store double [[TMP59]], ptr [[TMP58]], align 8 +// CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP61:%.*]] = add i32 1, [[TMP60]] +// CHECK-NEXT: store i32 [[TMP61]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP65:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIfEvv_l20 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store float 0.000000e+00, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP38]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP37]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = fadd float [[TMP40]], [[TMP39]] +// CHECK-NEXT: store float [[TMP41]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = zext i32 [[TMP42]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP46]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP45]], i64 [[IDXPROM9]] +// CHECK-NEXT: store float [[TMP44]], ptr [[ARRAYIDX10]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP49]], ptr [[TMP48]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP66:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_f_16x64(float [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIfEvv_l20_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_f_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_f, float 0.000000e+00, i64 [[TMP16]], i32 1) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr float, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4 +// CHECK-NEXT: store float [[TMP43]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float 0.000000e+00, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = fadd float [[TMP47]], [[TMP46]] +// CHECK-NEXT: store float [[TMP48]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = zext i32 [[TMP49]] to i64 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP51]], ptr [[TMP4]], align 4 +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP54]], ptr [[ARRAYIDX11]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = getelementptr float, ptr [[TMP35]], i32 [[TMP55]] +// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP57]], ptr [[TMP56]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP59:%.*]] = add i32 1, [[TMP58]] +// CHECK-NEXT: store i32 [[TMP59]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP67:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIfEvv_l28 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP29]] +// CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], [[TMP30]] +// CHECK-NEXT: [[TMP34:%.*]] = icmp ule i32 [[TMP32]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP35:%.*]] = and i1 [[TMP34]], [[TMP33]] +// CHECK-NEXT: br i1 [[TMP35]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP36]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store float 0.000000e+00, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP39]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP38]], i64 [[IDXPROM]] +// CHECK-NEXT: store float [[TMP37]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP40]] to i64 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: br label [[OMP_AFTER_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP42:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP43]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP42]], i64 [[IDXPROM9]] +// CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = fadd float [[TMP45]], [[TMP44]] +// CHECK-NEXT: store float [[TMP46]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = getelementptr float, ptr [[TMP31]], i32 [[TMP47]] +// CHECK-NEXT: [[TMP49:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP49]], ptr [[TMP48]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP51:%.*]] = add i32 1, [[TMP50]] +// CHECK-NEXT: store i32 [[TMP51]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP68:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load ptr, ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: [[TMP54:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP55:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_f_16x64(float [[TMP55]], ptr [[TMP54]], ptr [[TMP4]], ptr [[TMP52]], ptr [[TMP53]], ptr @__kmpc_rfun_sum_f, ptr @__kmpc_rfun_sum_lds_f, float 0.000000e+00, i64 [[TMP16]], i32 [[TMP15]]) +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8run_testIfEvv_l28_1 +// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]], ptr noundef [[A:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM1:%.*]], ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// CHECK-NEXT: [[SUM_ADDR2:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR4:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[DOTADDR5:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SUM8:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr +// CHECK-NEXT: [[SUM_ADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR2]] to ptr +// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr +// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr +// CHECK-NEXT: [[DOTADDR4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR4]] to ptr +// CHECK-NEXT: [[DOTADDR5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR5]] to ptr +// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr +// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr +// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr +// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr +// CHECK-NEXT: [[SUM8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM8]] to ptr +// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[SUM1]], ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR3_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[SUM_ADDR2_ASCAST]], align 8 +// CHECK-NEXT: call void @__kmpc_specialized_kernel_init() +// CHECK-NEXT: [[TMP7:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 999999, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[GPU_BLOCK_ID]], [[NVPTX_NUM_THREADS]] +// CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP9]] +// CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 +// CHECK-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[GLOBAL_UPPER_BOUND:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[GLOBAL_UPPER_BOUND]], [[TMP17]] +// CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() +// CHECK-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() +// CHECK-NEXT: [[GPU_BLOCK_ID7:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +// CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[GPU_BLOCK_ID7]], [[NVPTX_NUM_THREADS6]] +// CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], [[TMP20]] +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_get_hardware_num_blocks() +// CHECK-NEXT: [[TMP24:%.*]] = mul i32 [[NVPTX_NUM_THREADS6]], [[TMP23]] +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP24]] +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_KERNEL_BODY:%.*]], label [[OMP_KERNEL_DONE:%.*]] +// CHECK: omp.kernel.body: +// CHECK-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP19]], [[TMP24]] +// CHECK-NEXT: [[PADDED_SEGMENT_SIZE:%.*]] = add i32 [[TMP27]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTADDR4_ASCAST]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_xteams_phase2_f_16x64(ptr [[TMP29]], i32 [[PADDED_SEGMENT_SIZE]], ptr [[TMP28]], ptr [[TMP30]], ptr @__kmpc_rfun_sum_f, float 0.000000e+00, i64 [[TMP16]], i32 0) +// CHECK-NEXT: [[TMP32:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP22]] +// CHECK-NEXT: store i32 [[TMP32]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP22]], 1 +// CHECK-NEXT: [[TMP34:%.*]] = mul i32 [[PADDED_SEGMENT_SIZE]], [[TMP33]] +// CHECK-NEXT: [[TMP35:%.*]] = load ptr, ptr [[DOTADDR5_ASCAST]], align 8 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = icmp ult i32 [[TMP36]], [[TMP34]] +// CHECK-NEXT: [[TMP38:%.*]] = icmp ule i32 [[TMP36]], [[GLOBAL_UPPER_BOUND]] +// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP38]], [[TMP37]] +// CHECK-NEXT: br i1 [[TMP39]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP40]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = getelementptr float, ptr [[TMP35]], i32 [[TMP41]] +// CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4 +// CHECK-NEXT: store float [[TMP43]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float 0.000000e+00, ptr [[SUM8_ASCAST]], align 4 +// CHECK-NEXT: br label [[OMP_INSCAN_DISPATCH:%.*]] +// CHECK: omp.before.scan.bb: +// CHECK-NEXT: [[TMP44:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP45]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP44]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP46:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP46]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.exit.inscan.bb: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE]] +// CHECK: omp.inscan.dispatch: +// CHECK-NEXT: [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP47]] to i64 +// CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[TMP48]], 0 +// CHECK-NEXT: br i1 [[TMP49]], label [[OMP_EXCLUSIVE_COPY_EXIT:%.*]], label [[OMP_EXCLUSIVE_DEC:%.*]] +// CHECK: omp.exclusive.dec: +// CHECK-NEXT: [[TMP50:%.*]] = sub nuw i64 [[TMP48]], 1 +// CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP50]] +// CHECK-NEXT: [[TMP51:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP51]], ptr [[TMP4]], align 4 +// CHECK-NEXT: br label [[OMP_EXCLUSIVE_COPY_EXIT]] +// CHECK: omp.exclusive.copy.exit: +// CHECK-NEXT: br label [[OMP_BEFORE_SCAN_BB:%.*]] +// CHECK: omp.after.scan.bb: +// CHECK-NEXT: [[TMP52:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[I_ASCAST]], align 4 +// CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP53]] to i64 +// CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float, ptr [[TMP52]], i64 [[IDXPROM10]] +// CHECK-NEXT: [[TMP54:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 +// CHECK-NEXT: [[TMP55:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: [[TMP56:%.*]] = fadd float [[TMP55]], [[TMP54]] +// CHECK-NEXT: store float [[TMP56]], ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: br label [[OMP_EXIT_INSCAN_BB:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP57:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = getelementptr float, ptr [[TMP35]], i32 [[TMP57]] +// CHECK-NEXT: [[TMP59:%.*]] = load float, ptr addrspace(5) [[TMP7]], align 4 +// CHECK-NEXT: store float [[TMP59]], ptr [[TMP58]], align 4 +// CHECK-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: [[TMP61:%.*]] = add i32 1, [[TMP60]] +// CHECK-NEXT: store i32 [[TMP61]], ptr [[DOTOMP_IV_ASCAST]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP69:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: br label [[OMP_KERNEL_DONE]] +// CHECK: omp.kernel.done: +// CHECK-NEXT: ret void +// diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 4222bca707fc4..209ab7ebdc8c9 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -668,13 +668,60 @@ __OMP_RTL(__kmpc_xteams_i_4x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr __OMP_RTL(__kmpc_xteams_i_8x64, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32) __OMP_RTL(__kmpc_xteams_i_8x32, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32) __OMP_RTL(__kmpc_xteams_i_16x32, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32) +__OMP_RTL(__kmpc_xteams_i_32x32, false, Void, Int32, Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, VoidPtr, VoidPtr, Int32, Int64, Int32) + +__OMP_RTL(__kmpc_xteams_d_16x64, false, Void, Double, DoublePtr, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_d_4x64, false, Void, Double, DoublePtr, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_d_8x64, false, Void, Double, DoublePtr, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_d_8x32, false, Void, Double, DoublePtr, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_d_16x32, false, Void, Double, DoublePtr, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_d_32x32, false, Void, Double, DoublePtr, DoublePtr, DoublePtr, Int32Ptr, VoidPtr, VoidPtr, Double, Int64, Int32) + +__OMP_RTL(__kmpc_xteams_f_16x64, false, Void, Float, FloatPtr, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_f_4x64, false, Void, Float, FloatPtr, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_f_8x64, false, Void, Float, FloatPtr, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_f_8x32, false, Void, Float, FloatPtr, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_f_16x32, false, Void, Float, FloatPtr, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_f_32x32, false, Void, Float, FloatPtr, FloatPtr, FloatPtr, Int32Ptr, VoidPtr, VoidPtr, Float, Int64, Int32) + +__OMP_RTL(__kmpc_xteams_l_16x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_l_4x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_l_8x64, false, Void, Int64, Int64Ptr, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_l_8x32, false, Void, Int64, Int64Ptr, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_l_16x32, false, Void, Int64, Int64Ptr, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_l_32x32, false, Void, Int64, Int64Ptr, Int64Ptr, Int64Ptr, Int32Ptr, VoidPtr, VoidPtr, Int64, Int64, Int32) + __OMP_RTL(__kmpc_xteams_phase2_i_16x64, false, Void, Int32Ptr, Int32, Int32Ptr, Int32Ptr, VoidPtr, Int32, Int64, Int32) __OMP_RTL(__kmpc_xteams_phase2_i_8x64, false, Void, Int32Ptr, Int32, Int32Ptr, Int32Ptr, VoidPtr, Int32, Int64, Int32) __OMP_RTL(__kmpc_xteams_phase2_i_4x64, false, Void, Int32Ptr, Int32, Int32Ptr, Int32Ptr, VoidPtr, Int32, Int64, Int32) __OMP_RTL(__kmpc_xteams_phase2_i_8x32, false, Void, Int32Ptr, Int32, Int32Ptr, Int32Ptr, VoidPtr, Int32, Int64, Int32) __OMP_RTL(__kmpc_xteams_phase2_i_16x32, false, Void, Int32Ptr, Int32, Int32Ptr, Int32Ptr, VoidPtr, Int32, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_i_32x32, false, Void, Int32Ptr, Int32, Int32Ptr, Int32Ptr, VoidPtr, Int32, Int64, Int32) + + +__OMP_RTL(__kmpc_xteams_phase2_d_16x64, false, Void, DoublePtr, Int32, DoublePtr, DoublePtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_d_8x64, false, Void, DoublePtr, Int32, DoublePtr, DoublePtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_d_4x64, false, Void, DoublePtr, Int32, DoublePtr, DoublePtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_d_8x32, false, Void, DoublePtr, Int32, DoublePtr, DoublePtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_d_16x32, false, Void, DoublePtr, Int32, DoublePtr, DoublePtr, VoidPtr, Double, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_d_32x32, false, Void, DoublePtr, Int32, DoublePtr, DoublePtr, VoidPtr, Double, Int64, Int32) + + +__OMP_RTL(__kmpc_xteams_phase2_f_16x64, false, Void, FloatPtr, Int32, FloatPtr, FloatPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_f_8x64, false, Void, FloatPtr, Int32, FloatPtr, FloatPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_f_4x64, false, Void, FloatPtr, Int32, FloatPtr, FloatPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_f_8x32, false, Void, FloatPtr, Int32, FloatPtr, FloatPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_f_16x32, false, Void, FloatPtr, Int32, FloatPtr, FloatPtr, VoidPtr, Float, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_f_32x32, false, Void, FloatPtr, Int32, FloatPtr, FloatPtr, VoidPtr, Float, Int64, Int32) + +__OMP_RTL(__kmpc_xteams_phase2_l_16x64, false, Void, Int64Ptr, Int32, Int64Ptr, Int64Ptr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_l_8x64, false, Void, Int64Ptr, Int32, Int64Ptr, Int64Ptr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_l_4x64, false, Void, Int64Ptr, Int32, Int64Ptr, Int64Ptr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_l_8x32, false, Void, Int64Ptr, Int32, Int64Ptr, Int64Ptr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_l_16x32, false, Void, Int64Ptr, Int32, Int64Ptr, Int64Ptr, VoidPtr, Int64, Int64, Int32) +__OMP_RTL(__kmpc_xteams_phase2_l_32x32, false, Void, Int64Ptr, Int32, Int64Ptr, Int64Ptr, VoidPtr, Int64, Int64, Int32) __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL diff --git a/offload/DeviceRTL/include/Xteams.h b/offload/DeviceRTL/include/Xteams.h index e556baed42766..12937f24fcabe 100644 --- a/offload/DeviceRTL/include/Xteams.h +++ b/offload/DeviceRTL/include/Xteams.h @@ -393,16 +393,108 @@ void _INLINE_ATTR_ __kmpc_xteams_phase2_i_4x64(int *storage, int segment_size, void (*rf)(int *, int), const int rnv, const uint64_t k, bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_i_8x32(int *storage, int segment_size, + int *tvs, int *seg_vals, + void (*rf)(int *, int), + const int rnv, const uint64_t k, + bool is_inclusive_scan); void _INLINE_ATTR_ __kmpc_xteams_phase2_i_16x32(int *storage, int segment_size, int *tvs, int *seg_vals, void (*rf)(int *, int), const int rnv, const uint64_t k, bool is_inclusive_scan); -void _INLINE_ATTR_ __kmpc_xteams_phase2_i_16x32(int *storage, int segment_size, +void _INLINE_ATTR_ __kmpc_xteams_phase2_i_32x32(int *storage, int segment_size, int *tvs, int *seg_vals, void (*rf)(int *, int), const int rnv, const uint64_t k, bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_d_16x64( + double *storage, int segment_size, double *tvs, double *seg_vals, + void (*rf)(double *, double), const double rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_d_8x64( + double *storage, int segment_size, double *tvs, double *seg_vals, + void (*rf)(double *, double), const double rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_d_4x64( + double *storage, int segment_size, double *tvs, double *seg_vals, + void (*rf)(double *, double), const double rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_d_8x32( + double *storage, int segment_size, double *tvs, double *seg_vals, + void (*rf)(double *, double), const double rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_d_16x32( + double *storage, int segment_size, double *tvs, double *seg_vals, + void (*rf)(double *, double), const double rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_d_32x32( + double *storage, int segment_size, double *tvs, double *seg_vals, + void (*rf)(double *, double), const double rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_l_16x64(long *storage, int segment_size, + long *tvs, long *seg_vals, + void (*rf)(long *, long), + const long rnv, + const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_l_8x64(long *storage, int segment_size, + long *tvs, long *seg_vals, + void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_l_4x64(long *storage, int segment_size, + long *tvs, long *seg_vals, + void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_l_8x32(long *storage, int segment_size, + long *tvs, long *seg_vals, + void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_l_16x32(long *storage, int segment_size, + long *tvs, long *seg_vals, + void (*rf)(long *, long), + const long rnv, + const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_l_32x32(long *storage, int segment_size, + long *tvs, long *seg_vals, + void (*rf)(long *, long), + const long rnv, + const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_f_16x64( + float *storage, int segment_size, float *tvs, float *seg_vals, + void (*rf)(float *, float), const float rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_f_8x64(float *storage, int segment_size, + float *tvs, float *seg_vals, + void (*rf)(float *, float), + const float rnv, + const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_f_4x64(float *storage, int segment_size, + float *tvs, float *seg_vals, + void (*rf)(float *, float), + const float rnv, + const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_f_8x32(float *storage, int segment_size, + float *tvs, float *seg_vals, + void (*rf)(float *, float), + const float rnv, + const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_f_16x32( + float *storage, int segment_size, float *tvs, float *seg_vals, + void (*rf)(float *, float), const float rnv, const uint64_t k, + bool is_inclusive_scan); +void _INLINE_ATTR_ __kmpc_xteams_phase2_f_32x32( + float *storage, int segment_size, float *tvs, float *seg_vals, + void (*rf)(float *, float), const float rnv, const uint64_t k, + bool is_inclusive_scan); } // end extern C #undef _CD diff --git a/offload/DeviceRTL/src/Xteams.cpp b/offload/DeviceRTL/src/Xteams.cpp index d602527535f73..22a5ddbdd8e22 100644 --- a/offload/DeviceRTL/src/Xteams.cpp +++ b/offload/DeviceRTL/src/Xteams.cpp @@ -219,7 +219,7 @@ _xteam_scan_phase2(T *storage, int segment_size, T *team_vals, T *segment_vals, if (is_inclusive_scan) { for (int i = 0; i < segment_size; i++) - segment_vals[(k * segment_size) + i] += thread_level_result; + (*_rf)(segment_vals + (k * segment_size) + i, thread_level_result); } else { // Exclusive scan // Populate the non-first element in every segment with scanned result for (int i = segment_size - 1; i > 0; i--) @@ -855,6 +855,158 @@ __kmpc_xteams_phase2_i_8x32(int *storage, int segment_size, int *tvs, _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, rnv, k, is_inclusive_scan); } +_EXT_ATTR +__kmpc_xteams_phase2_i_32x32(int *storage, int segment_size, int *tvs, + int *seg_vals, void (*rf)(int *, int), + const int rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, rnv, + k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_d_16x64(double *storage, int segment_size, double *tvs, + double *seg_vals, void (*rf)(double *, double), + const double rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_d_8x64(double *storage, int segment_size, double *tvs, + double *seg_vals, void (*rf)(double *, double), + const double rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_d_4x64(double *storage, int segment_size, double *tvs, + double *seg_vals, void (*rf)(double *, double), + const double rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_d_8x32(double *storage, int segment_size, double *tvs, + double *seg_vals, void (*rf)(double *, double), + const double rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_d_16x32(double *storage, int segment_size, double *tvs, + double *seg_vals, void (*rf)(double *, double), + const double rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_d_32x32(double *storage, int segment_size, double *tvs, + double *seg_vals, void (*rf)(double *, double), + const double rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_l_16x64(long *storage, int segment_size, long *tvs, + long *seg_vals, void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_l_8x64(long *storage, int segment_size, long *tvs, + long *seg_vals, void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, rnv, + k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_l_4x64(long *storage, int segment_size, long *tvs, + long *seg_vals, void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, rnv, + k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_l_8x32(long *storage, int segment_size, long *tvs, + long *seg_vals, void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, rnv, + k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_l_16x32(long *storage, int segment_size, long *tvs, + long *seg_vals, void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_l_32x32(long *storage, int segment_size, long *tvs, + long *seg_vals, void (*rf)(long *, long), + const long rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_f_16x64(float *storage, int segment_size, float *tvs, + float *seg_vals, void (*rf)(float *, float), + const float rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_f_8x64(float *storage, int segment_size, float *tvs, + float *seg_vals, void (*rf)(float *, float), + const float rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_f_4x64(float *storage, int segment_size, float *tvs, + float *seg_vals, void (*rf)(float *, float), + const float rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_f_8x32(float *storage, int segment_size, float *tvs, + float *seg_vals, void (*rf)(float *, float), + const float rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_f_16x32(float *storage, int segment_size, float *tvs, + float *seg_vals, void (*rf)(float *, float), + const float rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} +_EXT_ATTR +__kmpc_xteams_phase2_f_32x32(float *storage, int segment_size, float *tvs, + float *seg_vals, void (*rf)(float *, float), + const float rnv, const uint64_t k, + bool is_inclusive_scan) { + _xteam_scan_phase2(storage, segment_size, tvs, seg_vals, rf, + rnv, k, is_inclusive_scan); +} #undef _CF #undef _UI #undef _UL diff --git a/offload/test/offloading/xteam_scan_3.cpp b/offload/test/offloading/xteam_scan_3.cpp new file mode 100644 index 0000000000000..e796ddf353481 --- /dev/null +++ b/offload/test/offloading/xteam_scan_3.cpp @@ -0,0 +1,241 @@ +// clang-format off +// This test verifies the output of inclusive and exclusive scan computed using the Xteam Scan Kernel +// for various datatypes for the default teamsXthreads combination: 104x1024 +// + +// RUN: %libomptarget-compile-generic -fopenmp-target-ignore-env-vars -fopenmp-target-xteam-scan -fopenmp-assume-no-nested-parallelism -fopenmp-assume-no-thread-state -lm -latomic +// RUN: env LIBOMPTARGET_KERNEL_TRACE=1 \ +// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic + +// UNSUPPORTED: nvptx64-nvidia-cuda +// UNSUPPORTED: nvptx64-nvidia-cuda-LTO +// UNSUPPORTED: aarch64-unknown-linux-gnu +// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO +// UNSUPPORTED: x86_64-unknown-linux-gnu +// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO + +// clang-format on +#include +#include +#include + +#define N 2000000 + +template +void run_test() { + T *in = (T*)malloc(sizeof(T) * N); + T *out1 = (T*)malloc(sizeof(T) * N); // For inclusive scan + T *out2 = (T *)malloc(sizeof(T) * N); // For exclusive scan + + for (int i = 0; i < N; i++) { + in[i] = 10; + out1[i] = 0; + } + + T sum1 = T(0); + +#pragma omp target teams distribute parallel for reduction(inscan, +:sum1) map(tofrom: in[0:N], out1[0:N]) + for (int i = 0; i < N; i++) { + sum1 += in[i]; // input phase +#pragma omp scan inclusive(sum1) + out1[i] = sum1; // scan phase + } + + T checksum = T(0); + for (int i = 0; i < N; i++) { + checksum += in[i]; + if (checksum != out1[i]) { + printf("Inclusive Scan: Failure. Wrong Result at %d. Exiting...\n", i); + return; + } + } + free(out1); + printf("Inclusive Scan: Success!\n"); + + T sum2 = T(0); + +#pragma omp target teams distribute parallel for reduction(inscan, +:sum2) map(tofrom: in[0:N], out2[0:N]) + for (int i = 0; i < N; i++) { + out2[i] = sum2; // scan phase +#pragma omp scan exclusive(sum2) + sum2 += in[i]; // input phase + } + + checksum = T(0); + for (int i = 0; i < N; i++) { + if (checksum != out2[i]) { + printf("Exclusive Scan: Failure. Wrong Result at %d. Exiting...\n", i); + return; + } + checksum += in[i]; + } + free(in); + free(out2); + printf("Exclusive Scan: Success!\n"); +} + +int main() { + printf("Testing for datatype int\n"); + run_test(); + + printf("Testing for datatype uint32_t\n"); + run_test(); + + printf("Testing for datatype uint64_t\n"); + run_test(); + + printf("Testing for datatype long\n"); + run_test(); + + printf("Testing for datatype double\n"); + run_test(); + + printf("Testing for datatype float\n"); + run_test(); + return 0; +} +// clang-format off + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:8200B +/// CHECK: n:__omp_offloading_[[MANGLED:.*i.*]]_l37 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l37_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:8200B +/// CHECK: n:__omp_offloading_[[MANGLED:.*i.*]]_l57 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l57_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:8200B +/// CHECK: n:__omp_offloading_[[MANGLED:.*j.*]]_l37 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l37_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:8200B +/// CHECK: n:__omp_offloading_[[MANGLED:.*j.*]]_l57 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l57_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:16400B +/// CHECK: n:__omp_offloading_[[MANGLED:.*m.*]]_l37 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l37_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:16400B +/// CHECK: n:__omp_offloading_[[MANGLED:.*m.*]]_l57 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l57_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:16400B +/// CHECK: n:__omp_offloading_[[MANGLED:.*l.*]]_l37 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l37_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:16400B +/// CHECK: n:__omp_offloading_[[MANGLED:.*l.*]]_l57 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l57_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:16400B +/// CHECK: n:__omp_offloading_[[MANGLED:.*d.*]]_l37 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l37_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:16400B +/// CHECK: n:__omp_offloading_[[MANGLED:.*d.*]]_l57 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l57_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:8200B +/// CHECK: n:__omp_offloading_[[MANGLED:.*f.*]]_l37 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l37_1 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:8200B +/// CHECK: n:__omp_offloading_[[MANGLED:.*f.*]]_l57 + +/// CHECK: DEVID:[[S:[ ]*]][[DEVID:[0-9]+]] SGN:8 +/// CHECK: args:10 teamsXthrds:( 104X1024) +/// CHECK: lds_usage:0B +/// CHECK: n:__omp_offloading_[[MANGLED]]_l57_1 + +/// CHECK: Testing for datatype int +/// CHECK: Inclusive Scan: Success! +/// CHECK: Exclusive Scan: Success! + +/// CHECK: Testing for datatype uint32_t +/// CHECK: Inclusive Scan: Success! +/// CHECK: Exclusive Scan: Success! + +/// CHECK: Testing for datatype uint64_t +/// CHECK: Inclusive Scan: Success! +/// CHECK: Exclusive Scan: Success! + +/// CHECK: Testing for datatype long +/// CHECK: Inclusive Scan: Success! +/// CHECK: Exclusive Scan: Success! + +/// CHECK: Testing for datatype double +/// CHECK: Inclusive Scan: Success! +/// CHECK: Exclusive Scan: Success! + +/// CHECK: Testing for datatype float +/// CHECK: Inclusive Scan: Success! +/// CHECK: Exclusive Scan: Success! \ No newline at end of file